Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_sqrt_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
52 #ifndef INCLUDED_volk_32f_sqrt_32f_a_H
53 #define INCLUDED_volk_32f_sqrt_32f_a_H
54 
55 #include <inttypes.h>
56 #include <math.h>
57 #include <stdio.h>
58 
59 #ifdef LV_HAVE_GENERIC
60 
61 static inline void
62 volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
63 {
64  float* cPtr = cVector;
65  const float* aPtr = aVector;
66  unsigned int number = 0;
67 
68  for (number = 0; number < num_points; number++) {
69  *cPtr++ = sqrtf(*aPtr++);
70  }
71 }
72 
73 #endif /* LV_HAVE_GENERIC */
74 
75 
76 #ifdef LV_HAVE_SSE
77 #include <xmmintrin.h>
78 
79 static inline void
80 volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
81 {
82  unsigned int number = 0;
83  const unsigned int quarterPoints = num_points / 4;
84 
85  float* cPtr = cVector;
86  const float* aPtr = aVector;
87 
88  __m128 aVal, cVal;
89  for (; number < quarterPoints; number++) {
90  aVal = _mm_load_ps(aPtr);
91 
92  cVal = _mm_sqrt_ps(aVal);
93 
94  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
95 
96  aPtr += 4;
97  cPtr += 4;
98  }
99 
100  number = quarterPoints * 4;
101  for (; number < num_points; number++) {
102  *cPtr++ = sqrtf(*aPtr++);
103  }
104 }
105 
106 #endif /* LV_HAVE_SSE */
107 
108 #if LV_HAVE_AVX512F
109 #include <immintrin.h>
110 
111 static inline void
112 volk_32f_sqrt_32f_a_avx512(float* cVector, const float* aVector, unsigned int num_points)
113 {
114  unsigned int number = 0;
115  const unsigned int sixteenthPoints = num_points / 16;
116 
117  float* cPtr = cVector;
118  const float* aPtr = aVector;
119 
120  __m512 aVal, cVal;
121  for (; number < sixteenthPoints; number++) {
122  aVal = _mm512_load_ps(aPtr);
123  cVal = _mm512_sqrt_ps(aVal);
124  _mm512_store_ps(cPtr, cVal);
125 
126  aPtr += 16;
127  cPtr += 16;
128  }
129 
130  number = sixteenthPoints * 16;
131  for (; number < num_points; number++) {
132  *cPtr++ = sqrtf(*aPtr++);
133  }
134 }
135 
136 #endif /* LV_HAVE_AVX512F */
137 
138 
139 #ifdef LV_HAVE_AVX2
140 #include <immintrin.h>
141 
142 static inline void
143 volk_32f_sqrt_32f_a_avx2(float* cVector, const float* aVector, unsigned int num_points)
144 {
145  unsigned int number = 0;
146  const unsigned int eighthPoints = num_points / 8;
147 
148  float* cPtr = cVector;
149  const float* aPtr = aVector;
150 
151  __m256 aVal, cVal;
152  for (; number < eighthPoints; number++) {
153  aVal = _mm256_load_ps(aPtr);
154  cVal = _mm256_sqrt_ps(aVal);
155  _mm256_store_ps(cPtr, cVal);
156 
157  aPtr += 8;
158  cPtr += 8;
159  }
160 
161  number = eighthPoints * 8;
162  for (; number < num_points; number++) {
163  *cPtr++ = sqrtf(*aPtr++);
164  }
165 }
166 
167 #endif /* LV_HAVE_AVX2 */
168 
169 
170 #ifdef LV_HAVE_AVX
171 #include <immintrin.h>
172 
173 static inline void
174 volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
175 {
176  unsigned int number = 0;
177  const unsigned int eighthPoints = num_points / 8;
178 
179  float* cPtr = cVector;
180  const float* aPtr = aVector;
181 
182  __m256 aVal, cVal;
183  for (; number < eighthPoints; number++) {
184  aVal = _mm256_load_ps(aPtr);
185 
186  cVal = _mm256_sqrt_ps(aVal);
187 
188  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
189 
190  aPtr += 8;
191  cPtr += 8;
192  }
193 
194  number = eighthPoints * 8;
195  for (; number < num_points; number++) {
196  *cPtr++ = sqrtf(*aPtr++);
197  }
198 }
199 
200 #endif /* LV_HAVE_AVX */
201 
202 
203 #ifdef LV_HAVE_NEON
204 #include <arm_neon.h>
205 
206 static inline void
207 volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
208 {
209  float* cPtr = cVector;
210  const float* aPtr = aVector;
211  unsigned int number = 0;
212  unsigned int quarter_points = num_points / 4;
213  float32x4_t in_vec, out_vec;
214 
215  for (number = 0; number < quarter_points; number++) {
216  in_vec = vld1q_f32(aPtr);
217  // note that armv8 has vsqrt_f32 which will be much better
218  out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
219  vst1q_f32(cPtr, out_vec);
220  aPtr += 4;
221  cPtr += 4;
222  }
223 
224  for (number = quarter_points * 4; number < num_points; number++) {
225  *cPtr++ = sqrtf(*aPtr++);
226  }
227 }
228 
229 #endif /* LV_HAVE_NEON */
230 
231 #ifdef LV_HAVE_NEONV8
232 #include <arm_neon.h>
233 
234 static inline void
235 volk_32f_sqrt_32f_neonv8(float* cVector, const float* aVector, unsigned int num_points)
236 {
237  float* cPtr = cVector;
238  const float* aPtr = aVector;
239  unsigned int number = 0;
240  unsigned int quarter_points = num_points / 4;
241 
242  for (number = 0; number < quarter_points; number++) {
243  float32x4_t in_vec = vld1q_f32(aPtr);
244  float32x4_t out_vec = vsqrtq_f32(in_vec);
245  vst1q_f32(cPtr, out_vec);
246  aPtr += 4;
247  cPtr += 4;
248  }
249 
250  for (number = quarter_points * 4; number < num_points; number++) {
251  *cPtr++ = sqrtf(*aPtr++);
252  }
253 }
254 
255 #endif /* LV_HAVE_NEONV8 */
256 
257 #endif /* INCLUDED_volk_32f_sqrt_32f_a_H */
258 
259 #ifndef INCLUDED_volk_32f_sqrt_32f_u_H
260 #define INCLUDED_volk_32f_sqrt_32f_u_H
261 
262 #include <inttypes.h>
263 #include <math.h>
264 #include <stdio.h>
265 
266 #ifdef LV_HAVE_SSE
267 #include <xmmintrin.h>
268 
269 static inline void
270 volk_32f_sqrt_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
271 {
272  unsigned int number = 0;
273  const unsigned int quarterPoints = num_points / 4;
274 
275  float* cPtr = cVector;
276  const float* aPtr = aVector;
277 
278  __m128 aVal, cVal;
279  for (; number < quarterPoints; number++) {
280  aVal = _mm_loadu_ps(aPtr);
281  cVal = _mm_sqrt_ps(aVal);
282  _mm_storeu_ps(cPtr, cVal);
283 
284  aPtr += 4;
285  cPtr += 4;
286  }
287 
288  number = quarterPoints * 4;
289  for (; number < num_points; number++) {
290  *cPtr++ = sqrtf(*aPtr++);
291  }
292 }
293 
294 #endif /* LV_HAVE_SSE */
295 
296 
297 #if LV_HAVE_AVX512F
298 #include <immintrin.h>
299 
300 static inline void
301 volk_32f_sqrt_32f_u_avx512(float* cVector, const float* aVector, unsigned int num_points)
302 {
303  unsigned int number = 0;
304  const unsigned int sixteenthPoints = num_points / 16;
305 
306  float* cPtr = cVector;
307  const float* aPtr = aVector;
308 
309  __m512 aVal, cVal;
310  for (; number < sixteenthPoints; number++) {
311  aVal = _mm512_loadu_ps(aPtr);
312  cVal = _mm512_sqrt_ps(aVal);
313  _mm512_storeu_ps(cPtr, cVal);
314 
315  aPtr += 16;
316  cPtr += 16;
317  }
318 
319  number = sixteenthPoints * 16;
320  for (; number < num_points; number++) {
321  *cPtr++ = sqrtf(*aPtr++);
322  }
323 }
324 
325 #endif /* LV_HAVE_AVX512F */
326 
327 
328 #ifdef LV_HAVE_AVX2
329 #include <immintrin.h>
330 
331 static inline void
332 volk_32f_sqrt_32f_u_avx2(float* cVector, const float* aVector, unsigned int num_points)
333 {
334  unsigned int number = 0;
335  const unsigned int eighthPoints = num_points / 8;
336 
337  float* cPtr = cVector;
338  const float* aPtr = aVector;
339 
340  __m256 aVal, cVal;
341  for (; number < eighthPoints; number++) {
342  aVal = _mm256_loadu_ps(aPtr);
343  cVal = _mm256_sqrt_ps(aVal);
344  _mm256_storeu_ps(cPtr, cVal);
345 
346  aPtr += 8;
347  cPtr += 8;
348  }
349 
350  number = eighthPoints * 8;
351  for (; number < num_points; number++) {
352  *cPtr++ = sqrtf(*aPtr++);
353  }
354 }
355 
356 #endif /* LV_HAVE_AVX2 */
357 
358 
359 #ifdef LV_HAVE_AVX
360 #include <immintrin.h>
361 
362 static inline void
363 volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
364 {
365  unsigned int number = 0;
366  const unsigned int eighthPoints = num_points / 8;
367 
368  float* cPtr = cVector;
369  const float* aPtr = aVector;
370 
371  __m256 aVal, cVal;
372  for (; number < eighthPoints; number++) {
373  aVal = _mm256_loadu_ps(aPtr);
374 
375  cVal = _mm256_sqrt_ps(aVal);
376 
377  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
378 
379  aPtr += 8;
380  cPtr += 8;
381  }
382 
383  number = eighthPoints * 8;
384  for (; number < num_points; number++) {
385  *cPtr++ = sqrtf(*aPtr++);
386  }
387 }
388 
389 #endif /* LV_HAVE_AVX */
390 
391 #ifdef LV_HAVE_RVV
392 #include <riscv_vector.h>
393 
394 static inline void
395 volk_32f_sqrt_32f_rvv(float* cVector, const float* aVector, unsigned int num_points)
396 {
397  size_t n = num_points;
398  for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
399  vl = __riscv_vsetvl_e32m8(n);
400  vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
401  __riscv_vse32(cVector, __riscv_vfsqrt(v, vl), vl);
402  }
403 }
404 #endif /*LV_HAVE_RVV*/
405 
406 #endif /* INCLUDED_volk_32f_sqrt_32f_u_H */
volk_32f_sqrt_32f_a_avx
static void volk_32f_sqrt_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:174
volk_32f_sqrt_32f_u_sse
static void volk_32f_sqrt_32f_u_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:270
volk_32f_sqrt_32f_u_avx
static void volk_32f_sqrt_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:363
volk_32f_sqrt_32f_a_sse
static void volk_32f_sqrt_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:80
volk_32f_sqrt_32f_neon
static void volk_32f_sqrt_32f_neon(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:207
volk_32f_sqrt_32f_generic
static void volk_32f_sqrt_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:62