Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_16ic_s32f_magnitude_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
42 #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
43 #define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
44 
45 #include <inttypes.h>
46 #include <math.h>
47 #include <stdio.h>
48 #include <volk/volk_common.h>
49 
50 #ifdef LV_HAVE_AVX2
51 #include <immintrin.h>
52 
53 static inline void volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector,
54  const lv_16sc_t* complexVector,
55  const float scalar,
56  unsigned int num_points)
57 {
58  unsigned int number = 0;
59  const unsigned int eighthPoints = num_points / 8;
60 
61  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
62  float* magnitudeVectorPtr = magnitudeVector;
63 
64  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
65 
66  __m256 cplxValue1, cplxValue2, result;
67  __m256i int1, int2;
68  __m128i short1, short2;
69  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
70 
71  for (; number < eighthPoints; number++) {
72 
73  int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
74  complexVectorPtr += 16;
75  short1 = _mm256_extracti128_si256(int1, 0);
76  short2 = _mm256_extracti128_si256(int1, 1);
77 
78  int1 = _mm256_cvtepi16_epi32(short1);
79  int2 = _mm256_cvtepi16_epi32(short2);
80  cplxValue1 = _mm256_cvtepi32_ps(int1);
81  cplxValue2 = _mm256_cvtepi32_ps(int2);
82 
83  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
84  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
85 
86  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
87  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
88 
89  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
90  result = _mm256_permutevar8x32_ps(result, idx);
91 
92  result = _mm256_sqrt_ps(result); // Square root the values
93 
94  _mm256_store_ps(magnitudeVectorPtr, result);
95 
96  magnitudeVectorPtr += 8;
97  }
98 
99  number = eighthPoints * 8;
100  magnitudeVectorPtr = &magnitudeVector[number];
101  complexVectorPtr = (const int16_t*)&complexVector[number];
102  for (; number < num_points; number++) {
103  float val1Real = (float)(*complexVectorPtr++) / scalar;
104  float val1Imag = (float)(*complexVectorPtr++) / scalar;
105  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
106  }
107 }
108 #endif /* LV_HAVE_AVX2 */
109 
110 
111 #ifdef LV_HAVE_SSE3
112 #include <pmmintrin.h>
113 
114 static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector,
115  const lv_16sc_t* complexVector,
116  const float scalar,
117  unsigned int num_points)
118 {
119  unsigned int number = 0;
120  const unsigned int quarterPoints = num_points / 4;
121 
122  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
123  float* magnitudeVectorPtr = magnitudeVector;
124 
125  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
126 
127  __m128 cplxValue1, cplxValue2, result;
128 
129  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
130 
131  for (; number < quarterPoints; number++) {
132 
133  inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
134  inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
135  inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
136  inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
137 
138  inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
139  inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
140  inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
141  inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
142 
143  cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
144  cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
145 
146  complexVectorPtr += 8;
147 
148  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
149  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
150 
151  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
152  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
153 
154  result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
155 
156  result = _mm_sqrt_ps(result); // Square root the values
157 
158  _mm_store_ps(magnitudeVectorPtr, result);
159 
160  magnitudeVectorPtr += 4;
161  }
162 
163  number = quarterPoints * 4;
164  magnitudeVectorPtr = &magnitudeVector[number];
165  complexVectorPtr = (const int16_t*)&complexVector[number];
166  for (; number < num_points; number++) {
167  float val1Real = (float)(*complexVectorPtr++) / scalar;
168  float val1Imag = (float)(*complexVectorPtr++) / scalar;
169  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
170  }
171 }
172 #endif /* LV_HAVE_SSE3 */
173 
174 #ifdef LV_HAVE_SSE
175 #include <xmmintrin.h>
176 
177 static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector,
178  const lv_16sc_t* complexVector,
179  const float scalar,
180  unsigned int num_points)
181 {
182  unsigned int number = 0;
183  const unsigned int quarterPoints = num_points / 4;
184 
185  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
186  float* magnitudeVectorPtr = magnitudeVector;
187 
188  const float iScalar = 1.0 / scalar;
189  __m128 invScalar = _mm_set_ps1(iScalar);
190 
191  __m128 cplxValue1, cplxValue2, result, re, im;
192 
193  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
194 
195  for (; number < quarterPoints; number++) {
196  inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
197  inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
198  inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
199  inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
200 
201  inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
202  inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
203  inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
204  inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
205 
206  cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
207  cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
208 
209  re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
210  im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
211 
212  complexVectorPtr += 8;
213 
214  cplxValue1 = _mm_mul_ps(re, invScalar);
215  cplxValue2 = _mm_mul_ps(im, invScalar);
216 
217  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
218  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
219 
220  result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
221 
222  result = _mm_sqrt_ps(result); // Square root the values
223 
224  _mm_store_ps(magnitudeVectorPtr, result);
225 
226  magnitudeVectorPtr += 4;
227  }
228 
229  number = quarterPoints * 4;
230  magnitudeVectorPtr = &magnitudeVector[number];
231  complexVectorPtr = (const int16_t*)&complexVector[number];
232  for (; number < num_points; number++) {
233  float val1Real = (float)(*complexVectorPtr++) * iScalar;
234  float val1Imag = (float)(*complexVectorPtr++) * iScalar;
235  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
236  }
237 }
238 
239 
240 #endif /* LV_HAVE_SSE */
241 
242 #ifdef LV_HAVE_GENERIC
243 
244 static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector,
245  const lv_16sc_t* complexVector,
246  const float scalar,
247  unsigned int num_points)
248 {
249  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
250  float* magnitudeVectorPtr = magnitudeVector;
251  unsigned int number = 0;
252  const float invScalar = 1.0 / scalar;
253  for (number = 0; number < num_points; number++) {
254  float real = ((float)(*complexVectorPtr++)) * invScalar;
255  float imag = ((float)(*complexVectorPtr++)) * invScalar;
256  *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
257  }
258 }
259 #endif /* LV_HAVE_GENERIC */
260 
261 
262 #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a_H */
263 
264 #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
265 #define INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
266 
267 #include <inttypes.h>
268 #include <math.h>
269 #include <stdio.h>
270 #include <volk/volk_common.h>
271 
272 #ifdef LV_HAVE_AVX2
273 #include <immintrin.h>
274 
275 static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector,
276  const lv_16sc_t* complexVector,
277  const float scalar,
278  unsigned int num_points)
279 {
280  unsigned int number = 0;
281  const unsigned int eighthPoints = num_points / 8;
282 
283  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
284  float* magnitudeVectorPtr = magnitudeVector;
285 
286  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
287 
288  __m256 cplxValue1, cplxValue2, result;
289  __m256i int1, int2;
290  __m128i short1, short2;
291  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
292 
293  for (; number < eighthPoints; number++) {
294 
295  int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
296  complexVectorPtr += 16;
297  short1 = _mm256_extracti128_si256(int1, 0);
298  short2 = _mm256_extracti128_si256(int1, 1);
299 
300  int1 = _mm256_cvtepi16_epi32(short1);
301  int2 = _mm256_cvtepi16_epi32(short2);
302  cplxValue1 = _mm256_cvtepi32_ps(int1);
303  cplxValue2 = _mm256_cvtepi32_ps(int2);
304 
305  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
306  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
307 
308  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
309  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
310 
311  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
312  result = _mm256_permutevar8x32_ps(result, idx);
313 
314  result = _mm256_sqrt_ps(result); // Square root the values
315 
316  _mm256_storeu_ps(magnitudeVectorPtr, result);
317 
318  magnitudeVectorPtr += 8;
319  }
320 
321  number = eighthPoints * 8;
322  magnitudeVectorPtr = &magnitudeVector[number];
323  complexVectorPtr = (const int16_t*)&complexVector[number];
324  for (; number < num_points; number++) {
325  float val1Real = (float)(*complexVectorPtr++) / scalar;
326  float val1Imag = (float)(*complexVectorPtr++) / scalar;
327  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
328  }
329 }
330 #endif /* LV_HAVE_AVX2 */
331 
332 #ifdef LV_HAVE_NEON
333 #include <arm_neon.h>
334 
335 static inline void volk_16ic_s32f_magnitude_32f_neon(float* magnitudeVector,
336  const lv_16sc_t* complexVector,
337  const float scalar,
338  unsigned int num_points)
339 {
340  unsigned int number = 0;
341  const unsigned int quarter_points = num_points / 4;
342 
343  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
344  float* magnitudeVectorPtr = magnitudeVector;
345  const float invScalar = 1.0f / scalar;
346  float32x4_t vInvScalar = vdupq_n_f32(invScalar);
347 
348  for (; number < quarter_points; number++) {
349  int16x4x2_t input = vld2_s16(complexVectorPtr);
350  complexVectorPtr += 8;
351 
352  int32x4_t realInt = vmovl_s16(input.val[0]);
353  int32x4_t imagInt = vmovl_s16(input.val[1]);
354 
355  float32x4_t realFloat = vcvtq_f32_s32(realInt);
356  float32x4_t imagFloat = vcvtq_f32_s32(imagInt);
357 
358  realFloat = vmulq_f32(realFloat, vInvScalar);
359  imagFloat = vmulq_f32(imagFloat, vInvScalar);
360 
361  float32x4_t realSquared = vmulq_f32(realFloat, realFloat);
362  float32x4_t imagSquared = vmulq_f32(imagFloat, imagFloat);
363  float32x4_t sumSquared = vaddq_f32(realSquared, imagSquared);
364 
365  /* Use reciprocal square root estimate with Newton-Raphson refinement */
366  float32x4_t rsqrt = vrsqrteq_f32(sumSquared);
367  rsqrt = vmulq_f32(rsqrt, vrsqrtsq_f32(vmulq_f32(sumSquared, rsqrt), rsqrt));
368  rsqrt = vmulq_f32(rsqrt, vrsqrtsq_f32(vmulq_f32(sumSquared, rsqrt), rsqrt));
369  float32x4_t result = vmulq_f32(sumSquared, rsqrt);
370 
371  /* Handle zero case - if sumSquared is 0, result should be 0 */
372  uint32x4_t zero_mask = vceqq_f32(sumSquared, vdupq_n_f32(0.0f));
373  result = vbslq_f32(zero_mask, sumSquared, result);
374 
375  vst1q_f32(magnitudeVectorPtr, result);
376  magnitudeVectorPtr += 4;
377  }
378 
379  number = quarter_points * 4;
380  complexVectorPtr = (const int16_t*)&complexVector[number];
381  for (; number < num_points; number++) {
382  float real = ((float)(*complexVectorPtr++)) * invScalar;
383  float imag = ((float)(*complexVectorPtr++)) * invScalar;
384  *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
385  }
386 }
387 #endif /* LV_HAVE_NEON */
388 
389 #ifdef LV_HAVE_NEONV8
390 #include <arm_neon.h>
391 
392 static inline void volk_16ic_s32f_magnitude_32f_neonv8(float* magnitudeVector,
393  const lv_16sc_t* complexVector,
394  const float scalar,
395  unsigned int num_points)
396 {
397  unsigned int number = 0;
398  const unsigned int eighth_points = num_points / 8;
399 
400  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
401  float* magnitudeVectorPtr = magnitudeVector;
402  const float invScalar = 1.0f / scalar;
403  float32x4_t vInvScalar = vdupq_n_f32(invScalar);
404 
405  for (; number < eighth_points; number++) {
406  int16x8x2_t input = vld2q_s16(complexVectorPtr);
407  complexVectorPtr += 16;
408  __VOLK_PREFETCH(complexVectorPtr + 16);
409 
410  /* First 4 elements */
411  int32x4_t realInt0 = vmovl_s16(vget_low_s16(input.val[0]));
412  int32x4_t imagInt0 = vmovl_s16(vget_low_s16(input.val[1]));
413 
414  float32x4_t realFloat0 = vcvtq_f32_s32(realInt0);
415  float32x4_t imagFloat0 = vcvtq_f32_s32(imagInt0);
416 
417  realFloat0 = vmulq_f32(realFloat0, vInvScalar);
418  imagFloat0 = vmulq_f32(imagFloat0, vInvScalar);
419 
420  float32x4_t sumSquared0 =
421  vfmaq_f32(vmulq_f32(imagFloat0, imagFloat0), realFloat0, realFloat0);
422  float32x4_t result0 = vsqrtq_f32(sumSquared0);
423 
424  /* Second 4 elements */
425  int32x4_t realInt1 = vmovl_s16(vget_high_s16(input.val[0]));
426  int32x4_t imagInt1 = vmovl_s16(vget_high_s16(input.val[1]));
427 
428  float32x4_t realFloat1 = vcvtq_f32_s32(realInt1);
429  float32x4_t imagFloat1 = vcvtq_f32_s32(imagInt1);
430 
431  realFloat1 = vmulq_f32(realFloat1, vInvScalar);
432  imagFloat1 = vmulq_f32(imagFloat1, vInvScalar);
433 
434  float32x4_t sumSquared1 =
435  vfmaq_f32(vmulq_f32(imagFloat1, imagFloat1), realFloat1, realFloat1);
436  float32x4_t result1 = vsqrtq_f32(sumSquared1);
437 
438  vst1q_f32(magnitudeVectorPtr, result0);
439  vst1q_f32(magnitudeVectorPtr + 4, result1);
440  magnitudeVectorPtr += 8;
441  }
442 
443  number = eighth_points * 8;
444  complexVectorPtr = (const int16_t*)&complexVector[number];
445  for (; number < num_points; number++) {
446  float real = ((float)(*complexVectorPtr++)) * invScalar;
447  float imag = ((float)(*complexVectorPtr++)) * invScalar;
448  *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
449  }
450 }
451 #endif /* LV_HAVE_NEONV8 */
452 
453 #ifdef LV_HAVE_RVV
454 #include <riscv_vector.h>
455 
456 static inline void volk_16ic_s32f_magnitude_32f_rvv(float* magnitudeVector,
457  const lv_16sc_t* complexVector,
458  const float scalar,
459  unsigned int num_points)
460 {
461  size_t n = num_points;
462  for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
463  vl = __riscv_vsetvl_e16m4(n);
464  vint32m8_t vc = __riscv_vle32_v_i32m8((const int32_t*)complexVector, vl);
465  vint16m4_t vr = __riscv_vnsra(vc, 0, vl);
466  vint16m4_t vi = __riscv_vnsra(vc, 16, vl);
467  vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0f / scalar, vl);
468  vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0f / scalar, vl);
469  vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl);
470  __riscv_vse32(magnitudeVector, __riscv_vfsqrt(vf, vl), vl);
471  }
472 }
473 #endif /*LV_HAVE_RVV*/
474 
475 #ifdef LV_HAVE_RVVSEG
476 #include <riscv_vector.h>
477 
478 static inline void volk_16ic_s32f_magnitude_32f_rvvseg(float* magnitudeVector,
479  const lv_16sc_t* complexVector,
480  const float scalar,
481  unsigned int num_points)
482 {
483  size_t n = num_points;
484  for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
485  vl = __riscv_vsetvl_e16m4(n);
486  vint16m4x2_t vc = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)complexVector, vl);
487  vint16m4_t vr = __riscv_vget_i16m4(vc, 0);
488  vint16m4_t vi = __riscv_vget_i16m4(vc, 1);
489  vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0f / scalar, vl);
490  vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0f / scalar, vl);
491  vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl);
492  __riscv_vse32(magnitudeVector, __riscv_vfsqrt(vf, vl), vl);
493  }
494 }
495 #endif /*LV_HAVE_RVVSEG*/
496 
497 #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */
volk_16ic_s32f_magnitude_32f_a_sse3
static void volk_16ic_s32f_magnitude_32f_a_sse3(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_magnitude_32f.h:114
volk_16ic_s32f_magnitude_32f_generic
static void volk_16ic_s32f_magnitude_32f_generic(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_magnitude_32f.h:244
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
lv_16sc_t
short complex lv_16sc_t
Definition: volk_complex.h:71
volk_common.h
volk_16ic_s32f_magnitude_32f_neon
static void volk_16ic_s32f_magnitude_32f_neon(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_magnitude_32f.h:335
volk_16ic_s32f_magnitude_32f_a_sse
static void volk_16ic_s32f_magnitude_32f_a_sse(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_magnitude_32f.h:177