Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_s32f_magnitude_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
61 #define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
62 
63 #include <inttypes.h>
64 #include <math.h>
65 #include <stdio.h>
66 #include <volk/volk_common.h>
67 
68 #ifdef LV_HAVE_GENERIC
69 
70 static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector,
71  const lv_32fc_t* complexVector,
72  const float scalar,
73  unsigned int num_points)
74 {
75  const float* complexVectorPtr = (float*)complexVector;
76  int16_t* magnitudeVectorPtr = magnitudeVector;
77  unsigned int number = 0;
78  for (number = 0; number < num_points; number++) {
79  float real = *complexVectorPtr++;
80  float imag = *complexVectorPtr++;
81  *magnitudeVectorPtr++ =
82  (int16_t)rintf(scalar * sqrtf((real * real) + (imag * imag)));
83  }
84 }
85 #endif /* LV_HAVE_GENERIC */
86 
87 #ifdef LV_HAVE_AVX2
88 #include <immintrin.h>
89 
90 static inline void volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector,
91  const lv_32fc_t* complexVector,
92  const float scalar,
93  unsigned int num_points)
94 {
95  unsigned int number = 0;
96  const unsigned int eighthPoints = num_points / 8;
97 
98  const float* complexVectorPtr = (const float*)complexVector;
99  int16_t* magnitudeVectorPtr = magnitudeVector;
100 
101  __m256 vScalar = _mm256_set1_ps(scalar);
102  __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
103  __m256 cplxValue1, cplxValue2, result;
104  __m256i resultInt;
105  __m128i resultShort;
106 
107  for (; number < eighthPoints; number++) {
108  cplxValue1 = _mm256_load_ps(complexVectorPtr);
109  complexVectorPtr += 8;
110 
111  cplxValue2 = _mm256_load_ps(complexVectorPtr);
112  complexVectorPtr += 8;
113 
114  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
115  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
116 
117  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
118 
119  result = _mm256_sqrt_ps(result);
120 
121  result = _mm256_mul_ps(result, vScalar);
122 
123  resultInt = _mm256_cvtps_epi32(result);
124  resultInt = _mm256_packs_epi32(resultInt, resultInt);
125  resultInt = _mm256_permutevar8x32_epi32(
126  resultInt, idx); // permute to compensate for shuffling in hadd and packs
127  resultShort = _mm256_extracti128_si256(resultInt, 0);
128  _mm_store_si128((__m128i*)magnitudeVectorPtr, resultShort);
129  magnitudeVectorPtr += 8;
130  }
131 
132  number = eighthPoints * 8;
134  magnitudeVector + number, complexVector + number, scalar, num_points - number);
135 }
136 #endif /* LV_HAVE_AVX2 */
137 
138 #ifdef LV_HAVE_SSE3
139 #include <pmmintrin.h>
140 
141 static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector,
142  const lv_32fc_t* complexVector,
143  const float scalar,
144  unsigned int num_points)
145 {
146  unsigned int number = 0;
147  const unsigned int quarterPoints = num_points / 4;
148 
149  const float* complexVectorPtr = (const float*)complexVector;
150  int16_t* magnitudeVectorPtr = magnitudeVector;
151 
152  __m128 vScalar = _mm_set_ps1(scalar);
153 
154  __m128 cplxValue1, cplxValue2, result;
155 
156  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
157 
158  for (; number < quarterPoints; number++) {
159  cplxValue1 = _mm_load_ps(complexVectorPtr);
160  complexVectorPtr += 4;
161 
162  cplxValue2 = _mm_load_ps(complexVectorPtr);
163  complexVectorPtr += 4;
164 
165  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
166  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
167 
168  result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
169 
170  result = _mm_sqrt_ps(result);
171 
172  result = _mm_mul_ps(result, vScalar);
173 
174  _mm_store_ps(floatBuffer, result);
175  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
176  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
177  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
178  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
179  }
180 
181  number = quarterPoints * 4;
183  magnitudeVector + number, complexVector + number, scalar, num_points - number);
184 }
185 #endif /* LV_HAVE_SSE3 */
186 
187 
188 #ifdef LV_HAVE_SSE
189 #include <xmmintrin.h>
190 
191 static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector,
192  const lv_32fc_t* complexVector,
193  const float scalar,
194  unsigned int num_points)
195 {
196  unsigned int number = 0;
197  const unsigned int quarterPoints = num_points / 4;
198 
199  const float* complexVectorPtr = (const float*)complexVector;
200  int16_t* magnitudeVectorPtr = magnitudeVector;
201 
202  __m128 vScalar = _mm_set_ps1(scalar);
203 
204  __m128 cplxValue1, cplxValue2, result;
205  __m128 iValue, qValue;
206 
207  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
208 
209  for (; number < quarterPoints; number++) {
210  cplxValue1 = _mm_load_ps(complexVectorPtr);
211  complexVectorPtr += 4;
212 
213  cplxValue2 = _mm_load_ps(complexVectorPtr);
214  complexVectorPtr += 4;
215 
216  // Arrange in i1i2i3i4 format
217  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
218  // Arrange in q1q2q3q4 format
219  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
220 
221  __m128 iValue2 = _mm_mul_ps(iValue, iValue); // Square the I values
222  __m128 qValue2 = _mm_mul_ps(qValue, qValue); // Square the Q Values
223 
224  result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values
225 
226  result = _mm_sqrt_ps(result);
227 
228  result = _mm_mul_ps(result, vScalar);
229 
230  _mm_store_ps(floatBuffer, result);
231  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
232  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
233  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
234  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
235  }
236 
237  number = quarterPoints * 4;
239  magnitudeVector + number, complexVector + number, scalar, num_points - number);
240 }
241 #endif /* LV_HAVE_SSE */
242 
243 
244 #endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a_H */
245 
246 #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
247 #define INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
248 
249 #include <inttypes.h>
250 #include <math.h>
251 #include <stdio.h>
252 #include <volk/volk_common.h>
253 
254 #ifdef LV_HAVE_AVX2
255 #include <immintrin.h>
256 
257 static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector,
258  const lv_32fc_t* complexVector,
259  const float scalar,
260  unsigned int num_points)
261 {
262  unsigned int number = 0;
263  const unsigned int eighthPoints = num_points / 8;
264 
265  const float* complexVectorPtr = (const float*)complexVector;
266  int16_t* magnitudeVectorPtr = magnitudeVector;
267 
268  __m256 vScalar = _mm256_set1_ps(scalar);
269  __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
270  __m256 cplxValue1, cplxValue2, result;
271  __m256i resultInt;
272  __m128i resultShort;
273 
274  for (; number < eighthPoints; number++) {
275  cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
276  complexVectorPtr += 8;
277 
278  cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
279  complexVectorPtr += 8;
280 
281  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
282  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
283 
284  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
285 
286  result = _mm256_sqrt_ps(result);
287 
288  result = _mm256_mul_ps(result, vScalar);
289 
290  resultInt = _mm256_cvtps_epi32(result);
291  resultInt = _mm256_packs_epi32(resultInt, resultInt);
292  resultInt = _mm256_permutevar8x32_epi32(
293  resultInt, idx); // permute to compensate for shuffling in hadd and packs
294  resultShort = _mm256_extracti128_si256(resultInt, 0);
295  _mm_storeu_si128((__m128i*)magnitudeVectorPtr, resultShort);
296  magnitudeVectorPtr += 8;
297  }
298 
299  number = eighthPoints * 8;
301  magnitudeVector + number, complexVector + number, scalar, num_points - number);
302 }
303 #endif /* LV_HAVE_AVX2 */
304 
305 #ifdef LV_HAVE_NEON
306 #include <arm_neon.h>
307 
308 static inline void volk_32fc_s32f_magnitude_16i_neon(int16_t* magnitudeVector,
309  const lv_32fc_t* complexVector,
310  const float scalar,
311  unsigned int num_points)
312 {
313  unsigned int number = 0;
314  const unsigned int quarter_points = num_points / 4;
315 
316  const float* complexVectorPtr = (const float*)complexVector;
317  int16_t* magnitudeVectorPtr = magnitudeVector;
318  float32x4_t vScalar = vdupq_n_f32(scalar);
319 
320  float32x4_t half = vdupq_n_f32(0.5f);
321 
322  for (; number < quarter_points; number++) {
323  float32x4x2_t input = vld2q_f32(complexVectorPtr);
324  complexVectorPtr += 8;
325 
326  float32x4_t realSquared = vmulq_f32(input.val[0], input.val[0]);
327  float32x4_t imagSquared = vmulq_f32(input.val[1], input.val[1]);
328  float32x4_t sumSquared = vaddq_f32(realSquared, imagSquared);
329 
330  /* Use reciprocal square root estimate with Newton-Raphson refinement */
331  float32x4_t rsqrt = vrsqrteq_f32(sumSquared);
332  rsqrt = vmulq_f32(rsqrt, vrsqrtsq_f32(vmulq_f32(sumSquared, rsqrt), rsqrt));
333  rsqrt = vmulq_f32(rsqrt, vrsqrtsq_f32(vmulq_f32(sumSquared, rsqrt), rsqrt));
334  float32x4_t magnitude = vmulq_f32(sumSquared, rsqrt);
335 
336  /* Handle zero case */
337  uint32x4_t zero_mask = vceqq_f32(sumSquared, vdupq_n_f32(0.0f));
338  magnitude = vbslq_f32(zero_mask, sumSquared, magnitude);
339 
340  // Magnitude is always non-negative, so just add 0.5 for rounding
341  float32x4_t scaled = vaddq_f32(vmulq_f32(magnitude, vScalar), half);
342  int32x4_t intVal = vcvtq_s32_f32(scaled);
343  int16x4_t shortVal = vqmovn_s32(intVal);
344 
345  vst1_s16(magnitudeVectorPtr, shortVal);
346  magnitudeVectorPtr += 4;
347  }
348 
349  number = quarter_points * 4;
350  for (; number < num_points; number++) {
351  float real = *complexVectorPtr++;
352  float imag = *complexVectorPtr++;
353  *magnitudeVectorPtr++ =
354  (int16_t)rintf(scalar * sqrtf((real * real) + (imag * imag)));
355  }
356 }
357 #endif /* LV_HAVE_NEON */
358 
359 #ifdef LV_HAVE_NEONV8
360 #include <arm_neon.h>
361 
362 static inline void volk_32fc_s32f_magnitude_16i_neonv8(int16_t* magnitudeVector,
363  const lv_32fc_t* complexVector,
364  const float scalar,
365  unsigned int num_points)
366 {
367  unsigned int number = 0;
368  const unsigned int eighth_points = num_points / 8;
369 
370  const float* complexVectorPtr = (const float*)complexVector;
371  int16_t* magnitudeVectorPtr = magnitudeVector;
372  float32x4_t vScalar = vdupq_n_f32(scalar);
373 
374  for (; number < eighth_points; number++) {
375  float32x4x2_t input0 = vld2q_f32(complexVectorPtr);
376  float32x4x2_t input1 = vld2q_f32(complexVectorPtr + 8);
377  complexVectorPtr += 16;
378  __VOLK_PREFETCH(complexVectorPtr + 16);
379 
380  float32x4_t sumSquared0 = vfmaq_f32(
381  vmulq_f32(input0.val[1], input0.val[1]), input0.val[0], input0.val[0]);
382  float32x4_t sumSquared1 = vfmaq_f32(
383  vmulq_f32(input1.val[1], input1.val[1]), input1.val[0], input1.val[0]);
384 
385  float32x4_t magnitude0 = vsqrtq_f32(sumSquared0);
386  float32x4_t magnitude1 = vsqrtq_f32(sumSquared1);
387 
388  float32x4_t scaled0 = vmulq_f32(magnitude0, vScalar);
389  float32x4_t scaled1 = vmulq_f32(magnitude1, vScalar);
390 
391  int32x4_t intVal0 = vcvtnq_s32_f32(scaled0);
392  int32x4_t intVal1 = vcvtnq_s32_f32(scaled1);
393 
394  int16x4_t shortVal0 = vqmovn_s32(intVal0);
395  int16x4_t shortVal1 = vqmovn_s32(intVal1);
396 
397  vst1_s16(magnitudeVectorPtr, shortVal0);
398  vst1_s16(magnitudeVectorPtr + 4, shortVal1);
399  magnitudeVectorPtr += 8;
400  }
401 
402  number = eighth_points * 8;
403  for (; number < num_points; number++) {
404  float real = *complexVectorPtr++;
405  float imag = *complexVectorPtr++;
406  *magnitudeVectorPtr++ =
407  (int16_t)rintf(scalar * sqrtf((real * real) + (imag * imag)));
408  }
409 }
410 #endif /* LV_HAVE_NEONV8 */
411 
412 #ifdef LV_HAVE_RVV
413 #include <riscv_vector.h>
414 
415 static inline void volk_32fc_s32f_magnitude_16i_rvv(int16_t* magnitudeVector,
416  const lv_32fc_t* complexVector,
417  const float scalar,
418  unsigned int num_points)
419 {
420  size_t n = num_points;
421  for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
422  vl = __riscv_vsetvl_e32m4(n);
423  vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl);
424  vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
425  vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
426  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
427  v = __riscv_vfmul(__riscv_vfsqrt(v, vl), scalar, vl);
428  __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(v, vl), vl);
429  }
430 }
431 #endif /*LV_HAVE_RVV*/
432 
433 #ifdef LV_HAVE_RVVSEG
434 #include <riscv_vector.h>
435 
436 static inline void volk_32fc_s32f_magnitude_16i_rvvseg(int16_t* magnitudeVector,
437  const lv_32fc_t* complexVector,
438  const float scalar,
439  unsigned int num_points)
440 {
441  size_t n = num_points;
442  for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
443  vl = __riscv_vsetvl_e32m4(n);
444  vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl);
445  vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
446  vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
447  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
448  v = __riscv_vfmul(__riscv_vfsqrt(v, vl), scalar, vl);
449  __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(v, vl), vl);
450  }
451 }
452 #endif /*LV_HAVE_RVVSEG*/
453 
454 #endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_u_H */
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_common.h
volk_32fc_s32f_magnitude_16i_generic
static void volk_32fc_s32f_magnitude_16i_generic(int16_t *magnitudeVector, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_magnitude_16i.h:70
volk_32fc_s32f_magnitude_16i_a_sse
static void volk_32fc_s32f_magnitude_16i_a_sse(int16_t *magnitudeVector, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_magnitude_16i.h:191
volk_32fc_s32f_magnitude_16i_a_sse3
static void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t *magnitudeVector, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_magnitude_16i.h:141
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
bit128::f
float f[4]
Definition: volk_common.h:120
volk_32fc_s32f_magnitude_16i_neon
static void volk_32fc_s32f_magnitude_16i_neon(int16_t *magnitudeVector, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_magnitude_16i.h:308
rintf
static float rintf(float x)
Definition: config.h:45