Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_x2_s32f_interleave_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
62 #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
63 #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
64 
65 #include <inttypes.h>
66 #include <stdio.h>
67 #include <volk/volk_common.h>
68 
69 #ifdef LV_HAVE_AVX2
70 #include <immintrin.h>
71 
72 static inline void volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector,
73  const float* iBuffer,
74  const float* qBuffer,
75  const float scalar,
76  unsigned int num_points)
77 {
78  unsigned int number = 0;
79  const float* iBufferPtr = iBuffer;
80  const float* qBufferPtr = qBuffer;
81 
82  __m256 vScalar = _mm256_set1_ps(scalar);
83 
84  const unsigned int eighthPoints = num_points / 8;
85 
86  __m256 iValue, qValue, cplxValue1, cplxValue2;
87  __m256i intValue1, intValue2;
88 
89  int16_t* complexVectorPtr = (int16_t*)complexVector;
90 
91  for (; number < eighthPoints; number++) {
92  iValue = _mm256_load_ps(iBufferPtr);
93  qValue = _mm256_load_ps(qBufferPtr);
94 
95  // Interleaves the lower two values in the i and q variables into one buffer
96  cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
97  cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
98 
99  // Interleaves the upper two values in the i and q variables into one buffer
100  cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
101  cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
102 
103  intValue1 = _mm256_cvtps_epi32(cplxValue1);
104  intValue2 = _mm256_cvtps_epi32(cplxValue2);
105 
106  intValue1 = _mm256_packs_epi32(intValue1, intValue2);
107 
108  _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
109  complexVectorPtr += 16;
110 
111  iBufferPtr += 8;
112  qBufferPtr += 8;
113  }
114 
115  number = eighthPoints * 8;
116  complexVectorPtr = (int16_t*)(&complexVector[number]);
117  for (; number < num_points; number++) {
118  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
119  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
120  }
121 }
122 #endif /* LV_HAVE_AVX2 */
123 
124 
125 #ifdef LV_HAVE_SSE2
126 #include <emmintrin.h>
127 
128 static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector,
129  const float* iBuffer,
130  const float* qBuffer,
131  const float scalar,
132  unsigned int num_points)
133 {
134  unsigned int number = 0;
135  const float* iBufferPtr = iBuffer;
136  const float* qBufferPtr = qBuffer;
137 
138  __m128 vScalar = _mm_set_ps1(scalar);
139 
140  const unsigned int quarterPoints = num_points / 4;
141 
142  __m128 iValue, qValue, cplxValue1, cplxValue2;
143  __m128i intValue1, intValue2;
144 
145  int16_t* complexVectorPtr = (int16_t*)complexVector;
146 
147  for (; number < quarterPoints; number++) {
148  iValue = _mm_load_ps(iBufferPtr);
149  qValue = _mm_load_ps(qBufferPtr);
150 
151  // Interleaves the lower two values in the i and q variables into one buffer
152  cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
153  cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
154 
155  // Interleaves the upper two values in the i and q variables into one buffer
156  cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
157  cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
158 
159  intValue1 = _mm_cvtps_epi32(cplxValue1);
160  intValue2 = _mm_cvtps_epi32(cplxValue2);
161 
162  intValue1 = _mm_packs_epi32(intValue1, intValue2);
163 
164  _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
165  complexVectorPtr += 8;
166 
167  iBufferPtr += 4;
168  qBufferPtr += 4;
169  }
170 
171  number = quarterPoints * 4;
172  complexVectorPtr = (int16_t*)(&complexVector[number]);
173  for (; number < num_points; number++) {
174  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
175  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
176  }
177 }
178 #endif /* LV_HAVE_SSE2 */
179 
180 
181 #ifdef LV_HAVE_SSE
182 #include <xmmintrin.h>
183 
184 static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector,
185  const float* iBuffer,
186  const float* qBuffer,
187  const float scalar,
188  unsigned int num_points)
189 {
190  unsigned int number = 0;
191  const float* iBufferPtr = iBuffer;
192  const float* qBufferPtr = qBuffer;
193 
194  __m128 vScalar = _mm_set_ps1(scalar);
195 
196  const unsigned int quarterPoints = num_points / 4;
197 
198  __m128 iValue, qValue, cplxValue;
199 
200  int16_t* complexVectorPtr = (int16_t*)complexVector;
201 
202  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
203 
204  for (; number < quarterPoints; number++) {
205  iValue = _mm_load_ps(iBufferPtr);
206  qValue = _mm_load_ps(qBufferPtr);
207 
208  // Interleaves the lower two values in the i and q variables into one buffer
209  cplxValue = _mm_unpacklo_ps(iValue, qValue);
210  cplxValue = _mm_mul_ps(cplxValue, vScalar);
211 
212  _mm_store_ps(floatBuffer, cplxValue);
213 
214  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
215  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
216  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
217  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
218 
219  // Interleaves the upper two values in the i and q variables into one buffer
220  cplxValue = _mm_unpackhi_ps(iValue, qValue);
221  cplxValue = _mm_mul_ps(cplxValue, vScalar);
222 
223  _mm_store_ps(floatBuffer, cplxValue);
224 
225  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
226  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
227  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
228  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
229 
230  iBufferPtr += 4;
231  qBufferPtr += 4;
232  }
233 
234  number = quarterPoints * 4;
235  complexVectorPtr = (int16_t*)(&complexVector[number]);
236  for (; number < num_points; number++) {
237  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
238  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
239  }
240 }
241 #endif /* LV_HAVE_SSE */
242 
243 
244 #ifdef LV_HAVE_GENERIC
245 
246 static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector,
247  const float* iBuffer,
248  const float* qBuffer,
249  const float scalar,
250  unsigned int num_points)
251 {
252  int16_t* complexVectorPtr = (int16_t*)complexVector;
253  const float* iBufferPtr = iBuffer;
254  const float* qBufferPtr = qBuffer;
255  unsigned int number = 0;
256 
257  for (number = 0; number < num_points; number++) {
258  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
259  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
260  }
261 }
262 #endif /* LV_HAVE_GENERIC */
263 
264 
265 #endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */
266 
267 #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
268 #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
269 
270 #include <inttypes.h>
271 #include <stdio.h>
272 #include <volk/volk_common.h>
273 
274 #ifdef LV_HAVE_AVX2
275 #include <immintrin.h>
276 
277 static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector,
278  const float* iBuffer,
279  const float* qBuffer,
280  const float scalar,
281  unsigned int num_points)
282 {
283  unsigned int number = 0;
284  const float* iBufferPtr = iBuffer;
285  const float* qBufferPtr = qBuffer;
286 
287  __m256 vScalar = _mm256_set1_ps(scalar);
288 
289  const unsigned int eighthPoints = num_points / 8;
290 
291  __m256 iValue, qValue, cplxValue1, cplxValue2;
292  __m256i intValue1, intValue2;
293 
294  int16_t* complexVectorPtr = (int16_t*)complexVector;
295 
296  for (; number < eighthPoints; number++) {
297  iValue = _mm256_loadu_ps(iBufferPtr);
298  qValue = _mm256_loadu_ps(qBufferPtr);
299 
300  // Interleaves the lower two values in the i and q variables into one buffer
301  cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
302  cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
303 
304  // Interleaves the upper two values in the i and q variables into one buffer
305  cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
306  cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
307 
308  intValue1 = _mm256_cvtps_epi32(cplxValue1);
309  intValue2 = _mm256_cvtps_epi32(cplxValue2);
310 
311  intValue1 = _mm256_packs_epi32(intValue1, intValue2);
312 
313  _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
314  complexVectorPtr += 16;
315 
316  iBufferPtr += 8;
317  qBufferPtr += 8;
318  }
319 
320  number = eighthPoints * 8;
321  complexVectorPtr = (int16_t*)(&complexVector[number]);
322  for (; number < num_points; number++) {
323  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
324  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
325  }
326 }
327 #endif /* LV_HAVE_AVX2 */
328 
329 #ifdef LV_HAVE_NEON
330 #include <arm_neon.h>
331 
332 static inline void volk_32f_x2_s32f_interleave_16ic_neon(lv_16sc_t* complexVector,
333  const float* iBuffer,
334  const float* qBuffer,
335  const float scalar,
336  unsigned int num_points)
337 {
338  unsigned int number = 0;
339  const unsigned int quarter_points = num_points / 4;
340 
341  const float* iBufferPtr = iBuffer;
342  const float* qBufferPtr = qBuffer;
343  int16_t* complexVectorPtr = (int16_t*)complexVector;
344 
345  float32x4_t vScalar = vdupq_n_f32(scalar);
346  float32x4_t half = vdupq_n_f32(0.5f);
347  float32x4_t neg_half = vdupq_n_f32(-0.5f);
348  float32x4_t zero = vdupq_n_f32(0.0f);
349 
350  for (; number < quarter_points; number++) {
351  float32x4_t iValue = vld1q_f32(iBufferPtr);
352  float32x4_t qValue = vld1q_f32(qBufferPtr);
353 
354  iValue = vmulq_f32(iValue, vScalar);
355  qValue = vmulq_f32(qValue, vScalar);
356 
357  // Round to nearest: add copysign(0.5, x) before truncating
358  uint32x4_t iNeg = vcltq_f32(iValue, zero);
359  uint32x4_t qNeg = vcltq_f32(qValue, zero);
360  iValue = vaddq_f32(iValue, vbslq_f32(iNeg, neg_half, half));
361  qValue = vaddq_f32(qValue, vbslq_f32(qNeg, neg_half, half));
362 
363  int32x4_t iInt = vcvtq_s32_f32(iValue);
364  int32x4_t qInt = vcvtq_s32_f32(qValue);
365 
366  int16x4_t iShort = vqmovn_s32(iInt);
367  int16x4_t qShort = vqmovn_s32(qInt);
368 
369  int16x4x2_t interleaved;
370  interleaved.val[0] = iShort;
371  interleaved.val[1] = qShort;
372  vst2_s16(complexVectorPtr, interleaved);
373 
374  complexVectorPtr += 8;
375  iBufferPtr += 4;
376  qBufferPtr += 4;
377  }
378 
379  number = quarter_points * 4;
380  complexVectorPtr = (int16_t*)(&complexVector[number]);
381  for (; number < num_points; number++) {
382  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
383  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
384  }
385 }
386 #endif /* LV_HAVE_NEON */
387 
388 #ifdef LV_HAVE_NEONV8
389 #include <arm_neon.h>
390 
391 static inline void volk_32f_x2_s32f_interleave_16ic_neonv8(lv_16sc_t* complexVector,
392  const float* iBuffer,
393  const float* qBuffer,
394  const float scalar,
395  unsigned int num_points)
396 {
397  unsigned int number = 0;
398  const unsigned int eighth_points = num_points / 8;
399 
400  const float* iBufferPtr = iBuffer;
401  const float* qBufferPtr = qBuffer;
402  int16_t* complexVectorPtr = (int16_t*)complexVector;
403 
404  float32x4_t vScalar = vdupq_n_f32(scalar);
405 
406  for (; number < eighth_points; number++) {
407  float32x4_t iValue0 = vld1q_f32(iBufferPtr);
408  float32x4_t iValue1 = vld1q_f32(iBufferPtr + 4);
409  float32x4_t qValue0 = vld1q_f32(qBufferPtr);
410  float32x4_t qValue1 = vld1q_f32(qBufferPtr + 4);
411  __VOLK_PREFETCH(iBufferPtr + 8);
412  __VOLK_PREFETCH(qBufferPtr + 8);
413 
414  iValue0 = vmulq_f32(iValue0, vScalar);
415  iValue1 = vmulq_f32(iValue1, vScalar);
416  qValue0 = vmulq_f32(qValue0, vScalar);
417  qValue1 = vmulq_f32(qValue1, vScalar);
418 
419  int32x4_t iInt0 = vcvtnq_s32_f32(iValue0);
420  int32x4_t iInt1 = vcvtnq_s32_f32(iValue1);
421  int32x4_t qInt0 = vcvtnq_s32_f32(qValue0);
422  int32x4_t qInt1 = vcvtnq_s32_f32(qValue1);
423 
424  int16x4_t iShort0 = vqmovn_s32(iInt0);
425  int16x4_t iShort1 = vqmovn_s32(iInt1);
426  int16x4_t qShort0 = vqmovn_s32(qInt0);
427  int16x4_t qShort1 = vqmovn_s32(qInt1);
428 
429  int16x4x2_t interleaved0, interleaved1;
430  interleaved0.val[0] = iShort0;
431  interleaved0.val[1] = qShort0;
432  interleaved1.val[0] = iShort1;
433  interleaved1.val[1] = qShort1;
434 
435  vst2_s16(complexVectorPtr, interleaved0);
436  vst2_s16(complexVectorPtr + 8, interleaved1);
437 
438  complexVectorPtr += 16;
439  iBufferPtr += 8;
440  qBufferPtr += 8;
441  }
442 
443  number = eighth_points * 8;
444  complexVectorPtr = (int16_t*)(&complexVector[number]);
445  for (; number < num_points; number++) {
446  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
447  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
448  }
449 }
450 #endif /* LV_HAVE_NEONV8 */
451 
452 #ifdef LV_HAVE_RVV
453 #include <riscv_vector.h>
454 
455 static inline void volk_32f_x2_s32f_interleave_16ic_rvv(lv_16sc_t* complexVector,
456  const float* iBuffer,
457  const float* qBuffer,
458  const float scalar,
459  unsigned int num_points)
460 {
461  uint32_t* out = (uint32_t*)complexVector;
462  size_t n = num_points;
463  for (size_t vl; n > 0; n -= vl, out += vl, iBuffer += vl, qBuffer += vl) {
464  vl = __riscv_vsetvl_e32m8(n);
465  vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl);
466  vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl);
467  vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl);
468  vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl);
469  vuint16m4_t vr = __riscv_vreinterpret_u16m4(vri);
470  vuint16m4_t vi = __riscv_vreinterpret_u16m4(vii);
471  vuint32m8_t vc = __riscv_vwmaccu(__riscv_vwaddu_vv(vr, vi, vl), 0xFFFF, vi, vl);
472  __riscv_vse32(out, vc, vl);
473  }
474 }
475 #endif /*LV_HAVE_RVV*/
476 
477 #ifdef LV_HAVE_RVVSEG
478 #include <riscv_vector.h>
479 
480 static inline void volk_32f_x2_s32f_interleave_16ic_rvvseg(lv_16sc_t* complexVector,
481  const float* iBuffer,
482  const float* qBuffer,
483  const float scalar,
484  unsigned int num_points)
485 {
486  size_t n = num_points;
487  for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
488  vl = __riscv_vsetvl_e32m8(n);
489  vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl);
490  vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl);
491  vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl);
492  vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl);
493  __riscv_vsseg2e16(
494  (int16_t*)complexVector, __riscv_vcreate_v_i16m4x2(vri, vii), vl);
495  }
496 }
497 #endif /*LV_HAVE_RVVSEG*/
498 
499 #endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */
volk_32f_x2_s32f_interleave_16ic_generic
static void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:246
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32f_x2_s32f_interleave_16ic_a_sse
static void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:184
lv_16sc_t
short complex lv_16sc_t
Definition: volk_complex.h:71
volk_common.h
volk_32f_x2_s32f_interleave_16ic_a_sse2
static void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:128
volk_32f_x2_s32f_interleave_16ic_neon
static void volk_32f_x2_s32f_interleave_16ic_neon(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:332
rintf
static float rintf(float x)
Definition: config.h:45