Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_16ic_s32f_deinterleave_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
43 #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
44 #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
45 
46 #include <inttypes.h>
47 #include <stdio.h>
48 #include <volk/volk_common.h>
49 
50 #ifdef LV_HAVE_AVX2
51 #include <immintrin.h>
52 
53 static inline void
54 volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
55  float* qBuffer,
56  const lv_16sc_t* complexVector,
57  const float scalar,
58  unsigned int num_points)
59 {
60  float* iBufferPtr = iBuffer;
61  float* qBufferPtr = qBuffer;
62 
63  uint64_t number = 0;
64  const uint64_t eighthPoints = num_points / 8;
65  __m256 cplxValue1, cplxValue2, iValue, qValue;
66  __m256i cplxValueA, cplxValueB;
67  __m128i cplxValue128;
68 
69  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
70  int16_t* complexVectorPtr = (int16_t*)complexVector;
71  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
72 
73  for (; number < eighthPoints; number++) {
74 
75  cplxValueA = _mm256_load_si256((__m256i*)complexVectorPtr);
76  complexVectorPtr += 16;
77 
78  // cvt
79  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
80  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
81  cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
82  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
83  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
84  cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
85 
86  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
87  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
88 
89  // Arrange in i1i2i3i4 format
90  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
91  iValue = _mm256_permutevar8x32_ps(iValue, idx);
92  // Arrange in q1q2q3q4 format
93  qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
94  qValue = _mm256_permutevar8x32_ps(qValue, idx);
95 
96  _mm256_store_ps(iBufferPtr, iValue);
97  _mm256_store_ps(qBufferPtr, qValue);
98 
99  iBufferPtr += 8;
100  qBufferPtr += 8;
101  }
102 
103  number = eighthPoints * 8;
104  complexVectorPtr = (int16_t*)&complexVector[number];
105  for (; number < num_points; number++) {
106  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
107  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
108  }
109 }
110 #endif /* LV_HAVE_AVX2 */
111 
112 #ifdef LV_HAVE_SSE
113 #include <xmmintrin.h>
114 
115 static inline void
117  float* qBuffer,
118  const lv_16sc_t* complexVector,
119  const float scalar,
120  unsigned int num_points)
121 {
122  float* iBufferPtr = iBuffer;
123  float* qBufferPtr = qBuffer;
124 
125  uint64_t number = 0;
126  const uint64_t quarterPoints = num_points / 4;
127  __m128 cplxValue1, cplxValue2, iValue, qValue;
128 
129  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
130  int16_t* complexVectorPtr = (int16_t*)complexVector;
131 
132  __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
133 
134  for (; number < quarterPoints; number++) {
135 
136  floatBuffer[0] = (float)(complexVectorPtr[0]);
137  floatBuffer[1] = (float)(complexVectorPtr[1]);
138  floatBuffer[2] = (float)(complexVectorPtr[2]);
139  floatBuffer[3] = (float)(complexVectorPtr[3]);
140 
141  floatBuffer[4] = (float)(complexVectorPtr[4]);
142  floatBuffer[5] = (float)(complexVectorPtr[5]);
143  floatBuffer[6] = (float)(complexVectorPtr[6]);
144  floatBuffer[7] = (float)(complexVectorPtr[7]);
145 
146  cplxValue1 = _mm_load_ps(&floatBuffer[0]);
147  cplxValue2 = _mm_load_ps(&floatBuffer[4]);
148 
149  complexVectorPtr += 8;
150 
151  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
152  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
153 
154  // Arrange in i1i2i3i4 format
155  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
156  // Arrange in q1q2q3q4 format
157  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
158 
159  _mm_store_ps(iBufferPtr, iValue);
160  _mm_store_ps(qBufferPtr, qValue);
161 
162  iBufferPtr += 4;
163  qBufferPtr += 4;
164  }
165 
166  number = quarterPoints * 4;
167  complexVectorPtr = (int16_t*)&complexVector[number];
168  for (; number < num_points; number++) {
169  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
170  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
171  }
172 }
173 #endif /* LV_HAVE_SSE */
174 
175 #ifdef LV_HAVE_GENERIC
176 
177 static inline void
179  float* qBuffer,
180  const lv_16sc_t* complexVector,
181  const float scalar,
182  unsigned int num_points)
183 {
184  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
185  float* iBufferPtr = iBuffer;
186  float* qBufferPtr = qBuffer;
187  unsigned int number;
188  for (number = 0; number < num_points; number++) {
189  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
190  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
191  }
192 }
193 #endif /* LV_HAVE_GENERIC */
194 
195 #ifdef LV_HAVE_NEON
196 #include <arm_neon.h>
197 static inline void volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer,
198  float* qBuffer,
199  const lv_16sc_t* complexVector,
200  const float scalar,
201  unsigned int num_points)
202 {
203  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
204  float* iBufferPtr = iBuffer;
205  float* qBufferPtr = qBuffer;
206  unsigned int eighth_points = num_points / 4;
207  unsigned int number;
208  float iScalar = 1.f / scalar;
209  float32x4_t invScalar;
210  invScalar = vld1q_dup_f32(&iScalar);
211 
212  int16x4x2_t complexInput_s16;
213  int32x4x2_t complexInput_s32;
214  float32x4x2_t complexFloat;
215 
216  for (number = 0; number < eighth_points; number++) {
217  complexInput_s16 = vld2_s16(complexVectorPtr);
218  complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]);
219  complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]);
220  complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]);
221  complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]);
222  complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar);
223  complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar);
224  vst1q_f32(iBufferPtr, complexFloat.val[0]);
225  vst1q_f32(qBufferPtr, complexFloat.val[1]);
226  complexVectorPtr += 8;
227  iBufferPtr += 4;
228  qBufferPtr += 4;
229  }
230 
231  for (number = eighth_points * 4; number < num_points; number++) {
232  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
233  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
234  }
235 }
236 #endif /* LV_HAVE_GENERIC */
237 
238 #ifdef LV_HAVE_NEONV8
239 #include <arm_neon.h>
240 
241 static inline void
242 volk_16ic_s32f_deinterleave_32f_x2_neonv8(float* iBuffer,
243  float* qBuffer,
244  const lv_16sc_t* complexVector,
245  const float scalar,
246  unsigned int num_points)
247 {
248  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
249  float* iBufferPtr = iBuffer;
250  float* qBufferPtr = qBuffer;
251  const unsigned int eighthPoints = num_points / 8;
252  const float iScalar = 1.f / scalar;
253  const float32x4_t invScalar = vdupq_n_f32(iScalar);
254 
255  for (unsigned int number = 0; number < eighthPoints; number++) {
256  int16x8x2_t cplx0 = vld2q_s16(complexVectorPtr);
257  __VOLK_PREFETCH(complexVectorPtr + 32);
258 
259  /* Convert lower 4 of each to float */
260  int32x4_t i_lo = vmovl_s16(vget_low_s16(cplx0.val[0]));
261  int32x4_t q_lo = vmovl_s16(vget_low_s16(cplx0.val[1]));
262  int32x4_t i_hi = vmovl_s16(vget_high_s16(cplx0.val[0]));
263  int32x4_t q_hi = vmovl_s16(vget_high_s16(cplx0.val[1]));
264 
265  float32x4_t iFloat_lo = vmulq_f32(vcvtq_f32_s32(i_lo), invScalar);
266  float32x4_t qFloat_lo = vmulq_f32(vcvtq_f32_s32(q_lo), invScalar);
267  float32x4_t iFloat_hi = vmulq_f32(vcvtq_f32_s32(i_hi), invScalar);
268  float32x4_t qFloat_hi = vmulq_f32(vcvtq_f32_s32(q_hi), invScalar);
269 
270  vst1q_f32(iBufferPtr, iFloat_lo);
271  vst1q_f32(iBufferPtr + 4, iFloat_hi);
272  vst1q_f32(qBufferPtr, qFloat_lo);
273  vst1q_f32(qBufferPtr + 4, qFloat_hi);
274 
275  complexVectorPtr += 16;
276  iBufferPtr += 8;
277  qBufferPtr += 8;
278  }
279 
280  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
281  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
282  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
283  }
284 }
285 #endif /* LV_HAVE_NEONV8 */
286 
287 #ifdef LV_HAVE_ORC
288 extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer,
289  float* qBuffer,
290  const lv_16sc_t* complexVector,
291  const float scalar,
292  int num_points);
293 
294 static inline void
295 volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer,
296  float* qBuffer,
297  const lv_16sc_t* complexVector,
298  const float scalar,
299  unsigned int num_points)
300 {
301  volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(
302  iBuffer, qBuffer, complexVector, scalar, num_points);
303 }
304 #endif /* LV_HAVE_ORC */
305 
306 
307 #endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H */
308 
309 
310 #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
311 #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
312 
313 #include <inttypes.h>
314 #include <stdio.h>
315 #include <volk/volk_common.h>
316 
317 #ifdef LV_HAVE_AVX2
318 #include <immintrin.h>
319 
320 static inline void
321 volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
322  float* qBuffer,
323  const lv_16sc_t* complexVector,
324  const float scalar,
325  unsigned int num_points)
326 {
327  float* iBufferPtr = iBuffer;
328  float* qBufferPtr = qBuffer;
329 
330  uint64_t number = 0;
331  const uint64_t eighthPoints = num_points / 8;
332  __m256 cplxValue1, cplxValue2, iValue, qValue;
333  __m256i cplxValueA, cplxValueB;
334  __m128i cplxValue128;
335 
336  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
337  int16_t* complexVectorPtr = (int16_t*)complexVector;
338  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
339 
340  for (; number < eighthPoints; number++) {
341 
342  cplxValueA = _mm256_loadu_si256((__m256i*)complexVectorPtr);
343  complexVectorPtr += 16;
344 
345  // cvt
346  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
347  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
348  cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
349  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
350  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
351  cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
352 
353  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
354  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
355 
356  // Arrange in i1i2i3i4 format
357  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
358  iValue = _mm256_permutevar8x32_ps(iValue, idx);
359  // Arrange in q1q2q3q4 format
360  qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
361  qValue = _mm256_permutevar8x32_ps(qValue, idx);
362 
363  _mm256_storeu_ps(iBufferPtr, iValue);
364  _mm256_storeu_ps(qBufferPtr, qValue);
365 
366  iBufferPtr += 8;
367  qBufferPtr += 8;
368  }
369 
370  number = eighthPoints * 8;
371  complexVectorPtr = (int16_t*)&complexVector[number];
372  for (; number < num_points; number++) {
373  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
374  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
375  }
376 }
377 #endif /* LV_HAVE_AVX2 */
378 
379 #ifdef LV_HAVE_RVV
380 #include <riscv_vector.h>
381 
382 static inline void volk_16ic_s32f_deinterleave_32f_x2_rvv(float* iBuffer,
383  float* qBuffer,
384  const lv_16sc_t* complexVector,
385  const float scalar,
386  unsigned int num_points)
387 {
388  size_t n = num_points;
389  for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
390  vl = __riscv_vsetvl_e16m4(n);
391  vint32m8_t vc = __riscv_vle32_v_i32m8((const int32_t*)complexVector, vl);
392  vint16m4_t vr = __riscv_vnsra(vc, 0, vl);
393  vint16m4_t vi = __riscv_vnsra(vc, 16, vl);
394  vfloat32m8_t vrf = __riscv_vfwcvt_f(vr, vl);
395  vfloat32m8_t vif = __riscv_vfwcvt_f(vi, vl);
396  __riscv_vse32(iBuffer, __riscv_vfmul(vrf, 1.0f / scalar, vl), vl);
397  __riscv_vse32(qBuffer, __riscv_vfmul(vif, 1.0f / scalar, vl), vl);
398  }
399 }
400 #endif /*LV_HAVE_RVV*/
401 
402 #ifdef LV_HAVE_RVVSEG
403 #include <riscv_vector.h>
404 
405 static inline void
406 volk_16ic_s32f_deinterleave_32f_x2_rvvseg(float* iBuffer,
407  float* qBuffer,
408  const lv_16sc_t* complexVector,
409  const float scalar,
410  unsigned int num_points)
411 {
412  size_t n = num_points;
413  for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
414  vl = __riscv_vsetvl_e16m4(n);
415  vint16m4x2_t vc = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)complexVector, vl);
416  vint16m4_t vr = __riscv_vget_i16m4(vc, 0);
417  vint16m4_t vi = __riscv_vget_i16m4(vc, 1);
418  vfloat32m8_t vrf = __riscv_vfwcvt_f(vr, vl);
419  vfloat32m8_t vif = __riscv_vfwcvt_f(vi, vl);
420  __riscv_vse32(iBuffer, __riscv_vfmul(vrf, 1.0f / scalar, vl), vl);
421  __riscv_vse32(qBuffer, __riscv_vfmul(vif, 1.0f / scalar, vl), vl);
422  }
423 }
424 #endif /*LV_HAVE_RVVSEG*/
425 
426 #endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H */
volk_16ic_s32f_deinterleave_32f_x2_generic
static void volk_16ic_s32f_deinterleave_32f_x2_generic(float *iBuffer, float *qBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_32f_x2.h:178
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
lv_16sc_t
short complex lv_16sc_t
Definition: volk_complex.h:71
volk_common.h
volk_16ic_s32f_deinterleave_32f_x2_neon
static void volk_16ic_s32f_deinterleave_32f_x2_neon(float *iBuffer, float *qBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_32f_x2.h:197
volk_16ic_s32f_deinterleave_32f_x2_a_sse
static void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float *iBuffer, float *qBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_32f_x2.h:116