Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_16ic_convert_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
45 #ifndef INCLUDED_volk_16ic_convert_32fc_a_H
46 #define INCLUDED_volk_16ic_convert_32fc_a_H
47 
48 #include <volk/volk_complex.h>
49 
50 #ifdef LV_HAVE_AVX2
51 #include <immintrin.h>
52 
53 static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector,
54  const lv_16sc_t* inputVector,
55  unsigned int num_points)
56 {
57  const unsigned int avx_iters = num_points / 4;
58  unsigned int number = 0;
59  const int16_t* complexVectorPtr = (int16_t*)inputVector;
60  float* outputVectorPtr = (float*)outputVector;
61  __m256 outVal;
62  __m256i outValInt;
63  __m128i cplxValue;
64 
65  for (number = 0; number < avx_iters; number++) {
66  cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
67  __VOLK_PREFETCH(complexVectorPtr + 16);
68  complexVectorPtr += 8;
69 
70  outValInt = _mm256_cvtepi16_epi32(cplxValue);
71  outVal = _mm256_cvtepi32_ps(outValInt);
72  _mm256_store_ps((float*)outputVectorPtr, outVal);
73 
74  outputVectorPtr += 8;
75  }
76 
77  number = avx_iters * 8;
78  for (; number < num_points * 2; number++) {
79  *outputVectorPtr++ = (float)*complexVectorPtr++;
80  }
81 }
82 
83 #endif /* LV_HAVE_AVX2 */
84 
85 #ifdef LV_HAVE_AVX512F
86 #include <immintrin.h>
87 
88 static inline void volk_16ic_convert_32fc_a_avx512(lv_32fc_t* outputVector,
89  const lv_16sc_t* inputVector,
90  unsigned int num_points)
91 {
92  const unsigned int avx512_iters = num_points / 8;
93  unsigned int number = 0;
94  const int16_t* complexVectorPtr = (int16_t*)inputVector;
95  float* outputVectorPtr = (float*)outputVector;
96  __m512 outVal;
97  __m512i outValInt;
98  __m256i cplxValue;
99 
100  for (number = 0; number < avx512_iters; number++) {
101  // Load 16 int16 values (8 complex = 16 floats)
102  cplxValue = _mm256_load_si256((__m256i*)complexVectorPtr);
103  __VOLK_PREFETCH(complexVectorPtr + 32);
104  complexVectorPtr += 16;
105 
106  // Convert int16 → int32 → float
107  outValInt = _mm512_cvtepi16_epi32(cplxValue);
108  outVal = _mm512_cvtepi32_ps(outValInt);
109  _mm512_store_ps((float*)outputVectorPtr, outVal);
110 
111  outputVectorPtr += 16;
112  }
113 
114  number = avx512_iters * 16;
115  for (; number < num_points * 2; number++) {
116  *outputVectorPtr++ = (float)*complexVectorPtr++;
117  }
118 }
119 
120 #endif /* LV_HAVE_AVX512F */
121 
122 #ifdef LV_HAVE_GENERIC
123 
124 static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector,
125  const lv_16sc_t* inputVector,
126  unsigned int num_points)
127 {
128  unsigned int i;
129  for (i = 0; i < num_points; i++) {
130  outputVector[i] =
131  lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
132  }
133 }
134 
135 #endif /* LV_HAVE_GENERIC */
136 
137 
138 #ifdef LV_HAVE_SSE2
139 #include <emmintrin.h>
140 
141 static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector,
142  const lv_16sc_t* inputVector,
143  unsigned int num_points)
144 {
145  const unsigned int sse_iters = num_points / 2;
146 
147  const lv_16sc_t* _in = inputVector;
148  lv_32fc_t* _out = outputVector;
149  __m128 a;
150  unsigned int number;
151 
152  for (number = 0; number < sse_iters; number++) {
153  a = _mm_set_ps(
154  (float)(lv_cimag(_in[1])),
155  (float)(lv_creal(_in[1])),
156  (float)(lv_cimag(_in[0])),
157  (float)(lv_creal(
158  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
159  _mm_store_ps((float*)_out, a);
160  _in += 2;
161  _out += 2;
162  }
163  if (num_points & 1) {
164  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
165  _in++;
166  }
167 }
168 
169 #endif /* LV_HAVE_SSE2 */
170 
171 #ifdef LV_HAVE_AVX
172 #include <immintrin.h>
173 
174 static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector,
175  const lv_16sc_t* inputVector,
176  unsigned int num_points)
177 {
178  const unsigned int sse_iters = num_points / 4;
179 
180  const lv_16sc_t* _in = inputVector;
181  lv_32fc_t* _out = outputVector;
182  __m256 a;
183  unsigned int i, number;
184 
185  for (number = 0; number < sse_iters; number++) {
186  a = _mm256_set_ps(
187  (float)(lv_cimag(_in[3])),
188  (float)(lv_creal(_in[3])),
189  (float)(lv_cimag(_in[2])),
190  (float)(lv_creal(_in[2])),
191  (float)(lv_cimag(_in[1])),
192  (float)(lv_creal(_in[1])),
193  (float)(lv_cimag(_in[0])),
194  (float)(lv_creal(
195  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
196  _mm256_store_ps((float*)_out, a);
197  _in += 4;
198  _out += 4;
199  }
200 
201  for (i = 0; i < (num_points % 4); ++i) {
202  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
203  _in++;
204  }
205 }
206 
207 #endif /* LV_HAVE_AVX */
208 
209 
210 #ifdef LV_HAVE_NEON
211 #include <arm_neon.h>
212 
213 static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
214  const lv_16sc_t* inputVector,
215  unsigned int num_points)
216 {
217  const int16_t* _in = (const int16_t*)inputVector;
218  float* _out = (float*)outputVector;
219  unsigned int n = num_points;
220 
221  // Process 8 complex numbers per iteration using 64-bit loads
222  // This avoids vget_low/vget_high overhead
223  while (n >= 8) {
224  int16x4_t v0 = vld1_s16(_in);
225  int16x4_t v1 = vld1_s16(_in + 4);
226  int16x4_t v2 = vld1_s16(_in + 8);
227  int16x4_t v3 = vld1_s16(_in + 12);
228  __VOLK_PREFETCH(_in + 32);
229 
230  vst1q_f32(_out, vcvtq_f32_s32(vmovl_s16(v0)));
231  vst1q_f32(_out + 4, vcvtq_f32_s32(vmovl_s16(v1)));
232  vst1q_f32(_out + 8, vcvtq_f32_s32(vmovl_s16(v2)));
233  vst1q_f32(_out + 12, vcvtq_f32_s32(vmovl_s16(v3)));
234 
235  _in += 16;
236  _out += 16;
237  n -= 8;
238  }
239 
240  // Handle remaining elements
241  while (n--) {
242  *_out++ = (float)*_in++;
243  *_out++ = (float)*_in++;
244  }
245 }
246 #endif /* LV_HAVE_NEON */
247 
248 #ifdef LV_HAVE_NEONV8
249 #include <arm_neon.h>
250 
251 static inline void volk_16ic_convert_32fc_neonv8(lv_32fc_t* outputVector,
252  const lv_16sc_t* inputVector,
253  unsigned int num_points)
254 {
255  const int16_t* _in = (const int16_t*)inputVector;
256  float* _out = (float*)outputVector;
257  unsigned int n = num_points;
258 
259  /* Process 8 complex numbers per iteration using 64-bit loads */
260  while (n >= 8) {
261  int16x4_t v0 = vld1_s16(_in);
262  int16x4_t v1 = vld1_s16(_in + 4);
263  int16x4_t v2 = vld1_s16(_in + 8);
264  int16x4_t v3 = vld1_s16(_in + 12);
265  __VOLK_PREFETCH(_in + 32);
266 
267  vst1q_f32(_out, vcvtq_f32_s32(vmovl_s16(v0)));
268  vst1q_f32(_out + 4, vcvtq_f32_s32(vmovl_s16(v1)));
269  vst1q_f32(_out + 8, vcvtq_f32_s32(vmovl_s16(v2)));
270  vst1q_f32(_out + 12, vcvtq_f32_s32(vmovl_s16(v3)));
271 
272  _in += 16;
273  _out += 16;
274  n -= 8;
275  }
276 
277  /* Handle remaining elements */
278  while (n--) {
279  *_out++ = (float)*_in++;
280  *_out++ = (float)*_in++;
281  }
282 }
283 #endif /* LV_HAVE_NEONV8 */
284 
285 #endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
286 
287 #ifndef INCLUDED_volk_16ic_convert_32fc_u_H
288 #define INCLUDED_volk_16ic_convert_32fc_u_H
289 
290 #include <volk/volk_complex.h>
291 
292 
293 #ifdef LV_HAVE_AVX2
294 #include <immintrin.h>
295 
296 static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector,
297  const lv_16sc_t* inputVector,
298  unsigned int num_points)
299 {
300  const unsigned int avx_iters = num_points / 4;
301  unsigned int number = 0;
302  const int16_t* complexVectorPtr = (int16_t*)inputVector;
303  float* outputVectorPtr = (float*)outputVector;
304  __m256 outVal;
305  __m256i outValInt;
306  __m128i cplxValue;
307 
308  for (number = 0; number < avx_iters; number++) {
309  cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
310  __VOLK_PREFETCH(complexVectorPtr + 16);
311  complexVectorPtr += 8;
312 
313  outValInt = _mm256_cvtepi16_epi32(cplxValue);
314  outVal = _mm256_cvtepi32_ps(outValInt);
315  _mm256_storeu_ps((float*)outputVectorPtr, outVal);
316 
317  outputVectorPtr += 8;
318  }
319 
320  number = avx_iters * 8;
321  for (; number < num_points * 2; number++) {
322  *outputVectorPtr++ = (float)*complexVectorPtr++;
323  }
324 }
325 
326 #endif /* LV_HAVE_AVX2 */
327 
328 #ifdef LV_HAVE_AVX512F
329 #include <immintrin.h>
330 
331 static inline void volk_16ic_convert_32fc_u_avx512(lv_32fc_t* outputVector,
332  const lv_16sc_t* inputVector,
333  unsigned int num_points)
334 {
335  const unsigned int avx512_iters = num_points / 8;
336  unsigned int number = 0;
337  const int16_t* complexVectorPtr = (int16_t*)inputVector;
338  float* outputVectorPtr = (float*)outputVector;
339  __m512 outVal;
340  __m512i outValInt;
341  __m256i cplxValue;
342 
343  for (number = 0; number < avx512_iters; number++) {
344  // Load 16 int16 values (8 complex = 16 floats) - unaligned
345  cplxValue = _mm256_loadu_si256((__m256i*)complexVectorPtr);
346  __VOLK_PREFETCH(complexVectorPtr + 32);
347  complexVectorPtr += 16;
348 
349  // Convert int16 → int32 → float
350  outValInt = _mm512_cvtepi16_epi32(cplxValue);
351  outVal = _mm512_cvtepi32_ps(outValInt);
352  _mm512_storeu_ps((float*)outputVectorPtr, outVal);
353 
354  outputVectorPtr += 16;
355  }
356 
357  number = avx512_iters * 16;
358  for (; number < num_points * 2; number++) {
359  *outputVectorPtr++ = (float)*complexVectorPtr++;
360  }
361 }
362 
363 #endif /* LV_HAVE_AVX512F */
364 
365 #ifdef LV_HAVE_SSE2
366 #include <emmintrin.h>
367 
368 static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector,
369  const lv_16sc_t* inputVector,
370  unsigned int num_points)
371 {
372  const unsigned int sse_iters = num_points / 2;
373 
374  const lv_16sc_t* _in = inputVector;
375  lv_32fc_t* _out = outputVector;
376  __m128 a;
377  unsigned int number;
378 
379  for (number = 0; number < sse_iters; number++) {
380  a = _mm_set_ps(
381  (float)(lv_cimag(_in[1])),
382  (float)(lv_creal(_in[1])),
383  (float)(lv_cimag(_in[0])),
384  (float)(lv_creal(
385  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
386  _mm_storeu_ps((float*)_out, a);
387  _in += 2;
388  _out += 2;
389  }
390  if (num_points & 1) {
391  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
392  _in++;
393  }
394 }
395 
396 #endif /* LV_HAVE_SSE2 */
397 
398 
399 #ifdef LV_HAVE_AVX
400 #include <immintrin.h>
401 
402 static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector,
403  const lv_16sc_t* inputVector,
404  unsigned int num_points)
405 {
406  const unsigned int sse_iters = num_points / 4;
407 
408  const lv_16sc_t* _in = inputVector;
409  lv_32fc_t* _out = outputVector;
410  __m256 a;
411  unsigned int i, number;
412 
413  for (number = 0; number < sse_iters; number++) {
414  a = _mm256_set_ps(
415  (float)(lv_cimag(_in[3])),
416  (float)(lv_creal(_in[3])),
417  (float)(lv_cimag(_in[2])),
418  (float)(lv_creal(_in[2])),
419  (float)(lv_cimag(_in[1])),
420  (float)(lv_creal(_in[1])),
421  (float)(lv_cimag(_in[0])),
422  (float)(lv_creal(
423  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
424  _mm256_storeu_ps((float*)_out, a);
425  _in += 4;
426  _out += 4;
427  }
428 
429  for (i = 0; i < (num_points % 4); ++i) {
430  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
431  _in++;
432  }
433 }
434 
435 #endif /* LV_HAVE_AVX */
436 
437 #ifdef LV_HAVE_RVV
438 #include <riscv_vector.h>
439 
440 static inline void volk_16ic_convert_32fc_rvv(lv_32fc_t* outputVector,
441  const lv_16sc_t* inputVector,
442  unsigned int num_points)
443 {
444  const int16_t* in = (const int16_t*)inputVector;
445  float* out = (float*)outputVector;
446  size_t n = num_points * 2;
447  for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
448  vl = __riscv_vsetvl_e16m4(n);
449  vint16m4_t v = __riscv_vle16_v_i16m4(in, vl);
450  __riscv_vse32(out, __riscv_vfwcvt_f(v, vl), vl);
451  }
452 }
453 #endif /*LV_HAVE_RVV*/
454 
455 #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:98
volk_16ic_convert_32fc_generic
static void volk_16ic_convert_32fc_generic(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:124
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
lv_16sc_t
short complex lv_16sc_t
Definition: volk_complex.h:71
volk_16ic_convert_32fc_a_sse2
static void volk_16ic_convert_32fc_a_sse2(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:141
i
for i
Definition: volk_config_fixed.tmpl.h:13
lv_cmake
#define lv_cmake(r, i)
Definition: volk_complex.h:77
volk_16ic_convert_32fc_a_avx
static void volk_16ic_convert_32fc_a_avx(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:174
volk_16ic_convert_32fc_u_avx
static void volk_16ic_convert_32fc_u_avx(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:402
volk_16ic_convert_32fc_u_sse2
static void volk_16ic_convert_32fc_u_sse2(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:368
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_complex.h
volk_16ic_convert_32fc_neon
static void volk_16ic_convert_32fc_neon(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:213
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:96