Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_convert_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
33 #ifndef INCLUDED_volk_32fc_convert_16ic_a_H
34 #define INCLUDED_volk_32fc_convert_16ic_a_H
35 
36 #include "volk/volk_complex.h"
37 #include <limits.h>
38 #include <math.h>
39 
40 #ifdef LV_HAVE_AVX2
41 #include <immintrin.h>
42 
43 static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector,
44  const lv_32fc_t* inputVector,
45  unsigned int num_points)
46 {
47  const unsigned int avx_iters = num_points / 8;
48 
49  float* inputVectorPtr = (float*)inputVector;
50  int16_t* outputVectorPtr = (int16_t*)outputVector;
51  float aux;
52 
53  const float min_val = (float)SHRT_MIN;
54  const float max_val = (float)SHRT_MAX;
55 
56  __m256 inputVal1, inputVal2;
57  __m256i intInputVal1, intInputVal2;
58  __m256 ret1, ret2;
59  const __m256 vmin_val = _mm256_set1_ps(min_val);
60  const __m256 vmax_val = _mm256_set1_ps(max_val);
61  unsigned int i;
62 
63  for (i = 0; i < avx_iters; i++) {
64  inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
65  inputVectorPtr += 8;
66  inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
67  inputVectorPtr += 8;
68  __VOLK_PREFETCH(inputVectorPtr + 16);
69 
70  // Clip
71  ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
72  ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
73 
74  intInputVal1 = _mm256_cvtps_epi32(ret1);
75  intInputVal2 = _mm256_cvtps_epi32(ret2);
76 
77  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
78  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
79 
80  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
81  outputVectorPtr += 16;
82  }
83 
84  for (i = avx_iters * 16; i < num_points * 2; i++) {
85  aux = *inputVectorPtr++;
86  if (aux > max_val)
87  aux = max_val;
88  else if (aux < min_val)
89  aux = min_val;
90  *outputVectorPtr++ = (int16_t)rintf(aux);
91  }
92 }
93 #endif /* LV_HAVE_AVX2 */
94 
95 #ifdef LV_HAVE_AVX512F
96 #include <immintrin.h>
97 
98 static inline void volk_32fc_convert_16ic_a_avx512(lv_16sc_t* outputVector,
99  const lv_32fc_t* inputVector,
100  unsigned int num_points)
101 {
102  const unsigned int avx512_iters = num_points / 8;
103 
104  float* inputVectorPtr = (float*)inputVector;
105  int16_t* outputVectorPtr = (int16_t*)outputVector;
106  float aux;
107 
108  const float min_val = (float)SHRT_MIN;
109  const float max_val = (float)SHRT_MAX;
110 
111  __m512 inputVal1;
112  __m256i intInputVal;
113  __m512 ret1;
114  const __m512 vmin_val = _mm512_set1_ps(min_val);
115  const __m512 vmax_val = _mm512_set1_ps(max_val);
116  unsigned int i;
117 
118  for (i = 0; i < avx512_iters; i++) {
119  inputVal1 = _mm512_load_ps((float*)inputVectorPtr);
120  inputVectorPtr += 16;
121  __VOLK_PREFETCH(inputVectorPtr + 16);
122 
123  // Clip
124  ret1 = _mm512_max_ps(_mm512_min_ps(inputVal1, vmax_val), vmin_val);
125 
126  // Convert float to int32, then pack to int16 with saturation
127  intInputVal = _mm512_cvtsepi32_epi16(_mm512_cvtps_epi32(ret1));
128 
129  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
130  outputVectorPtr += 16;
131  }
132 
133  for (i = avx512_iters * 16; i < num_points * 2; i++) {
134  aux = *inputVectorPtr++;
135  if (aux > max_val)
136  aux = max_val;
137  else if (aux < min_val)
138  aux = min_val;
139  *outputVectorPtr++ = (int16_t)rintf(aux);
140  }
141 }
142 #endif /* LV_HAVE_AVX512F */
143 
144 #ifdef LV_HAVE_SSE2
145 #include <emmintrin.h>
146 
147 static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector,
148  const lv_32fc_t* inputVector,
149  unsigned int num_points)
150 {
151  const unsigned int sse_iters = num_points / 4;
152 
153  float* inputVectorPtr = (float*)inputVector;
154  int16_t* outputVectorPtr = (int16_t*)outputVector;
155  float aux;
156 
157  const float min_val = (float)SHRT_MIN;
158  const float max_val = (float)SHRT_MAX;
159 
160  __m128 inputVal1, inputVal2;
161  __m128i intInputVal1, intInputVal2;
162  __m128 ret1, ret2;
163  const __m128 vmin_val = _mm_set_ps1(min_val);
164  const __m128 vmax_val = _mm_set_ps1(max_val);
165  unsigned int i;
166 
167  for (i = 0; i < sse_iters; i++) {
168  inputVal1 = _mm_load_ps((float*)inputVectorPtr);
169  inputVectorPtr += 4;
170  inputVal2 = _mm_load_ps((float*)inputVectorPtr);
171  inputVectorPtr += 4;
172  __VOLK_PREFETCH(inputVectorPtr + 8);
173 
174  // Clip
175  ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
176  ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
177 
178  intInputVal1 = _mm_cvtps_epi32(ret1);
179  intInputVal2 = _mm_cvtps_epi32(ret2);
180 
181  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
182 
183  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
184  outputVectorPtr += 8;
185  }
186 
187  for (i = sse_iters * 8; i < num_points * 2; i++) {
188  aux = *inputVectorPtr++;
189  if (aux > max_val)
190  aux = max_val;
191  else if (aux < min_val)
192  aux = min_val;
193  *outputVectorPtr++ = (int16_t)rintf(aux);
194  }
195 }
196 #endif /* LV_HAVE_SSE2 */
197 
198 
199 #if LV_HAVE_NEONV7
200 #include <arm_neon.h>
201 
202 static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
203  const lv_32fc_t* inputVector,
204  unsigned int num_points)
205 {
206 
207  const unsigned int neon_iters = num_points / 4;
208 
209  float32_t* inputVectorPtr = (float32_t*)inputVector;
210  int16_t* outputVectorPtr = (int16_t*)outputVector;
211 
212  const float min_val_f = (float)SHRT_MIN;
213  const float max_val_f = (float)SHRT_MAX;
214  float32_t aux;
215  unsigned int i;
216 
217  const float32x4_t min_val = vmovq_n_f32(min_val_f);
218  const float32x4_t max_val = vmovq_n_f32(max_val_f);
219  float32x4_t half = vdupq_n_f32(0.5f);
220  float32x4_t ret1, ret2, a, b, sign, PlusHalf, Round;
221 
222  int32x4_t toint_a = { 0, 0, 0, 0 };
223  int32x4_t toint_b = { 0, 0, 0, 0 };
224  int16x4_t intInputVal1, intInputVal2;
225  int16x8_t res;
226 
227  for (i = 0; i < neon_iters; i++) {
228  a = vld1q_f32((const float32_t*)(inputVectorPtr));
229  inputVectorPtr += 4;
230  b = vld1q_f32((const float32_t*)(inputVectorPtr));
231  inputVectorPtr += 4;
232  __VOLK_PREFETCH(inputVectorPtr + 8);
233 
234  ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
235  ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
236 
237  sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
238  PlusHalf = vaddq_f32(ret1, half);
239  Round = vsubq_f32(PlusHalf, sign);
240  toint_a = vcvtq_s32_f32(Round);
241 
242  sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret2), 31)));
243  PlusHalf = vaddq_f32(ret2, half);
244  Round = vsubq_f32(PlusHalf, sign);
245  toint_b = vcvtq_s32_f32(Round);
246 
247  intInputVal1 = vqmovn_s32(toint_a);
248  intInputVal2 = vqmovn_s32(toint_b);
249 
250  res = vcombine_s16(intInputVal1, intInputVal2);
251  vst1q_s16((int16_t*)outputVectorPtr, res);
252  outputVectorPtr += 8;
253  }
254 
255  for (i = neon_iters * 8; i < num_points * 2; i++) {
256  aux = *inputVectorPtr++;
257  if (aux > max_val_f)
258  aux = max_val_f;
259  else if (aux < min_val_f)
260  aux = min_val_f;
261  *outputVectorPtr++ = (int16_t)rintf(aux);
262  }
263 }
264 
265 #endif /* LV_HAVE_NEONV7 */
266 
267 #if LV_HAVE_NEONV8
268 #include <arm_neon.h>
269 
270 static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector,
271  const lv_32fc_t* inputVector,
272  unsigned int num_points)
273 {
274  const unsigned int neon_iters = num_points / 4;
275 
276  float32_t* inputVectorPtr = (float32_t*)inputVector;
277  int16_t* outputVectorPtr = (int16_t*)outputVector;
278 
279  const float min_val_f = (float)SHRT_MIN;
280  const float max_val_f = (float)SHRT_MAX;
281  float32_t aux;
282  unsigned int i;
283 
284  const float32x4_t min_val = vmovq_n_f32(min_val_f);
285  const float32x4_t max_val = vmovq_n_f32(max_val_f);
286  float32x4_t ret1, ret2, a, b;
287 
288  int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
289  int16x4_t intInputVal1, intInputVal2;
290  int16x8_t res;
291 
292  for (i = 0; i < neon_iters; i++) {
293  a = vld1q_f32((const float32_t*)(inputVectorPtr));
294  inputVectorPtr += 4;
295  b = vld1q_f32((const float32_t*)(inputVectorPtr));
296  inputVectorPtr += 4;
297  __VOLK_PREFETCH(inputVectorPtr + 8);
298 
299  ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
300  ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
301 
302  // vrndiq takes into account the current rounding mode (as does rintf)
303  toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
304  toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
305 
306  intInputVal1 = vqmovn_s32(toint_a);
307  intInputVal2 = vqmovn_s32(toint_b);
308 
309  res = vcombine_s16(intInputVal1, intInputVal2);
310  vst1q_s16((int16_t*)outputVectorPtr, res);
311  outputVectorPtr += 8;
312  }
313 
314  for (i = neon_iters * 8; i < num_points * 2; i++) {
315  aux = *inputVectorPtr++;
316  if (aux > max_val_f)
317  aux = max_val_f;
318  else if (aux < min_val_f)
319  aux = min_val_f;
320  *outputVectorPtr++ = (int16_t)rintf(aux);
321  }
322 }
323 #endif /* LV_HAVE_NEONV8 */
324 
325 
326 #ifdef LV_HAVE_GENERIC
327 
328 static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector,
329  const lv_32fc_t* inputVector,
330  unsigned int num_points)
331 {
332  float* inputVectorPtr = (float*)inputVector;
333  int16_t* outputVectorPtr = (int16_t*)outputVector;
334  const float min_val = (float)SHRT_MIN;
335  const float max_val = (float)SHRT_MAX;
336  float aux;
337  unsigned int i;
338  for (i = 0; i < num_points * 2; i++) {
339  aux = *inputVectorPtr++;
340  if (aux > max_val)
341  aux = max_val;
342  else if (aux < min_val)
343  aux = min_val;
344  *outputVectorPtr++ = (int16_t)rintf(aux);
345  }
346 }
347 #endif /* LV_HAVE_GENERIC */
348 
349 #endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
350 
351 #ifndef INCLUDED_volk_32fc_convert_16ic_u_H
352 #define INCLUDED_volk_32fc_convert_16ic_u_H
353 
354 #include "volk/volk_complex.h"
355 #include <limits.h>
356 #include <math.h>
357 
358 
359 #ifdef LV_HAVE_AVX2
360 #include <immintrin.h>
361 
362 static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector,
363  const lv_32fc_t* inputVector,
364  unsigned int num_points)
365 {
366  const unsigned int avx_iters = num_points / 8;
367 
368  float* inputVectorPtr = (float*)inputVector;
369  int16_t* outputVectorPtr = (int16_t*)outputVector;
370  float aux;
371 
372  const float min_val = (float)SHRT_MIN;
373  const float max_val = (float)SHRT_MAX;
374 
375  __m256 inputVal1, inputVal2;
376  __m256i intInputVal1, intInputVal2;
377  __m256 ret1, ret2;
378  const __m256 vmin_val = _mm256_set1_ps(min_val);
379  const __m256 vmax_val = _mm256_set1_ps(max_val);
380  unsigned int i;
381 
382  for (i = 0; i < avx_iters; i++) {
383  inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
384  inputVectorPtr += 8;
385  inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
386  inputVectorPtr += 8;
387  __VOLK_PREFETCH(inputVectorPtr + 16);
388 
389  // Clip
390  ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
391  ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
392 
393  intInputVal1 = _mm256_cvtps_epi32(ret1);
394  intInputVal2 = _mm256_cvtps_epi32(ret2);
395 
396  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
397  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
398 
399  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
400  outputVectorPtr += 16;
401  }
402 
403  for (i = avx_iters * 16; i < num_points * 2; i++) {
404  aux = *inputVectorPtr++;
405  if (aux > max_val)
406  aux = max_val;
407  else if (aux < min_val)
408  aux = min_val;
409  *outputVectorPtr++ = (int16_t)rintf(aux);
410  }
411 }
412 #endif /* LV_HAVE_AVX2 */
413 
414 #ifdef LV_HAVE_AVX512F
415 #include <immintrin.h>
416 
417 static inline void volk_32fc_convert_16ic_u_avx512(lv_16sc_t* outputVector,
418  const lv_32fc_t* inputVector,
419  unsigned int num_points)
420 {
421  const unsigned int avx512_iters = num_points / 8;
422 
423  float* inputVectorPtr = (float*)inputVector;
424  int16_t* outputVectorPtr = (int16_t*)outputVector;
425  float aux;
426 
427  const float min_val = (float)SHRT_MIN;
428  const float max_val = (float)SHRT_MAX;
429 
430  __m512 inputVal1;
431  __m256i intInputVal;
432  __m512 ret1;
433  const __m512 vmin_val = _mm512_set1_ps(min_val);
434  const __m512 vmax_val = _mm512_set1_ps(max_val);
435  unsigned int i;
436 
437  for (i = 0; i < avx512_iters; i++) {
438  inputVal1 = _mm512_loadu_ps((float*)inputVectorPtr);
439  inputVectorPtr += 16;
440  __VOLK_PREFETCH(inputVectorPtr + 16);
441 
442  // Clip
443  ret1 = _mm512_max_ps(_mm512_min_ps(inputVal1, vmax_val), vmin_val);
444 
445  // Convert float to int32, then pack to int16 with saturation
446  intInputVal = _mm512_cvtsepi32_epi16(_mm512_cvtps_epi32(ret1));
447 
448  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
449  outputVectorPtr += 16;
450  }
451 
452  for (i = avx512_iters * 16; i < num_points * 2; i++) {
453  aux = *inputVectorPtr++;
454  if (aux > max_val)
455  aux = max_val;
456  else if (aux < min_val)
457  aux = min_val;
458  *outputVectorPtr++ = (int16_t)rintf(aux);
459  }
460 }
461 #endif /* LV_HAVE_AVX512F */
462 
463 
464 #ifdef LV_HAVE_SSE2
465 #include <emmintrin.h>
466 
467 static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector,
468  const lv_32fc_t* inputVector,
469  unsigned int num_points)
470 {
471  const unsigned int sse_iters = num_points / 4;
472 
473  float* inputVectorPtr = (float*)inputVector;
474  int16_t* outputVectorPtr = (int16_t*)outputVector;
475  float aux;
476 
477  const float min_val = (float)SHRT_MIN;
478  const float max_val = (float)SHRT_MAX;
479 
480  __m128 inputVal1, inputVal2;
481  __m128i intInputVal1, intInputVal2;
482  __m128 ret1, ret2;
483  const __m128 vmin_val = _mm_set_ps1(min_val);
484  const __m128 vmax_val = _mm_set_ps1(max_val);
485 
486  unsigned int i;
487  for (i = 0; i < sse_iters; i++) {
488  inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
489  inputVectorPtr += 4;
490  inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
491  inputVectorPtr += 4;
492  __VOLK_PREFETCH(inputVectorPtr + 8);
493 
494  // Clip
495  ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
496  ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
497 
498  intInputVal1 = _mm_cvtps_epi32(ret1);
499  intInputVal2 = _mm_cvtps_epi32(ret2);
500 
501  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
502 
503  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
504  outputVectorPtr += 8;
505  }
506 
507  for (i = sse_iters * 8; i < num_points * 2; i++) {
508  aux = *inputVectorPtr++;
509  if (aux > max_val)
510  aux = max_val;
511  else if (aux < min_val)
512  aux = min_val;
513  *outputVectorPtr++ = (int16_t)rintf(aux);
514  }
515 }
516 #endif /* LV_HAVE_SSE2 */
517 
518 #ifdef LV_HAVE_RVV
519 #include <riscv_vector.h>
520 
521 static inline void volk_32fc_convert_16ic_rvv(lv_16sc_t* outputVector,
522  const lv_32fc_t* inputVector,
523  unsigned int num_points)
524 {
525  int16_t* out = (int16_t*)outputVector;
526  float* in = (float*)inputVector;
527  size_t n = num_points * 2;
528  for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
529  vl = __riscv_vsetvl_e32m8(n);
530  vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
531  __riscv_vse16(out, __riscv_vfncvt_x(v, vl), vl);
532  }
533 }
534 #endif /*LV_HAVE_RVV*/
535 
536 #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
volk_32fc_convert_16ic_a_sse2
static void volk_32fc_convert_16ic_a_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:147
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
lv_16sc_t
short complex lv_16sc_t
Definition: volk_complex.h:71
volk_32fc_convert_16ic_u_sse2
static void volk_32fc_convert_16ic_u_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:467
i
for i
Definition: volk_config_fixed.tmpl.h:13
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_complex.h
volk_32fc_convert_16ic_generic
static void volk_32fc_convert_16ic_generic(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:328
rintf
static float rintf(float x)
Definition: config.h:45