Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
55 #ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
56 #define INCLUDED_volk_32f_s32f_convert_16i_u_H
57 
58 #include <inttypes.h>
59 #include <limits.h>
60 #include <stdio.h>
61 
62 #ifdef LV_HAVE_AVX2
63 #include <immintrin.h>
64 
65 static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
66  const float* inputVector,
67  const float scalar,
68  unsigned int num_points)
69 {
70  unsigned int number = 0;
71 
72  const unsigned int sixteenthPoints = num_points / 16;
73 
74  const float* inputVectorPtr = (const float*)inputVector;
75  int16_t* outputVectorPtr = outputVector;
76 
77  float min_val = SHRT_MIN;
78  float max_val = SHRT_MAX;
79  float r;
80 
81  __m256 vScalar = _mm256_set1_ps(scalar);
82  __m256 inputVal1, inputVal2;
83  __m256i intInputVal1, intInputVal2;
84  __m256 ret1, ret2;
85  __m256 vmin_val = _mm256_set1_ps(min_val);
86  __m256 vmax_val = _mm256_set1_ps(max_val);
87 
88  for (; number < sixteenthPoints; number++) {
89  inputVal1 = _mm256_loadu_ps(inputVectorPtr);
90  inputVectorPtr += 8;
91  inputVal2 = _mm256_loadu_ps(inputVectorPtr);
92  inputVectorPtr += 8;
93 
94  // Scale and clip
95  ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
96  vmin_val);
97  ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
98  vmin_val);
99 
100  intInputVal1 = _mm256_cvtps_epi32(ret1);
101  intInputVal2 = _mm256_cvtps_epi32(ret2);
102 
103  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
104  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
105 
106  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
107  outputVectorPtr += 16;
108  }
109 
110  number = sixteenthPoints * 16;
111  for (; number < num_points; number++) {
112  r = inputVector[number] * scalar;
113  if (r > max_val)
114  r = max_val;
115  else if (r < min_val)
116  r = min_val;
117  outputVector[number] = (int16_t)rintf(r);
118  }
119 }
120 #endif /* LV_HAVE_AVX2 */
121 
122 #ifdef LV_HAVE_AVX512F
123 #include <immintrin.h>
124 
125 static inline void volk_32f_s32f_convert_16i_u_avx512(int16_t* outputVector,
126  const float* inputVector,
127  const float scalar,
128  unsigned int num_points)
129 {
130  unsigned int number = 0;
131 
132  const unsigned int sixteenthPoints = num_points / 16;
133 
134  const float* inputVectorPtr = (const float*)inputVector;
135  int16_t* outputVectorPtr = outputVector;
136 
137  float min_val = SHRT_MIN;
138  float max_val = SHRT_MAX;
139  float r;
140 
141  __m512 vScalar = _mm512_set1_ps(scalar);
142  __m512 inputVal;
143  __m256i intInputVal;
144  __m512 ret;
145  __m512 vmin_val = _mm512_set1_ps(min_val);
146  __m512 vmax_val = _mm512_set1_ps(max_val);
147 
148  for (; number < sixteenthPoints; number++) {
149  inputVal = _mm512_loadu_ps(inputVectorPtr);
150  inputVectorPtr += 16;
151 
152  // Scale and clip
153  ret = _mm512_max_ps(_mm512_min_ps(_mm512_mul_ps(inputVal, vScalar), vmax_val),
154  vmin_val);
155 
156  // Convert float to int32, then pack to int16 with saturation
157  intInputVal = _mm512_cvtsepi32_epi16(_mm512_cvtps_epi32(ret));
158 
159  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
160  outputVectorPtr += 16;
161  }
162 
163  number = sixteenthPoints * 16;
164  for (; number < num_points; number++) {
165  r = inputVector[number] * scalar;
166  if (r > max_val)
167  r = max_val;
168  else if (r < min_val)
169  r = min_val;
170  outputVector[number] = (int16_t)rintf(r);
171  }
172 }
173 #endif /* LV_HAVE_AVX512F */
174 
175 
176 #ifdef LV_HAVE_AVX
177 #include <immintrin.h>
178 
179 static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector,
180  const float* inputVector,
181  const float scalar,
182  unsigned int num_points)
183 {
184  unsigned int number = 0;
185 
186  const unsigned int eighthPoints = num_points / 8;
187 
188  const float* inputVectorPtr = (const float*)inputVector;
189  int16_t* outputVectorPtr = outputVector;
190 
191  float min_val = SHRT_MIN;
192  float max_val = SHRT_MAX;
193  float r;
194 
195  __m256 vScalar = _mm256_set1_ps(scalar);
196  __m256 inputVal, ret;
197  __m256i intInputVal;
198  __m128i intInputVal1, intInputVal2;
199  __m256 vmin_val = _mm256_set1_ps(min_val);
200  __m256 vmax_val = _mm256_set1_ps(max_val);
201 
202  for (; number < eighthPoints; number++) {
203  inputVal = _mm256_loadu_ps(inputVectorPtr);
204  inputVectorPtr += 8;
205 
206  // Scale and clip
207  ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
208  vmin_val);
209 
210  intInputVal = _mm256_cvtps_epi32(ret);
211 
212  intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
213  intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
214 
215  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
216 
217  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
218  outputVectorPtr += 8;
219  }
220 
221  number = eighthPoints * 8;
222  for (; number < num_points; number++) {
223  r = inputVector[number] * scalar;
224  if (r > max_val)
225  r = max_val;
226  else if (r < min_val)
227  r = min_val;
228  outputVector[number] = (int16_t)rintf(r);
229  }
230 }
231 #endif /* LV_HAVE_AVX */
232 
233 
234 #ifdef LV_HAVE_SSE2
235 #include <emmintrin.h>
236 
237 static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector,
238  const float* inputVector,
239  const float scalar,
240  unsigned int num_points)
241 {
242  unsigned int number = 0;
243 
244  const unsigned int eighthPoints = num_points / 8;
245 
246  const float* inputVectorPtr = (const float*)inputVector;
247  int16_t* outputVectorPtr = outputVector;
248 
249  float min_val = SHRT_MIN;
250  float max_val = SHRT_MAX;
251  float r;
252 
253  __m128 vScalar = _mm_set_ps1(scalar);
254  __m128 inputVal1, inputVal2;
255  __m128i intInputVal1, intInputVal2;
256  __m128 ret1, ret2;
257  __m128 vmin_val = _mm_set_ps1(min_val);
258  __m128 vmax_val = _mm_set_ps1(max_val);
259 
260  for (; number < eighthPoints; number++) {
261  inputVal1 = _mm_loadu_ps(inputVectorPtr);
262  inputVectorPtr += 4;
263  inputVal2 = _mm_loadu_ps(inputVectorPtr);
264  inputVectorPtr += 4;
265 
266  // Scale and clip
267  ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
268  ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
269 
270  intInputVal1 = _mm_cvtps_epi32(ret1);
271  intInputVal2 = _mm_cvtps_epi32(ret2);
272 
273  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
274 
275  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
276  outputVectorPtr += 8;
277  }
278 
279  number = eighthPoints * 8;
280  for (; number < num_points; number++) {
281  r = inputVector[number] * scalar;
282  if (r > max_val)
283  r = max_val;
284  else if (r < min_val)
285  r = min_val;
286  outputVector[number] = (int16_t)rintf(r);
287  }
288 }
289 #endif /* LV_HAVE_SSE2 */
290 
291 
292 #ifdef LV_HAVE_SSE
293 #include <xmmintrin.h>
294 
295 static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector,
296  const float* inputVector,
297  const float scalar,
298  unsigned int num_points)
299 {
300  unsigned int number = 0;
301 
302  const unsigned int quarterPoints = num_points / 4;
303 
304  const float* inputVectorPtr = (const float*)inputVector;
305  int16_t* outputVectorPtr = outputVector;
306 
307  float min_val = SHRT_MIN;
308  float max_val = SHRT_MAX;
309  float r;
310 
311  __m128 vScalar = _mm_set_ps1(scalar);
312  __m128 ret;
313  __m128 vmin_val = _mm_set_ps1(min_val);
314  __m128 vmax_val = _mm_set_ps1(max_val);
315 
316  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
317 
318  for (; number < quarterPoints; number++) {
319  ret = _mm_loadu_ps(inputVectorPtr);
320  inputVectorPtr += 4;
321 
322  // Scale and clip
323  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
324 
325  _mm_store_ps(outputFloatBuffer, ret);
326  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
327  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
328  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
329  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
330  }
331 
332  number = quarterPoints * 4;
333  for (; number < num_points; number++) {
334  r = inputVector[number] * scalar;
335  if (r > max_val)
336  r = max_val;
337  else if (r < min_val)
338  r = min_val;
339  outputVector[number] = (int16_t)rintf(r);
340  }
341 }
342 #endif /* LV_HAVE_SSE */
343 
344 
345 #ifdef LV_HAVE_GENERIC
346 
347 static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector,
348  const float* inputVector,
349  const float scalar,
350  unsigned int num_points)
351 {
352  int16_t* outputVectorPtr = outputVector;
353  const float* inputVectorPtr = inputVector;
354  unsigned int number = 0;
355  float min_val = SHRT_MIN;
356  float max_val = SHRT_MAX;
357  float r;
358 
359  for (number = 0; number < num_points; number++) {
360  r = *inputVectorPtr++ * scalar;
361  if (r > max_val)
362  r = max_val;
363  else if (r < min_val)
364  r = min_val;
365  *outputVectorPtr++ = (int16_t)rintf(r);
366  }
367 }
368 #endif /* LV_HAVE_GENERIC */
369 
370 
371 #endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
372 #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
373 #define INCLUDED_volk_32f_s32f_convert_16i_a_H
374 
375 #include <inttypes.h>
376 #include <math.h>
377 #include <stdio.h>
378 #include <volk/volk_common.h>
379 
380 #ifdef LV_HAVE_AVX2
381 #include <immintrin.h>
382 
383 static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
384  const float* inputVector,
385  const float scalar,
386  unsigned int num_points)
387 {
388  unsigned int number = 0;
389 
390  const unsigned int sixteenthPoints = num_points / 16;
391 
392  const float* inputVectorPtr = (const float*)inputVector;
393  int16_t* outputVectorPtr = outputVector;
394 
395  float min_val = SHRT_MIN;
396  float max_val = SHRT_MAX;
397  float r;
398 
399  __m256 vScalar = _mm256_set1_ps(scalar);
400  __m256 inputVal1, inputVal2;
401  __m256i intInputVal1, intInputVal2;
402  __m256 ret1, ret2;
403  __m256 vmin_val = _mm256_set1_ps(min_val);
404  __m256 vmax_val = _mm256_set1_ps(max_val);
405 
406  for (; number < sixteenthPoints; number++) {
407  inputVal1 = _mm256_load_ps(inputVectorPtr);
408  inputVectorPtr += 8;
409  inputVal2 = _mm256_load_ps(inputVectorPtr);
410  inputVectorPtr += 8;
411 
412  // Scale and clip
413  ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
414  vmin_val);
415  ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
416  vmin_val);
417 
418  intInputVal1 = _mm256_cvtps_epi32(ret1);
419  intInputVal2 = _mm256_cvtps_epi32(ret2);
420 
421  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
422  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
423 
424  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
425  outputVectorPtr += 16;
426  }
427 
428  number = sixteenthPoints * 16;
429  for (; number < num_points; number++) {
430  r = inputVector[number] * scalar;
431  if (r > max_val)
432  r = max_val;
433  else if (r < min_val)
434  r = min_val;
435  outputVector[number] = (int16_t)rintf(r);
436  }
437 }
438 #endif /* LV_HAVE_AVX2 */
439 
440 #ifdef LV_HAVE_AVX512F
441 #include <immintrin.h>
442 
443 static inline void volk_32f_s32f_convert_16i_a_avx512(int16_t* outputVector,
444  const float* inputVector,
445  const float scalar,
446  unsigned int num_points)
447 {
448  unsigned int number = 0;
449 
450  const unsigned int sixteenthPoints = num_points / 16;
451 
452  const float* inputVectorPtr = (const float*)inputVector;
453  int16_t* outputVectorPtr = outputVector;
454 
455  float min_val = SHRT_MIN;
456  float max_val = SHRT_MAX;
457  float r;
458 
459  __m512 vScalar = _mm512_set1_ps(scalar);
460  __m512 inputVal;
461  __m256i intInputVal;
462  __m512 ret;
463  __m512 vmin_val = _mm512_set1_ps(min_val);
464  __m512 vmax_val = _mm512_set1_ps(max_val);
465 
466  for (; number < sixteenthPoints; number++) {
467  inputVal = _mm512_load_ps(inputVectorPtr);
468  inputVectorPtr += 16;
469 
470  // Scale and clip
471  ret = _mm512_max_ps(_mm512_min_ps(_mm512_mul_ps(inputVal, vScalar), vmax_val),
472  vmin_val);
473 
474  // Convert float to int32, then pack to int16 with saturation
475  intInputVal = _mm512_cvtsepi32_epi16(_mm512_cvtps_epi32(ret));
476 
477  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
478  outputVectorPtr += 16;
479  }
480 
481  number = sixteenthPoints * 16;
482  for (; number < num_points; number++) {
483  r = inputVector[number] * scalar;
484  if (r > max_val)
485  r = max_val;
486  else if (r < min_val)
487  r = min_val;
488  outputVector[number] = (int16_t)rintf(r);
489  }
490 }
491 #endif /* LV_HAVE_AVX512F */
492 
493 
494 #ifdef LV_HAVE_AVX
495 #include <immintrin.h>
496 
497 static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector,
498  const float* inputVector,
499  const float scalar,
500  unsigned int num_points)
501 {
502  unsigned int number = 0;
503 
504  const unsigned int eighthPoints = num_points / 8;
505 
506  const float* inputVectorPtr = (const float*)inputVector;
507  int16_t* outputVectorPtr = outputVector;
508 
509  float min_val = SHRT_MIN;
510  float max_val = SHRT_MAX;
511  float r;
512 
513  __m256 vScalar = _mm256_set1_ps(scalar);
514  __m256 inputVal, ret;
515  __m256i intInputVal;
516  __m128i intInputVal1, intInputVal2;
517  __m256 vmin_val = _mm256_set1_ps(min_val);
518  __m256 vmax_val = _mm256_set1_ps(max_val);
519 
520  for (; number < eighthPoints; number++) {
521  inputVal = _mm256_load_ps(inputVectorPtr);
522  inputVectorPtr += 8;
523 
524  // Scale and clip
525  ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
526  vmin_val);
527 
528  intInputVal = _mm256_cvtps_epi32(ret);
529 
530  intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
531  intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
532 
533  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
534 
535  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
536  outputVectorPtr += 8;
537  }
538 
539  number = eighthPoints * 8;
540  for (; number < num_points; number++) {
541  r = inputVector[number] * scalar;
542  if (r > max_val)
543  r = max_val;
544  else if (r < min_val)
545  r = min_val;
546  outputVector[number] = (int16_t)rintf(r);
547  }
548 }
549 #endif /* LV_HAVE_AVX */
550 
551 #ifdef LV_HAVE_SSE2
552 #include <emmintrin.h>
553 
554 static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector,
555  const float* inputVector,
556  const float scalar,
557  unsigned int num_points)
558 {
559  unsigned int number = 0;
560 
561  const unsigned int eighthPoints = num_points / 8;
562 
563  const float* inputVectorPtr = (const float*)inputVector;
564  int16_t* outputVectorPtr = outputVector;
565 
566  float min_val = SHRT_MIN;
567  float max_val = SHRT_MAX;
568  float r;
569 
570  __m128 vScalar = _mm_set_ps1(scalar);
571  __m128 inputVal1, inputVal2;
572  __m128i intInputVal1, intInputVal2;
573  __m128 ret1, ret2;
574  __m128 vmin_val = _mm_set_ps1(min_val);
575  __m128 vmax_val = _mm_set_ps1(max_val);
576 
577  for (; number < eighthPoints; number++) {
578  inputVal1 = _mm_load_ps(inputVectorPtr);
579  inputVectorPtr += 4;
580  inputVal2 = _mm_load_ps(inputVectorPtr);
581  inputVectorPtr += 4;
582 
583  // Scale and clip
584  ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
585  ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
586 
587  intInputVal1 = _mm_cvtps_epi32(ret1);
588  intInputVal2 = _mm_cvtps_epi32(ret2);
589 
590  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
591 
592  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
593  outputVectorPtr += 8;
594  }
595 
596  number = eighthPoints * 8;
597  for (; number < num_points; number++) {
598  r = inputVector[number] * scalar;
599  if (r > max_val)
600  r = max_val;
601  else if (r < min_val)
602  r = min_val;
603  outputVector[number] = (int16_t)rintf(r);
604  }
605 }
606 #endif /* LV_HAVE_SSE2 */
607 
608 
609 #ifdef LV_HAVE_SSE
610 #include <xmmintrin.h>
611 
612 static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector,
613  const float* inputVector,
614  const float scalar,
615  unsigned int num_points)
616 {
617  unsigned int number = 0;
618 
619  const unsigned int quarterPoints = num_points / 4;
620 
621  const float* inputVectorPtr = (const float*)inputVector;
622  int16_t* outputVectorPtr = outputVector;
623 
624  float min_val = SHRT_MIN;
625  float max_val = SHRT_MAX;
626  float r;
627 
628  __m128 vScalar = _mm_set_ps1(scalar);
629  __m128 ret;
630  __m128 vmin_val = _mm_set_ps1(min_val);
631  __m128 vmax_val = _mm_set_ps1(max_val);
632 
633  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
634 
635  for (; number < quarterPoints; number++) {
636  ret = _mm_load_ps(inputVectorPtr);
637  inputVectorPtr += 4;
638 
639  // Scale and clip
640  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
641 
642  _mm_store_ps(outputFloatBuffer, ret);
643  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
644  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
645  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
646  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
647  }
648 
649  number = quarterPoints * 4;
650  for (; number < num_points; number++) {
651  r = inputVector[number] * scalar;
652  if (r > max_val)
653  r = max_val;
654  else if (r < min_val)
655  r = min_val;
656  outputVector[number] = (int16_t)rintf(r);
657  }
658 }
659 #endif /* LV_HAVE_SSE */
660 
661 
662 #ifdef LV_HAVE_NEON
663 #include <arm_neon.h>
664 
665 static inline void volk_32f_s32f_convert_16i_neon(int16_t* outputVector,
666  const float* inputVector,
667  const float scalar,
668  unsigned int num_points)
669 {
670  unsigned int number = 0;
671  const unsigned int eighthPoints = num_points / 8;
672 
673  const float* inputVectorPtr = inputVector;
674  int16_t* outputVectorPtr = outputVector;
675 
676  float min_val = SHRT_MIN;
677  float max_val = SHRT_MAX;
678  float r;
679 
680  float32x4_t vScalar = vdupq_n_f32(scalar);
681  float32x4_t vmin_val = vdupq_n_f32(min_val);
682  float32x4_t vmax_val = vdupq_n_f32(max_val);
683 
684  for (; number < eighthPoints; number++) {
685  float32x4_t inputVal1 = vld1q_f32(inputVectorPtr);
686  float32x4_t inputVal2 = vld1q_f32(inputVectorPtr + 4);
687  inputVectorPtr += 8;
688 
689  // Scale and clip
690  float32x4_t ret1 =
691  vmaxq_f32(vminq_f32(vmulq_f32(inputVal1, vScalar), vmax_val), vmin_val);
692  float32x4_t ret2 =
693  vmaxq_f32(vminq_f32(vmulq_f32(inputVal2, vScalar), vmax_val), vmin_val);
694 
695  // Round to nearest: add copysign(0.5, x) before truncating
696  float32x4_t half = vdupq_n_f32(0.5f);
697  float32x4_t neg_half = vdupq_n_f32(-0.5f);
698  float32x4_t zero = vdupq_n_f32(0.0f);
699  uint32x4_t neg1 = vcltq_f32(ret1, zero);
700  uint32x4_t neg2 = vcltq_f32(ret2, zero);
701  ret1 = vaddq_f32(ret1, vbslq_f32(neg1, neg_half, half));
702  ret2 = vaddq_f32(ret2, vbslq_f32(neg2, neg_half, half));
703 
704  // Convert to int32 (truncates towards zero, but we pre-rounded)
705  int32x4_t intVal1 = vcvtq_s32_f32(ret1);
706  int32x4_t intVal2 = vcvtq_s32_f32(ret2);
707 
708  // Narrow to int16 with saturation
709  int16x4_t narrow1 = vqmovn_s32(intVal1);
710  int16x4_t narrow2 = vqmovn_s32(intVal2);
711  int16x8_t result = vcombine_s16(narrow1, narrow2);
712 
713  vst1q_s16(outputVectorPtr, result);
714  outputVectorPtr += 8;
715  }
716 
717  number = eighthPoints * 8;
718  for (; number < num_points; number++) {
719  r = inputVector[number] * scalar;
720  if (r > max_val)
721  r = max_val;
722  else if (r < min_val)
723  r = min_val;
724  outputVector[number] = (int16_t)rintf(r);
725  }
726 }
727 #endif /* LV_HAVE_NEON */
728 
729 
730 #ifdef LV_HAVE_NEONV8
731 #include <arm_neon.h>
732 
733 static inline void volk_32f_s32f_convert_16i_neonv8(int16_t* outputVector,
734  const float* inputVector,
735  const float scalar,
736  unsigned int num_points)
737 {
738  unsigned int number = 0;
739  const unsigned int sixteenthPoints = num_points / 16;
740 
741  const float* inputVectorPtr = inputVector;
742  int16_t* outputVectorPtr = outputVector;
743 
744  float min_val = SHRT_MIN;
745  float max_val = SHRT_MAX;
746  float r;
747 
748  float32x4_t vScalar = vdupq_n_f32(scalar);
749  float32x4_t vmin_val = vdupq_n_f32(min_val);
750  float32x4_t vmax_val = vdupq_n_f32(max_val);
751 
752  for (; number < sixteenthPoints; number++) {
753  float32x4_t inputVal0 = vld1q_f32(inputVectorPtr);
754  float32x4_t inputVal1 = vld1q_f32(inputVectorPtr + 4);
755  float32x4_t inputVal2 = vld1q_f32(inputVectorPtr + 8);
756  float32x4_t inputVal3 = vld1q_f32(inputVectorPtr + 12);
757  __VOLK_PREFETCH(inputVectorPtr + 16);
758  inputVectorPtr += 16;
759 
760  // Scale and clip
761  float32x4_t ret0 =
762  vmaxq_f32(vminq_f32(vmulq_f32(inputVal0, vScalar), vmax_val), vmin_val);
763  float32x4_t ret1 =
764  vmaxq_f32(vminq_f32(vmulq_f32(inputVal1, vScalar), vmax_val), vmin_val);
765  float32x4_t ret2 =
766  vmaxq_f32(vminq_f32(vmulq_f32(inputVal2, vScalar), vmax_val), vmin_val);
767  float32x4_t ret3 =
768  vmaxq_f32(vminq_f32(vmulq_f32(inputVal3, vScalar), vmax_val), vmin_val);
769 
770  // Convert to int32 using round-to-nearest (ARMv8)
771  int32x4_t intVal0 = vcvtnq_s32_f32(ret0);
772  int32x4_t intVal1 = vcvtnq_s32_f32(ret1);
773  int32x4_t intVal2 = vcvtnq_s32_f32(ret2);
774  int32x4_t intVal3 = vcvtnq_s32_f32(ret3);
775 
776  // Narrow to int16 with saturation
777  int16x4_t narrow0 = vqmovn_s32(intVal0);
778  int16x4_t narrow1 = vqmovn_s32(intVal1);
779  int16x4_t narrow2 = vqmovn_s32(intVal2);
780  int16x4_t narrow3 = vqmovn_s32(intVal3);
781  int16x8_t result0 = vcombine_s16(narrow0, narrow1);
782  int16x8_t result1 = vcombine_s16(narrow2, narrow3);
783 
784  vst1q_s16(outputVectorPtr, result0);
785  vst1q_s16(outputVectorPtr + 8, result1);
786  outputVectorPtr += 16;
787  }
788 
789  number = sixteenthPoints * 16;
790  for (; number < num_points; number++) {
791  r = inputVector[number] * scalar;
792  if (r > max_val)
793  r = max_val;
794  else if (r < min_val)
795  r = min_val;
796  outputVector[number] = (int16_t)rintf(r);
797  }
798 }
799 #endif /* LV_HAVE_NEONV8 */
800 
801 
802 #ifdef LV_HAVE_RVV
803 #include <riscv_vector.h>
804 
805 static inline void volk_32f_s32f_convert_16i_rvv(int16_t* outputVector,
806  const float* inputVector,
807  const float scalar,
808  unsigned int num_points)
809 {
810  size_t n = num_points;
811  for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
812  vl = __riscv_vsetvl_e32m8(n);
813  vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl);
814  v = __riscv_vfmul(v, scalar, vl);
815  __riscv_vse16(outputVector, __riscv_vfncvt_x(v, vl), vl);
816  }
817 }
818 #endif /*LV_HAVE_RVV*/
819 
820 #endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
volk_32f_s32f_convert_16i_a_sse2
static void volk_32f_s32f_convert_16i_a_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:554
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32f_s32f_convert_16i_u_sse
static void volk_32f_s32f_convert_16i_u_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:295
volk_common.h
volk_32f_s32f_convert_16i_a_sse
static void volk_32f_s32f_convert_16i_a_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:612
volk_32f_s32f_convert_16i_a_avx
static void volk_32f_s32f_convert_16i_a_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:497
volk_32f_s32f_convert_16i_neon
static void volk_32f_s32f_convert_16i_neon(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:665
volk_32f_s32f_convert_16i_u_avx
static void volk_32f_s32f_convert_16i_u_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:179
volk_32f_s32f_convert_16i_generic
static void volk_32f_s32f_convert_16i_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:347
rintf
static float rintf(float x)
Definition: config.h:45
volk_32f_s32f_convert_16i_u_sse2
static void volk_32f_s32f_convert_16i_u_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:237