Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_16i_s32f_convert_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
41 #ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
42 #define INCLUDED_volk_16i_s32f_convert_32f_u_H
43 
44 #include <inttypes.h>
45 #include <stdio.h>
46 
47 #ifdef LV_HAVE_AVX2
48 #include <immintrin.h>
49 
50 static inline void volk_16i_s32f_convert_32f_u_avx2(float* outputVector,
51  const int16_t* inputVector,
52  const float scalar,
53  unsigned int num_points)
54 {
55  unsigned int number = 0;
56  const unsigned int eighthPoints = num_points / 8;
57 
58  float* outputVectorPtr = outputVector;
59  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
60  int16_t* inputPtr = (int16_t*)inputVector;
61  __m128i inputVal;
62  __m256i inputVal2;
63  __m256 ret;
64 
65  for (; number < eighthPoints; number++) {
66 
67  // Load the 8 values
68  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
69 
70  // Convert
71  inputVal2 = _mm256_cvtepi16_epi32(inputVal);
72 
73  ret = _mm256_cvtepi32_ps(inputVal2);
74  ret = _mm256_mul_ps(ret, invScalar);
75 
76  _mm256_storeu_ps(outputVectorPtr, ret);
77 
78  outputVectorPtr += 8;
79 
80  inputPtr += 8;
81  }
82 
83  number = eighthPoints * 8;
84  for (; number < num_points; number++) {
85  outputVector[number] = ((float)(inputVector[number])) / scalar;
86  }
87 }
88 #endif /* LV_HAVE_AVX2 */
89 
90 #ifdef LV_HAVE_AVX512F
91 #include <immintrin.h>
92 
93 static inline void volk_16i_s32f_convert_32f_u_avx512(float* outputVector,
94  const int16_t* inputVector,
95  const float scalar,
96  unsigned int num_points)
97 {
98  unsigned int number = 0;
99  const unsigned int sixteenthPoints = num_points / 16;
100 
101  float* outputVectorPtr = outputVector;
102  __m512 invScalar = _mm512_set1_ps(1.0 / scalar);
103  int16_t* inputPtr = (int16_t*)inputVector;
104  __m256i inputVal;
105  __m512i inputVal2;
106  __m512 ret;
107 
108  for (; number < sixteenthPoints; number++) {
109 
110  // Load 16 int16 values
111  inputVal = _mm256_loadu_si256((__m256i*)inputPtr);
112 
113  // Convert int16 → int32 → float
114  inputVal2 = _mm512_cvtepi16_epi32(inputVal);
115  ret = _mm512_cvtepi32_ps(inputVal2);
116  ret = _mm512_mul_ps(ret, invScalar);
117 
118  _mm512_storeu_ps(outputVectorPtr, ret);
119 
120  outputVectorPtr += 16;
121  inputPtr += 16;
122  }
123 
124  number = sixteenthPoints * 16;
125  for (; number < num_points; number++) {
126  outputVector[number] = ((float)(inputVector[number])) / scalar;
127  }
128 }
129 #endif /* LV_HAVE_AVX512F */
130 
131 #ifdef LV_HAVE_AVX
132 #include <immintrin.h>
133 
134 static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector,
135  const int16_t* inputVector,
136  const float scalar,
137  unsigned int num_points)
138 {
139  unsigned int number = 0;
140  const unsigned int eighthPoints = num_points / 8;
141 
142  float* outputVectorPtr = outputVector;
143  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
144  int16_t* inputPtr = (int16_t*)inputVector;
145  __m128i inputVal, inputVal2;
146  __m128 ret;
147  __m256 output;
148  __m256 dummy = _mm256_setzero_ps();
149 
150  for (; number < eighthPoints; number++) {
151 
152  // Load the 8 values
153  // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
154  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
155 
156  // Shift the input data to the right by 64 bits ( 8 bytes )
157  inputVal2 = _mm_srli_si128(inputVal, 8);
158 
159  // Convert the lower 4 values into 32 bit words
160  inputVal = _mm_cvtepi16_epi32(inputVal);
161  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
162 
163  ret = _mm_cvtepi32_ps(inputVal);
164  ret = _mm_mul_ps(ret, invScalar);
165  output = _mm256_insertf128_ps(dummy, ret, 0);
166 
167  ret = _mm_cvtepi32_ps(inputVal2);
168  ret = _mm_mul_ps(ret, invScalar);
169  output = _mm256_insertf128_ps(output, ret, 1);
170 
171  _mm256_storeu_ps(outputVectorPtr, output);
172 
173  outputVectorPtr += 8;
174 
175  inputPtr += 8;
176  }
177 
178  number = eighthPoints * 8;
179  for (; number < num_points; number++) {
180  outputVector[number] = ((float)(inputVector[number])) / scalar;
181  }
182 }
183 #endif /* LV_HAVE_AVX */
184 
185 #ifdef LV_HAVE_SSE4_1
186 #include <smmintrin.h>
187 
188 static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector,
189  const int16_t* inputVector,
190  const float scalar,
191  unsigned int num_points)
192 {
193  unsigned int number = 0;
194  const unsigned int eighthPoints = num_points / 8;
195 
196  float* outputVectorPtr = outputVector;
197  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
198  int16_t* inputPtr = (int16_t*)inputVector;
199  __m128i inputVal;
200  __m128i inputVal2;
201  __m128 ret;
202 
203  for (; number < eighthPoints; number++) {
204 
205  // Load the 8 values
206  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
207 
208  // Shift the input data to the right by 64 bits ( 8 bytes )
209  inputVal2 = _mm_srli_si128(inputVal, 8);
210 
211  // Convert the lower 4 values into 32 bit words
212  inputVal = _mm_cvtepi16_epi32(inputVal);
213  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
214 
215  ret = _mm_cvtepi32_ps(inputVal);
216  ret = _mm_mul_ps(ret, invScalar);
217  _mm_storeu_ps(outputVectorPtr, ret);
218  outputVectorPtr += 4;
219 
220  ret = _mm_cvtepi32_ps(inputVal2);
221  ret = _mm_mul_ps(ret, invScalar);
222  _mm_storeu_ps(outputVectorPtr, ret);
223 
224  outputVectorPtr += 4;
225 
226  inputPtr += 8;
227  }
228 
229  number = eighthPoints * 8;
230  for (; number < num_points; number++) {
231  outputVector[number] = ((float)(inputVector[number])) / scalar;
232  }
233 }
234 #endif /* LV_HAVE_SSE4_1 */
235 
236 #ifdef LV_HAVE_SSE
237 #include <xmmintrin.h>
238 
239 static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector,
240  const int16_t* inputVector,
241  const float scalar,
242  unsigned int num_points)
243 {
244  unsigned int number = 0;
245  const unsigned int quarterPoints = num_points / 4;
246 
247  float* outputVectorPtr = outputVector;
248  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
249  int16_t* inputPtr = (int16_t*)inputVector;
250  __m128 ret;
251 
252  for (; number < quarterPoints; number++) {
253  ret = _mm_set_ps((float)(inputPtr[3]),
254  (float)(inputPtr[2]),
255  (float)(inputPtr[1]),
256  (float)(inputPtr[0]));
257 
258  ret = _mm_mul_ps(ret, invScalar);
259  _mm_storeu_ps(outputVectorPtr, ret);
260 
261  inputPtr += 4;
262  outputVectorPtr += 4;
263  }
264 
265  number = quarterPoints * 4;
266  for (; number < num_points; number++) {
267  outputVector[number] = (float)(inputVector[number]) / scalar;
268  }
269 }
270 #endif /* LV_HAVE_SSE */
271 
272 #ifdef LV_HAVE_GENERIC
273 
274 static inline void volk_16i_s32f_convert_32f_generic(float* outputVector,
275  const int16_t* inputVector,
276  const float scalar,
277  unsigned int num_points)
278 {
279  float* outputVectorPtr = outputVector;
280  const int16_t* inputVectorPtr = inputVector;
281  unsigned int number = 0;
282 
283  for (number = 0; number < num_points; number++) {
284  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
285  }
286 }
287 #endif /* LV_HAVE_GENERIC */
288 
289 #ifdef LV_HAVE_NEON
290 #include <arm_neon.h>
291 
292 static inline void volk_16i_s32f_convert_32f_neon(float* outputVector,
293  const int16_t* inputVector,
294  const float scalar,
295  unsigned int num_points)
296 {
297  float* outputPtr = outputVector;
298  const int16_t* inputPtr = inputVector;
299  unsigned int number = 0;
300  unsigned int eighth_points = num_points / 8;
301 
302  int16x4x2_t input16;
303  int32x4_t input32_0, input32_1;
304  float32x4_t input_float_0, input_float_1;
305  float32x4x2_t output_float;
306  float32x4_t inv_scale;
307 
308  inv_scale = vdupq_n_f32(1.0 / scalar);
309 
310  // the generic disassembles to a 128-bit load
311  // and duplicates every instruction to operate on 64-bits
312  // at a time. This is only possible with lanes, which is faster
313  // than just doing a vld1_s16, but still slower.
314  for (number = 0; number < eighth_points; number++) {
315  input16 = vld2_s16(inputPtr);
316  // widen 16-bit int to 32-bit int
317  input32_0 = vmovl_s16(input16.val[0]);
318  input32_1 = vmovl_s16(input16.val[1]);
319  // convert 32-bit int to float with scale
320  input_float_0 = vcvtq_f32_s32(input32_0);
321  input_float_1 = vcvtq_f32_s32(input32_1);
322  output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
323  output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
324  vst2q_f32(outputPtr, output_float);
325  inputPtr += 8;
326  outputPtr += 8;
327  }
328 
329  for (number = eighth_points * 8; number < num_points; number++) {
330  *outputPtr++ = ((float)(*inputPtr++)) / scalar;
331  }
332 }
333 #endif /* LV_HAVE_NEON */
334 
335 
336 #ifdef LV_HAVE_NEONV8
337 #include <arm_neon.h>
338 
339 static inline void volk_16i_s32f_convert_32f_neonv8(float* outputVector,
340  const int16_t* inputVector,
341  const float scalar,
342  unsigned int num_points)
343 {
344  unsigned int n = num_points;
345  float* out = outputVector;
346  const int16_t* in = inputVector;
347 
348  const float32x4_t inv_scale = vdupq_n_f32(1.0f / scalar);
349 
350  /* Process 8 int16 values per iteration using 64-bit loads */
351  while (n >= 8) {
352  int16x4_t v0 = vld1_s16(in);
353  int16x4_t v1 = vld1_s16(in + 4);
354  __VOLK_PREFETCH(in + 16);
355 
356  /* Widen int16 to int32, convert to float, scale */
357  float32x4_t f0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(v0)), inv_scale);
358  float32x4_t f1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(v1)), inv_scale);
359 
360  vst1q_f32(out, f0);
361  vst1q_f32(out + 4, f1);
362 
363  in += 8;
364  out += 8;
365  n -= 8;
366  }
367 
368  /* Process remaining 4 values */
369  if (n >= 4) {
370  int16x4_t v0 = vld1_s16(in);
371  vst1q_f32(out, vmulq_f32(vcvtq_f32_s32(vmovl_s16(v0)), inv_scale));
372  in += 4;
373  out += 4;
374  n -= 4;
375  }
376 
377  /* Scalar tail */
378  while (n > 0) {
379  *out++ = ((float)(*in++)) / scalar;
380  n--;
381  }
382 }
383 
384 #endif /* LV_HAVE_NEONV8 */
385 
386 
387 #endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
388 #ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
389 #define INCLUDED_volk_16i_s32f_convert_32f_a_H
390 
391 #include <inttypes.h>
392 #include <stdio.h>
393 
394 #ifdef LV_HAVE_AVX2
395 #include <immintrin.h>
396 
397 static inline void volk_16i_s32f_convert_32f_a_avx2(float* outputVector,
398  const int16_t* inputVector,
399  const float scalar,
400  unsigned int num_points)
401 {
402  unsigned int number = 0;
403  const unsigned int eighthPoints = num_points / 8;
404 
405  float* outputVectorPtr = outputVector;
406  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
407  int16_t* inputPtr = (int16_t*)inputVector;
408  __m128i inputVal;
409  __m256i inputVal2;
410  __m256 ret;
411 
412  for (; number < eighthPoints; number++) {
413 
414  // Load the 8 values
415  inputVal = _mm_load_si128((__m128i*)inputPtr);
416 
417  // Convert
418  inputVal2 = _mm256_cvtepi16_epi32(inputVal);
419 
420  ret = _mm256_cvtepi32_ps(inputVal2);
421  ret = _mm256_mul_ps(ret, invScalar);
422 
423  _mm256_store_ps(outputVectorPtr, ret);
424 
425  outputVectorPtr += 8;
426 
427  inputPtr += 8;
428  }
429 
430  number = eighthPoints * 8;
431  for (; number < num_points; number++) {
432  outputVector[number] = ((float)(inputVector[number])) / scalar;
433  }
434 }
435 #endif /* LV_HAVE_AVX2 */
436 
437 #ifdef LV_HAVE_AVX512F
438 #include <immintrin.h>
439 
440 static inline void volk_16i_s32f_convert_32f_a_avx512(float* outputVector,
441  const int16_t* inputVector,
442  const float scalar,
443  unsigned int num_points)
444 {
445  unsigned int number = 0;
446  const unsigned int sixteenthPoints = num_points / 16;
447 
448  float* outputVectorPtr = outputVector;
449  __m512 invScalar = _mm512_set1_ps(1.0 / scalar);
450  int16_t* inputPtr = (int16_t*)inputVector;
451  __m256i inputVal;
452  __m512i inputVal2;
453  __m512 ret;
454 
455  for (; number < sixteenthPoints; number++) {
456 
457  // Load 16 int16 values
458  inputVal = _mm256_load_si256((__m256i*)inputPtr);
459 
460  // Convert int16 → int32 → float
461  inputVal2 = _mm512_cvtepi16_epi32(inputVal);
462  ret = _mm512_cvtepi32_ps(inputVal2);
463  ret = _mm512_mul_ps(ret, invScalar);
464 
465  _mm512_store_ps(outputVectorPtr, ret);
466 
467  outputVectorPtr += 16;
468  inputPtr += 16;
469  }
470 
471  number = sixteenthPoints * 16;
472  for (; number < num_points; number++) {
473  outputVector[number] = ((float)(inputVector[number])) / scalar;
474  }
475 }
476 #endif /* LV_HAVE_AVX512F */
477 
478 #ifdef LV_HAVE_AVX
479 #include <immintrin.h>
480 
481 static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector,
482  const int16_t* inputVector,
483  const float scalar,
484  unsigned int num_points)
485 {
486  unsigned int number = 0;
487  const unsigned int eighthPoints = num_points / 8;
488 
489  float* outputVectorPtr = outputVector;
490  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
491  int16_t* inputPtr = (int16_t*)inputVector;
492  __m128i inputVal, inputVal2;
493  __m128 ret;
494  __m256 output;
495  __m256 dummy = _mm256_setzero_ps();
496 
497  for (; number < eighthPoints; number++) {
498 
499  // Load the 8 values
500  // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
501  inputVal = _mm_load_si128((__m128i*)inputPtr);
502 
503  // Shift the input data to the right by 64 bits ( 8 bytes )
504  inputVal2 = _mm_srli_si128(inputVal, 8);
505 
506  // Convert the lower 4 values into 32 bit words
507  inputVal = _mm_cvtepi16_epi32(inputVal);
508  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
509 
510  ret = _mm_cvtepi32_ps(inputVal);
511  ret = _mm_mul_ps(ret, invScalar);
512  output = _mm256_insertf128_ps(dummy, ret, 0);
513 
514  ret = _mm_cvtepi32_ps(inputVal2);
515  ret = _mm_mul_ps(ret, invScalar);
516  output = _mm256_insertf128_ps(output, ret, 1);
517 
518  _mm256_store_ps(outputVectorPtr, output);
519 
520  outputVectorPtr += 8;
521 
522  inputPtr += 8;
523  }
524 
525  number = eighthPoints * 8;
526  for (; number < num_points; number++) {
527  outputVector[number] = ((float)(inputVector[number])) / scalar;
528  }
529 }
530 #endif /* LV_HAVE_AVX */
531 
532 #ifdef LV_HAVE_SSE4_1
533 #include <smmintrin.h>
534 
535 static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector,
536  const int16_t* inputVector,
537  const float scalar,
538  unsigned int num_points)
539 {
540  unsigned int number = 0;
541  const unsigned int eighthPoints = num_points / 8;
542 
543  float* outputVectorPtr = outputVector;
544  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
545  int16_t* inputPtr = (int16_t*)inputVector;
546  __m128i inputVal;
547  __m128i inputVal2;
548  __m128 ret;
549 
550  for (; number < eighthPoints; number++) {
551 
552  // Load the 8 values
553  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
554 
555  // Shift the input data to the right by 64 bits ( 8 bytes )
556  inputVal2 = _mm_srli_si128(inputVal, 8);
557 
558  // Convert the lower 4 values into 32 bit words
559  inputVal = _mm_cvtepi16_epi32(inputVal);
560  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
561 
562  ret = _mm_cvtepi32_ps(inputVal);
563  ret = _mm_mul_ps(ret, invScalar);
564  _mm_storeu_ps(outputVectorPtr, ret);
565  outputVectorPtr += 4;
566 
567  ret = _mm_cvtepi32_ps(inputVal2);
568  ret = _mm_mul_ps(ret, invScalar);
569  _mm_storeu_ps(outputVectorPtr, ret);
570 
571  outputVectorPtr += 4;
572 
573  inputPtr += 8;
574  }
575 
576  number = eighthPoints * 8;
577  for (; number < num_points; number++) {
578  outputVector[number] = ((float)(inputVector[number])) / scalar;
579  }
580 }
581 #endif /* LV_HAVE_SSE4_1 */
582 
583 #ifdef LV_HAVE_SSE
584 #include <xmmintrin.h>
585 
586 static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector,
587  const int16_t* inputVector,
588  const float scalar,
589  unsigned int num_points)
590 {
591  unsigned int number = 0;
592  const unsigned int quarterPoints = num_points / 4;
593 
594  float* outputVectorPtr = outputVector;
595  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
596  int16_t* inputPtr = (int16_t*)inputVector;
597  __m128 ret;
598 
599  for (; number < quarterPoints; number++) {
600  ret = _mm_set_ps((float)(inputPtr[3]),
601  (float)(inputPtr[2]),
602  (float)(inputPtr[1]),
603  (float)(inputPtr[0]));
604 
605  ret = _mm_mul_ps(ret, invScalar);
606  _mm_storeu_ps(outputVectorPtr, ret);
607 
608  inputPtr += 4;
609  outputVectorPtr += 4;
610  }
611 
612  number = quarterPoints * 4;
613  for (; number < num_points; number++) {
614  outputVector[number] = (float)(inputVector[number]) / scalar;
615  }
616 }
617 #endif /* LV_HAVE_SSE */
618 
619 #ifdef LV_HAVE_RVV
620 #include <riscv_vector.h>
621 
622 static inline void volk_16i_s32f_convert_32f_rvv(float* outputVector,
623  const int16_t* inputVector,
624  const float scalar,
625  unsigned int num_points)
626 {
627  size_t n = num_points;
628  for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
629  vl = __riscv_vsetvl_e16m4(n);
630  vfloat32m8_t v = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(inputVector, vl), vl);
631  __riscv_vse32(outputVector, __riscv_vfmul(v, 1.0f / scalar, vl), vl);
632  }
633 }
634 #endif /*LV_HAVE_RVV*/
635 
636 #endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
volk_16i_s32f_convert_32f_u_sse
static void volk_16i_s32f_convert_32f_u_sse(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:239
volk_16i_s32f_convert_32f_a_sse
static void volk_16i_s32f_convert_32f_a_sse(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:586
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_16i_s32f_convert_32f_u_avx
static void volk_16i_s32f_convert_32f_u_avx(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:134
volk_16i_s32f_convert_32f_generic
static void volk_16i_s32f_convert_32f_generic(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:274
volk_16i_s32f_convert_32f_a_avx
static void volk_16i_s32f_convert_32f_a_avx(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:481
volk_16i_s32f_convert_32f_neon
static void volk_16i_s32f_convert_32f_neon(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:292