Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_8i_s32f_convert_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
41 #ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
42 #define INCLUDED_volk_8i_s32f_convert_32f_u_H
43 
44 #include <inttypes.h>
45 #include <stdio.h>
46 
47 #ifdef LV_HAVE_AVX2
48 #include <immintrin.h>
49 
50 static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector,
51  const int8_t* inputVector,
52  const float scalar,
53  unsigned int num_points)
54 {
55  unsigned int number = 0;
56  const unsigned int sixteenthPoints = num_points / 16;
57 
58  float* outputVectorPtr = outputVector;
59  const float iScalar = 1.0 / scalar;
60  __m256 invScalar = _mm256_set1_ps(iScalar);
61  const int8_t* inputVectorPtr = inputVector;
62  __m256 ret;
63  __m128i inputVal128;
64  __m256i interimVal;
65 
66  for (; number < sixteenthPoints; number++) {
67  inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
68 
69  interimVal = _mm256_cvtepi8_epi32(inputVal128);
70  ret = _mm256_cvtepi32_ps(interimVal);
71  ret = _mm256_mul_ps(ret, invScalar);
72  _mm256_storeu_ps(outputVectorPtr, ret);
73  outputVectorPtr += 8;
74 
75  inputVal128 = _mm_srli_si128(inputVal128, 8);
76  interimVal = _mm256_cvtepi8_epi32(inputVal128);
77  ret = _mm256_cvtepi32_ps(interimVal);
78  ret = _mm256_mul_ps(ret, invScalar);
79  _mm256_storeu_ps(outputVectorPtr, ret);
80  outputVectorPtr += 8;
81 
82  inputVectorPtr += 16;
83  }
84 
85  number = sixteenthPoints * 16;
86  for (; number < num_points; number++) {
87  outputVector[number] = (float)(inputVector[number]) * iScalar;
88  }
89 }
90 #endif /* LV_HAVE_AVX2 */
91 
92 #ifdef LV_HAVE_AVX512F
93 #include <immintrin.h>
94 
95 static inline void volk_8i_s32f_convert_32f_u_avx512(float* outputVector,
96  const int8_t* inputVector,
97  const float scalar,
98  unsigned int num_points)
99 {
100  unsigned int number = 0;
101  const unsigned int sixteenthPoints = num_points / 16;
102 
103  float* outputVectorPtr = outputVector;
104  const float iScalar = 1.0 / scalar;
105  __m512 invScalar = _mm512_set1_ps(iScalar);
106  const int8_t* inputVectorPtr = inputVector;
107  __m512 ret;
108  __m128i inputVal128;
109  __m512i interimVal;
110 
111  for (; number < sixteenthPoints; number++) {
112  inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
113 
114  interimVal = _mm512_cvtepi8_epi32(inputVal128);
115  ret = _mm512_cvtepi32_ps(interimVal);
116  ret = _mm512_mul_ps(ret, invScalar);
117  _mm512_storeu_ps(outputVectorPtr, ret);
118  outputVectorPtr += 16;
119 
120  inputVectorPtr += 16;
121  }
122 
123  number = sixteenthPoints * 16;
124  for (; number < num_points; number++) {
125  outputVector[number] = (float)(inputVector[number]) * iScalar;
126  }
127 }
128 #endif /* LV_HAVE_AVX512F */
129 
130 
131 #ifdef LV_HAVE_SSE4_1
132 #include <smmintrin.h>
133 
134 static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector,
135  const int8_t* inputVector,
136  const float scalar,
137  unsigned int num_points)
138 {
139  unsigned int number = 0;
140  const unsigned int sixteenthPoints = num_points / 16;
141 
142  float* outputVectorPtr = outputVector;
143  const float iScalar = 1.0 / scalar;
144  __m128 invScalar = _mm_set_ps1(iScalar);
145  const int8_t* inputVectorPtr = inputVector;
146  __m128 ret;
147  __m128i inputVal;
148  __m128i interimVal;
149 
150  for (; number < sixteenthPoints; number++) {
151  inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
152 
153  interimVal = _mm_cvtepi8_epi32(inputVal);
154  ret = _mm_cvtepi32_ps(interimVal);
155  ret = _mm_mul_ps(ret, invScalar);
156  _mm_storeu_ps(outputVectorPtr, ret);
157  outputVectorPtr += 4;
158 
159  inputVal = _mm_srli_si128(inputVal, 4);
160  interimVal = _mm_cvtepi8_epi32(inputVal);
161  ret = _mm_cvtepi32_ps(interimVal);
162  ret = _mm_mul_ps(ret, invScalar);
163  _mm_storeu_ps(outputVectorPtr, ret);
164  outputVectorPtr += 4;
165 
166  inputVal = _mm_srli_si128(inputVal, 4);
167  interimVal = _mm_cvtepi8_epi32(inputVal);
168  ret = _mm_cvtepi32_ps(interimVal);
169  ret = _mm_mul_ps(ret, invScalar);
170  _mm_storeu_ps(outputVectorPtr, ret);
171  outputVectorPtr += 4;
172 
173  inputVal = _mm_srli_si128(inputVal, 4);
174  interimVal = _mm_cvtepi8_epi32(inputVal);
175  ret = _mm_cvtepi32_ps(interimVal);
176  ret = _mm_mul_ps(ret, invScalar);
177  _mm_storeu_ps(outputVectorPtr, ret);
178  outputVectorPtr += 4;
179 
180  inputVectorPtr += 16;
181  }
182 
183  number = sixteenthPoints * 16;
184  for (; number < num_points; number++) {
185  outputVector[number] = (float)(inputVector[number]) * iScalar;
186  }
187 }
188 #endif /* LV_HAVE_SSE4_1 */
189 
190 #ifdef LV_HAVE_GENERIC
191 
192 static inline void volk_8i_s32f_convert_32f_generic(float* outputVector,
193  const int8_t* inputVector,
194  const float scalar,
195  unsigned int num_points)
196 {
197  float* outputVectorPtr = outputVector;
198  const int8_t* inputVectorPtr = inputVector;
199  unsigned int number = 0;
200  const float iScalar = 1.0 / scalar;
201 
202  for (number = 0; number < num_points; number++) {
203  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
204  }
205 }
206 #endif /* LV_HAVE_GENERIC */
207 
208 
209 #endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
210 
211 #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
212 #define INCLUDED_volk_8i_s32f_convert_32f_a_H
213 
214 #include <inttypes.h>
215 #include <stdio.h>
216 
217 #ifdef LV_HAVE_AVX2
218 #include <immintrin.h>
219 
220 static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector,
221  const int8_t* inputVector,
222  const float scalar,
223  unsigned int num_points)
224 {
225  unsigned int number = 0;
226  const unsigned int sixteenthPoints = num_points / 16;
227 
228  float* outputVectorPtr = outputVector;
229  const float iScalar = 1.0 / scalar;
230  __m256 invScalar = _mm256_set1_ps(iScalar);
231  const int8_t* inputVectorPtr = inputVector;
232  __m256 ret;
233  __m128i inputVal128;
234  __m256i interimVal;
235 
236  for (; number < sixteenthPoints; number++) {
237  inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
238 
239  interimVal = _mm256_cvtepi8_epi32(inputVal128);
240  ret = _mm256_cvtepi32_ps(interimVal);
241  ret = _mm256_mul_ps(ret, invScalar);
242  _mm256_store_ps(outputVectorPtr, ret);
243  outputVectorPtr += 8;
244 
245  inputVal128 = _mm_srli_si128(inputVal128, 8);
246  interimVal = _mm256_cvtepi8_epi32(inputVal128);
247  ret = _mm256_cvtepi32_ps(interimVal);
248  ret = _mm256_mul_ps(ret, invScalar);
249  _mm256_store_ps(outputVectorPtr, ret);
250  outputVectorPtr += 8;
251 
252  inputVectorPtr += 16;
253  }
254 
255  number = sixteenthPoints * 16;
256  for (; number < num_points; number++) {
257  outputVector[number] = (float)(inputVector[number]) * iScalar;
258  }
259 }
260 #endif /* LV_HAVE_AVX2 */
261 
262 #ifdef LV_HAVE_AVX512F
263 #include <immintrin.h>
264 
265 static inline void volk_8i_s32f_convert_32f_a_avx512(float* outputVector,
266  const int8_t* inputVector,
267  const float scalar,
268  unsigned int num_points)
269 {
270  unsigned int number = 0;
271  const unsigned int sixteenthPoints = num_points / 16;
272 
273  float* outputVectorPtr = outputVector;
274  const float iScalar = 1.0 / scalar;
275  __m512 invScalar = _mm512_set1_ps(iScalar);
276  const int8_t* inputVectorPtr = inputVector;
277  __m512 ret;
278  __m128i inputVal128;
279  __m512i interimVal;
280 
281  for (; number < sixteenthPoints; number++) {
282  inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
283 
284  interimVal = _mm512_cvtepi8_epi32(inputVal128);
285  ret = _mm512_cvtepi32_ps(interimVal);
286  ret = _mm512_mul_ps(ret, invScalar);
287  _mm512_store_ps(outputVectorPtr, ret);
288  outputVectorPtr += 16;
289 
290  inputVectorPtr += 16;
291  }
292 
293  number = sixteenthPoints * 16;
294  for (; number < num_points; number++) {
295  outputVector[number] = (float)(inputVector[number]) * iScalar;
296  }
297 }
298 #endif /* LV_HAVE_AVX512F */
299 
300 #ifdef LV_HAVE_SSE4_1
301 #include <smmintrin.h>
302 
303 static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector,
304  const int8_t* inputVector,
305  const float scalar,
306  unsigned int num_points)
307 {
308  unsigned int number = 0;
309  const unsigned int sixteenthPoints = num_points / 16;
310 
311  float* outputVectorPtr = outputVector;
312  const float iScalar = 1.0 / scalar;
313  __m128 invScalar = _mm_set_ps1(iScalar);
314  const int8_t* inputVectorPtr = inputVector;
315  __m128 ret;
316  __m128i inputVal;
317  __m128i interimVal;
318 
319  for (; number < sixteenthPoints; number++) {
320  inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
321 
322  interimVal = _mm_cvtepi8_epi32(inputVal);
323  ret = _mm_cvtepi32_ps(interimVal);
324  ret = _mm_mul_ps(ret, invScalar);
325  _mm_store_ps(outputVectorPtr, ret);
326  outputVectorPtr += 4;
327 
328  inputVal = _mm_srli_si128(inputVal, 4);
329  interimVal = _mm_cvtepi8_epi32(inputVal);
330  ret = _mm_cvtepi32_ps(interimVal);
331  ret = _mm_mul_ps(ret, invScalar);
332  _mm_store_ps(outputVectorPtr, ret);
333  outputVectorPtr += 4;
334 
335  inputVal = _mm_srli_si128(inputVal, 4);
336  interimVal = _mm_cvtepi8_epi32(inputVal);
337  ret = _mm_cvtepi32_ps(interimVal);
338  ret = _mm_mul_ps(ret, invScalar);
339  _mm_store_ps(outputVectorPtr, ret);
340  outputVectorPtr += 4;
341 
342  inputVal = _mm_srli_si128(inputVal, 4);
343  interimVal = _mm_cvtepi8_epi32(inputVal);
344  ret = _mm_cvtepi32_ps(interimVal);
345  ret = _mm_mul_ps(ret, invScalar);
346  _mm_store_ps(outputVectorPtr, ret);
347  outputVectorPtr += 4;
348 
349  inputVectorPtr += 16;
350  }
351 
352  number = sixteenthPoints * 16;
353  for (; number < num_points; number++) {
354  outputVector[number] = (float)(inputVector[number]) * iScalar;
355  }
356 }
357 #endif /* LV_HAVE_SSE4_1 */
358 
359 #ifdef LV_HAVE_NEON
360 #include <arm_neon.h>
361 
362 static inline void volk_8i_s32f_convert_32f_neon(float* outputVector,
363  const int8_t* inputVector,
364  const float scalar,
365  unsigned int num_points)
366 {
367  float* outputVectorPtr = outputVector;
368  const int8_t* inputVectorPtr = inputVector;
369 
370  const float iScalar = 1.0 / scalar;
371  const float32x4_t qiScalar = vdupq_n_f32(iScalar);
372 
373  int8x16_t inputVal;
374 
375  int16x8_t lower;
376  int16x8_t higher;
377 
378  float32x4_t outputFloat;
379 
380  unsigned int number = 0;
381  const unsigned int sixteenthPoints = num_points / 16;
382  for (; number < sixteenthPoints; number++) {
383  inputVal = vld1q_s8(inputVectorPtr);
384  inputVectorPtr += 16;
385 
386  lower = vmovl_s8(vget_low_s8(inputVal));
387  higher = vmovl_s8(vget_high_s8(inputVal));
388 
389  outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
390  vst1q_f32(outputVectorPtr, outputFloat);
391  outputVectorPtr += 4;
392 
393  outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
394  vst1q_f32(outputVectorPtr, outputFloat);
395  outputVectorPtr += 4;
396 
397  outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
398  vst1q_f32(outputVectorPtr, outputFloat);
399  outputVectorPtr += 4;
400 
401  outputFloat =
402  vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
403  vst1q_f32(outputVectorPtr, outputFloat);
404  outputVectorPtr += 4;
405  }
406  for (number = sixteenthPoints * 16; number < num_points; number++) {
407  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
408  }
409 }
410 
411 #endif /* LV_HAVE_NEON */
412 
413 #ifdef LV_HAVE_NEONV8
414 #include <arm_neon.h>
415 
416 static inline void volk_8i_s32f_convert_32f_neonv8(float* outputVector,
417  const int8_t* inputVector,
418  const float scalar,
419  unsigned int num_points)
420 {
421  float* outputVectorPtr = outputVector;
422  const int8_t* inputVectorPtr = inputVector;
423  const float iScalar = 1.0f / scalar;
424  const float32x4_t qiScalar = vdupq_n_f32(iScalar);
425  const unsigned int thirtysecondPoints = num_points / 32;
426 
427  for (unsigned int number = 0; number < thirtysecondPoints; number++) {
428  int8x16_t in0 = vld1q_s8(inputVectorPtr);
429  int8x16_t in1 = vld1q_s8(inputVectorPtr + 16);
430  __VOLK_PREFETCH(inputVectorPtr + 64);
431 
432  /* Widen int8 -> int16 -> int32 -> float */
433  int16x8_t lo0 = vmovl_s8(vget_low_s8(in0));
434  int16x8_t hi0 = vmovl_s8(vget_high_s8(in0));
435  int16x8_t lo1 = vmovl_s8(vget_low_s8(in1));
436  int16x8_t hi1 = vmovl_s8(vget_high_s8(in1));
437 
438  vst1q_f32(outputVectorPtr,
439  vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lo0))), qiScalar));
440  vst1q_f32(outputVectorPtr + 4,
441  vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lo0))), qiScalar));
442  vst1q_f32(outputVectorPtr + 8,
443  vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(hi0))), qiScalar));
444  vst1q_f32(outputVectorPtr + 12,
445  vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(hi0))), qiScalar));
446  vst1q_f32(outputVectorPtr + 16,
447  vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lo1))), qiScalar));
448  vst1q_f32(outputVectorPtr + 20,
449  vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lo1))), qiScalar));
450  vst1q_f32(outputVectorPtr + 24,
451  vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(hi1))), qiScalar));
452  vst1q_f32(outputVectorPtr + 28,
453  vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(hi1))), qiScalar));
454 
455  inputVectorPtr += 32;
456  outputVectorPtr += 32;
457  }
458 
459  for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
460  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
461  }
462 }
463 #endif /* LV_HAVE_NEONV8 */
464 
465 #ifdef LV_HAVE_ORC
466 extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector,
467  const int8_t* inputVector,
468  const float scalar,
469  int num_points);
470 
471 static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
472  const int8_t* inputVector,
473  const float scalar,
474  unsigned int num_points)
475 {
476  float invscalar = 1.0 / scalar;
477  volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
478 }
479 #endif /* LV_HAVE_ORC */
480 
481 #ifdef LV_HAVE_RVV
482 #include <riscv_vector.h>
483 
484 static inline void volk_8i_s32f_convert_32f_rvv(float* outputVector,
485  const int8_t* inputVector,
486  const float scalar,
487  unsigned int num_points)
488 {
489  size_t n = num_points;
490  for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
491  vl = __riscv_vsetvl_e8m2(n);
492  vint16m4_t v = __riscv_vsext_vf2(__riscv_vle8_v_i8m2(inputVector, vl), vl);
493  __riscv_vse32(
494  outputVector, __riscv_vfmul(__riscv_vfwcvt_f(v, vl), 1.0f / scalar, vl), vl);
495  }
496 }
497 #endif /*LV_HAVE_RVV*/
498 
499 #endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
volk_8i_s32f_convert_32f_neon
static void volk_8i_s32f_convert_32f_neon(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:362
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_8i_s32f_convert_32f_generic
static void volk_8i_s32f_convert_32f_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:192
bit128::f
float f[4]
Definition: volk_common.h:120