Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_32i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
57 #ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
58 #define INCLUDED_volk_32f_s32f_convert_32i_u_H
59 
60 #include <inttypes.h>
61 #include <limits.h>
62 #include <stdio.h>
63 
64 #ifdef LV_HAVE_AVX
65 #include <immintrin.h>
66 
67 static inline void volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector,
68  const float* inputVector,
69  const float scalar,
70  unsigned int num_points)
71 {
72  unsigned int number = 0;
73 
74  const unsigned int eighthPoints = num_points / 8;
75 
76  const float* inputVectorPtr = (const float*)inputVector;
77  int32_t* outputVectorPtr = outputVector;
78 
79  float min_val = INT_MIN;
80  float max_val = (uint32_t)INT_MAX + 1;
81  float r;
82 
83  __m256 vScalar = _mm256_set1_ps(scalar);
84  __m256 inputVal1;
85  __m256i intInputVal1;
86  __m256 vmin_val = _mm256_set1_ps(min_val);
87  __m256 vmax_val = _mm256_set1_ps(max_val);
88 
89  for (; number < eighthPoints; number++) {
90  inputVal1 = _mm256_loadu_ps(inputVectorPtr);
91  inputVectorPtr += 8;
92 
93  inputVal1 = _mm256_max_ps(
94  _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
95  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
96 
97  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
98  outputVectorPtr += 8;
99  }
100 
101  number = eighthPoints * 8;
102  for (; number < num_points; number++) {
103  r = inputVector[number] * scalar;
104  if (r > max_val)
105  r = max_val;
106  else if (r < min_val)
107  r = min_val;
108  outputVector[number] = (int32_t)rintf(r);
109  }
110 }
111 
112 #endif /* LV_HAVE_AVX */
113 
114 #ifdef LV_HAVE_SSE2
115 #include <emmintrin.h>
116 
117 static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector,
118  const float* inputVector,
119  const float scalar,
120  unsigned int num_points)
121 {
122  unsigned int number = 0;
123 
124  const unsigned int quarterPoints = num_points / 4;
125 
126  const float* inputVectorPtr = (const float*)inputVector;
127  int32_t* outputVectorPtr = outputVector;
128 
129  float min_val = INT_MIN;
130  float max_val = (uint32_t)INT_MAX + 1;
131  float r;
132 
133  __m128 vScalar = _mm_set_ps1(scalar);
134  __m128 inputVal1;
135  __m128i intInputVal1;
136  __m128 vmin_val = _mm_set_ps1(min_val);
137  __m128 vmax_val = _mm_set_ps1(max_val);
138 
139  for (; number < quarterPoints; number++) {
140  inputVal1 = _mm_loadu_ps(inputVectorPtr);
141  inputVectorPtr += 4;
142 
143  inputVal1 =
144  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
145  intInputVal1 = _mm_cvtps_epi32(inputVal1);
146 
147  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
148  outputVectorPtr += 4;
149  }
150 
151  number = quarterPoints * 4;
152  for (; number < num_points; number++) {
153  r = inputVector[number] * scalar;
154  if (r > max_val)
155  r = max_val;
156  else if (r < min_val)
157  r = min_val;
158  outputVector[number] = (int32_t)rintf(r);
159  }
160 }
161 
162 #endif /* LV_HAVE_SSE2 */
163 
164 
165 #ifdef LV_HAVE_SSE
166 #include <xmmintrin.h>
167 
168 static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector,
169  const float* inputVector,
170  const float scalar,
171  unsigned int num_points)
172 {
173  unsigned int number = 0;
174 
175  const unsigned int quarterPoints = num_points / 4;
176 
177  const float* inputVectorPtr = (const float*)inputVector;
178  int32_t* outputVectorPtr = outputVector;
179 
180  float min_val = INT_MIN;
181  float max_val = (uint32_t)INT_MAX + 1;
182  float r;
183 
184  __m128 vScalar = _mm_set_ps1(scalar);
185  __m128 ret;
186  __m128 vmin_val = _mm_set_ps1(min_val);
187  __m128 vmax_val = _mm_set_ps1(max_val);
188 
189  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
190 
191  for (; number < quarterPoints; number++) {
192  ret = _mm_loadu_ps(inputVectorPtr);
193  inputVectorPtr += 4;
194 
195  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
196 
197  _mm_store_ps(outputFloatBuffer, ret);
198  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
199  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
200  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
201  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
202  }
203 
204  number = quarterPoints * 4;
205  for (; number < num_points; number++) {
206  r = inputVector[number] * scalar;
207  if (r > max_val)
208  r = max_val;
209  else if (r < min_val)
210  r = min_val;
211  outputVector[number] = (int32_t)rintf(r);
212  }
213 }
214 
215 #endif /* LV_HAVE_SSE */
216 
217 
218 #ifdef LV_HAVE_GENERIC
219 
220 static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector,
221  const float* inputVector,
222  const float scalar,
223  unsigned int num_points)
224 {
225  int32_t* outputVectorPtr = outputVector;
226  const float* inputVectorPtr = inputVector;
227  const float min_val = (float)INT_MIN;
228  const float max_val = (float)((uint32_t)INT_MAX + 1);
229 
230  for (unsigned int number = 0; number < num_points; number++) {
231  const float r = *inputVectorPtr++ * scalar;
232  int s;
233  if (r >= max_val)
234  s = INT_MAX;
235  else if (r < min_val)
236  s = INT_MIN;
237  else
238  s = (int32_t)rintf(r);
239  *outputVectorPtr++ = s;
240  }
241 }
242 
243 #endif /* LV_HAVE_GENERIC */
244 
245 
246 #endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
247 #ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
248 #define INCLUDED_volk_32f_s32f_convert_32i_a_H
249 
250 #include <inttypes.h>
251 #include <stdio.h>
252 #include <volk/volk_common.h>
253 
254 #ifdef LV_HAVE_AVX
255 #include <immintrin.h>
256 
257 static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector,
258  const float* inputVector,
259  const float scalar,
260  unsigned int num_points)
261 {
262  unsigned int number = 0;
263 
264  const unsigned int eighthPoints = num_points / 8;
265 
266  const float* inputVectorPtr = (const float*)inputVector;
267  int32_t* outputVectorPtr = outputVector;
268 
269  float min_val = INT_MIN;
270  float max_val = (uint32_t)INT_MAX + 1;
271  float r;
272 
273  __m256 vScalar = _mm256_set1_ps(scalar);
274  __m256 inputVal1;
275  __m256i intInputVal1;
276  __m256 vmin_val = _mm256_set1_ps(min_val);
277  __m256 vmax_val = _mm256_set1_ps(max_val);
278 
279  for (; number < eighthPoints; number++) {
280  inputVal1 = _mm256_load_ps(inputVectorPtr);
281  inputVectorPtr += 8;
282 
283  inputVal1 = _mm256_max_ps(
284  _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
285  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
286 
287  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
288  outputVectorPtr += 8;
289  }
290 
291  number = eighthPoints * 8;
292  for (; number < num_points; number++) {
293  r = inputVector[number] * scalar;
294  if (r > max_val)
295  r = max_val;
296  else if (r < min_val)
297  r = min_val;
298  outputVector[number] = (int32_t)rintf(r);
299  }
300 }
301 
302 #endif /* LV_HAVE_AVX */
303 
304 
305 #ifdef LV_HAVE_SSE2
306 #include <emmintrin.h>
307 
308 static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector,
309  const float* inputVector,
310  const float scalar,
311  unsigned int num_points)
312 {
313  unsigned int number = 0;
314 
315  const unsigned int quarterPoints = num_points / 4;
316 
317  const float* inputVectorPtr = (const float*)inputVector;
318  int32_t* outputVectorPtr = outputVector;
319 
320  float min_val = INT_MIN;
321  float max_val = (uint32_t)INT_MAX + 1;
322  float r;
323 
324  __m128 vScalar = _mm_set_ps1(scalar);
325  __m128 inputVal1;
326  __m128i intInputVal1;
327  __m128 vmin_val = _mm_set_ps1(min_val);
328  __m128 vmax_val = _mm_set_ps1(max_val);
329 
330  for (; number < quarterPoints; number++) {
331  inputVal1 = _mm_load_ps(inputVectorPtr);
332  inputVectorPtr += 4;
333 
334  inputVal1 =
335  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
336  intInputVal1 = _mm_cvtps_epi32(inputVal1);
337 
338  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
339  outputVectorPtr += 4;
340  }
341 
342  number = quarterPoints * 4;
343  for (; number < num_points; number++) {
344  r = inputVector[number] * scalar;
345  if (r > max_val)
346  r = max_val;
347  else if (r < min_val)
348  r = min_val;
349  outputVector[number] = (int32_t)rintf(r);
350  }
351 }
352 
353 #endif /* LV_HAVE_SSE2 */
354 
355 
356 #ifdef LV_HAVE_SSE
357 #include <xmmintrin.h>
358 
359 static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector,
360  const float* inputVector,
361  const float scalar,
362  unsigned int num_points)
363 {
364  unsigned int number = 0;
365 
366  const unsigned int quarterPoints = num_points / 4;
367 
368  const float* inputVectorPtr = (const float*)inputVector;
369  int32_t* outputVectorPtr = outputVector;
370 
371  float min_val = INT_MIN;
372  float max_val = (uint32_t)INT_MAX + 1;
373  float r;
374 
375  __m128 vScalar = _mm_set_ps1(scalar);
376  __m128 ret;
377  __m128 vmin_val = _mm_set_ps1(min_val);
378  __m128 vmax_val = _mm_set_ps1(max_val);
379 
380  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
381 
382  for (; number < quarterPoints; number++) {
383  ret = _mm_load_ps(inputVectorPtr);
384  inputVectorPtr += 4;
385 
386  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
387 
388  _mm_store_ps(outputFloatBuffer, ret);
389  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
390  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
391  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
392  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
393  }
394 
395  number = quarterPoints * 4;
396  for (; number < num_points; number++) {
397  r = inputVector[number] * scalar;
398  if (r > max_val)
399  r = max_val;
400  else if (r < min_val)
401  r = min_val;
402  outputVector[number] = (int32_t)rintf(r);
403  }
404 }
405 
406 #endif /* LV_HAVE_SSE */
407 
408 #ifdef LV_HAVE_NEON
409 #include <arm_neon.h>
410 
411 static inline void volk_32f_s32f_convert_32i_neon(int32_t* outputVector,
412  const float* inputVector,
413  const float scalar,
414  unsigned int num_points)
415 {
416  unsigned int number = 0;
417  const unsigned int quarter_points = num_points / 4;
418 
419  const float* inputPtr = inputVector;
420  int32_t* outputPtr = outputVector;
421 
422  const float min_val = (float)INT_MIN;
423  const float max_val = (float)((uint32_t)INT_MAX + 1);
424 
425  float32x4_t vScalar = vdupq_n_f32(scalar);
426  float32x4_t vmin_val = vdupq_n_f32(min_val);
427  float32x4_t vmax_val = vdupq_n_f32(max_val);
428  float32x4_t half = vdupq_n_f32(0.5f);
429  float32x4_t neg_half = vdupq_n_f32(-0.5f);
430  float32x4_t zero = vdupq_n_f32(0.0f);
431 
432  for (; number < quarter_points; number++) {
433  float32x4_t inputVal = vld1q_f32(inputPtr);
434  inputVal = vmulq_f32(inputVal, vScalar);
435  inputVal = vmaxq_f32(vminq_f32(inputVal, vmax_val), vmin_val);
436  // Round to nearest: add copysign(0.5, x) before truncating
437  uint32x4_t neg = vcltq_f32(inputVal, zero);
438  inputVal = vaddq_f32(inputVal, vbslq_f32(neg, neg_half, half));
439  int32x4_t intVal = vcvtq_s32_f32(inputVal);
440  vst1q_s32(outputPtr, intVal);
441  inputPtr += 4;
442  outputPtr += 4;
443  }
444 
445  number = quarter_points * 4;
446  for (; number < num_points; number++) {
447  float r = *inputPtr++ * scalar;
448  if (r >= max_val)
449  *outputPtr++ = INT_MAX;
450  else if (r < min_val)
451  *outputPtr++ = INT_MIN;
452  else
453  *outputPtr++ = (int32_t)rintf(r);
454  }
455 }
456 #endif /* LV_HAVE_NEON */
457 
458 #ifdef LV_HAVE_NEONV8
459 #include <arm_neon.h>
460 
461 static inline void volk_32f_s32f_convert_32i_neonv8(int32_t* outputVector,
462  const float* inputVector,
463  const float scalar,
464  unsigned int num_points)
465 {
466  unsigned int number = 0;
467  const unsigned int eighth_points = num_points / 8;
468 
469  const float* inputPtr = inputVector;
470  int32_t* outputPtr = outputVector;
471 
472  const float min_val = (float)INT_MIN;
473  const float max_val = (float)((uint32_t)INT_MAX + 1);
474 
475  float32x4_t vScalar = vdupq_n_f32(scalar);
476  float32x4_t vmin_val = vdupq_n_f32(min_val);
477  float32x4_t vmax_val = vdupq_n_f32(max_val);
478 
479  for (; number < eighth_points; number++) {
480  float32x4_t inputVal0 = vld1q_f32(inputPtr);
481  float32x4_t inputVal1 = vld1q_f32(inputPtr + 4);
482  __VOLK_PREFETCH(inputPtr + 8);
483 
484  inputVal0 = vmulq_f32(inputVal0, vScalar);
485  inputVal1 = vmulq_f32(inputVal1, vScalar);
486  inputVal0 = vmaxq_f32(vminq_f32(inputVal0, vmax_val), vmin_val);
487  inputVal1 = vmaxq_f32(vminq_f32(inputVal1, vmax_val), vmin_val);
488 
489  int32x4_t intVal0 = vcvtnq_s32_f32(inputVal0);
490  int32x4_t intVal1 = vcvtnq_s32_f32(inputVal1);
491 
492  vst1q_s32(outputPtr, intVal0);
493  vst1q_s32(outputPtr + 4, intVal1);
494  inputPtr += 8;
495  outputPtr += 8;
496  }
497 
498  number = eighth_points * 8;
499  for (; number < num_points; number++) {
500  float r = *inputPtr++ * scalar;
501  if (r >= max_val)
502  *outputPtr++ = INT_MAX;
503  else if (r < min_val)
504  *outputPtr++ = INT_MIN;
505  else
506  *outputPtr++ = (int32_t)rintf(r);
507  }
508 }
509 #endif /* LV_HAVE_NEONV8 */
510 
511 #ifdef LV_HAVE_RVV
512 #include <riscv_vector.h>
513 
514 static inline void volk_32f_s32f_convert_32i_rvv(int32_t* outputVector,
515  const float* inputVector,
516  const float scalar,
517  unsigned int num_points)
518 {
519  size_t n = num_points;
520  for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
521  vl = __riscv_vsetvl_e32m8(n);
522  vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl);
523  v = __riscv_vfmul(v, scalar, vl);
524  __riscv_vse32(outputVector, __riscv_vfcvt_x(v, vl), vl);
525  }
526 }
527 #endif /*LV_HAVE_RVV*/
528 
529 #endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */
volk_32f_s32f_convert_32i_a_sse
static void volk_32f_s32f_convert_32i_a_sse(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:359
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_common.h
volk_32f_s32f_convert_32i_neon
static void volk_32f_s32f_convert_32i_neon(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:411
volk_32f_s32f_convert_32i_u_avx
static void volk_32f_s32f_convert_32i_u_avx(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:67
volk_32f_s32f_convert_32i_u_sse
static void volk_32f_s32f_convert_32i_u_sse(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:168
volk_32f_s32f_convert_32i_generic
static void volk_32f_s32f_convert_32i_generic(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:220
volk_32f_s32f_convert_32i_a_sse2
static void volk_32f_s32f_convert_32i_a_sse2(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:308
volk_32f_s32f_convert_32i_a_avx
static void volk_32f_s32f_convert_32i_a_avx(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:257
rintf
static float rintf(float x)
Definition: config.h:45
volk_32f_s32f_convert_32i_u_sse2
static void volk_32f_s32f_convert_32i_u_sse2(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:117