Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_expfast_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
52 #include <inttypes.h>
53 #include <math.h>
54 #include <stdio.h>
55 
56 #define Mln2 0.6931471805f
57 #define A 8388608.0f
58 #define B 1065353216.0f
59 #define C 60801.0f
60 
61 
62 #ifndef INCLUDED_volk_32f_expfast_32f_a_H
63 #define INCLUDED_volk_32f_expfast_32f_a_H
64 
65 #if LV_HAVE_AVX && LV_HAVE_FMA
66 
67 #include <immintrin.h>
68 
69 static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,
70  const float* aVector,
71  unsigned int num_points)
72 {
73  float* bPtr = bVector;
74  const float* aPtr = aVector;
75 
76  unsigned int number = 0;
77  const unsigned int eighthPoints = num_points / 8;
78 
79  __m256 aVal, bVal, a, b;
80  __m256i exp;
81  a = _mm256_set1_ps(A / Mln2);
82  b = _mm256_set1_ps(B - C);
83 
84  for (; number < eighthPoints; number++) {
85  aVal = _mm256_load_ps(aPtr);
86  exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
87  bVal = _mm256_castsi256_ps(exp);
88 
89  _mm256_store_ps(bPtr, bVal);
90  aPtr += 8;
91  bPtr += 8;
92  }
93 
94  number = eighthPoints * 8;
95  for (; number < num_points; number++) {
96  *bPtr++ = expf(*aPtr++);
97  }
98 }
99 
100 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
101 
102 #ifdef LV_HAVE_AVX
103 
104 #include <immintrin.h>
105 
106 static inline void
107 volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
108 {
109  float* bPtr = bVector;
110  const float* aPtr = aVector;
111 
112  unsigned int number = 0;
113  const unsigned int eighthPoints = num_points / 8;
114 
115  __m256 aVal, bVal, a, b;
116  __m256i exp;
117  a = _mm256_set1_ps(A / Mln2);
118  b = _mm256_set1_ps(B - C);
119 
120  for (; number < eighthPoints; number++) {
121  aVal = _mm256_load_ps(aPtr);
122  exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
123  bVal = _mm256_castsi256_ps(exp);
124 
125  _mm256_store_ps(bPtr, bVal);
126  aPtr += 8;
127  bPtr += 8;
128  }
129 
130  number = eighthPoints * 8;
131  for (; number < num_points; number++) {
132  *bPtr++ = expf(*aPtr++);
133  }
134 }
135 
136 #endif /* LV_HAVE_AVX for aligned */
137 
138 #ifdef LV_HAVE_SSE4_1
139 #include <smmintrin.h>
140 
141 static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,
142  const float* aVector,
143  unsigned int num_points)
144 {
145  float* bPtr = bVector;
146  const float* aPtr = aVector;
147 
148  unsigned int number = 0;
149  const unsigned int quarterPoints = num_points / 4;
150 
151  __m128 aVal, bVal, a, b;
152  __m128i exp;
153  a = _mm_set1_ps(A / Mln2);
154  b = _mm_set1_ps(B - C);
155 
156  for (; number < quarterPoints; number++) {
157  aVal = _mm_load_ps(aPtr);
158  exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
159  bVal = _mm_castsi128_ps(exp);
160 
161  _mm_store_ps(bPtr, bVal);
162  aPtr += 4;
163  bPtr += 4;
164  }
165 
166  number = quarterPoints * 4;
167  for (; number < num_points; number++) {
168  *bPtr++ = expf(*aPtr++);
169  }
170 }
171 
172 #endif /* LV_HAVE_SSE4_1 for aligned */
173 
174 #endif /* INCLUDED_volk_32f_expfast_32f_a_H */
175 
176 #ifndef INCLUDED_volk_32f_expfast_32f_u_H
177 #define INCLUDED_volk_32f_expfast_32f_u_H
178 
179 #if LV_HAVE_AVX && LV_HAVE_FMA
180 #include <immintrin.h>
181 
182 static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,
183  const float* aVector,
184  unsigned int num_points)
185 {
186  float* bPtr = bVector;
187  const float* aPtr = aVector;
188 
189  unsigned int number = 0;
190  const unsigned int eighthPoints = num_points / 8;
191 
192  __m256 aVal, bVal, a, b;
193  __m256i exp;
194  a = _mm256_set1_ps(A / Mln2);
195  b = _mm256_set1_ps(B - C);
196 
197  for (; number < eighthPoints; number++) {
198  aVal = _mm256_loadu_ps(aPtr);
199  exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
200  bVal = _mm256_castsi256_ps(exp);
201 
202  _mm256_storeu_ps(bPtr, bVal);
203  aPtr += 8;
204  bPtr += 8;
205  }
206 
207  number = eighthPoints * 8;
208  for (; number < num_points; number++) {
209  *bPtr++ = expf(*aPtr++);
210  }
211 }
212 
213 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
214 
215 #ifdef LV_HAVE_AVX
216 #include <immintrin.h>
217 
218 static inline void
219 volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
220 {
221  float* bPtr = bVector;
222  const float* aPtr = aVector;
223 
224  unsigned int number = 0;
225  const unsigned int eighthPoints = num_points / 8;
226 
227  __m256 aVal, bVal, a, b;
228  __m256i exp;
229  a = _mm256_set1_ps(A / Mln2);
230  b = _mm256_set1_ps(B - C);
231 
232  for (; number < eighthPoints; number++) {
233  aVal = _mm256_loadu_ps(aPtr);
234  exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
235  bVal = _mm256_castsi256_ps(exp);
236 
237  _mm256_storeu_ps(bPtr, bVal);
238  aPtr += 8;
239  bPtr += 8;
240  }
241 
242  number = eighthPoints * 8;
243  for (; number < num_points; number++) {
244  *bPtr++ = expf(*aPtr++);
245  }
246 }
247 
248 #endif /* LV_HAVE_AVX for unaligned */
249 
250 
251 #ifdef LV_HAVE_SSE4_1
252 #include <smmintrin.h>
253 
254 static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,
255  const float* aVector,
256  unsigned int num_points)
257 {
258  float* bPtr = bVector;
259  const float* aPtr = aVector;
260 
261  unsigned int number = 0;
262  const unsigned int quarterPoints = num_points / 4;
263 
264  __m128 aVal, bVal, a, b;
265  __m128i exp;
266  a = _mm_set1_ps(A / Mln2);
267  b = _mm_set1_ps(B - C);
268 
269  for (; number < quarterPoints; number++) {
270  aVal = _mm_loadu_ps(aPtr);
271  exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
272  bVal = _mm_castsi128_ps(exp);
273 
274  _mm_storeu_ps(bPtr, bVal);
275  aPtr += 4;
276  bPtr += 4;
277  }
278 
279  number = quarterPoints * 4;
280  for (; number < num_points; number++) {
281  *bPtr++ = expf(*aPtr++);
282  }
283 }
284 
285 #endif /* LV_HAVE_SSE4_1 for unaligned */
286 
287 
288 #ifdef LV_HAVE_GENERIC
289 
290 static inline void volk_32f_expfast_32f_generic(float* bVector,
291  const float* aVector,
292  unsigned int num_points)
293 {
294  float* bPtr = bVector;
295  const float* aPtr = aVector;
296  unsigned int number = 0;
297 
298  for (number = 0; number < num_points; number++) {
299  *bPtr++ = expf(*aPtr++);
300  }
301 }
302 #endif /* LV_HAVE_GENERIC */
303 
304 #ifdef LV_HAVE_NEON
305 #include <arm_neon.h>
306 
307 static inline void
308 volk_32f_expfast_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
309 {
310  float* bPtr = bVector;
311  const float* aPtr = aVector;
312 
313  unsigned int number = 0;
314  const unsigned int quarterPoints = num_points / 4;
315 
316  float32x4_t a = vdupq_n_f32(A / Mln2);
317  float32x4_t b = vdupq_n_f32(B - C);
318 
319  for (; number < quarterPoints; number++) {
320  float32x4_t aVal = vld1q_f32(aPtr);
321  int32x4_t exp = vcvtq_s32_f32(vmlaq_f32(b, a, aVal));
322  float32x4_t bVal = vreinterpretq_f32_s32(exp);
323  vst1q_f32(bPtr, bVal);
324 
325  aPtr += 4;
326  bPtr += 4;
327  }
328 
329  number = quarterPoints * 4;
330  for (; number < num_points; number++) {
331  *bPtr++ = expf(*aPtr++);
332  }
333 }
334 
335 #endif /* LV_HAVE_NEON */
336 
337 #ifdef LV_HAVE_NEONV8
338 #include <arm_neon.h>
339 
340 static inline void
341 volk_32f_expfast_32f_neonv8(float* bVector, const float* aVector, unsigned int num_points)
342 {
343  float* bPtr = bVector;
344  const float* aPtr = aVector;
345 
346  unsigned int number = 0;
347  const unsigned int eighthPoints = num_points / 8;
348 
349  float32x4_t a = vdupq_n_f32(A / Mln2);
350  float32x4_t b = vdupq_n_f32(B - C);
351 
352  for (; number < eighthPoints; number++) {
353  __VOLK_PREFETCH(aPtr + 16);
354 
355  float32x4_t aVal0 = vld1q_f32(aPtr);
356  float32x4_t aVal1 = vld1q_f32(aPtr + 4);
357 
358  int32x4_t exp0 = vcvtq_s32_f32(vfmaq_f32(b, a, aVal0));
359  int32x4_t exp1 = vcvtq_s32_f32(vfmaq_f32(b, a, aVal1));
360 
361  float32x4_t bVal0 = vreinterpretq_f32_s32(exp0);
362  float32x4_t bVal1 = vreinterpretq_f32_s32(exp1);
363 
364  vst1q_f32(bPtr, bVal0);
365  vst1q_f32(bPtr + 4, bVal1);
366 
367  aPtr += 8;
368  bPtr += 8;
369  }
370 
371  number = eighthPoints * 8;
372  for (; number < num_points; number++) {
373  *bPtr++ = expf(*aPtr++);
374  }
375 }
376 
377 #endif /* LV_HAVE_NEONV8 */
378 
379 #ifdef LV_HAVE_RVV
380 #include <riscv_vector.h>
381 
382 static inline void
383 volk_32f_expfast_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
384 {
385  size_t vlmax = __riscv_vsetvlmax_e32m8();
386  const vfloat32m8_t ca = __riscv_vfmv_v_f_f32m8(A / Mln2, vlmax);
387  const vfloat32m8_t cb = __riscv_vfmv_v_f_f32m8(B - C, vlmax);
388 
389  size_t n = num_points;
390  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
391  vl = __riscv_vsetvl_e32m8(n);
392  vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
393  v = __riscv_vfmadd(v, ca, cb, vl);
394  v = __riscv_vreinterpret_f32m8(__riscv_vfcvt_x(v, vl));
395  __riscv_vse32(bVector, v, vl);
396  }
397 }
398 #endif /*LV_HAVE_RVV*/
399 
400 #endif /* INCLUDED_volk_32f_expfast_32f_u_H */
volk_32f_expfast_32f_neon
static void volk_32f_expfast_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:308
volk_32f_expfast_32f_generic
static void volk_32f_expfast_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:290
B
#define B
Definition: volk_32f_expfast_32f.h:58
C
#define C
Definition: volk_32f_expfast_32f.h:59
volk_32f_expfast_32f_u_avx
static void volk_32f_expfast_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:219
volk_32f_expfast_32f_a_avx
static void volk_32f_expfast_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:107
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
Mln2
#define Mln2
Definition: volk_32f_expfast_32f.h:56
A
#define A
Definition: volk_32f_expfast_32f.h:57