Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_exp_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015-2020 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 /* SIMD (SSE4) implementation of exp
11  Inspired by Intel Approximate Math library, and based on the
12  corresponding algorithms of the cephes math library
13 */
14 
15 /* Copyright (C) 2007 Julien Pommier
16 
17  This software is provided 'as-is', without any express or implied
18  warranty. In no event will the authors be held liable for any damages
19  arising from the use of this software.
20 
21  Permission is granted to anyone to use this software for any purpose,
22  including commercial applications, and to alter it and redistribute it
23  freely, subject to the following restrictions:
24 
25  1. The origin of this software must not be misrepresented; you must not
26  claim that you wrote the original software. If you use this software
27  in a product, an acknowledgment in the product documentation would be
28  appreciated but is not required.
29  2. Altered source versions must be plainly marked as such, and must not be
30  misrepresented as being the original software.
31  3. This notice may not be removed or altered from any source distribution.
32 
33  (this is the zlib license)
34 */
35 
82 #include <inttypes.h>
83 #include <math.h>
84 #include <stdio.h>
85 
86 #ifndef INCLUDED_volk_32f_exp_32f_a_H
87 #define INCLUDED_volk_32f_exp_32f_a_H
88 
89 #ifdef LV_HAVE_SSE2
90 #include <emmintrin.h>
91 
92 static inline void
93 volk_32f_exp_32f_a_sse2(float* bVector, const float* aVector, unsigned int num_points)
94 {
95  float* bPtr = bVector;
96  const float* aPtr = aVector;
97 
98  unsigned int number = 0;
99  unsigned int quarterPoints = num_points / 4;
100 
101  // Declare variables and constants
102  __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
103  __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
104  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
105  __m128i emm0, pi32_0x7f;
106 
107  one = _mm_set1_ps(1.0);
108  exp_hi = _mm_set1_ps(88.3762626647949);
109  exp_lo = _mm_set1_ps(-88.3762626647949);
110  log2EF = _mm_set1_ps(1.44269504088896341);
111  half = _mm_set1_ps(0.5);
112  exp_C1 = _mm_set1_ps(0.693359375);
113  exp_C2 = _mm_set1_ps(-2.12194440e-4);
114  pi32_0x7f = _mm_set1_epi32(0x7f);
115 
116  exp_p0 = _mm_set1_ps(1.9875691500e-4);
117  exp_p1 = _mm_set1_ps(1.3981999507e-3);
118  exp_p2 = _mm_set1_ps(8.3334519073e-3);
119  exp_p3 = _mm_set1_ps(4.1665795894e-2);
120  exp_p4 = _mm_set1_ps(1.6666665459e-1);
121  exp_p5 = _mm_set1_ps(5.0000001201e-1);
122 
123  for (; number < quarterPoints; number++) {
124  aVal = _mm_load_ps(aPtr);
125  tmp = _mm_setzero_ps();
126 
127  aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
128 
129  /* express exp(x) as exp(g + n*log(2)) */
130  fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
131 
132  emm0 = _mm_cvttps_epi32(fx);
133  tmp = _mm_cvtepi32_ps(emm0);
134 
135  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
136  fx = _mm_sub_ps(tmp, mask);
137 
138  tmp = _mm_mul_ps(fx, exp_C1);
139  z = _mm_mul_ps(fx, exp_C2);
140  aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
141  z = _mm_mul_ps(aVal, aVal);
142 
143  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
144  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
145  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
146  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
147  y = _mm_add_ps(y, one);
148 
149  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
150 
151  pow2n = _mm_castsi128_ps(emm0);
152  bVal = _mm_mul_ps(y, pow2n);
153 
154  _mm_store_ps(bPtr, bVal);
155  aPtr += 4;
156  bPtr += 4;
157  }
158 
159  number = quarterPoints * 4;
160  for (; number < num_points; number++) {
161  *bPtr++ = expf(*aPtr++);
162  }
163 }
164 
165 #endif /* LV_HAVE_SSE2 for aligned */
166 
167 
168 #endif /* INCLUDED_volk_32f_exp_32f_a_H */
169 
170 #ifndef INCLUDED_volk_32f_exp_32f_u_H
171 #define INCLUDED_volk_32f_exp_32f_u_H
172 
173 #ifdef LV_HAVE_SSE2
174 #include <emmintrin.h>
175 
176 static inline void
177 volk_32f_exp_32f_u_sse2(float* bVector, const float* aVector, unsigned int num_points)
178 {
179  float* bPtr = bVector;
180  const float* aPtr = aVector;
181 
182  unsigned int number = 0;
183  unsigned int quarterPoints = num_points / 4;
184 
185  // Declare variables and constants
186  __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
187  __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
188  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
189  __m128i emm0, pi32_0x7f;
190 
191  one = _mm_set1_ps(1.0);
192  exp_hi = _mm_set1_ps(88.3762626647949);
193  exp_lo = _mm_set1_ps(-88.3762626647949);
194  log2EF = _mm_set1_ps(1.44269504088896341);
195  half = _mm_set1_ps(0.5);
196  exp_C1 = _mm_set1_ps(0.693359375);
197  exp_C2 = _mm_set1_ps(-2.12194440e-4);
198  pi32_0x7f = _mm_set1_epi32(0x7f);
199 
200  exp_p0 = _mm_set1_ps(1.9875691500e-4);
201  exp_p1 = _mm_set1_ps(1.3981999507e-3);
202  exp_p2 = _mm_set1_ps(8.3334519073e-3);
203  exp_p3 = _mm_set1_ps(4.1665795894e-2);
204  exp_p4 = _mm_set1_ps(1.6666665459e-1);
205  exp_p5 = _mm_set1_ps(5.0000001201e-1);
206 
207 
208  for (; number < quarterPoints; number++) {
209  aVal = _mm_loadu_ps(aPtr);
210  tmp = _mm_setzero_ps();
211 
212  aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
213 
214  /* express exp(x) as exp(g + n*log(2)) */
215  fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
216 
217  emm0 = _mm_cvttps_epi32(fx);
218  tmp = _mm_cvtepi32_ps(emm0);
219 
220  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
221  fx = _mm_sub_ps(tmp, mask);
222 
223  tmp = _mm_mul_ps(fx, exp_C1);
224  z = _mm_mul_ps(fx, exp_C2);
225  aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
226  z = _mm_mul_ps(aVal, aVal);
227 
228  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
229  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
230  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
231  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
232  y = _mm_add_ps(y, one);
233 
234  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
235 
236  pow2n = _mm_castsi128_ps(emm0);
237  bVal = _mm_mul_ps(y, pow2n);
238 
239  _mm_storeu_ps(bPtr, bVal);
240  aPtr += 4;
241  bPtr += 4;
242  }
243 
244  number = quarterPoints * 4;
245  for (; number < num_points; number++) {
246  *bPtr++ = expf(*aPtr++);
247  }
248 }
249 
250 #endif /* LV_HAVE_SSE2 for unaligned */
251 
252 
253 #ifdef LV_HAVE_GENERIC
254 
255 static inline void
256 volk_32f_exp_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
257 {
258  float* bPtr = bVector;
259  const float* aPtr = aVector;
260  unsigned int number = 0;
261 
262  for (number = 0; number < num_points; number++) {
263  *bPtr++ = expf(*aPtr++);
264  }
265 }
266 
267 #endif /* LV_HAVE_GENERIC */
268 
269 #ifdef LV_HAVE_NEON
270 #include <arm_neon.h>
271 
272 static inline void
273 volk_32f_exp_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
274 {
275  float* bPtr = bVector;
276  const float* aPtr = aVector;
277 
278  unsigned int number = 0;
279  unsigned int quarterPoints = num_points / 4;
280 
281  // Constants
282  float32x4_t one = vdupq_n_f32(1.0f);
283  float32x4_t exp_hi = vdupq_n_f32(88.3762626647949f);
284  float32x4_t exp_lo = vdupq_n_f32(-88.3762626647949f);
285  float32x4_t log2EF = vdupq_n_f32(1.44269504088896341f);
286  float32x4_t half = vdupq_n_f32(0.5f);
287  float32x4_t exp_C1 = vdupq_n_f32(0.693359375f);
288  float32x4_t exp_C2 = vdupq_n_f32(-2.12194440e-4f);
289  int32x4_t pi32_0x7f = vdupq_n_s32(0x7f);
290 
291  float32x4_t exp_p0 = vdupq_n_f32(1.9875691500e-4f);
292  float32x4_t exp_p1 = vdupq_n_f32(1.3981999507e-3f);
293  float32x4_t exp_p2 = vdupq_n_f32(8.3334519073e-3f);
294  float32x4_t exp_p3 = vdupq_n_f32(4.1665795894e-2f);
295  float32x4_t exp_p4 = vdupq_n_f32(1.6666665459e-1f);
296  float32x4_t exp_p5 = vdupq_n_f32(5.0000001201e-1f);
297 
298  for (; number < quarterPoints; number++) {
299  float32x4_t aVal = vld1q_f32(aPtr);
300 
301  // Clamp to valid range
302  aVal = vmaxq_f32(vminq_f32(aVal, exp_hi), exp_lo);
303 
304  // express exp(x) as exp(g + n*log(2))
305  float32x4_t fx = vmlaq_f32(half, aVal, log2EF);
306 
307  // Floor function
308  int32x4_t emm0 = vcvtq_s32_f32(fx);
309  float32x4_t tmp = vcvtq_f32_s32(emm0);
310 
311  // If tmp > fx, subtract 1 (floor correction)
312  uint32x4_t mask = vcgtq_f32(tmp, fx);
313  float32x4_t mask_one = vbslq_f32(mask, one, vdupq_n_f32(0.0f));
314  fx = vsubq_f32(tmp, mask_one);
315 
316  // Reduce x
317  tmp = vmulq_f32(fx, exp_C1);
318  float32x4_t z = vmulq_f32(fx, exp_C2);
319  aVal = vsubq_f32(vsubq_f32(aVal, tmp), z);
320  z = vmulq_f32(aVal, aVal);
321 
322  // Polynomial approximation
323  float32x4_t y = vmlaq_f32(exp_p1, exp_p0, aVal);
324  y = vmulq_f32(y, aVal);
325  y = vaddq_f32(y, exp_p2);
326  y = vmulq_f32(y, aVal);
327  y = vaddq_f32(y, exp_p3);
328  y = vmlaq_f32(exp_p4, y, aVal);
329  y = vmulq_f32(y, aVal);
330  y = vaddq_f32(y, exp_p5);
331  y = vmlaq_f32(aVal, y, z);
332  y = vaddq_f32(y, one);
333 
334  // Build 2^n
335  emm0 = vcvtq_s32_f32(fx);
336  emm0 = vaddq_s32(emm0, pi32_0x7f);
337  emm0 = vshlq_n_s32(emm0, 23);
338  float32x4_t pow2n = vreinterpretq_f32_s32(emm0);
339 
340  float32x4_t bVal = vmulq_f32(y, pow2n);
341  vst1q_f32(bPtr, bVal);
342 
343  aPtr += 4;
344  bPtr += 4;
345  }
346 
347  number = quarterPoints * 4;
348  for (; number < num_points; number++) {
349  *bPtr++ = expf(*aPtr++);
350  }
351 }
352 
353 #endif /* LV_HAVE_NEON */
354 
355 #ifdef LV_HAVE_NEONV8
356 #include <arm_neon.h>
357 
358 static inline void
359 volk_32f_exp_32f_neonv8(float* bVector, const float* aVector, unsigned int num_points)
360 {
361  float* bPtr = bVector;
362  const float* aPtr = aVector;
363 
364  unsigned int number = 0;
365  unsigned int eighthPoints = num_points / 8;
366 
367  // Constants
368  float32x4_t one = vdupq_n_f32(1.0f);
369  float32x4_t exp_hi = vdupq_n_f32(88.3762626647949f);
370  float32x4_t exp_lo = vdupq_n_f32(-88.3762626647949f);
371  float32x4_t log2EF = vdupq_n_f32(1.44269504088896341f);
372  float32x4_t half = vdupq_n_f32(0.5f);
373  float32x4_t exp_C1 = vdupq_n_f32(0.693359375f);
374  float32x4_t exp_C2 = vdupq_n_f32(-2.12194440e-4f);
375  int32x4_t pi32_0x7f = vdupq_n_s32(0x7f);
376 
377  float32x4_t exp_p0 = vdupq_n_f32(1.9875691500e-4f);
378  float32x4_t exp_p1 = vdupq_n_f32(1.3981999507e-3f);
379  float32x4_t exp_p2 = vdupq_n_f32(8.3334519073e-3f);
380  float32x4_t exp_p3 = vdupq_n_f32(4.1665795894e-2f);
381  float32x4_t exp_p4 = vdupq_n_f32(1.6666665459e-1f);
382  float32x4_t exp_p5 = vdupq_n_f32(5.0000001201e-1f);
383 
384  for (; number < eighthPoints; number++) {
385  __VOLK_PREFETCH(aPtr + 16);
386 
387  float32x4_t aVal0 = vld1q_f32(aPtr);
388  float32x4_t aVal1 = vld1q_f32(aPtr + 4);
389 
390  // Clamp to valid range
391  aVal0 = vmaxq_f32(vminq_f32(aVal0, exp_hi), exp_lo);
392  aVal1 = vmaxq_f32(vminq_f32(aVal1, exp_hi), exp_lo);
393 
394  // express exp(x) as exp(g + n*log(2))
395  float32x4_t fx0 = vfmaq_f32(half, aVal0, log2EF);
396  float32x4_t fx1 = vfmaq_f32(half, aVal1, log2EF);
397 
398  // Floor function
399  int32x4_t emm0_0 = vcvtq_s32_f32(fx0);
400  int32x4_t emm0_1 = vcvtq_s32_f32(fx1);
401  float32x4_t tmp0 = vcvtq_f32_s32(emm0_0);
402  float32x4_t tmp1 = vcvtq_f32_s32(emm0_1);
403 
404  // If tmp > fx, subtract 1 (floor correction)
405  uint32x4_t mask0 = vcgtq_f32(tmp0, fx0);
406  uint32x4_t mask1 = vcgtq_f32(tmp1, fx1);
407  float32x4_t mask_one0 = vbslq_f32(mask0, one, vdupq_n_f32(0.0f));
408  float32x4_t mask_one1 = vbslq_f32(mask1, one, vdupq_n_f32(0.0f));
409  fx0 = vsubq_f32(tmp0, mask_one0);
410  fx1 = vsubq_f32(tmp1, mask_one1);
411 
412  // Reduce x
413  tmp0 = vmulq_f32(fx0, exp_C1);
414  tmp1 = vmulq_f32(fx1, exp_C1);
415  float32x4_t z0 = vmulq_f32(fx0, exp_C2);
416  float32x4_t z1 = vmulq_f32(fx1, exp_C2);
417  aVal0 = vsubq_f32(vsubq_f32(aVal0, tmp0), z0);
418  aVal1 = vsubq_f32(vsubq_f32(aVal1, tmp1), z1);
419  z0 = vmulq_f32(aVal0, aVal0);
420  z1 = vmulq_f32(aVal1, aVal1);
421 
422  // Polynomial approximation using FMA
423  float32x4_t y0 = vfmaq_f32(exp_p1, exp_p0, aVal0);
424  float32x4_t y1 = vfmaq_f32(exp_p1, exp_p0, aVal1);
425  y0 = vmulq_f32(y0, aVal0);
426  y1 = vmulq_f32(y1, aVal1);
427  y0 = vaddq_f32(y0, exp_p2);
428  y1 = vaddq_f32(y1, exp_p2);
429  y0 = vmulq_f32(y0, aVal0);
430  y1 = vmulq_f32(y1, aVal1);
431  y0 = vaddq_f32(y0, exp_p3);
432  y1 = vaddq_f32(y1, exp_p3);
433  y0 = vfmaq_f32(exp_p4, y0, aVal0);
434  y1 = vfmaq_f32(exp_p4, y1, aVal1);
435  y0 = vmulq_f32(y0, aVal0);
436  y1 = vmulq_f32(y1, aVal1);
437  y0 = vaddq_f32(y0, exp_p5);
438  y1 = vaddq_f32(y1, exp_p5);
439  y0 = vfmaq_f32(aVal0, y0, z0);
440  y1 = vfmaq_f32(aVal1, y1, z1);
441  y0 = vaddq_f32(y0, one);
442  y1 = vaddq_f32(y1, one);
443 
444  // Build 2^n
445  emm0_0 = vcvtq_s32_f32(fx0);
446  emm0_1 = vcvtq_s32_f32(fx1);
447  emm0_0 = vaddq_s32(emm0_0, pi32_0x7f);
448  emm0_1 = vaddq_s32(emm0_1, pi32_0x7f);
449  emm0_0 = vshlq_n_s32(emm0_0, 23);
450  emm0_1 = vshlq_n_s32(emm0_1, 23);
451  float32x4_t pow2n0 = vreinterpretq_f32_s32(emm0_0);
452  float32x4_t pow2n1 = vreinterpretq_f32_s32(emm0_1);
453 
454  float32x4_t bVal0 = vmulq_f32(y0, pow2n0);
455  float32x4_t bVal1 = vmulq_f32(y1, pow2n1);
456  vst1q_f32(bPtr, bVal0);
457  vst1q_f32(bPtr + 4, bVal1);
458 
459  aPtr += 8;
460  bPtr += 8;
461  }
462 
463  number = eighthPoints * 8;
464  for (; number < num_points; number++) {
465  *bPtr++ = expf(*aPtr++);
466  }
467 }
468 
469 #endif /* LV_HAVE_NEONV8 */
470 
471 #ifdef LV_HAVE_RVV
472 #include <riscv_vector.h>
473 
474 static inline void
475 volk_32f_exp_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
476 {
477  size_t vlmax = __riscv_vsetvlmax_e32m2();
478 
479  const vfloat32m2_t exp_hi = __riscv_vfmv_v_f_f32m2(88.376259f, vlmax);
480  const vfloat32m2_t exp_lo = __riscv_vfmv_v_f_f32m2(-88.376259f, vlmax);
481  const vfloat32m2_t log2EF = __riscv_vfmv_v_f_f32m2(1.442695f, vlmax);
482  const vfloat32m2_t exp_C1 = __riscv_vfmv_v_f_f32m2(-0.6933594f, vlmax);
483  const vfloat32m2_t exp_C2 = __riscv_vfmv_v_f_f32m2(0.000212194f, vlmax);
484  const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
485  const vfloat32m2_t cf1o2 = __riscv_vfmv_v_f_f32m2(0.5f, vlmax);
486 
487  const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(1.9875691500e-4, vlmax);
488  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(1.3981999507e-3, vlmax);
489  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(8.3334519073e-3, vlmax);
490  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(4.1665795894e-2, vlmax);
491  const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(1.6666665459e-1, vlmax);
492  const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.0000001201e-1, vlmax);
493 
494  size_t n = num_points;
495  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
496  vl = __riscv_vsetvl_e32m2(n);
497  vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
498  v = __riscv_vfmin(v, exp_hi, vl);
499  v = __riscv_vfmax(v, exp_lo, vl);
500  vfloat32m2_t fx = __riscv_vfmadd(v, log2EF, cf1o2, vl);
501 
502  vfloat32m2_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl);
503  fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl);
504  v = __riscv_vfmacc(v, fx, exp_C1, vl);
505  v = __riscv_vfmacc(v, fx, exp_C2, vl);
506  vfloat32m2_t vv = __riscv_vfmul(v, v, vl);
507 
508  vfloat32m2_t y = c0;
509  y = __riscv_vfmadd(y, v, c1, vl);
510  y = __riscv_vfmadd(y, v, c2, vl);
511  y = __riscv_vfmadd(y, v, c3, vl);
512  y = __riscv_vfmadd(y, v, c4, vl);
513  y = __riscv_vfmadd(y, v, c5, vl);
514  y = __riscv_vfmadd(y, vv, v, vl);
515  y = __riscv_vfadd(y, cf1, vl);
516 
517  vfloat32m2_t pow2n = __riscv_vreinterpret_f32m2(
518  __riscv_vsll(__riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), 0x7f, vl), 23, vl));
519 
520  __riscv_vse32(bVector, __riscv_vfmul(y, pow2n, vl), vl);
521  }
522 }
523 #endif /*LV_HAVE_RVV*/
524 
525 #endif /* INCLUDED_volk_32f_exp_32f_u_H */
volk_32f_exp_32f_neon
static void volk_32f_exp_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:273
volk_32f_exp_32f_generic
static void volk_32f_exp_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:256
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32f_exp_32f_a_sse2
static void volk_32f_exp_32f_a_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:93
volk_32f_exp_32f_u_sse2
static void volk_32f_exp_32f_u_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:177