Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_tanh_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
55 #ifndef INCLUDED_volk_32f_tanh_32f_a_H
56 #define INCLUDED_volk_32f_tanh_32f_a_H
57 
58 #include <inttypes.h>
59 #include <math.h>
60 #include <stdio.h>
61 #include <string.h>
62 
63 
64 #ifdef LV_HAVE_GENERIC
65 
66 static inline void
67 volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
68 {
69  unsigned int number = 0;
70  float* cPtr = cVector;
71  const float* aPtr = aVector;
72  for (; number < num_points; number++) {
73  *cPtr++ = tanhf(*aPtr++);
74  }
75 }
76 
77 #endif /* LV_HAVE_GENERIC */
78 
79 
80 #ifdef LV_HAVE_GENERIC
81 
82 static inline void
83 volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points)
84 {
85  float* cPtr = cVector;
86  const float* aPtr = aVector;
87  for (unsigned int number = 0; number < num_points; number++) {
88  if (*aPtr > 4.97)
89  *cPtr++ = 1;
90  else if (*aPtr <= -4.97)
91  *cPtr++ = -1;
92  else {
93  float x2 = (*aPtr) * (*aPtr);
94  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
95  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
96  *cPtr++ = a / b;
97  aPtr++;
98  }
99  }
100 }
101 
102 #endif /* LV_HAVE_GENERIC */
103 
104 
105 #ifdef LV_HAVE_SSE
106 #include <xmmintrin.h>
107 
108 static inline void
109 volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
110 {
111  unsigned int number = 0;
112  const unsigned int quarterPoints = num_points / 4;
113 
114  float* cPtr = cVector;
115  const float* aPtr = aVector;
116 
117  __m128 aVal, cVal, x2, a, b;
118  __m128 const1, const2, const3, const4, const5, const6;
119  const1 = _mm_set_ps1(135135.0f);
120  const2 = _mm_set_ps1(17325.0f);
121  const3 = _mm_set_ps1(378.0f);
122  const4 = _mm_set_ps1(62370.0f);
123  const5 = _mm_set_ps1(3150.0f);
124  const6 = _mm_set_ps1(28.0f);
125  for (; number < quarterPoints; number++) {
126 
127  aVal = _mm_load_ps(aPtr);
128  x2 = _mm_mul_ps(aVal, aVal);
129  a = _mm_mul_ps(
130  aVal,
131  _mm_add_ps(
132  const1,
133  _mm_mul_ps(x2,
134  _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
135  b = _mm_add_ps(
136  const1,
137  _mm_mul_ps(
138  x2,
139  _mm_add_ps(const4,
140  _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
141 
142  cVal = _mm_div_ps(a, b);
143 
144  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
145 
146  aPtr += 4;
147  cPtr += 4;
148  }
149 
150  number = quarterPoints * 4;
151  volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
152 }
153 #endif /* LV_HAVE_SSE */
154 
155 
156 #ifdef LV_HAVE_AVX
157 #include <immintrin.h>
158 
159 static inline void
160 volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
161 {
162  unsigned int number = 0;
163  const unsigned int eighthPoints = num_points / 8;
164 
165  float* cPtr = cVector;
166  const float* aPtr = aVector;
167 
168  __m256 aVal, cVal, x2, a, b;
169  __m256 const1, const2, const3, const4, const5, const6;
170  const1 = _mm256_set1_ps(135135.0f);
171  const2 = _mm256_set1_ps(17325.0f);
172  const3 = _mm256_set1_ps(378.0f);
173  const4 = _mm256_set1_ps(62370.0f);
174  const5 = _mm256_set1_ps(3150.0f);
175  const6 = _mm256_set1_ps(28.0f);
176  for (; number < eighthPoints; number++) {
177 
178  aVal = _mm256_load_ps(aPtr);
179  x2 = _mm256_mul_ps(aVal, aVal);
180  a = _mm256_mul_ps(
181  aVal,
182  _mm256_add_ps(
183  const1,
184  _mm256_mul_ps(
185  x2,
186  _mm256_add_ps(const2,
187  _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
188  b = _mm256_add_ps(
189  const1,
190  _mm256_mul_ps(
191  x2,
192  _mm256_add_ps(
193  const4,
194  _mm256_mul_ps(x2,
195  _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
196 
197  cVal = _mm256_div_ps(a, b);
198 
199  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
200 
201  aPtr += 8;
202  cPtr += 8;
203  }
204 
205  number = eighthPoints * 8;
206  volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
207 }
208 #endif /* LV_HAVE_AVX */
209 
210 #if LV_HAVE_AVX && LV_HAVE_FMA
211 #include <immintrin.h>
212 
213 static inline void
214 volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
215 {
216  unsigned int number = 0;
217  const unsigned int eighthPoints = num_points / 8;
218 
219  float* cPtr = cVector;
220  const float* aPtr = aVector;
221 
222  __m256 aVal, cVal, x2, a, b;
223  __m256 const1, const2, const3, const4, const5, const6;
224  const1 = _mm256_set1_ps(135135.0f);
225  const2 = _mm256_set1_ps(17325.0f);
226  const3 = _mm256_set1_ps(378.0f);
227  const4 = _mm256_set1_ps(62370.0f);
228  const5 = _mm256_set1_ps(3150.0f);
229  const6 = _mm256_set1_ps(28.0f);
230  for (; number < eighthPoints; number++) {
231 
232  aVal = _mm256_load_ps(aPtr);
233  x2 = _mm256_mul_ps(aVal, aVal);
234  a = _mm256_mul_ps(
235  aVal,
236  _mm256_fmadd_ps(
237  x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
238  b = _mm256_fmadd_ps(
239  x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
240 
241  cVal = _mm256_div_ps(a, b);
242 
243  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
244 
245  aPtr += 8;
246  cPtr += 8;
247  }
248 
249  number = eighthPoints * 8;
250  volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
251 }
252 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
253 
254 #ifdef LV_HAVE_NEON
255 #include <arm_neon.h>
256 
257 static inline void
258 volk_32f_tanh_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
259 {
260  unsigned int number = 0;
261  const unsigned int quarterPoints = num_points / 4;
262 
263  float* cPtr = cVector;
264  const float* aPtr = aVector;
265 
266  // Polynomial coefficients for tanh approximation
267  const float32x4_t const1 = vdupq_n_f32(135135.0f);
268  const float32x4_t const2 = vdupq_n_f32(17325.0f);
269  const float32x4_t const3 = vdupq_n_f32(378.0f);
270  const float32x4_t const4 = vdupq_n_f32(62370.0f);
271  const float32x4_t const5 = vdupq_n_f32(3150.0f);
272  const float32x4_t const6 = vdupq_n_f32(28.0f);
273 
274  for (; number < quarterPoints; number++) {
275  float32x4_t aVal = vld1q_f32(aPtr);
276  float32x4_t x2 = vmulq_f32(aVal, aVal);
277 
278  // a = x * (135135 + x2 * (17325 + x2 * (378 + x2)))
279  float32x4_t inner_a = vaddq_f32(const3, x2);
280  inner_a = vmlaq_f32(const2, x2, inner_a);
281  inner_a = vmlaq_f32(const1, x2, inner_a);
282  float32x4_t a = vmulq_f32(aVal, inner_a);
283 
284  // b = 135135 + x2 * (62370 + x2 * (3150 + x2 * 28))
285  float32x4_t inner_b = vmlaq_f32(const5, x2, const6);
286  inner_b = vmlaq_f32(const4, x2, inner_b);
287  float32x4_t b = vmlaq_f32(const1, x2, inner_b);
288 
289  // c = a / b (use reciprocal approximation)
290  float32x4_t b_recip = vrecpeq_f32(b);
291  b_recip = vmulq_f32(b_recip, vrecpsq_f32(b, b_recip));
292  b_recip = vmulq_f32(b_recip, vrecpsq_f32(b, b_recip));
293  float32x4_t cVal = vmulq_f32(a, b_recip);
294 
295  vst1q_f32(cPtr, cVal);
296  aPtr += 4;
297  cPtr += 4;
298  }
299 
300  number = quarterPoints * 4;
301  volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
302 }
303 
304 #endif /* LV_HAVE_NEON */
305 
306 #ifdef LV_HAVE_NEONV8
307 #include <arm_neon.h>
308 
309 static inline void
310 volk_32f_tanh_32f_neonv8(float* cVector, const float* aVector, unsigned int num_points)
311 {
312  unsigned int number = 0;
313  const unsigned int eighthPoints = num_points / 8;
314 
315  float* cPtr = cVector;
316  const float* aPtr = aVector;
317 
318  // Polynomial coefficients for tanh approximation
319  const float32x4_t const1 = vdupq_n_f32(135135.0f);
320  const float32x4_t const2 = vdupq_n_f32(17325.0f);
321  const float32x4_t const3 = vdupq_n_f32(378.0f);
322  const float32x4_t const4 = vdupq_n_f32(62370.0f);
323  const float32x4_t const5 = vdupq_n_f32(3150.0f);
324  const float32x4_t const6 = vdupq_n_f32(28.0f);
325 
326  for (; number < eighthPoints; number++) {
327  __VOLK_PREFETCH(aPtr + 16);
328 
329  float32x4_t aVal0 = vld1q_f32(aPtr);
330  float32x4_t aVal1 = vld1q_f32(aPtr + 4);
331  float32x4_t x2_0 = vmulq_f32(aVal0, aVal0);
332  float32x4_t x2_1 = vmulq_f32(aVal1, aVal1);
333 
334  // a = x * (135135 + x2 * (17325 + x2 * (378 + x2))) using FMA
335  float32x4_t inner_a0 = vaddq_f32(const3, x2_0);
336  float32x4_t inner_a1 = vaddq_f32(const3, x2_1);
337  inner_a0 = vfmaq_f32(const2, x2_0, inner_a0);
338  inner_a1 = vfmaq_f32(const2, x2_1, inner_a1);
339  inner_a0 = vfmaq_f32(const1, x2_0, inner_a0);
340  inner_a1 = vfmaq_f32(const1, x2_1, inner_a1);
341  float32x4_t a0 = vmulq_f32(aVal0, inner_a0);
342  float32x4_t a1 = vmulq_f32(aVal1, inner_a1);
343 
344  // b = 135135 + x2 * (62370 + x2 * (3150 + x2 * 28)) using FMA
345  float32x4_t inner_b0 = vfmaq_f32(const5, x2_0, const6);
346  float32x4_t inner_b1 = vfmaq_f32(const5, x2_1, const6);
347  inner_b0 = vfmaq_f32(const4, x2_0, inner_b0);
348  inner_b1 = vfmaq_f32(const4, x2_1, inner_b1);
349  float32x4_t b0 = vfmaq_f32(const1, x2_0, inner_b0);
350  float32x4_t b1 = vfmaq_f32(const1, x2_1, inner_b1);
351 
352  // c = a / b using native division
353  float32x4_t cVal0 = vdivq_f32(a0, b0);
354  float32x4_t cVal1 = vdivq_f32(a1, b1);
355 
356  vst1q_f32(cPtr, cVal0);
357  vst1q_f32(cPtr + 4, cVal1);
358  aPtr += 8;
359  cPtr += 8;
360  }
361 
362  number = eighthPoints * 8;
363  volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
364 }
365 
366 #endif /* LV_HAVE_NEONV8 */
367 
368 #endif /* INCLUDED_volk_32f_tanh_32f_a_H */
369 
370 
371 #ifndef INCLUDED_volk_32f_tanh_32f_u_H
372 #define INCLUDED_volk_32f_tanh_32f_u_H
373 
374 #include <inttypes.h>
375 #include <math.h>
376 #include <stdio.h>
377 #include <string.h>
378 
379 
380 #ifdef LV_HAVE_SSE
381 #include <xmmintrin.h>
382 
383 static inline void
384 volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
385 {
386  unsigned int number = 0;
387  const unsigned int quarterPoints = num_points / 4;
388 
389  float* cPtr = cVector;
390  const float* aPtr = aVector;
391 
392  __m128 aVal, cVal, x2, a, b;
393  __m128 const1, const2, const3, const4, const5, const6;
394  const1 = _mm_set_ps1(135135.0f);
395  const2 = _mm_set_ps1(17325.0f);
396  const3 = _mm_set_ps1(378.0f);
397  const4 = _mm_set_ps1(62370.0f);
398  const5 = _mm_set_ps1(3150.0f);
399  const6 = _mm_set_ps1(28.0f);
400  for (; number < quarterPoints; number++) {
401 
402  aVal = _mm_loadu_ps(aPtr);
403  x2 = _mm_mul_ps(aVal, aVal);
404  a = _mm_mul_ps(
405  aVal,
406  _mm_add_ps(
407  const1,
408  _mm_mul_ps(x2,
409  _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
410  b = _mm_add_ps(
411  const1,
412  _mm_mul_ps(
413  x2,
414  _mm_add_ps(const4,
415  _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
416 
417  cVal = _mm_div_ps(a, b);
418 
419  _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
420 
421  aPtr += 4;
422  cPtr += 4;
423  }
424 
425  number = quarterPoints * 4;
426  volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
427 }
428 #endif /* LV_HAVE_SSE */
429 
430 
431 #ifdef LV_HAVE_AVX
432 #include <immintrin.h>
433 
434 static inline void
435 volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
436 {
437  unsigned int number = 0;
438  const unsigned int eighthPoints = num_points / 8;
439 
440  float* cPtr = cVector;
441  const float* aPtr = aVector;
442 
443  __m256 aVal, cVal, x2, a, b;
444  __m256 const1, const2, const3, const4, const5, const6;
445  const1 = _mm256_set1_ps(135135.0f);
446  const2 = _mm256_set1_ps(17325.0f);
447  const3 = _mm256_set1_ps(378.0f);
448  const4 = _mm256_set1_ps(62370.0f);
449  const5 = _mm256_set1_ps(3150.0f);
450  const6 = _mm256_set1_ps(28.0f);
451  for (; number < eighthPoints; number++) {
452 
453  aVal = _mm256_loadu_ps(aPtr);
454  x2 = _mm256_mul_ps(aVal, aVal);
455  a = _mm256_mul_ps(
456  aVal,
457  _mm256_add_ps(
458  const1,
459  _mm256_mul_ps(
460  x2,
461  _mm256_add_ps(const2,
462  _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
463  b = _mm256_add_ps(
464  const1,
465  _mm256_mul_ps(
466  x2,
467  _mm256_add_ps(
468  const4,
469  _mm256_mul_ps(x2,
470  _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
471 
472  cVal = _mm256_div_ps(a, b);
473 
474  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
475 
476  aPtr += 8;
477  cPtr += 8;
478  }
479 
480  number = eighthPoints * 8;
481  volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
482 }
483 #endif /* LV_HAVE_AVX */
484 
485 #if LV_HAVE_AVX && LV_HAVE_FMA
486 #include <immintrin.h>
487 
488 static inline void
489 volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
490 {
491  unsigned int number = 0;
492  const unsigned int eighthPoints = num_points / 8;
493 
494  float* cPtr = cVector;
495  const float* aPtr = aVector;
496 
497  __m256 aVal, cVal, x2, a, b;
498  __m256 const1, const2, const3, const4, const5, const6;
499  const1 = _mm256_set1_ps(135135.0f);
500  const2 = _mm256_set1_ps(17325.0f);
501  const3 = _mm256_set1_ps(378.0f);
502  const4 = _mm256_set1_ps(62370.0f);
503  const5 = _mm256_set1_ps(3150.0f);
504  const6 = _mm256_set1_ps(28.0f);
505  for (; number < eighthPoints; number++) {
506 
507  aVal = _mm256_loadu_ps(aPtr);
508  x2 = _mm256_mul_ps(aVal, aVal);
509  a = _mm256_mul_ps(
510  aVal,
511  _mm256_fmadd_ps(
512  x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
513  b = _mm256_fmadd_ps(
514  x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
515 
516  cVal = _mm256_div_ps(a, b);
517 
518  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
519 
520  aPtr += 8;
521  cPtr += 8;
522  }
523 
524  number = eighthPoints * 8;
525  volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
526 }
527 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
528 
529 #ifdef LV_HAVE_RVV
530 #include <riscv_vector.h>
531 
532 static inline void
533 volk_32f_tanh_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
534 {
535  size_t vlmax = __riscv_vsetvlmax_e32m2();
536 
537  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(135135.0f, vlmax);
538  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(17325.0f, vlmax);
539  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(378.0f, vlmax);
540  const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(62370.0f, vlmax);
541  const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3150.0f, vlmax);
542  const vfloat32m2_t c6 = __riscv_vfmv_v_f_f32m2(28.0f, vlmax);
543 
544  size_t n = num_points;
545  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
546  vl = __riscv_vsetvl_e32m2(n);
547  vfloat32m2_t x = __riscv_vle32_v_f32m2(aVector, vl);
548  vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
549  vfloat32m2_t a, b;
550  a = __riscv_vfadd(xx, c3, vl);
551  a = __riscv_vfmadd(a, xx, c2, vl);
552  a = __riscv_vfmadd(a, xx, c1, vl);
553  a = __riscv_vfmul(a, x, vl);
554  b = c6;
555  b = __riscv_vfmadd(b, xx, c5, vl);
556  b = __riscv_vfmadd(b, xx, c4, vl);
557  b = __riscv_vfmadd(b, xx, c1, vl);
558  __riscv_vse32(bVector, __riscv_vfdiv(a, b, vl), vl);
559  }
560 }
561 #endif /*LV_HAVE_RVV*/
562 
563 #endif /* INCLUDED_volk_32f_tanh_32f_u_H */
volk_32f_tanh_32f_a_sse
static void volk_32f_tanh_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:109
volk_32f_tanh_32f_a_avx
static void volk_32f_tanh_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:160
volk_32f_tanh_32f_u_avx
static void volk_32f_tanh_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:435
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32f_tanh_32f_neon
static void volk_32f_tanh_32f_neon(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:258
volk_32f_tanh_32f_series
static void volk_32f_tanh_32f_series(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:83
volk_32f_tanh_32f_u_sse
static void volk_32f_tanh_32f_u_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:384
volk_32f_tanh_32f_generic
static void volk_32f_tanh_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:67