Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_x2_pow_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
61 #define INCLUDED_volk_32f_x2_pow_32f_a_H
62 
63 #include <inttypes.h>
64 #include <math.h>
65 #include <stdio.h>
66 #include <stdlib.h>
67 
68 #define POW_POLY_DEGREE 3
69 
70 #if LV_HAVE_AVX2 && LV_HAVE_FMA
71 #include <immintrin.h>
72 
73 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
74 #define POLY1_AVX2_FMA(x, c0, c1) \
75  _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
76 #define POLY2_AVX2_FMA(x, c0, c1, c2) \
77  _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
78 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
79  _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
80 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
81  _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
82 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
83  _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
84 
85 static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,
86  const float* bVector,
87  const float* aVector,
88  unsigned int num_points)
89 {
90  float* cPtr = cVector;
91  const float* bPtr = bVector;
92  const float* aPtr = aVector;
93 
94  unsigned int number = 0;
95  const unsigned int eighthPoints = num_points / 8;
96 
97  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
98  __m256 tmp, fx, mask, pow2n, z, y;
99  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
100  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
101  __m256i bias, exp, emm0, pi32_0x7f;
102 
103  one = _mm256_set1_ps(1.0);
104  exp_hi = _mm256_set1_ps(88.3762626647949);
105  exp_lo = _mm256_set1_ps(-88.3762626647949);
106  ln2 = _mm256_set1_ps(0.6931471805);
107  log2EF = _mm256_set1_ps(1.44269504088896341);
108  half = _mm256_set1_ps(0.5);
109  exp_C1 = _mm256_set1_ps(0.693359375);
110  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
111  pi32_0x7f = _mm256_set1_epi32(0x7f);
112 
113  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
114  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
115  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
116  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
117  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
118  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
119 
120  for (; number < eighthPoints; number++) {
121  // First compute the logarithm
122  aVal = _mm256_load_ps(aPtr);
123  bias = _mm256_set1_epi32(127);
124  leadingOne = _mm256_set1_ps(1.0f);
125  exp = _mm256_sub_epi32(
126  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
127  _mm256_set1_epi32(0x7f800000)),
128  23),
129  bias);
130  logarithm = _mm256_cvtepi32_ps(exp);
131 
132  frac = _mm256_or_ps(
133  leadingOne,
134  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
135 
136 #if POW_POLY_DEGREE == 6
137  mantissa = POLY5_AVX2_FMA(frac,
138  3.1157899f,
139  -3.3241990f,
140  2.5988452f,
141  -1.2315303f,
142  3.1821337e-1f,
143  -3.4436006e-2f);
144 #elif POW_POLY_DEGREE == 5
145  mantissa = POLY4_AVX2_FMA(frac,
146  2.8882704548164776201f,
147  -2.52074962577807006663f,
148  1.48116647521213171641f,
149  -0.465725644288844778798f,
150  0.0596515482674574969533f);
151 #elif POW_POLY_DEGREE == 4
152  mantissa = POLY3_AVX2_FMA(frac,
153  2.61761038894603480148f,
154  -1.75647175389045657003f,
155  0.688243882994381274313f,
156  -0.107254423828329604454f);
157 #elif POW_POLY_DEGREE == 3
158  mantissa = POLY2_AVX2_FMA(frac,
159  2.28330284476918490682f,
160  -1.04913055217340124191f,
161  0.204446009836232697516f);
162 #else
163 #error
164 #endif
165 
166  logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
167  logarithm = _mm256_mul_ps(logarithm, ln2);
168 
169  // Now calculate b*lna
170  bVal = _mm256_load_ps(bPtr);
171  bVal = _mm256_mul_ps(bVal, logarithm);
172 
173  // Now compute exp(b*lna)
174  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
175 
176  fx = _mm256_fmadd_ps(bVal, log2EF, half);
177 
178  emm0 = _mm256_cvttps_epi32(fx);
179  tmp = _mm256_cvtepi32_ps(emm0);
180 
181  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
182  fx = _mm256_sub_ps(tmp, mask);
183 
184  tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
185  bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
186  z = _mm256_mul_ps(bVal, bVal);
187 
188  y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
189  y = _mm256_fmadd_ps(y, bVal, exp_p2);
190  y = _mm256_fmadd_ps(y, bVal, exp_p3);
191  y = _mm256_fmadd_ps(y, bVal, exp_p4);
192  y = _mm256_fmadd_ps(y, bVal, exp_p5);
193  y = _mm256_fmadd_ps(y, z, bVal);
194  y = _mm256_add_ps(y, one);
195 
196  emm0 =
197  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
198 
199  pow2n = _mm256_castsi256_ps(emm0);
200  cVal = _mm256_mul_ps(y, pow2n);
201 
202  _mm256_store_ps(cPtr, cVal);
203 
204  aPtr += 8;
205  bPtr += 8;
206  cPtr += 8;
207  }
208 
209  number = eighthPoints * 8;
210  for (; number < num_points; number++) {
211  *cPtr++ = pow(*aPtr++, *bPtr++);
212  }
213 }
214 
215 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
216 
217 #ifdef LV_HAVE_AVX2
218 #include <immintrin.h>
219 
220 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
221 #define POLY1_AVX2(x, c0, c1) \
222  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
223 #define POLY2_AVX2(x, c0, c1, c2) \
224  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
225 #define POLY3_AVX2(x, c0, c1, c2, c3) \
226  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
227 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
228  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
229 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
230  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
231 
232 static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,
233  const float* bVector,
234  const float* aVector,
235  unsigned int num_points)
236 {
237  float* cPtr = cVector;
238  const float* bPtr = bVector;
239  const float* aPtr = aVector;
240 
241  unsigned int number = 0;
242  const unsigned int eighthPoints = num_points / 8;
243 
244  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
245  __m256 tmp, fx, mask, pow2n, z, y;
246  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
247  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
248  __m256i bias, exp, emm0, pi32_0x7f;
249 
250  one = _mm256_set1_ps(1.0);
251  exp_hi = _mm256_set1_ps(88.3762626647949);
252  exp_lo = _mm256_set1_ps(-88.3762626647949);
253  ln2 = _mm256_set1_ps(0.6931471805);
254  log2EF = _mm256_set1_ps(1.44269504088896341);
255  half = _mm256_set1_ps(0.5);
256  exp_C1 = _mm256_set1_ps(0.693359375);
257  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
258  pi32_0x7f = _mm256_set1_epi32(0x7f);
259 
260  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
261  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
262  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
263  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
264  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
265  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
266 
267  for (; number < eighthPoints; number++) {
268  // First compute the logarithm
269  aVal = _mm256_load_ps(aPtr);
270  bias = _mm256_set1_epi32(127);
271  leadingOne = _mm256_set1_ps(1.0f);
272  exp = _mm256_sub_epi32(
273  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
274  _mm256_set1_epi32(0x7f800000)),
275  23),
276  bias);
277  logarithm = _mm256_cvtepi32_ps(exp);
278 
279  frac = _mm256_or_ps(
280  leadingOne,
281  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
282 
283 #if POW_POLY_DEGREE == 6
284  mantissa = POLY5_AVX2(frac,
285  3.1157899f,
286  -3.3241990f,
287  2.5988452f,
288  -1.2315303f,
289  3.1821337e-1f,
290  -3.4436006e-2f);
291 #elif POW_POLY_DEGREE == 5
292  mantissa = POLY4_AVX2(frac,
293  2.8882704548164776201f,
294  -2.52074962577807006663f,
295  1.48116647521213171641f,
296  -0.465725644288844778798f,
297  0.0596515482674574969533f);
298 #elif POW_POLY_DEGREE == 4
299  mantissa = POLY3_AVX2(frac,
300  2.61761038894603480148f,
301  -1.75647175389045657003f,
302  0.688243882994381274313f,
303  -0.107254423828329604454f);
304 #elif POW_POLY_DEGREE == 3
305  mantissa = POLY2_AVX2(frac,
306  2.28330284476918490682f,
307  -1.04913055217340124191f,
308  0.204446009836232697516f);
309 #else
310 #error
311 #endif
312 
313  logarithm = _mm256_add_ps(
314  _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
315  logarithm = _mm256_mul_ps(logarithm, ln2);
316 
317  // Now calculate b*lna
318  bVal = _mm256_load_ps(bPtr);
319  bVal = _mm256_mul_ps(bVal, logarithm);
320 
321  // Now compute exp(b*lna)
322  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
323 
324  fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
325 
326  emm0 = _mm256_cvttps_epi32(fx);
327  tmp = _mm256_cvtepi32_ps(emm0);
328 
329  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
330  fx = _mm256_sub_ps(tmp, mask);
331 
332  tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
333  bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
334  z = _mm256_mul_ps(bVal, bVal);
335 
336  y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
337  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
338  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
339  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
340  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
341  y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
342  y = _mm256_add_ps(y, one);
343 
344  emm0 =
345  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
346 
347  pow2n = _mm256_castsi256_ps(emm0);
348  cVal = _mm256_mul_ps(y, pow2n);
349 
350  _mm256_store_ps(cPtr, cVal);
351 
352  aPtr += 8;
353  bPtr += 8;
354  cPtr += 8;
355  }
356 
357  number = eighthPoints * 8;
358  for (; number < num_points; number++) {
359  *cPtr++ = pow(*aPtr++, *bPtr++);
360  }
361 }
362 
363 #endif /* LV_HAVE_AVX2 for aligned */
364 
365 
366 #ifdef LV_HAVE_SSE4_1
367 #include <smmintrin.h>
368 
369 #define POLY0(x, c0) _mm_set1_ps(c0)
370 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
371 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
372 #define POLY3(x, c0, c1, c2, c3) \
373  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
374 #define POLY4(x, c0, c1, c2, c3, c4) \
375  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
376 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
377  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
378 
379 static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,
380  const float* bVector,
381  const float* aVector,
382  unsigned int num_points)
383 {
384  float* cPtr = cVector;
385  const float* bPtr = bVector;
386  const float* aPtr = aVector;
387 
388  unsigned int number = 0;
389  const unsigned int quarterPoints = num_points / 4;
390 
391  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
392  __m128 tmp, fx, mask, pow2n, z, y;
393  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
394  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
395  __m128i bias, exp, emm0, pi32_0x7f;
396 
397  one = _mm_set1_ps(1.0);
398  exp_hi = _mm_set1_ps(88.3762626647949);
399  exp_lo = _mm_set1_ps(-88.3762626647949);
400  ln2 = _mm_set1_ps(0.6931471805);
401  log2EF = _mm_set1_ps(1.44269504088896341);
402  half = _mm_set1_ps(0.5);
403  exp_C1 = _mm_set1_ps(0.693359375);
404  exp_C2 = _mm_set1_ps(-2.12194440e-4);
405  pi32_0x7f = _mm_set1_epi32(0x7f);
406 
407  exp_p0 = _mm_set1_ps(1.9875691500e-4);
408  exp_p1 = _mm_set1_ps(1.3981999507e-3);
409  exp_p2 = _mm_set1_ps(8.3334519073e-3);
410  exp_p3 = _mm_set1_ps(4.1665795894e-2);
411  exp_p4 = _mm_set1_ps(1.6666665459e-1);
412  exp_p5 = _mm_set1_ps(5.0000001201e-1);
413 
414  for (; number < quarterPoints; number++) {
415  // First compute the logarithm
416  aVal = _mm_load_ps(aPtr);
417  bias = _mm_set1_epi32(127);
418  leadingOne = _mm_set1_ps(1.0f);
419  exp = _mm_sub_epi32(
420  _mm_srli_epi32(
421  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
422  bias);
423  logarithm = _mm_cvtepi32_ps(exp);
424 
425  frac = _mm_or_ps(leadingOne,
426  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
427 
428 #if POW_POLY_DEGREE == 6
429  mantissa = POLY5(frac,
430  3.1157899f,
431  -3.3241990f,
432  2.5988452f,
433  -1.2315303f,
434  3.1821337e-1f,
435  -3.4436006e-2f);
436 #elif POW_POLY_DEGREE == 5
437  mantissa = POLY4(frac,
438  2.8882704548164776201f,
439  -2.52074962577807006663f,
440  1.48116647521213171641f,
441  -0.465725644288844778798f,
442  0.0596515482674574969533f);
443 #elif POW_POLY_DEGREE == 4
444  mantissa = POLY3(frac,
445  2.61761038894603480148f,
446  -1.75647175389045657003f,
447  0.688243882994381274313f,
448  -0.107254423828329604454f);
449 #elif POW_POLY_DEGREE == 3
450  mantissa = POLY2(frac,
451  2.28330284476918490682f,
452  -1.04913055217340124191f,
453  0.204446009836232697516f);
454 #else
455 #error
456 #endif
457 
458  logarithm =
459  _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
460  logarithm = _mm_mul_ps(logarithm, ln2);
461 
462 
463  // Now calculate b*lna
464  bVal = _mm_load_ps(bPtr);
465  bVal = _mm_mul_ps(bVal, logarithm);
466 
467  // Now compute exp(b*lna)
468  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
469 
470  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
471 
472  emm0 = _mm_cvttps_epi32(fx);
473  tmp = _mm_cvtepi32_ps(emm0);
474 
475  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
476  fx = _mm_sub_ps(tmp, mask);
477 
478  tmp = _mm_mul_ps(fx, exp_C1);
479  z = _mm_mul_ps(fx, exp_C2);
480  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
481  z = _mm_mul_ps(bVal, bVal);
482 
483  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
484  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
485  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
486  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
487  y = _mm_add_ps(y, one);
488 
489  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
490 
491  pow2n = _mm_castsi128_ps(emm0);
492  cVal = _mm_mul_ps(y, pow2n);
493 
494  _mm_store_ps(cPtr, cVal);
495 
496  aPtr += 4;
497  bPtr += 4;
498  cPtr += 4;
499  }
500 
501  number = quarterPoints * 4;
502  for (; number < num_points; number++) {
503  *cPtr++ = powf(*aPtr++, *bPtr++);
504  }
505 }
506 
507 #endif /* LV_HAVE_SSE4_1 for aligned */
508 
509 #endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
510 
511 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
512 #define INCLUDED_volk_32f_x2_pow_32f_u_H
513 
514 #include <inttypes.h>
515 #include <math.h>
516 #include <stdio.h>
517 #include <stdlib.h>
518 
519 #define POW_POLY_DEGREE 3
520 
521 #ifdef LV_HAVE_GENERIC
522 
523 static inline void volk_32f_x2_pow_32f_generic(float* cVector,
524  const float* bVector,
525  const float* aVector,
526  unsigned int num_points)
527 {
528  float* cPtr = cVector;
529  const float* bPtr = bVector;
530  const float* aPtr = aVector;
531  unsigned int number = 0;
532 
533  for (number = 0; number < num_points; number++) {
534  *cPtr++ = powf(*aPtr++, *bPtr++);
535  }
536 }
537 #endif /* LV_HAVE_GENERIC */
538 
539 #ifdef LV_HAVE_NEON
540 #include <arm_neon.h>
541 
542 static inline void volk_32f_x2_pow_32f_neon(float* cVector,
543  const float* bVector,
544  const float* aVector,
545  unsigned int num_points)
546 {
547  float* cPtr = cVector;
548  const float* bPtr = bVector;
549  const float* aPtr = aVector;
550 
551  unsigned int number = 0;
552  const unsigned int quarterPoints = num_points / 4;
553 
554  // Constants
555  float32x4_t one = vdupq_n_f32(1.0f);
556  float32x4_t exp_hi = vdupq_n_f32(88.3762626647949f);
557  float32x4_t exp_lo = vdupq_n_f32(-88.3762626647949f);
558  float32x4_t ln2 = vdupq_n_f32(0.6931471805f);
559  float32x4_t log2EF = vdupq_n_f32(1.44269504088896341f);
560  float32x4_t half = vdupq_n_f32(0.5f);
561  float32x4_t exp_C1 = vdupq_n_f32(0.693359375f);
562  float32x4_t exp_C2 = vdupq_n_f32(-2.12194440e-4f);
563  int32x4_t pi32_0x7f = vdupq_n_s32(0x7f);
564  int32x4_t bias = vdupq_n_s32(127);
565  int32x4_t mantMask = vdupq_n_s32(0x7fffff);
566  int32x4_t expMask = vdupq_n_s32(0x7f800000);
567 
568  // Polynomial coefficients for log
569  float32x4_t log_c0 = vdupq_n_f32(2.28330284476918490682f);
570  float32x4_t log_c1 = vdupq_n_f32(-1.04913055217340124191f);
571  float32x4_t log_c2 = vdupq_n_f32(0.204446009836232697516f);
572 
573  // Polynomial coefficients for exp
574  float32x4_t exp_p0 = vdupq_n_f32(1.9875691500e-4f);
575  float32x4_t exp_p1 = vdupq_n_f32(1.3981999507e-3f);
576  float32x4_t exp_p2 = vdupq_n_f32(8.3334519073e-3f);
577  float32x4_t exp_p3 = vdupq_n_f32(4.1665795894e-2f);
578  float32x4_t exp_p4 = vdupq_n_f32(1.6666665459e-1f);
579  float32x4_t exp_p5 = vdupq_n_f32(5.0000001201e-1f);
580 
581  for (; number < quarterPoints; number++) {
582  float32x4_t aVal = vld1q_f32(aPtr);
583 
584  // First compute log(a)
585  int32x4_t aInt = vreinterpretq_s32_f32(aVal);
586  int32x4_t expPart = vsubq_s32(vshrq_n_s32(vandq_s32(aInt, expMask), 23), bias);
587  float32x4_t logarithm = vcvtq_f32_s32(expPart);
588 
589  int32x4_t mantPart =
590  vorrq_s32(vandq_s32(aInt, mantMask), vreinterpretq_s32_f32(one));
591  float32x4_t frac = vreinterpretq_f32_s32(mantPart);
592 
593  // Polynomial for log mantissa (degree 3)
594  float32x4_t mantissa = vmlaq_f32(log_c1, log_c2, frac);
595  mantissa = vmlaq_f32(log_c0, mantissa, frac);
596 
597  float32x4_t fracMinusOne = vsubq_f32(frac, one);
598  logarithm = vmlaq_f32(logarithm, mantissa, fracMinusOne);
599  logarithm = vmulq_f32(logarithm, ln2);
600 
601  // Now calculate b*log(a)
602  float32x4_t bVal = vld1q_f32(bPtr);
603  bVal = vmulq_f32(bVal, logarithm);
604 
605  // Now compute exp(b*log(a))
606  bVal = vmaxq_f32(vminq_f32(bVal, exp_hi), exp_lo);
607 
608  float32x4_t fx = vmlaq_f32(half, bVal, log2EF);
609 
610  int32x4_t emm0 = vcvtq_s32_f32(fx);
611  float32x4_t tmp = vcvtq_f32_s32(emm0);
612 
613  uint32x4_t mask = vcgtq_f32(tmp, fx);
614  float32x4_t mask_one = vbslq_f32(mask, one, vdupq_n_f32(0.0f));
615  fx = vsubq_f32(tmp, mask_one);
616 
617  tmp = vmulq_f32(fx, exp_C1);
618  float32x4_t z = vmulq_f32(fx, exp_C2);
619  bVal = vsubq_f32(vsubq_f32(bVal, tmp), z);
620  z = vmulq_f32(bVal, bVal);
621 
622  float32x4_t y = vmlaq_f32(exp_p1, exp_p0, bVal);
623  y = vmulq_f32(y, bVal);
624  y = vaddq_f32(y, exp_p2);
625  y = vmulq_f32(y, bVal);
626  y = vaddq_f32(y, exp_p3);
627  y = vmlaq_f32(exp_p4, y, bVal);
628  y = vmulq_f32(y, bVal);
629  y = vaddq_f32(y, exp_p5);
630  y = vmlaq_f32(bVal, y, z);
631  y = vaddq_f32(y, one);
632 
633  emm0 = vcvtq_s32_f32(fx);
634  emm0 = vaddq_s32(emm0, pi32_0x7f);
635  emm0 = vshlq_n_s32(emm0, 23);
636  float32x4_t pow2n = vreinterpretq_f32_s32(emm0);
637 
638  float32x4_t cVal = vmulq_f32(y, pow2n);
639  vst1q_f32(cPtr, cVal);
640 
641  aPtr += 4;
642  bPtr += 4;
643  cPtr += 4;
644  }
645 
646  number = quarterPoints * 4;
647  for (; number < num_points; number++) {
648  *cPtr++ = powf(*aPtr++, *bPtr++);
649  }
650 }
651 
652 #endif /* LV_HAVE_NEON */
653 
654 #ifdef LV_HAVE_NEONV8
655 #include <arm_neon.h>
656 
657 static inline void volk_32f_x2_pow_32f_neonv8(float* cVector,
658  const float* bVector,
659  const float* aVector,
660  unsigned int num_points)
661 {
662  float* cPtr = cVector;
663  const float* bPtr = bVector;
664  const float* aPtr = aVector;
665 
666  unsigned int number = 0;
667  const unsigned int quarterPoints = num_points / 4;
668 
669  // Constants
670  float32x4_t one = vdupq_n_f32(1.0f);
671  float32x4_t exp_hi = vdupq_n_f32(88.3762626647949f);
672  float32x4_t exp_lo = vdupq_n_f32(-88.3762626647949f);
673  float32x4_t ln2 = vdupq_n_f32(0.6931471805f);
674  float32x4_t log2EF = vdupq_n_f32(1.44269504088896341f);
675  float32x4_t half = vdupq_n_f32(0.5f);
676  float32x4_t exp_C1 = vdupq_n_f32(0.693359375f);
677  float32x4_t exp_C2 = vdupq_n_f32(-2.12194440e-4f);
678  int32x4_t pi32_0x7f = vdupq_n_s32(0x7f);
679  int32x4_t bias = vdupq_n_s32(127);
680  int32x4_t mantMask = vdupq_n_s32(0x7fffff);
681  int32x4_t expMask = vdupq_n_s32(0x7f800000);
682 
683  // Polynomial coefficients for log
684  float32x4_t log_c0 = vdupq_n_f32(2.28330284476918490682f);
685  float32x4_t log_c1 = vdupq_n_f32(-1.04913055217340124191f);
686  float32x4_t log_c2 = vdupq_n_f32(0.204446009836232697516f);
687 
688  // Polynomial coefficients for exp
689  float32x4_t exp_p0 = vdupq_n_f32(1.9875691500e-4f);
690  float32x4_t exp_p1 = vdupq_n_f32(1.3981999507e-3f);
691  float32x4_t exp_p2 = vdupq_n_f32(8.3334519073e-3f);
692  float32x4_t exp_p3 = vdupq_n_f32(4.1665795894e-2f);
693  float32x4_t exp_p4 = vdupq_n_f32(1.6666665459e-1f);
694  float32x4_t exp_p5 = vdupq_n_f32(5.0000001201e-1f);
695 
696  for (; number < quarterPoints; number++) {
697  __VOLK_PREFETCH(aPtr + 8);
698  __VOLK_PREFETCH(bPtr + 8);
699 
700  float32x4_t aVal = vld1q_f32(aPtr);
701 
702  // First compute log(a)
703  int32x4_t aInt = vreinterpretq_s32_f32(aVal);
704  int32x4_t expPart = vsubq_s32(vshrq_n_s32(vandq_s32(aInt, expMask), 23), bias);
705  float32x4_t logarithm = vcvtq_f32_s32(expPart);
706 
707  int32x4_t mantPart =
708  vorrq_s32(vandq_s32(aInt, mantMask), vreinterpretq_s32_f32(one));
709  float32x4_t frac = vreinterpretq_f32_s32(mantPart);
710 
711  // Polynomial for log mantissa (degree 3)
712  float32x4_t mantissa = vfmaq_f32(log_c1, log_c2, frac);
713  mantissa = vfmaq_f32(log_c0, mantissa, frac);
714 
715  float32x4_t fracMinusOne = vsubq_f32(frac, one);
716  logarithm = vfmaq_f32(logarithm, mantissa, fracMinusOne);
717  logarithm = vmulq_f32(logarithm, ln2);
718 
719  // Now calculate b*log(a)
720  float32x4_t bVal = vld1q_f32(bPtr);
721  bVal = vmulq_f32(bVal, logarithm);
722 
723  // Now compute exp(b*log(a))
724  bVal = vmaxq_f32(vminq_f32(bVal, exp_hi), exp_lo);
725 
726  float32x4_t fx = vfmaq_f32(half, bVal, log2EF);
727 
728  int32x4_t emm0 = vcvtq_s32_f32(fx);
729  float32x4_t tmp = vcvtq_f32_s32(emm0);
730 
731  uint32x4_t mask = vcgtq_f32(tmp, fx);
732  float32x4_t mask_one = vbslq_f32(mask, one, vdupq_n_f32(0.0f));
733  fx = vsubq_f32(tmp, mask_one);
734 
735  tmp = vmulq_f32(fx, exp_C1);
736  float32x4_t z = vmulq_f32(fx, exp_C2);
737  bVal = vsubq_f32(vsubq_f32(bVal, tmp), z);
738  z = vmulq_f32(bVal, bVal);
739 
740  float32x4_t y = vfmaq_f32(exp_p1, exp_p0, bVal);
741  y = vmulq_f32(y, bVal);
742  y = vaddq_f32(y, exp_p2);
743  y = vmulq_f32(y, bVal);
744  y = vaddq_f32(y, exp_p3);
745  y = vfmaq_f32(exp_p4, y, bVal);
746  y = vmulq_f32(y, bVal);
747  y = vaddq_f32(y, exp_p5);
748  y = vfmaq_f32(bVal, y, z);
749  y = vaddq_f32(y, one);
750 
751  emm0 = vcvtq_s32_f32(fx);
752  emm0 = vaddq_s32(emm0, pi32_0x7f);
753  emm0 = vshlq_n_s32(emm0, 23);
754  float32x4_t pow2n = vreinterpretq_f32_s32(emm0);
755 
756  float32x4_t cVal = vmulq_f32(y, pow2n);
757  vst1q_f32(cPtr, cVal);
758 
759  aPtr += 4;
760  bPtr += 4;
761  cPtr += 4;
762  }
763 
764  number = quarterPoints * 4;
765  for (; number < num_points; number++) {
766  *cPtr++ = powf(*aPtr++, *bPtr++);
767  }
768 }
769 
770 #endif /* LV_HAVE_NEONV8 */
771 
772 #ifdef LV_HAVE_SSE4_1
773 #include <smmintrin.h>
774 
775 #define POLY0(x, c0) _mm_set1_ps(c0)
776 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
777 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
778 #define POLY3(x, c0, c1, c2, c3) \
779  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
780 #define POLY4(x, c0, c1, c2, c3, c4) \
781  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
782 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
783  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
784 
785 static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,
786  const float* bVector,
787  const float* aVector,
788  unsigned int num_points)
789 {
790  float* cPtr = cVector;
791  const float* bPtr = bVector;
792  const float* aPtr = aVector;
793 
794  unsigned int number = 0;
795  const unsigned int quarterPoints = num_points / 4;
796 
797  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
798  __m128 tmp, fx, mask, pow2n, z, y;
799  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
800  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
801  __m128i bias, exp, emm0, pi32_0x7f;
802 
803  one = _mm_set1_ps(1.0);
804  exp_hi = _mm_set1_ps(88.3762626647949);
805  exp_lo = _mm_set1_ps(-88.3762626647949);
806  ln2 = _mm_set1_ps(0.6931471805);
807  log2EF = _mm_set1_ps(1.44269504088896341);
808  half = _mm_set1_ps(0.5);
809  exp_C1 = _mm_set1_ps(0.693359375);
810  exp_C2 = _mm_set1_ps(-2.12194440e-4);
811  pi32_0x7f = _mm_set1_epi32(0x7f);
812 
813  exp_p0 = _mm_set1_ps(1.9875691500e-4);
814  exp_p1 = _mm_set1_ps(1.3981999507e-3);
815  exp_p2 = _mm_set1_ps(8.3334519073e-3);
816  exp_p3 = _mm_set1_ps(4.1665795894e-2);
817  exp_p4 = _mm_set1_ps(1.6666665459e-1);
818  exp_p5 = _mm_set1_ps(5.0000001201e-1);
819 
820  for (; number < quarterPoints; number++) {
821  // First compute the logarithm
822  aVal = _mm_loadu_ps(aPtr);
823  bias = _mm_set1_epi32(127);
824  leadingOne = _mm_set1_ps(1.0f);
825  exp = _mm_sub_epi32(
826  _mm_srli_epi32(
827  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
828  bias);
829  logarithm = _mm_cvtepi32_ps(exp);
830 
831  frac = _mm_or_ps(leadingOne,
832  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
833 
834 #if POW_POLY_DEGREE == 6
835  mantissa = POLY5(frac,
836  3.1157899f,
837  -3.3241990f,
838  2.5988452f,
839  -1.2315303f,
840  3.1821337e-1f,
841  -3.4436006e-2f);
842 #elif POW_POLY_DEGREE == 5
843  mantissa = POLY4(frac,
844  2.8882704548164776201f,
845  -2.52074962577807006663f,
846  1.48116647521213171641f,
847  -0.465725644288844778798f,
848  0.0596515482674574969533f);
849 #elif POW_POLY_DEGREE == 4
850  mantissa = POLY3(frac,
851  2.61761038894603480148f,
852  -1.75647175389045657003f,
853  0.688243882994381274313f,
854  -0.107254423828329604454f);
855 #elif POW_POLY_DEGREE == 3
856  mantissa = POLY2(frac,
857  2.28330284476918490682f,
858  -1.04913055217340124191f,
859  0.204446009836232697516f);
860 #else
861 #error
862 #endif
863 
864  logarithm =
865  _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
866  logarithm = _mm_mul_ps(logarithm, ln2);
867 
868 
869  // Now calculate b*lna
870  bVal = _mm_loadu_ps(bPtr);
871  bVal = _mm_mul_ps(bVal, logarithm);
872 
873  // Now compute exp(b*lna)
874  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
875 
876  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
877 
878  emm0 = _mm_cvttps_epi32(fx);
879  tmp = _mm_cvtepi32_ps(emm0);
880 
881  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
882  fx = _mm_sub_ps(tmp, mask);
883 
884  tmp = _mm_mul_ps(fx, exp_C1);
885  z = _mm_mul_ps(fx, exp_C2);
886  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
887  z = _mm_mul_ps(bVal, bVal);
888 
889  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
890  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
891  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
892  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
893  y = _mm_add_ps(y, one);
894 
895  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
896 
897  pow2n = _mm_castsi128_ps(emm0);
898  cVal = _mm_mul_ps(y, pow2n);
899 
900  _mm_storeu_ps(cPtr, cVal);
901 
902  aPtr += 4;
903  bPtr += 4;
904  cPtr += 4;
905  }
906 
907  number = quarterPoints * 4;
908  for (; number < num_points; number++) {
909  *cPtr++ = powf(*aPtr++, *bPtr++);
910  }
911 }
912 
913 #endif /* LV_HAVE_SSE4_1 for unaligned */
914 
915 #if LV_HAVE_AVX2 && LV_HAVE_FMA
916 #include <immintrin.h>
917 
918 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
919 #define POLY1_AVX2_FMA(x, c0, c1) \
920  _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
921 #define POLY2_AVX2_FMA(x, c0, c1, c2) \
922  _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
923 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
924  _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
925 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
926  _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
927 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
928  _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
929 
930 static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,
931  const float* bVector,
932  const float* aVector,
933  unsigned int num_points)
934 {
935  float* cPtr = cVector;
936  const float* bPtr = bVector;
937  const float* aPtr = aVector;
938 
939  unsigned int number = 0;
940  const unsigned int eighthPoints = num_points / 8;
941 
942  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
943  __m256 tmp, fx, mask, pow2n, z, y;
944  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
945  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
946  __m256i bias, exp, emm0, pi32_0x7f;
947 
948  one = _mm256_set1_ps(1.0);
949  exp_hi = _mm256_set1_ps(88.3762626647949);
950  exp_lo = _mm256_set1_ps(-88.3762626647949);
951  ln2 = _mm256_set1_ps(0.6931471805);
952  log2EF = _mm256_set1_ps(1.44269504088896341);
953  half = _mm256_set1_ps(0.5);
954  exp_C1 = _mm256_set1_ps(0.693359375);
955  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
956  pi32_0x7f = _mm256_set1_epi32(0x7f);
957 
958  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
959  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
960  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
961  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
962  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
963  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
964 
965  for (; number < eighthPoints; number++) {
966  // First compute the logarithm
967  aVal = _mm256_loadu_ps(aPtr);
968  bias = _mm256_set1_epi32(127);
969  leadingOne = _mm256_set1_ps(1.0f);
970  exp = _mm256_sub_epi32(
971  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
972  _mm256_set1_epi32(0x7f800000)),
973  23),
974  bias);
975  logarithm = _mm256_cvtepi32_ps(exp);
976 
977  frac = _mm256_or_ps(
978  leadingOne,
979  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
980 
981 #if POW_POLY_DEGREE == 6
982  mantissa = POLY5_AVX2_FMA(frac,
983  3.1157899f,
984  -3.3241990f,
985  2.5988452f,
986  -1.2315303f,
987  3.1821337e-1f,
988  -3.4436006e-2f);
989 #elif POW_POLY_DEGREE == 5
990  mantissa = POLY4_AVX2_FMA(frac,
991  2.8882704548164776201f,
992  -2.52074962577807006663f,
993  1.48116647521213171641f,
994  -0.465725644288844778798f,
995  0.0596515482674574969533f);
996 #elif POW_POLY_DEGREE == 4
997  mantissa = POLY3_AVX2_FMA(frac,
998  2.61761038894603480148f,
999  -1.75647175389045657003f,
1000  0.688243882994381274313f,
1001  -0.107254423828329604454f);
1002 #elif POW_POLY_DEGREE == 3
1003  mantissa = POLY2_AVX2_FMA(frac,
1004  2.28330284476918490682f,
1005  -1.04913055217340124191f,
1006  0.204446009836232697516f);
1007 #else
1008 #error
1009 #endif
1010 
1011  logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
1012  logarithm = _mm256_mul_ps(logarithm, ln2);
1013 
1014 
1015  // Now calculate b*lna
1016  bVal = _mm256_loadu_ps(bPtr);
1017  bVal = _mm256_mul_ps(bVal, logarithm);
1018 
1019  // Now compute exp(b*lna)
1020  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
1021 
1022  fx = _mm256_fmadd_ps(bVal, log2EF, half);
1023 
1024  emm0 = _mm256_cvttps_epi32(fx);
1025  tmp = _mm256_cvtepi32_ps(emm0);
1026 
1027  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
1028  fx = _mm256_sub_ps(tmp, mask);
1029 
1030  tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
1031  bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
1032  z = _mm256_mul_ps(bVal, bVal);
1033 
1034  y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
1035  y = _mm256_fmadd_ps(y, bVal, exp_p2);
1036  y = _mm256_fmadd_ps(y, bVal, exp_p3);
1037  y = _mm256_fmadd_ps(y, bVal, exp_p4);
1038  y = _mm256_fmadd_ps(y, bVal, exp_p5);
1039  y = _mm256_fmadd_ps(y, z, bVal);
1040  y = _mm256_add_ps(y, one);
1041 
1042  emm0 =
1043  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
1044 
1045  pow2n = _mm256_castsi256_ps(emm0);
1046  cVal = _mm256_mul_ps(y, pow2n);
1047 
1048  _mm256_storeu_ps(cPtr, cVal);
1049 
1050  aPtr += 8;
1051  bPtr += 8;
1052  cPtr += 8;
1053  }
1054 
1055  number = eighthPoints * 8;
1056  for (; number < num_points; number++) {
1057  *cPtr++ = pow(*aPtr++, *bPtr++);
1058  }
1059 }
1060 
1061 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
1062 
1063 #ifdef LV_HAVE_AVX2
1064 #include <immintrin.h>
1065 
1066 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
1067 #define POLY1_AVX2(x, c0, c1) \
1068  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
1069 #define POLY2_AVX2(x, c0, c1, c2) \
1070  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
1071 #define POLY3_AVX2(x, c0, c1, c2, c3) \
1072  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
1073 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
1074  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
1075 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
1076  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
1077 
1078 static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
1079  const float* bVector,
1080  const float* aVector,
1081  unsigned int num_points)
1082 {
1083  float* cPtr = cVector;
1084  const float* bPtr = bVector;
1085  const float* aPtr = aVector;
1086 
1087  unsigned int number = 0;
1088  const unsigned int eighthPoints = num_points / 8;
1089 
1090  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
1091  __m256 tmp, fx, mask, pow2n, z, y;
1092  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
1093  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
1094  __m256i bias, exp, emm0, pi32_0x7f;
1095 
1096  one = _mm256_set1_ps(1.0);
1097  exp_hi = _mm256_set1_ps(88.3762626647949);
1098  exp_lo = _mm256_set1_ps(-88.3762626647949);
1099  ln2 = _mm256_set1_ps(0.6931471805);
1100  log2EF = _mm256_set1_ps(1.44269504088896341);
1101  half = _mm256_set1_ps(0.5);
1102  exp_C1 = _mm256_set1_ps(0.693359375);
1103  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
1104  pi32_0x7f = _mm256_set1_epi32(0x7f);
1105 
1106  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
1107  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
1108  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
1109  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
1110  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
1111  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
1112 
1113  for (; number < eighthPoints; number++) {
1114  // First compute the logarithm
1115  aVal = _mm256_loadu_ps(aPtr);
1116  bias = _mm256_set1_epi32(127);
1117  leadingOne = _mm256_set1_ps(1.0f);
1118  exp = _mm256_sub_epi32(
1119  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
1120  _mm256_set1_epi32(0x7f800000)),
1121  23),
1122  bias);
1123  logarithm = _mm256_cvtepi32_ps(exp);
1124 
1125  frac = _mm256_or_ps(
1126  leadingOne,
1127  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
1128 
1129 #if POW_POLY_DEGREE == 6
1130  mantissa = POLY5_AVX2(frac,
1131  3.1157899f,
1132  -3.3241990f,
1133  2.5988452f,
1134  -1.2315303f,
1135  3.1821337e-1f,
1136  -3.4436006e-2f);
1137 #elif POW_POLY_DEGREE == 5
1138  mantissa = POLY4_AVX2(frac,
1139  2.8882704548164776201f,
1140  -2.52074962577807006663f,
1141  1.48116647521213171641f,
1142  -0.465725644288844778798f,
1143  0.0596515482674574969533f);
1144 #elif POW_POLY_DEGREE == 4
1145  mantissa = POLY3_AVX2(frac,
1146  2.61761038894603480148f,
1147  -1.75647175389045657003f,
1148  0.688243882994381274313f,
1149  -0.107254423828329604454f);
1150 #elif POW_POLY_DEGREE == 3
1151  mantissa = POLY2_AVX2(frac,
1152  2.28330284476918490682f,
1153  -1.04913055217340124191f,
1154  0.204446009836232697516f);
1155 #else
1156 #error
1157 #endif
1158 
1159  logarithm = _mm256_add_ps(
1160  _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
1161  logarithm = _mm256_mul_ps(logarithm, ln2);
1162 
1163  // Now calculate b*lna
1164  bVal = _mm256_loadu_ps(bPtr);
1165  bVal = _mm256_mul_ps(bVal, logarithm);
1166 
1167  // Now compute exp(b*lna)
1168  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
1169 
1170  fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
1171 
1172  emm0 = _mm256_cvttps_epi32(fx);
1173  tmp = _mm256_cvtepi32_ps(emm0);
1174 
1175  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
1176  fx = _mm256_sub_ps(tmp, mask);
1177 
1178  tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
1179  bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
1180  z = _mm256_mul_ps(bVal, bVal);
1181 
1182  y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
1183  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
1184  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
1185  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
1186  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
1187  y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
1188  y = _mm256_add_ps(y, one);
1189 
1190  emm0 =
1191  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
1192 
1193  pow2n = _mm256_castsi256_ps(emm0);
1194  cVal = _mm256_mul_ps(y, pow2n);
1195 
1196  _mm256_storeu_ps(cPtr, cVal);
1197 
1198  aPtr += 8;
1199  bPtr += 8;
1200  cPtr += 8;
1201  }
1202 
1203  number = eighthPoints * 8;
1204  for (; number < num_points; number++) {
1205  *cPtr++ = pow(*aPtr++, *bPtr++);
1206  }
1207 }
1208 
1209 #endif /* LV_HAVE_AVX2 for unaligned */
1210 
1211 #ifdef LV_HAVE_RVV
1212 #include <riscv_vector.h>
1213 
1214 static inline void volk_32f_x2_pow_32f_rvv(float* cVector,
1215  const float* bVector,
1216  const float* aVector,
1217  unsigned int num_points)
1218 {
1219  size_t vlmax = __riscv_vsetvlmax_e32m1();
1220 
1221 #if POW_POLY_DEGREE == 6
1222  const vfloat32m1_t cl5 = __riscv_vfmv_v_f_f32m1(3.1157899f, vlmax);
1223  const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(-3.3241990f, vlmax);
1224  const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.5988452f, vlmax);
1225  const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.2315303f, vlmax);
1226  const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(3.1821337e-1f, vlmax);
1227  const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-3.4436006e-2f, vlmax);
1228 #elif POW_POLY_DEGREE == 5
1229  const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(2.8882704548164776201f, vlmax);
1230  const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(-2.52074962577807006663f, vlmax);
1231  const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(1.48116647521213171641f, vlmax);
1232  const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-0.465725644288844778798f, vlmax);
1233  const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.0596515482674574969533f, vlmax);
1234 #elif POW_POLY_DEGREE == 4
1235  const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.61761038894603480148f, vlmax);
1236  const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.75647175389045657003f, vlmax);
1237  const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(0.688243882994381274313f, vlmax);
1238  const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-0.107254423828329604454f, vlmax);
1239 #elif POW_POLY_DEGREE == 3
1240  const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(2.28330284476918490682f, vlmax);
1241  const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-1.04913055217340124191f, vlmax);
1242  const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.204446009836232697516f, vlmax);
1243 #else
1244 #error
1245 #endif
1246 
1247  const vfloat32m1_t exp_hi = __riscv_vfmv_v_f_f32m1(88.376259f, vlmax);
1248  const vfloat32m1_t exp_lo = __riscv_vfmv_v_f_f32m1(-88.376259f, vlmax);
1249  const vfloat32m1_t log2EF = __riscv_vfmv_v_f_f32m1(1.442695f, vlmax);
1250  const vfloat32m1_t exp_C1 = __riscv_vfmv_v_f_f32m1(-0.6933594f, vlmax);
1251  const vfloat32m1_t exp_C2 = __riscv_vfmv_v_f_f32m1(0.000212194f, vlmax);
1252  const vfloat32m1_t cf1 = __riscv_vfmv_v_f_f32m1(1.0f, vlmax);
1253  const vfloat32m1_t cf1o2 = __riscv_vfmv_v_f_f32m1(0.5f, vlmax);
1254  const vfloat32m1_t ln2 = __riscv_vfmv_v_f_f32m1(0.6931471805f, vlmax);
1255 
1256  const vfloat32m1_t ce0 = __riscv_vfmv_v_f_f32m1(1.9875691500e-4, vlmax);
1257  const vfloat32m1_t ce1 = __riscv_vfmv_v_f_f32m1(1.3981999507e-3, vlmax);
1258  const vfloat32m1_t ce2 = __riscv_vfmv_v_f_f32m1(8.3334519073e-3, vlmax);
1259  const vfloat32m1_t ce3 = __riscv_vfmv_v_f_f32m1(4.1665795894e-2, vlmax);
1260  const vfloat32m1_t ce4 = __riscv_vfmv_v_f_f32m1(1.6666665459e-1, vlmax);
1261  const vfloat32m1_t ce5 = __riscv_vfmv_v_f_f32m1(5.0000001201e-1, vlmax);
1262 
1263  const vint32m1_t m1 = __riscv_vreinterpret_i32m1(cf1);
1264  const vint32m1_t m2 = __riscv_vmv_v_x_i32m1(0x7FFFFF, vlmax);
1265  const vint32m1_t c127 = __riscv_vmv_v_x_i32m1(127, vlmax);
1266 
1267  size_t n = num_points;
1268  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
1269  vl = __riscv_vsetvl_e32m1(n);
1270  vfloat32m1_t va = __riscv_vle32_v_f32m1(aVector, vl);
1271  vfloat32m1_t log;
1272 
1273  { /* log(a) */
1274  vfloat32m1_t a = __riscv_vfabs(va, vl);
1275  vfloat32m1_t exp = __riscv_vfcvt_f(
1276  __riscv_vsub(
1277  __riscv_vsra(__riscv_vreinterpret_i32m1(a), 23, vl), c127, vl),
1278  vl);
1279  vfloat32m1_t frac = __riscv_vreinterpret_f32m1(__riscv_vor(
1280  __riscv_vand(__riscv_vreinterpret_i32m1(va), m2, vl), m1, vl));
1281 
1282  vfloat32m1_t mant = cl0;
1283  mant = __riscv_vfmadd(mant, frac, cl1, vl);
1284  mant = __riscv_vfmadd(mant, frac, cl2, vl);
1285 #if POW_POLY_DEGREE >= 4
1286  mant = __riscv_vfmadd(mant, frac, cl3, vl);
1287 #if POW_POLY_DEGREE >= 5
1288  mant = __riscv_vfmadd(mant, frac, cl4, vl);
1289 #if POW_POLY_DEGREE >= 6
1290  mant = __riscv_vfmadd(mant, frac, cl5, vl);
1291 #endif
1292 #endif
1293 #endif
1294  log = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl);
1295  log = __riscv_vfmul(log, ln2, vl);
1296  }
1297 
1298  vfloat32m1_t vb = __riscv_vle32_v_f32m1(bVector, vl);
1299  vb = __riscv_vfmul(vb, log, vl); /* b*log(a) */
1300  vfloat32m1_t exp;
1301 
1302  { /* exp(b*log(a)) */
1303  vb = __riscv_vfmin(vb, exp_hi, vl);
1304  vb = __riscv_vfmax(vb, exp_lo, vl);
1305  vfloat32m1_t fx = __riscv_vfmadd(vb, log2EF, cf1o2, vl);
1306 
1307  vfloat32m1_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl);
1308  fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl);
1309  vb = __riscv_vfmacc(vb, exp_C1, fx, vl);
1310  vb = __riscv_vfmacc(vb, exp_C2, fx, vl);
1311  vfloat32m1_t vv = __riscv_vfmul(vb, vb, vl);
1312 
1313  vfloat32m1_t y = ce0;
1314  y = __riscv_vfmadd(y, vb, ce1, vl);
1315  y = __riscv_vfmadd(y, vb, ce2, vl);
1316  y = __riscv_vfmadd(y, vb, ce3, vl);
1317  y = __riscv_vfmadd(y, vb, ce4, vl);
1318  y = __riscv_vfmadd(y, vb, ce5, vl);
1319  y = __riscv_vfmadd(y, vv, vb, vl);
1320  y = __riscv_vfadd(y, cf1, vl);
1321 
1322  vfloat32m1_t pow2n = __riscv_vreinterpret_f32m1(__riscv_vsll(
1323  __riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), c127, vl), 23, vl));
1324 
1325  exp = __riscv_vfmul(y, pow2n, vl);
1326  }
1327 
1328  __riscv_vse32(cVector, exp, vl);
1329  }
1330 }
1331 
1332 #endif /*LV_HAVE_RVV*/
1333 
1334 #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32f_x2_pow_32f_generic
static void volk_32f_x2_pow_32f_generic(float *cVector, const float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_x2_pow_32f.h:523
volk_32f_x2_pow_32f_neon
static void volk_32f_x2_pow_32f_neon(float *cVector, const float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_x2_pow_32f.h:542