Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_neon_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  * Copyright 2025, 2026 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
11 /*
12  * Copyright (c) 2016-2019 ARM Limited.
13  *
14  * SPDX-License-Identifier: MIT
15  *
16  * Permission is hereby granted, free of charge, to any person obtaining a copy
17  * of this software and associated documentation files (the "Software"), to
18  * deal in the Software without restriction, including without limitation the
19  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
20  * sell copies of the Software, and to permit persons to whom the Software is
21  * furnished to do so, subject to the following conditions:
22  *
23  * The above copyright notice and this permission notice shall be included in all
24  * copies or substantial portions of the Software.
25  *
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
31  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  *
34  * _vtaylor_polyq_f32
35  * _vlogq_f32
36  *
37  */
38 
39 /* Copyright (C) 2011 Julien Pommier
40 
41  This software is provided 'as-is', without any express or implied
42  warranty. In no event will the authors be held liable for any damages
43  arising from the use of this software.
44 
45  Permission is granted to anyone to use this software for any purpose,
46  including commercial applications, and to alter it and redistribute it
47  freely, subject to the following restrictions:
48 
49  1. The origin of this software must not be misrepresented; you must not
50  claim that you wrote the original software. If you use this software
51  in a product, an acknowledgment in the product documentation would be
52  appreciated but is not required.
53  2. Altered source versions must be plainly marked as such, and must not be
54  misrepresented as being the original software.
55  3. This notice may not be removed or altered from any source distribution.
56 
57  (this is the zlib license)
58 
59  _vsincosq_f32
60 
61 */
62 
63 /*
64  * This file is intended to hold NEON intrinsics of intrinsics.
65  * They should be used in VOLK kernels to avoid copy-pasta.
66  */
67 
68 #ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
69 #define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
70 #include <arm_neon.h>
71 
72 
73 /* Magnitude squared for float32x4x2_t */
74 static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
75 {
76  float32x4_t iValue, qValue, result;
77  iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
78  qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
79  result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
80  return result;
81 }
82 
83 /* Inverse square root for float32x4_t
84  * Handles edge cases: +0 → +Inf, +Inf → 0 */
85 static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
86 {
87  float32x4_t x0 = vrsqrteq_f32(x); // +Inf for +0, 0 for +Inf
88 
89  // Newton-Raphson refinement using vrsqrtsq_f32
90  float32x4_t x1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x0), x0), x0);
91  x1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x1), x1), x1);
92 
93  // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
94  // Blend: use x0 where x == +0 or x == +Inf, else use x1
95  uint32x4_t x_bits = vreinterpretq_u32_f32(x);
96  uint32x4_t zero_mask = vceqq_u32(x_bits, vdupq_n_u32(0x00000000));
97  uint32x4_t inf_mask = vceqq_u32(x_bits, vdupq_n_u32(0x7F800000));
98  uint32x4_t special_mask = vorrq_u32(zero_mask, inf_mask);
99  return vbslq_f32(special_mask, x0, x1);
100 }
101 
102 /* Square root for ARMv7 NEON (no vsqrtq_f32)
103  * Uses sqrt(x) = x * rsqrt(x) with refined rsqrt
104  * Handles x=0 case to avoid NaN from 0 * inf */
105 static inline float32x4_t _vsqrtq_f32(float32x4_t x)
106 {
107  const float32x4_t zero = vdupq_n_f32(0.0f);
108  uint32x4_t zero_mask = vceqq_f32(x, zero);
109  float32x4_t result = vmulq_f32(x, _vinvsqrtq_f32(x));
110  return vbslq_f32(zero_mask, zero, result);
111 }
112 
113 /* Inverse */
114 static inline float32x4_t _vinvq_f32(float32x4_t x)
115 {
116  // Newton's method
117  float32x4_t recip = vrecpeq_f32(x);
118  recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
119  recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
120  return recip;
121 }
122 
123 /*
124  * Approximate arcsin(x) via polynomial expansion
125  * P(u) such that asin(x) = x * P(x^2) on |x| <= 0.5
126  *
127  * Maximum relative error ~1.5e-6
128  * Polynomial evaluated via Horner's method
129  */
130 static inline float32x4_t _varcsinq_f32(float32x4_t x)
131 {
132  const float32x4_t c0 = vdupq_n_f32(0x1.ffffcep-1f);
133  const float32x4_t c1 = vdupq_n_f32(0x1.55b648p-3f);
134  const float32x4_t c2 = vdupq_n_f32(0x1.24d192p-4f);
135  const float32x4_t c3 = vdupq_n_f32(0x1.0a788p-4f);
136 
137  const float32x4_t u = vmulq_f32(x, x);
138  float32x4_t p = c3;
139  p = vmlaq_f32(c2, u, p);
140  p = vmlaq_f32(c1, u, p);
141  p = vmlaq_f32(c0, u, p);
142 
143  return vmulq_f32(x, p);
144 }
145 
146 #ifdef LV_HAVE_NEONV8
147 /*
148  * Approximate arcsin(x) via polynomial expansion (NEONv8 with FMA)
149  * P(u) such that asin(x) = x * P(x^2) on |x| <= 0.5
150  *
151  * Maximum relative error ~1.5e-6
152  * Polynomial evaluated via Horner's method
153  */
154 static inline float32x4_t _varcsinq_f32_neonv8(float32x4_t x)
155 {
156  const float32x4_t c0 = vdupq_n_f32(0x1.ffffcep-1f);
157  const float32x4_t c1 = vdupq_n_f32(0x1.55b648p-3f);
158  const float32x4_t c2 = vdupq_n_f32(0x1.24d192p-4f);
159  const float32x4_t c3 = vdupq_n_f32(0x1.0a788p-4f);
160 
161  const float32x4_t u = vmulq_f32(x, x);
162  float32x4_t p = c3;
163  p = vfmaq_f32(c2, u, p);
164  p = vfmaq_f32(c1, u, p);
165  p = vfmaq_f32(c0, u, p);
166 
167  return vmulq_f32(x, p);
168 }
169 #endif /* LV_HAVE_NEONV8 */
170 
171 /* Complex multiplication for float32x4x2_t */
172 static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val,
173  float32x4x2_t b_val)
174 {
175  float32x4x2_t tmp_real;
176  float32x4x2_t tmp_imag;
177  float32x4x2_t c_val;
178 
179  // multiply the real*real and imag*imag to get real result
180  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
181  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
182  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
183  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
184  // Multiply cross terms to get the imaginary result
185  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
186  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
187  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
188  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
189  // combine the products
190  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
191  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
192  return c_val;
193 }
194 
195 /* From ARM Compute Library, MIT license */
196 static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
197 {
198  float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
199  float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
200  float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
201  float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
202  float32x4_t x2 = vmulq_f32(x, x);
203  float32x4_t x4 = vmulq_f32(x2, x2);
204  float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
205  return res;
206 }
207 
208 /* Natural logarithm.
209  * From ARM Compute Library, MIT license */
210 static inline float32x4_t _vlogq_f32(float32x4_t x)
211 {
212  const float32x4_t log_tab[8] = {
213  vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f),
214  vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f),
215  vdupq_n_f32(5.17591238022f), vdupq_n_f32(0.844007015228f),
216  vdupq_n_f32(4.58445882797f), vdupq_n_f32(0.0141278216615f),
217  };
218 
219  const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
220  const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
221 
222  // Extract exponent
223  int32x4_t m = vsubq_s32(
224  vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
225  float32x4_t val =
226  vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
227 
228  // Polynomial Approximation
229  float32x4_t poly = _vtaylor_polyq_f32(val, log_tab);
230 
231  // Reconstruct
232  poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
233 
234  return poly;
235 }
236 
237 /* Evaluation of 4 sines & cosines at once.
238  * Optimized from here (zlib license)
239  * http://gruntthepeon.free.fr/ssemath/ */
240 static inline float32x4x2_t _vsincosq_f32(float32x4_t x)
241 {
242  const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625);
243  const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4);
244  const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8);
245  const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4);
246  const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
247  const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1);
248  const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005);
249  const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003);
250  const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002);
251  const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI
252 
253  const float32x4_t CONST_1 = vdupq_n_f32(1.f);
254  const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f);
255  const float32x4_t CONST_0 = vdupq_n_f32(0.f);
256  const uint32x4_t CONST_2 = vdupq_n_u32(2);
257  const uint32x4_t CONST_4 = vdupq_n_u32(4);
258 
259  uint32x4_t emm2;
260 
261  uint32x4_t sign_mask_sin, sign_mask_cos;
262  sign_mask_sin = vcltq_f32(x, CONST_0);
263  x = vabsq_f32(x);
264  // scale by 4/pi
265  float32x4_t y = vmulq_f32(x, c_cephes_FOPI);
266 
267  // store the integer part of y in mm0
268  emm2 = vcvtq_u32_f32(y);
269  /* j=(j+1) & (~1) (see the cephes sources) */
270  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
271  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
272  y = vcvtq_f32_u32(emm2);
273 
274  /* get the polynom selection mask
275  there is one polynom for 0 <= x <= Pi/4
276  and another one for Pi/4<x<=Pi/2
277  Both branches will be computed. */
278  const uint32x4_t poly_mask = vtstq_u32(emm2, CONST_2);
279 
280  // The magic pass: "Extended precision modular arithmetic"
281  x = vmlaq_f32(x, y, c_minus_cephes_DP1);
282  x = vmlaq_f32(x, y, c_minus_cephes_DP2);
283  x = vmlaq_f32(x, y, c_minus_cephes_DP3);
284 
285  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, CONST_4));
286  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, CONST_2), CONST_4);
287 
288  /* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
289  and the second polynom (Pi/4 <= x <= 0) in y2 */
290  float32x4_t y1, y2;
291  float32x4_t z = vmulq_f32(x, x);
292 
293  y1 = vmlaq_f32(c_coscof_p1, z, c_coscof_p0);
294  y1 = vmlaq_f32(c_coscof_p2, z, y1);
295  y1 = vmulq_f32(y1, z);
296  y1 = vmulq_f32(y1, z);
297  y1 = vmlsq_f32(y1, z, CONST_1_2);
298  y1 = vaddq_f32(y1, CONST_1);
299 
300  y2 = vmlaq_f32(c_sincof_p1, z, c_sincof_p0);
301  y2 = vmlaq_f32(c_sincof_p2, z, y2);
302  y2 = vmulq_f32(y2, z);
303  y2 = vmlaq_f32(x, x, y2);
304 
305  /* select the correct result from the two polynoms */
306  const float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
307  const float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
308 
309  float32x4x2_t sincos;
310  sincos.val[0] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
311  sincos.val[1] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
312 
313  return sincos;
314 }
315 
316 static inline float32x4_t _vtanq_f32(float32x4_t x)
317 {
318  const float32x4x2_t sincos = _vsincosq_f32(x);
319  return vmulq_f32(sincos.val[0], _vinvq_f32(sincos.val[1]));
320 }
321 
322 /*
323  * Approximate arctan(x) via polynomial expansion
324  * on the interval [-1, 1]
325  *
326  * Maximum relative error ~6.5e-7
327  * Polynomial evaluated via Horner's method
328  */
329 static inline float32x4_t _varctan_poly_f32(float32x4_t x)
330 {
331  const float32x4_t a1 = vdupq_n_f32(+0x1.ffffeap-1f);
332  const float32x4_t a3 = vdupq_n_f32(-0x1.55437p-2f);
333  const float32x4_t a5 = vdupq_n_f32(+0x1.972be6p-3f);
334  const float32x4_t a7 = vdupq_n_f32(-0x1.1436ap-3f);
335  const float32x4_t a9 = vdupq_n_f32(+0x1.5785aap-4f);
336  const float32x4_t a11 = vdupq_n_f32(-0x1.2f3004p-5f);
337  const float32x4_t a13 = vdupq_n_f32(+0x1.01a37cp-7f);
338 
339  const float32x4_t x_sq = vmulq_f32(x, x);
340  float32x4_t result;
341  result = a13;
342  result = vmlaq_f32(a11, x_sq, result);
343  result = vmlaq_f32(a9, x_sq, result);
344  result = vmlaq_f32(a7, x_sq, result);
345  result = vmlaq_f32(a5, x_sq, result);
346  result = vmlaq_f32(a3, x_sq, result);
347  result = vmlaq_f32(a1, x_sq, result);
348  result = vmulq_f32(x, result);
349 
350  return result;
351 }
352 
353 static inline float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc,
354  float32x4_t acc,
355  float32x4_t val,
356  float32x4_t rec,
357  float32x4_t aux)
358 {
359  aux = vmulq_f32(aux, val);
360  aux = vsubq_f32(aux, acc);
361  aux = vmulq_f32(aux, aux);
362 #ifdef LV_HAVE_NEONV8
363  return vfmaq_f32(sq_acc, aux, rec);
364 #else
365  aux = vmulq_f32(aux, rec);
366  return vaddq_f32(sq_acc, aux);
367 #endif
368 }
369 
370 /*
371  * Minimax polynomial for sin(x) on [-pi/4, pi/4]
372  * Coefficients via Remez algorithm (Sollya)
373  * Max |error| < 7.3e-9
374  * sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
375  */
376 static inline float32x4_t _vsin_poly_f32(float32x4_t x)
377 {
378  const float32x4_t s1 = vdupq_n_f32(-0x1.555552p-3f);
379  const float32x4_t s2 = vdupq_n_f32(+0x1.110be2p-7f);
380  const float32x4_t s3 = vdupq_n_f32(-0x1.9ab22ap-13f);
381 
382  const float32x4_t x2 = vmulq_f32(x, x);
383  const float32x4_t x3 = vmulq_f32(x2, x);
384 
385  float32x4_t poly = vmlaq_f32(s2, x2, s3);
386  poly = vmlaq_f32(s1, x2, poly);
387  return vmlaq_f32(x, x3, poly);
388 }
389 
390 /*
391  * Minimax polynomial for cos(x) on [-pi/4, pi/4]
392  * Coefficients via Remez algorithm (Sollya)
393  * Max |error| < 1.1e-7
394  * cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
395  */
396 static inline float32x4_t _vcos_poly_f32(float32x4_t x)
397 {
398  const float32x4_t c1 = vdupq_n_f32(-0x1.fffff4p-2f);
399  const float32x4_t c2 = vdupq_n_f32(+0x1.554a46p-5f);
400  const float32x4_t c3 = vdupq_n_f32(-0x1.661be2p-10f);
401  const float32x4_t one = vdupq_n_f32(1.0f);
402 
403  const float32x4_t x2 = vmulq_f32(x, x);
404 
405  float32x4_t poly = vmlaq_f32(c2, x2, c3);
406  poly = vmlaq_f32(c1, x2, poly);
407  return vmlaq_f32(one, x2, poly);
408 }
409 
410 /*
411  * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
412  * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
413  * Max error: ~1.55e-6
414  *
415  * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
416  * Polynomial evaluated via Horner's method
417  */
418 static inline float32x4_t _vlog2_poly_f32(float32x4_t x)
419 {
420  const float32x4_t c0 = vdupq_n_f32(+0x1.a8a726p+1f);
421  const float32x4_t c1 = vdupq_n_f32(-0x1.0b7f7ep+2f);
422  const float32x4_t c2 = vdupq_n_f32(+0x1.05d9ccp+2f);
423  const float32x4_t c3 = vdupq_n_f32(-0x1.4d476cp+1f);
424  const float32x4_t c4 = vdupq_n_f32(+0x1.04fc3ap+0f);
425  const float32x4_t c5 = vdupq_n_f32(-0x1.c97982p-3f);
426  const float32x4_t c6 = vdupq_n_f32(+0x1.57aa42p-6f);
427 
428  // Horner's method: c0 + x*(c1 + x*(c2 + ...))
429  float32x4_t poly = c6;
430  poly = vmlaq_f32(c5, poly, x);
431  poly = vmlaq_f32(c4, poly, x);
432  poly = vmlaq_f32(c3, poly, x);
433  poly = vmlaq_f32(c2, poly, x);
434  poly = vmlaq_f32(c1, poly, x);
435  poly = vmlaq_f32(c0, poly, x);
436  return poly;
437 }
438 
439 #ifdef LV_HAVE_NEONV8
440 /* ARMv8 NEON FMA-based arctan polynomial for better accuracy and throughput */
441 static inline float32x4_t _varctan_poly_neonv8(float32x4_t x)
442 {
443  const float32x4_t a1 = vdupq_n_f32(+0x1.ffffeap-1f);
444  const float32x4_t a3 = vdupq_n_f32(-0x1.55437p-2f);
445  const float32x4_t a5 = vdupq_n_f32(+0x1.972be6p-3f);
446  const float32x4_t a7 = vdupq_n_f32(-0x1.1436ap-3f);
447  const float32x4_t a9 = vdupq_n_f32(+0x1.5785aap-4f);
448  const float32x4_t a11 = vdupq_n_f32(-0x1.2f3004p-5f);
449  const float32x4_t a13 = vdupq_n_f32(+0x1.01a37cp-7f);
450 
451  const float32x4_t x_sq = vmulq_f32(x, x);
452  float32x4_t result = a13;
453  result = vfmaq_f32(a11, x_sq, result);
454  result = vfmaq_f32(a9, x_sq, result);
455  result = vfmaq_f32(a7, x_sq, result);
456  result = vfmaq_f32(a5, x_sq, result);
457  result = vfmaq_f32(a3, x_sq, result);
458  result = vfmaq_f32(a1, x_sq, result);
459  result = vmulq_f32(x, result);
460 
461  return result;
462 }
463 
464 /* NEONv8 FMA sin polynomial on [-pi/4, pi/4], coeffs via Remez (Sollya) */
465 static inline float32x4_t _vsin_poly_neonv8(float32x4_t x)
466 {
467  const float32x4_t s1 = vdupq_n_f32(-0x1.555552p-3f);
468  const float32x4_t s2 = vdupq_n_f32(+0x1.110be2p-7f);
469  const float32x4_t s3 = vdupq_n_f32(-0x1.9ab22ap-13f);
470 
471  const float32x4_t x2 = vmulq_f32(x, x);
472  const float32x4_t x3 = vmulq_f32(x2, x);
473 
474  float32x4_t poly = vfmaq_f32(s2, x2, s3);
475  poly = vfmaq_f32(s1, x2, poly);
476  return vfmaq_f32(x, x3, poly);
477 }
478 
479 /* NEONv8 FMA cos polynomial on [-pi/4, pi/4], coeffs via Remez (Sollya) */
480 static inline float32x4_t _vcos_poly_neonv8(float32x4_t x)
481 {
482  const float32x4_t c1 = vdupq_n_f32(-0x1.fffff4p-2f);
483  const float32x4_t c2 = vdupq_n_f32(+0x1.554a46p-5f);
484  const float32x4_t c3 = vdupq_n_f32(-0x1.661be2p-10f);
485  const float32x4_t one = vdupq_n_f32(1.0f);
486 
487  const float32x4_t x2 = vmulq_f32(x, x);
488 
489  float32x4_t poly = vfmaq_f32(c2, x2, c3);
490  poly = vfmaq_f32(c1, x2, poly);
491  return vfmaq_f32(one, x2, poly);
492 }
493 
494 /*
495  * NEONv8 FMA log2 polynomial on [1, 2]
496  * log2(x) ≈ poly(x) * (x - 1)
497  * Max error: ~1.55e-6
498  */
499 static inline float32x4_t _vlog2_poly_neonv8(float32x4_t x)
500 {
501  const float32x4_t c0 = vdupq_n_f32(+0x1.a8a726p+1f);
502  const float32x4_t c1 = vdupq_n_f32(-0x1.0b7f7ep+2f);
503  const float32x4_t c2 = vdupq_n_f32(+0x1.05d9ccp+2f);
504  const float32x4_t c3 = vdupq_n_f32(-0x1.4d476cp+1f);
505  const float32x4_t c4 = vdupq_n_f32(+0x1.04fc3ap+0f);
506  const float32x4_t c5 = vdupq_n_f32(-0x1.c97982p-3f);
507  const float32x4_t c6 = vdupq_n_f32(+0x1.57aa42p-6f);
508 
509  // Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
510  float32x4_t poly = c6;
511  poly = vfmaq_f32(c5, poly, x);
512  poly = vfmaq_f32(c4, poly, x);
513  poly = vfmaq_f32(c3, poly, x);
514  poly = vfmaq_f32(c2, poly, x);
515  poly = vfmaq_f32(c1, poly, x);
516  poly = vfmaq_f32(c0, poly, x);
517  return poly;
518 }
519 #endif /* LV_HAVE_NEONV8 */
520 
521 #endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */
_varcsinq_f32
static float32x4_t _varcsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:130
_vmagnitudesquaredq_f32
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:74
_vsincosq_f32
static float32x4x2_t _vsincosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:240
volk_arch_defs.val
val
Definition: volk_arch_defs.py:57
_vtaylor_polyq_f32
static float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
Definition: volk_neon_intrinsics.h:196
_vcos_poly_f32
static float32x4_t _vcos_poly_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:396
_vlog2_poly_f32
static float32x4_t _vlog2_poly_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:418
_vlogq_f32
static float32x4_t _vlogq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:210
_varctan_poly_f32
static float32x4_t _varctan_poly_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:329
_vinvsqrtq_f32
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:85
_vsqrtq_f32
static float32x4_t _vsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:105
_neon_accumulate_square_sum_f32
static float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc, float32x4_t acc, float32x4_t val, float32x4_t rec, float32x4_t aux)
Definition: volk_neon_intrinsics.h:353
_vtanq_f32
static float32x4_t _vtanq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:316
_vmultiply_complexq_f32
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:172
_vinvq_f32
static float32x4_t _vinvq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:114
_vsin_poly_f32
static float32x4_t _vsin_poly_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:376