Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_avx2_fma_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2023 - 2025 Magnus Lundmark <magnuslundmark@gmail.com>
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 /*
11  * This file is intended to hold AVX2 FMA intrinsics.
12  * They should be used in VOLK kernels to avoid copy-paste.
13  */
14 
15 #ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
16 #define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
17 #include <immintrin.h>
18 
19 /*
20  * Approximate arctan(x) via polynomial expansion
21  * on the interval [-1, 1]
22  *
23  * Maximum relative error ~6.5e-7
24  * Polynomial evaluated via Horner's method
25  */
26 static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x)
27 {
28  const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
29  const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
30  const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
31  const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
32  const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
33  const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
34  const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
35 
36  const __m256 x_times_x = _mm256_mul_ps(x, x);
37  __m256 arctan;
38  arctan = a13;
39  arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
40  arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
41  arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
42  arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
43  arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
44  arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
45  arctan = _mm256_mul_ps(x, arctan);
46 
47  return arctan;
48 }
49 
50 /*
51  * Approximate arcsin(x) via polynomial expansion
52  * P(u) such that asin(x) = x * P(x^2) on |x| <= 0.5
53  *
54  * Maximum relative error ~1.5e-6
55  * Polynomial evaluated via Horner's method
56  */
57 static inline __m256 _mm256_arcsin_poly_avx2_fma(const __m256 x)
58 {
59  const __m256 c0 = _mm256_set1_ps(0x1.ffffcep-1f);
60  const __m256 c1 = _mm256_set1_ps(0x1.55b648p-3f);
61  const __m256 c2 = _mm256_set1_ps(0x1.24d192p-4f);
62  const __m256 c3 = _mm256_set1_ps(0x1.0a788p-4f);
63 
64  const __m256 u = _mm256_mul_ps(x, x);
65  __m256 p = c3;
66  p = _mm256_fmadd_ps(u, p, c2);
67  p = _mm256_fmadd_ps(u, p, c1);
68  p = _mm256_fmadd_ps(u, p, c0);
69 
70  return _mm256_mul_ps(x, p);
71 }
72 
73 /*
74  * Minimax polynomial for sin(x) on [-pi/4, pi/4]
75  * Coefficients via Remez algorithm (Sollya)
76  * Max |error| < 7.3e-9
77  * sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
78  */
79 static inline __m256 _mm256_sin_poly_avx2_fma(const __m256 x)
80 {
81  const __m256 s1 = _mm256_set1_ps(-0x1.555552p-3f);
82  const __m256 s2 = _mm256_set1_ps(+0x1.110be2p-7f);
83  const __m256 s3 = _mm256_set1_ps(-0x1.9ab22ap-13f);
84 
85  const __m256 x2 = _mm256_mul_ps(x, x);
86  const __m256 x3 = _mm256_mul_ps(x2, x);
87 
88  __m256 poly = _mm256_fmadd_ps(x2, s3, s2);
89  poly = _mm256_fmadd_ps(x2, poly, s1);
90  return _mm256_fmadd_ps(x3, poly, x);
91 }
92 
93 /*
94  * Minimax polynomial for cos(x) on [-pi/4, pi/4]
95  * Coefficients via Remez algorithm (Sollya)
96  * Max |error| < 1.1e-7
97  * cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
98  */
99 static inline __m256 _mm256_cos_poly_avx2_fma(const __m256 x)
100 {
101  const __m256 c1 = _mm256_set1_ps(-0x1.fffff4p-2f);
102  const __m256 c2 = _mm256_set1_ps(+0x1.554a46p-5f);
103  const __m256 c3 = _mm256_set1_ps(-0x1.661be2p-10f);
104  const __m256 one = _mm256_set1_ps(1.0f);
105 
106  const __m256 x2 = _mm256_mul_ps(x, x);
107 
108  __m256 poly = _mm256_fmadd_ps(x2, c3, c2);
109  poly = _mm256_fmadd_ps(x2, poly, c1);
110  return _mm256_fmadd_ps(x2, poly, one);
111 }
112 
113 /*
114  * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
115  * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
116  * Max error: ~1.55e-6
117  *
118  * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
119  * Polynomial evaluated via Horner's method with FMA
120  */
121 static inline __m256 _mm256_log2_poly_avx2_fma(const __m256 x)
122 {
123  const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
124  const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
125  const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
126  const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
127  const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
128  const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
129  const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);
130 
131  // Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
132  __m256 poly = c6;
133  poly = _mm256_fmadd_ps(poly, x, c5);
134  poly = _mm256_fmadd_ps(poly, x, c4);
135  poly = _mm256_fmadd_ps(poly, x, c3);
136  poly = _mm256_fmadd_ps(poly, x, c2);
137  poly = _mm256_fmadd_ps(poly, x, c1);
138  poly = _mm256_fmadd_ps(poly, x, c0);
139  return poly;
140 }
141 
142 #endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
_mm256_log2_poly_avx2_fma
static __m256 _mm256_log2_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:121
_mm256_arctan_poly_avx2_fma
static __m256 _mm256_arctan_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:26
_mm256_sin_poly_avx2_fma
static __m256 _mm256_sin_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:79
_mm256_cos_poly_avx2_fma
static __m256 _mm256_cos_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:99
_mm256_arcsin_poly_avx2_fma
static __m256 _mm256_arcsin_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:57