Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_sse_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  * Copyright 2023-2026 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
11 /*
12  * This file is intended to hold SSE intrinsics of intrinsics.
13  * They should be used in VOLK kernels to avoid copy-pasta.
14  */
15 
16 #ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
17 #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
18 #include <emmintrin.h>
19 #include <xmmintrin.h>
20 
21 /*
22  * Newton-Raphson refined reciprocal square root: 1/sqrt(a)
23  * One iteration doubles precision from ~12-bit to ~24-bit
24  * x1 = x0 * (1.5 - 0.5 * a * x0^2)
25  * Handles edge cases: +0 → +Inf, +Inf → 0
26  */
27 static inline __m128 _mm_rsqrt_nr_ps(const __m128 a)
28 {
29  const __m128 HALF = _mm_set1_ps(0.5f);
30  const __m128 THREE_HALFS = _mm_set1_ps(1.5f);
31 
32  const __m128 x0 = _mm_rsqrt_ps(a); // +Inf for +0, 0 for +Inf
33 
34  // Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
35  __m128 x1 = _mm_mul_ps(
36  x0, _mm_sub_ps(THREE_HALFS, _mm_mul_ps(HALF, _mm_mul_ps(_mm_mul_ps(x0, x0), a))));
37 
38  // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
39  // Blend: use x0 where a == +0 or a == +Inf, else use x1
40  __m128i a_si = _mm_castps_si128(a);
41  __m128i zero_mask = _mm_cmpeq_epi32(a_si, _mm_setzero_si128());
42  __m128i inf_mask = _mm_cmpeq_epi32(a_si, _mm_set1_epi32(0x7F800000));
43  __m128 special_mask = _mm_castsi128_ps(_mm_or_si128(zero_mask, inf_mask));
44  // SSE2-compatible blend: (x0 & mask) | (x1 & ~mask)
45  return _mm_or_ps(_mm_and_ps(special_mask, x0), _mm_andnot_ps(special_mask, x1));
46 }
47 
48 /*
49  * Approximate arctan(x) via polynomial expansion
50  * on the interval [-1, 1]
51  *
52  * Maximum relative error ~6.5e-7
53  * Polynomial evaluated via Horner's method
54  */
55 static inline __m128 _mm_arctan_poly_sse(const __m128 x)
56 {
57  const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
58  const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
59  const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f);
60  const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f);
61  const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f);
62  const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f);
63  const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f);
64 
65  const __m128 x_times_x = _mm_mul_ps(x, x);
66  __m128 arctan;
67  arctan = a13;
68  arctan = _mm_mul_ps(x_times_x, arctan);
69  arctan = _mm_add_ps(arctan, a11);
70  arctan = _mm_mul_ps(x_times_x, arctan);
71  arctan = _mm_add_ps(arctan, a9);
72  arctan = _mm_mul_ps(x_times_x, arctan);
73  arctan = _mm_add_ps(arctan, a7);
74  arctan = _mm_mul_ps(x_times_x, arctan);
75  arctan = _mm_add_ps(arctan, a5);
76  arctan = _mm_mul_ps(x_times_x, arctan);
77  arctan = _mm_add_ps(arctan, a3);
78  arctan = _mm_mul_ps(x_times_x, arctan);
79  arctan = _mm_add_ps(arctan, a1);
80  arctan = _mm_mul_ps(x, arctan);
81 
82  return arctan;
83 }
84 
85 /*
86  * Approximate arcsin(x) via polynomial expansion
87  * P(u) such that asin(x) = x * P(x^2) on |x| <= 0.5
88  *
89  * Maximum relative error ~1.5e-6
90  * Polynomial evaluated via Horner's method
91  */
92 static inline __m128 _mm_arcsin_poly_sse(const __m128 x)
93 {
94  const __m128 c0 = _mm_set1_ps(0x1.ffffcep-1f);
95  const __m128 c1 = _mm_set1_ps(0x1.55b648p-3f);
96  const __m128 c2 = _mm_set1_ps(0x1.24d192p-4f);
97  const __m128 c3 = _mm_set1_ps(0x1.0a788p-4f);
98 
99  const __m128 u = _mm_mul_ps(x, x);
100  __m128 p = c3;
101  p = _mm_mul_ps(u, p);
102  p = _mm_add_ps(p, c2);
103  p = _mm_mul_ps(u, p);
104  p = _mm_add_ps(p, c1);
105  p = _mm_mul_ps(u, p);
106  p = _mm_add_ps(p, c0);
107 
108  return _mm_mul_ps(x, p);
109 }
110 
111 static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
112 {
113  __m128 iValue, qValue;
114  // Arrange in i1i2i3i4 format
115  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
116  // Arrange in q1q2q3q4 format
117  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
118  iValue = _mm_mul_ps(iValue, iValue); // Square the I values
119  qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
120  return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
121 }
122 
123 static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
124 {
125  return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
126 }
127 
128 static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0,
129  const __m128 symbols1,
130  const __m128 points0,
131  const __m128 points1,
132  const __m128 scalar)
133 {
134  // calculate scalar * |x - y|^2
135  const __m128 diff0 = _mm_sub_ps(symbols0, points0);
136  const __m128 diff1 = _mm_sub_ps(symbols1, points1);
137  const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
138  return _mm_mul_ps(norms, scalar);
139 }
140 
141 static inline __m128 _mm_accumulate_square_sum_ps(
142  __m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
143 {
144  aux = _mm_mul_ps(aux, val);
145  aux = _mm_sub_ps(aux, acc);
146  aux = _mm_mul_ps(aux, aux);
147  aux = _mm_mul_ps(aux, rec);
148  return _mm_add_ps(sq_acc, aux);
149 }
150 
151 /*
152  * Minimax polynomial for sin(x) on [-pi/4, pi/4]
153  * Coefficients via Remez algorithm (Sollya)
154  * Max |error| < 7.3e-9
155  * sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
156  */
157 static inline __m128 _mm_sin_poly_sse(const __m128 x)
158 {
159  const __m128 s1 = _mm_set1_ps(-0x1.555552p-3f);
160  const __m128 s2 = _mm_set1_ps(+0x1.110be2p-7f);
161  const __m128 s3 = _mm_set1_ps(-0x1.9ab22ap-13f);
162 
163  const __m128 x2 = _mm_mul_ps(x, x);
164  const __m128 x3 = _mm_mul_ps(x2, x);
165 
166  __m128 poly = _mm_add_ps(_mm_mul_ps(x2, s3), s2);
167  poly = _mm_add_ps(_mm_mul_ps(x2, poly), s1);
168  return _mm_add_ps(_mm_mul_ps(x3, poly), x);
169 }
170 
171 /*
172  * Minimax polynomial for cos(x) on [-pi/4, pi/4]
173  * Coefficients via Remez algorithm (Sollya)
174  * Max |error| < 1.1e-7
175  * cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
176  */
177 static inline __m128 _mm_cos_poly_sse(const __m128 x)
178 {
179  const __m128 c1 = _mm_set1_ps(-0x1.fffff4p-2f);
180  const __m128 c2 = _mm_set1_ps(+0x1.554a46p-5f);
181  const __m128 c3 = _mm_set1_ps(-0x1.661be2p-10f);
182  const __m128 one = _mm_set1_ps(1.0f);
183 
184  const __m128 x2 = _mm_mul_ps(x, x);
185 
186  __m128 poly = _mm_add_ps(_mm_mul_ps(x2, c3), c2);
187  poly = _mm_add_ps(_mm_mul_ps(x2, poly), c1);
188  return _mm_add_ps(_mm_mul_ps(x2, poly), one);
189 }
190 
191 /*
192  * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
193  * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
194  * Max error: ~1.55e-6
195  *
196  * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
197  * Polynomial evaluated via Horner's method
198  */
199 static inline __m128 _mm_log2_poly_sse(const __m128 x)
200 {
201  const __m128 c0 = _mm_set1_ps(+0x1.a8a726p+1f);
202  const __m128 c1 = _mm_set1_ps(-0x1.0b7f7ep+2f);
203  const __m128 c2 = _mm_set1_ps(+0x1.05d9ccp+2f);
204  const __m128 c3 = _mm_set1_ps(-0x1.4d476cp+1f);
205  const __m128 c4 = _mm_set1_ps(+0x1.04fc3ap+0f);
206  const __m128 c5 = _mm_set1_ps(-0x1.c97982p-3f);
207  const __m128 c6 = _mm_set1_ps(+0x1.57aa42p-6f);
208 
209  // Horner's method: c0 + x*(c1 + x*(c2 + ...))
210  __m128 poly = c6;
211  poly = _mm_add_ps(_mm_mul_ps(poly, x), c5);
212  poly = _mm_add_ps(_mm_mul_ps(poly, x), c4);
213  poly = _mm_add_ps(_mm_mul_ps(poly, x), c3);
214  poly = _mm_add_ps(_mm_mul_ps(poly, x), c2);
215  poly = _mm_add_ps(_mm_mul_ps(poly, x), c1);
216  poly = _mm_add_ps(_mm_mul_ps(poly, x), c0);
217  return poly;
218 }
219 
220 #endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */
_mm_accumulate_square_sum_ps
static __m128 _mm_accumulate_square_sum_ps(__m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
Definition: volk_sse_intrinsics.h:141
_mm_arcsin_poly_sse
static __m128 _mm_arcsin_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:92
_mm_magnitude_ps
static __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse_intrinsics.h:123
volk_arch_defs.val
val
Definition: volk_arch_defs.py:57
_mm_scaled_norm_dist_ps_sse
static __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse_intrinsics.h:128
_mm_sin_poly_sse
static __m128 _mm_sin_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:157
_mm_magnitudesquared_ps
static __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse_intrinsics.h:111
_mm_rsqrt_nr_ps
static __m128 _mm_rsqrt_nr_ps(const __m128 a)
Definition: volk_sse_intrinsics.h:27
_mm_arctan_poly_sse
static __m128 _mm_arctan_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:55
_mm_log2_poly_sse
static __m128 _mm_log2_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:199
_mm_cos_poly_sse
static __m128 _mm_cos_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:177