Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_common.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc.
4  * Copyright 2023 - 2025 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
11 #ifndef INCLUDED_LIBVOLK_COMMON_H
12 #define INCLUDED_LIBVOLK_COMMON_H
13 
15 // Cross-platform attribute macros
17 #if _MSC_VER
18 #define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
19 #define __VOLK_ATTR_UNUSED
20 #define __VOLK_ATTR_INLINE __forceinline
21 #define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
22 #define __VOLK_ATTR_EXPORT __declspec(dllexport)
23 #define __VOLK_ATTR_IMPORT __declspec(dllimport)
24 #define __VOLK_PREFETCH(addr)
25 #define __VOLK_ASM __asm
26 #elif defined(__clang__)
27 // AppleClang also defines __GNUC__, so do this check first. These
28 // will probably be the same as for __GNUC__, but let's keep them
29 // separate just to be safe.
30 #define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
31 #define __VOLK_ATTR_UNUSED __attribute__((unused))
32 #define __VOLK_ATTR_INLINE __attribute__((always_inline))
33 #define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
34 #define __VOLK_ASM __asm__
35 #define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
36 #define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
37 #define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
38 #elif defined __GNUC__
39 #define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
40 #define __VOLK_ATTR_UNUSED __attribute__((unused))
41 #define __VOLK_ATTR_INLINE __attribute__((always_inline))
42 #define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
43 #define __VOLK_ASM __asm__
44 #if __GNUC__ >= 4
45 #define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
46 #define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
47 #else
48 #define __VOLK_ATTR_EXPORT
49 #define __VOLK_ATTR_IMPORT
50 #endif
51 #define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
52 #elif _MSC_VER
53 #define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
54 #define __VOLK_ATTR_UNUSED
55 #define __VOLK_ATTR_INLINE __forceinline
56 #define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
57 #define __VOLK_ATTR_EXPORT __declspec(dllexport)
58 #define __VOLK_ATTR_IMPORT __declspec(dllimport)
59 #define __VOLK_PREFETCH(addr)
60 #define __VOLK_ASM __asm
61 #else
62 #define __VOLK_ATTR_ALIGNED(x)
63 #define __VOLK_ATTR_UNUSED
64 #define __VOLK_ATTR_INLINE
65 #define __VOLK_ATTR_DEPRECATED
66 #define __VOLK_ATTR_EXPORT
67 #define __VOLK_ATTR_IMPORT
68 #define __VOLK_PREFETCH(addr)
69 #define __VOLK_ASM __asm__
70 #endif
71 
73 // Ignore annoying warnings in MSVC
75 #if defined(_MSC_VER)
76 #pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2',
77  // possible loss of data
78 #pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2'
79 #endif
80 
82 // C-linkage declaration macros
83 // FIXME: due to the usage of complex.h, require gcc for c-linkage
85 #if defined(__cplusplus) && (__GNUC__)
86 #define __VOLK_DECL_BEGIN extern "C" {
87 #define __VOLK_DECL_END }
88 #else
89 #define __VOLK_DECL_BEGIN
90 #define __VOLK_DECL_END
91 #endif
92 
94 // Define VOLK_API for library symbols
95 // https://gcc.gnu.org/wiki/Visibility
97 #ifdef volk_EXPORTS
98 #define VOLK_API __VOLK_ATTR_EXPORT
99 #else
100 #define VOLK_API __VOLK_ATTR_IMPORT
101 #endif
102 
104 // The bit128 union used by some
106 #include <stdint.h>
107 
108 #ifdef LV_HAVE_SSE
109 #ifdef _WIN32
110 #include <intrin.h>
111 #else
112 #include <x86intrin.h>
113 #endif
114 #endif
115 
116 union bit128 {
117  uint8_t i8[16];
118  uint16_t i16[8];
119  uint32_t i[4];
120  float f[4];
121  double d[2];
122 
123 #ifdef LV_HAVE_SSE
124  __m128 float_vec;
125 #endif
126 
127 #ifdef LV_HAVE_SSE2
128  __m128i int_vec;
129  __m128d double_vec;
130 #endif
131 };
132 
133 union bit256 {
134  uint8_t i8[32];
135  uint16_t i16[16];
136  uint32_t i[8];
137  float f[8];
138  double d[4];
139 
140 #ifdef LV_HAVE_AVX
141  __m256 float_vec;
142  __m256i int_vec;
143  __m256d double_vec;
144 #endif
145 };
146 
147 #define bit128_p(x) ((union bit128*)(x))
148 #define bit256_p(x) ((union bit256*)(x))
149 
151 // log2f
153 #include <math.h>
154 // +-Inf -> +-127.0f in order to match the behaviour of the SIMD kernels
155 // NaN -> NaN (preserved for consistency)
156 static inline float log2f_non_ieee(float f)
157 {
158  float const result = log2f(f);
159  // Return NaN for NaN inputs or negative values (preserves IEEE behavior for invalid
160  // inputs)
161  if (isnan(result))
162  return result;
163  // Map ±Inf to ±127.0f to match SIMD kernel behavior
164  return isinf(result) ? copysignf(127.0f, result) : result;
165 }
166 
168 // Constant used to do log10 calculations as faster log2
170 // precalculated 10.0 / log2f_non_ieee(10.0) to allow for constexpr
171 #define volk_log2to10factor (0x1.815182p1) // 3.01029995663981209120
172 
174 // arctan(x) polynomial expansion
176 static inline float volk_arctan_poly(const float x)
177 {
178  /*
179  * arctan(x) polynomial expansion on the interval [-1, 1]
180  * Maximum relative error < 6.6e-7
181  */
182  const float a1 = +0x1.ffffeap-1f;
183  const float a3 = -0x1.55437p-2f;
184  const float a5 = +0x1.972be6p-3f;
185  const float a7 = -0x1.1436ap-3f;
186  const float a9 = +0x1.5785aap-4f;
187  const float a11 = -0x1.2f3004p-5f;
188  const float a13 = +0x1.01a37cp-7f;
189 
190  const float x_times_x = x * x;
191  float arctan = a13;
192  arctan = fmaf(x_times_x, arctan, a11);
193  arctan = fmaf(x_times_x, arctan, a9);
194  arctan = fmaf(x_times_x, arctan, a7);
195  arctan = fmaf(x_times_x, arctan, a5);
196  arctan = fmaf(x_times_x, arctan, a3);
197  arctan = fmaf(x_times_x, arctan, a1);
198  arctan *= x;
199 
200  return arctan;
201 }
203 // sin(x) polynomial expansion
205 static inline float volk_sin_poly(const float x)
206 {
207  /*
208  * Minimax polynomial for sin(x) on [-pi/4, pi/4]
209  * Coefficients via Remez algorithm (Sollya)
210  * Max |error| < 7.3e-9
211  * sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
212  */
213  const float s1 = -0x1.555552p-3f;
214  const float s2 = +0x1.110be2p-7f;
215  const float s3 = -0x1.9ab22ap-13f;
216 
217  const float x2 = x * x;
218  const float x3 = x2 * x;
219 
220  float poly = fmaf(x2, s3, s2);
221  poly = fmaf(x2, poly, s1);
222  return fmaf(x3, poly, x);
223 }
225 // cos(x) polynomial expansion
227 static inline float volk_cos_poly(const float x)
228 {
229  /*
230  * Minimax polynomial for cos(x) on [-pi/4, pi/4]
231  * Coefficients via Remez algorithm (Sollya)
232  * Max |error| < 1.1e-7
233  * cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
234  */
235  const float c1 = -0x1.fffff4p-2f;
236  const float c2 = +0x1.554a46p-5f;
237  const float c3 = -0x1.661be2p-10f;
238 
239  const float x2 = x * x;
240 
241  float poly = fmaf(x2, c3, c2);
242  poly = fmaf(x2, poly, c1);
243  return fmaf(x2, poly, 1.0f);
244 }
246 // sin(x) with Cody-Waite argument reduction
248 static inline float volk_sin(const float x)
249 {
250  /*
251  * Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
252  * Then use sin/cos polynomials based on quadrant
253  */
254  const float two_over_pi = 0x1.45f306p-1f;
255  const float pi_over_2_hi = 0x1.921fb6p+0f;
256  const float pi_over_2_lo = -0x1.777a5cp-25f;
257 
258  float n_f = rintf(x * two_over_pi);
259  int n = (int)n_f;
260 
261  float r = fmaf(-n_f, pi_over_2_hi, x);
262  r = fmaf(-n_f, pi_over_2_lo, r);
263 
264  float sin_r = volk_sin_poly(r);
265  float cos_r = volk_cos_poly(r);
266 
267  // Quadrant selection: n&1 swaps sin/cos, n&2 negates
268  float result = (n & 1) ? cos_r : sin_r;
269  return (n & 2) ? -result : result;
270 }
272 // cos(x) with Cody-Waite argument reduction
274 static inline float volk_cos(const float x)
275 {
276  /*
277  * Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
278  * Then use sin/cos polynomials based on quadrant
279  */
280  const float two_over_pi = 0x1.45f306p-1f;
281  const float pi_over_2_hi = 0x1.921fb6p+0f;
282  const float pi_over_2_lo = -0x1.777a5cp-25f;
283 
284  float n_f = rintf(x * two_over_pi);
285  int n = (int)n_f;
286 
287  float r = fmaf(-n_f, pi_over_2_hi, x);
288  r = fmaf(-n_f, pi_over_2_lo, r);
289 
290  float sin_r = volk_sin_poly(r);
291  float cos_r = volk_cos_poly(r);
292 
293  // Quadrant selection: n&1 swaps sin/cos, (n+1)&2 negates
294  float result = (n & 1) ? sin_r : cos_r;
295  return ((n + 1) & 2) ? -result : result;
296 }
298 // arctan(x)
300 static inline float volk_arctan(const float x)
301 {
302  /*
303  * arctan(x) + arctan(1 / x) == sign(x) * pi / 2
304  */
305  const float pi_2 = 0x1.921fb6p0f;
306 
307  // Propagate NaN
308  if (isnan(x)) {
309  return x;
310  }
311 
312  // arctan(±∞) = ±π/2
313  if (isinf(x)) {
314  return copysignf(pi_2, x);
315  }
316 
317  if (fabs(x) < 1.f) {
318  return volk_arctan_poly(x);
319  } else {
320  return copysignf(pi_2, x) - volk_arctan_poly(1.f / x);
321  }
322 }
324 // arctan2(y, x)
326 static inline float volk_atan2(const float y, const float x)
327 {
328  /*
329  * / arctan(y / x) if x > 0
330  * | arctan(y / x) + PI if x < 0 and y >= 0
331  * atan2(y, x) = | arctan(y / x) - PI if x < 0 and y < 0
332  * | sign(y) * PI / 2 if x = 0
333  * \ undefined if x = 0 and y = 0
334  * atan2f(0.f, 0.f) shall return 0.f
335  * atan2f(0.f, -0.f) shall return -0.f
336  */
337  const float pi = 0x1.921fb6p1f;
338  const float pi_2 = 0x1.921fb6p0f;
339 
340  // Propagate NaN from inputs
341  if (isnan(x) || isnan(y)) {
342  return x + y;
343  }
344 
345  // Handle infinity cases per IEEE 754
346  if (isinf(y)) {
347  if (isinf(x)) {
348  // Both infinite: atan2(±∞, ±∞) = ±π/4 or ±3π/4
349  const float angle = (x > 0.f) ? (pi_2 / 2.f) : (3.f * pi_2 / 2.f);
350  return copysignf(angle, y);
351  } else {
352  // y infinite, x finite: atan2(±∞, x) = ±π/2
353  return copysignf(pi_2, y);
354  }
355  }
356  if (isinf(x)) {
357  // x infinite, y finite: atan2(y, +∞) = ±0, atan2(y, -∞) = ±π
358  return (x > 0.f) ? copysignf(0.f, y) : copysignf(pi, y);
359  }
360 
361  if (fabs(x) == 0.f) {
362  return (fabs(y) == 0.f) ? copysignf(0.f, y) : copysignf(pi_2, y);
363  }
364  const int swap = fabs(x) < fabs(y);
365  const float numerator = swap ? x : y;
366  const float denominator = swap ? y : x;
367  float input = numerator / denominator;
368 
369  if (isnan(input)) {
370  input = numerator;
371  }
372 
373  float result = volk_arctan_poly(input);
374  result = swap ? (input >= 0.f ? pi_2 : -pi_2) - result : result;
375  if (x < 0.f) {
376  result += copysignf(pi, y);
377  }
378  return result;
379 }
380 
382 // arcsin(x) polynomial expansion
383 // P(u) such that asin(x) = x * P(x^2) on |x| <= 0.5
384 // Maximum relative error ~1.5e-6
386 static inline float volk_arcsin_poly(const float x)
387 {
388  const float c0 = 0x1.ffffcep-1f;
389  const float c1 = 0x1.55b648p-3f;
390  const float c2 = 0x1.24d192p-4f;
391  const float c3 = 0x1.0a788p-4f;
392 
393  const float u = x * x;
394  float p = c3;
395  p = fmaf(u, p, c2);
396  p = fmaf(u, p, c1);
397  p = fmaf(u, p, c0);
398 
399  return x * p;
400 }
402 // arcsin(x) using two-range algorithm
404 static inline float volk_arcsin(const float x)
405 {
406  const float pi_2 = 0x1.921fb6p0f;
407 
408  const float ax = fabsf(x);
409  if (ax <= 0.5f) {
410  // Small argument: direct polynomial
411  return volk_arcsin_poly(x);
412  } else {
413  // Large argument: use identity asin(x) = pi/2 - 2*asin(sqrt((1-|x|)/2))
414  const float t = (1.0f - ax) * 0.5f;
415  const float s = sqrtf(t);
416  const float inner = volk_arcsin_poly(s);
417  const float result = pi_2 - 2.0f * inner;
418  return copysignf(result, x);
419  }
420 }
422 // arccos(x) = pi/2 - arcsin(x)
424 static inline float volk_arccos(const float x)
425 {
426  const float pi_2 = 0x1.921fb6p0f;
427  return pi_2 - volk_arcsin(x);
428 }
429 
430 #endif /*INCLUDED_LIBVOLK_COMMON_H*/
volk_arctan_poly
static float volk_arctan_poly(const float x)
Definition: volk_common.h:176
bit256::int_vec
__m256i int_vec
Definition: volk_common.h:142
bit256::i16
uint16_t i16[16]
Definition: volk_common.h:135
log2f_non_ieee
static float log2f_non_ieee(float f)
Definition: volk_common.h:156
bit128::i8
uint8_t i8[16]
Definition: volk_common.h:117
bit256
Definition: volk_common.h:133
volk_arcsin
static float volk_arcsin(const float x)
Definition: volk_common.h:404
bit256::f
float f[8]
Definition: volk_common.h:137
volk_arccos
static float volk_arccos(const float x)
Definition: volk_common.h:424
volk_cos
static float volk_cos(const float x)
Definition: volk_common.h:274
volk_sin_poly
static float volk_sin_poly(const float x)
Definition: volk_common.h:205
bit256::i
uint32_t i[8]
Definition: volk_common.h:136
volk_arctan
static float volk_arctan(const float x)
Definition: volk_common.h:300
bit128::int_vec
__m128i int_vec
Definition: volk_common.h:128
volk_sin
static float volk_sin(const float x)
Definition: volk_common.h:248
bit256::i8
uint8_t i8[32]
Definition: volk_common.h:134
bit128::double_vec
__m128d double_vec
Definition: volk_common.h:129
bit128::d
double d[2]
Definition: volk_common.h:121
bit128::i16
uint16_t i16[8]
Definition: volk_common.h:118
volk_cos_poly
static float volk_cos_poly(const float x)
Definition: volk_common.h:227
volk_arcsin_poly
static float volk_arcsin_poly(const float x)
Definition: volk_common.h:386
bit128::i
uint32_t i[4]
Definition: volk_common.h:119
bit128::f
float f[4]
Definition: volk_common.h:120
bit256::float_vec
__m256 float_vec
Definition: volk_common.h:141
bit256::d
double d[4]
Definition: volk_common.h:138
bit128
Definition: volk_common.h:116
rintf
static float rintf(float x)
Definition: config.h:45
bit256::double_vec
__m256d double_vec
Definition: volk_common.h:143
bit128::float_vec
__m128 float_vec
Definition: volk_common.h:124
volk_atan2
static float volk_atan2(const float y, const float x)
Definition: volk_common.h:326