Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_avx2_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
11 /*
12  * This file is intended to hold AVX2 intrinsics of intrinsics.
13  * They should be used in VOLK kernels to avoid copy-paste.
14  */
15 
16 #ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
17 #define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
19 #include <immintrin.h>
20 
21 /*
22  * Newton-Raphson refined reciprocal square root: 1/sqrt(a)
23  * AVX2 version with native 256-bit integer operations for edge case handling
24  * Handles edge cases: +0 → +Inf, +Inf → 0
25  */
26 static inline __m256 _mm256_rsqrt_nr_avx2(const __m256 a)
27 {
28  const __m256 HALF = _mm256_set1_ps(0.5f);
29  const __m256 THREE_HALFS = _mm256_set1_ps(1.5f);
30 
31  const __m256 x0 = _mm256_rsqrt_ps(a); // +Inf for +0, 0 for +Inf
32 
33  // Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
34  __m256 x1 = _mm256_mul_ps(
35  x0,
36  _mm256_sub_ps(THREE_HALFS,
37  _mm256_mul_ps(HALF, _mm256_mul_ps(_mm256_mul_ps(x0, x0), a))));
38 
39  // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
40  // AVX2: native 256-bit integer compare
41  __m256i a_si = _mm256_castps_si256(a);
42  __m256i zero_mask = _mm256_cmpeq_epi32(a_si, _mm256_setzero_si256());
43  __m256i inf_mask = _mm256_cmpeq_epi32(a_si, _mm256_set1_epi32(0x7F800000));
44  __m256 special_mask = _mm256_castsi256_ps(_mm256_or_si256(zero_mask, inf_mask));
45  return _mm256_blendv_ps(x1, x0, special_mask);
46 }
47 
48 static inline __m256 _mm256_real(const __m256 z1, const __m256 z2)
49 {
50  const __m256i permute_mask = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
51  __m256 r = _mm256_shuffle_ps(z1, z2, _MM_SHUFFLE(2, 0, 2, 0));
52  return _mm256_permutevar8x32_ps(r, permute_mask);
53 }
54 
55 static inline __m256 _mm256_imag(const __m256 z1, const __m256 z2)
56 {
57  const __m256i permute_mask = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
58  __m256 i = _mm256_shuffle_ps(z1, z2, _MM_SHUFFLE(3, 1, 3, 1));
59  return _mm256_permutevar8x32_ps(i, permute_mask);
60 }
61 
62 static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
63 {
64  const __m128i zeros = _mm_set1_epi8(0x00);
65  const __m128i sign_extract = _mm_set1_epi8(0x80);
66  const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
67  0xff,
68  0xff,
69  0x00,
70  0xff,
71  0xff,
72  0xff,
73  0x01,
74  0xff,
75  0xff,
76  0xff,
77  0x02,
78  0xff,
79  0xff,
80  0xff,
81  0x03,
82  0xff,
83  0xff,
84  0xff,
85  0x04,
86  0xff,
87  0xff,
88  0xff,
89  0x05,
90  0xff,
91  0xff,
92  0xff,
93  0x06,
94  0xff,
95  0xff,
96  0xff,
97  0x07);
98  __m256i sign_bits = _mm256_setzero_si256();
99 
100  fbits = _mm_cmpgt_epi8(fbits, zeros);
101  fbits = _mm_and_si128(fbits, sign_extract);
102  sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
103  sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
104  sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
105 
106  return _mm256_castsi256_ps(sign_bits);
107 }
108 
109 static inline __m256
110 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
111 {
112  // prepare sign mask for correct +-
113  __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
114 
115  __m256 llr0, llr1;
116  _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
117 
118  // calculate result
119  llr0 = _mm256_xor_ps(llr0, sign_mask);
120  __m256 dst = _mm256_add_ps(llr0, llr1);
121  return dst;
122 }
123 
124 static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
125  const __m256 cplxValue1)
126 {
127  const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
128  const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
129  const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
130  const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
131  return _mm256_permutevar8x32_ps(complex_result, idx);
132 }
133 
134 static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
135  const __m256 symbols1,
136  const __m256 points0,
137  const __m256 points1,
138  const __m256 scalar)
139 {
140  /*
141  * Calculate: |y - x|^2 * SNR_lin
142  * Consider 'symbolsX' and 'pointsX' to be complex float
143  * 'symbolsX' are 'y' and 'pointsX' are 'x'
144  */
145  const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
146  const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
147  const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
148  return _mm256_mul_ps(norms, scalar);
149 }
150 
151 /*
152  * The function below vectorizes the inner loop of the following code:
153  *
154  * float max_values[8] = {0.f};
155  * unsigned max_indices[8] = {0};
156  * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
157  * for (unsigned i = 0; i < num_points / 8; ++i) {
158  * for (unsigned j = 0; j < 8; ++j) {
159  * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
160  * bool compare = abs_squared > max_values[j];
161  * max_values[j] = compare ? abs_squared : max_values[j];
162  * max_indices[j] = compare ? current_indices[j] : max_indices[j]
163  * current_indices[j] += 8; // update for next outer loop iteration
164  * ++src0;
165  * }
166  * }
167  */
168 static inline void vector_32fc_index_max_variant0(__m256 in0,
169  __m256 in1,
170  __m256* max_values,
171  __m256i* max_indices,
172  __m256i* current_indices,
173  __m256i indices_increment)
174 {
175  in0 = _mm256_mul_ps(in0, in0);
176  in1 = _mm256_mul_ps(in1, in1);
177 
178  /*
179  * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
180  * hadd_ps(a, b) computes
181  * (b_7 + b_6,
182  * b_5 + b_4,
183  * ---------
184  * a_7 + b_6,
185  * a_5 + a_4,
186  * ---------
187  * b_3 + b_2,
188  * b_1 + b_0,
189  * ---------
190  * a_3 + a_2,
191  * a_1 + a_0).
192  * The result is the squared absolute value of complex numbers at index
193  * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
194  * current_indices!
195  */
196  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
197 
198  /*
199  * Compare the recently computed squared absolute values with the
200  * previously determined maximum values. cmp_ps(a, b) determines
201  * a > b ? 0xFFFFFFFF for each element in the vectors =>
202  * compare_mask = abs_squared > max_values ? 0xFFFFFFFF : 0
203  *
204  * If either operand is NaN, 0 is returned as an “ordered” comparision is
205  * used => the blend operation will select the value from *max_values.
206  */
207  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
208 
209  /* Select maximum by blending. This is the only line which differs from variant1 */
210  *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
211 
212  /*
213  * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
214  * each element in the vectors =>
215  * max_indices = compare_mask ? current_indices : max_indices
216  *
217  * Note: The casting of data types is required to make the compiler happy
218  * and does not change values.
219  */
220  *max_indices =
221  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
222  _mm256_castsi256_ps(*current_indices),
223  compare_mask));
224 
225  /* compute indices of complex numbers which will be loaded in the next iteration */
226  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
227 }
228 
229 /* See _variant0 for details */
230 static inline void vector_32fc_index_max_variant1(__m256 in0,
231  __m256 in1,
232  __m256* max_values,
233  __m256i* max_indices,
234  __m256i* current_indices,
235  __m256i indices_increment)
236 {
237  in0 = _mm256_mul_ps(in0, in0);
238  in1 = _mm256_mul_ps(in1, in1);
239 
240  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
241  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
242 
243  /*
244  * This is the only line which differs from variant0. Using maxps instead of
245  * blendvps is faster on Intel CPUs (on the ones tested with).
246  *
247  * Note: The order of arguments matters if a NaN is encountered in which
248  * case the value of the second argument is selected. This is consistent
249  * with the “ordered” comparision and the blend operation: The comparision
250  * returns false if a NaN is encountered and the blend operation
251  * consequently selects the value from max_indices.
252  */
253  *max_values = _mm256_max_ps(abs_squared, *max_values);
254 
255  *max_indices =
256  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
257  _mm256_castsi256_ps(*current_indices),
258  compare_mask));
259 
260  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
261 }
262 
263 /*
264  * The function below vectorizes the inner loop of the following code:
265  *
266  * float min_values[8] = {FLT_MAX};
267  * unsigned min_indices[8] = {0};
268  * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
269  * for (unsigned i = 0; i < num_points / 8; ++i) {
270  * for (unsigned j = 0; j < 8; ++j) {
271  * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
272  * bool compare = abs_squared < min_values[j];
273  * min_values[j] = compare ? abs_squared : min_values[j];
274  * min_indices[j] = compare ? current_indices[j] : min_indices[j]
275  * current_indices[j] += 8; // update for next outer loop iteration
276  * ++src0;
277  * }
278  * }
279  */
280 static inline void vector_32fc_index_min_variant0(__m256 in0,
281  __m256 in1,
282  __m256* min_values,
283  __m256i* min_indices,
284  __m256i* current_indices,
285  __m256i indices_increment)
286 {
287  in0 = _mm256_mul_ps(in0, in0);
288  in1 = _mm256_mul_ps(in1, in1);
289 
290  /*
291  * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
292  * hadd_ps(a, b) computes
293  * (b_7 + b_6,
294  * b_5 + b_4,
295  * ---------
296  * a_7 + b_6,
297  * a_5 + a_4,
298  * ---------
299  * b_3 + b_2,
300  * b_1 + b_0,
301  * ---------
302  * a_3 + a_2,
303  * a_1 + a_0).
304  * The result is the squared absolute value of complex numbers at index
305  * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
306  * current_indices!
307  */
308  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
309 
310  /*
311  * Compare the recently computed squared absolute values with the
312  * previously determined minimum values. cmp_ps(a, b) determines
313  * a < b ? 0xFFFFFFFF for each element in the vectors =>
314  * compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0
315  *
316  * If either operand is NaN, 0 is returned as an “ordered” comparision is
317  * used => the blend operation will select the value from *min_values.
318  */
319  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
320 
321  /* Select minimum by blending. This is the only line which differs from variant1 */
322  *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask);
323 
324  /*
325  * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
326  * each element in the vectors =>
327  * min_indices = compare_mask ? current_indices : min_indices
328  *
329  * Note: The casting of data types is required to make the compiler happy
330  * and does not change values.
331  */
332  *min_indices =
333  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
334  _mm256_castsi256_ps(*current_indices),
335  compare_mask));
336 
337  /* compute indices of complex numbers which will be loaded in the next iteration */
338  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
339 }
340 
341 /* See _variant0 for details */
342 static inline void vector_32fc_index_min_variant1(__m256 in0,
343  __m256 in1,
344  __m256* min_values,
345  __m256i* min_indices,
346  __m256i* current_indices,
347  __m256i indices_increment)
348 {
349  in0 = _mm256_mul_ps(in0, in0);
350  in1 = _mm256_mul_ps(in1, in1);
351 
352  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
353  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
354 
355  /*
356  * This is the only line which differs from variant0. Using maxps instead of
357  * blendvps is faster on Intel CPUs (on the ones tested with).
358  *
359  * Note: The order of arguments matters if a NaN is encountered in which
360  * case the value of the second argument is selected. This is consistent
361  * with the “ordered” comparision and the blend operation: The comparision
362  * returns false if a NaN is encountered and the blend operation
363  * consequently selects the value from min_indices.
364  */
365  *min_values = _mm256_min_ps(abs_squared, *min_values);
366 
367  *min_indices =
368  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
369  _mm256_castsi256_ps(*current_indices),
370  compare_mask));
371 
372  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
373 }
374 
375 /*
376  * Approximate sin(x) via polynomial expansion
377  * on the interval [-pi/4, pi/4]
378  *
379  * Maximum absolute error ~7.3e-9
380  * sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
381  */
382 static inline __m256 _mm256_sin_poly_avx2(const __m256 x)
383 {
384  const __m256 s1 = _mm256_set1_ps(-0x1.555552p-3f);
385  const __m256 s2 = _mm256_set1_ps(+0x1.110be2p-7f);
386  const __m256 s3 = _mm256_set1_ps(-0x1.9ab22ap-13f);
387 
388  const __m256 x2 = _mm256_mul_ps(x, x);
389  const __m256 x3 = _mm256_mul_ps(x2, x);
390 
391  __m256 poly = _mm256_add_ps(_mm256_mul_ps(x2, s3), s2);
392  poly = _mm256_add_ps(_mm256_mul_ps(x2, poly), s1);
393  return _mm256_add_ps(_mm256_mul_ps(x3, poly), x);
394 }
395 
396 /*
397  * Approximate cos(x) via polynomial expansion
398  * on the interval [-pi/4, pi/4]
399  *
400  * Maximum absolute error ~1.1e-7
401  * cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
402  */
403 static inline __m256 _mm256_cos_poly_avx2(const __m256 x)
404 {
405  const __m256 c1 = _mm256_set1_ps(-0x1.fffff4p-2f);
406  const __m256 c2 = _mm256_set1_ps(+0x1.554a46p-5f);
407  const __m256 c3 = _mm256_set1_ps(-0x1.661be2p-10f);
408  const __m256 one = _mm256_set1_ps(1.0f);
409 
410  const __m256 x2 = _mm256_mul_ps(x, x);
411 
412  __m256 poly = _mm256_add_ps(_mm256_mul_ps(x2, c3), c2);
413  poly = _mm256_add_ps(_mm256_mul_ps(x2, poly), c1);
414  return _mm256_add_ps(_mm256_mul_ps(x2, poly), one);
415 }
416 
417 /*
418  * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
419  * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
420  * Max error: ~1.55e-6
421  *
422  * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
423  * Polynomial evaluated via Horner's method
424  */
425 static inline __m256 _mm256_log2_poly_avx2(const __m256 x)
426 {
427  const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
428  const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
429  const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
430  const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
431  const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
432  const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
433  const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);
434 
435  // Horner's method: c0 + x*(c1 + x*(c2 + ...))
436  __m256 poly = c6;
437  poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c5);
438  poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c4);
439  poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c3);
440  poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c2);
441  poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c1);
442  poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c0);
443  return poly;
444 }
445 
446 #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
_mm256_magnitudesquared_ps_avx2
static __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, const __m256 cplxValue1)
Definition: volk_avx2_intrinsics.h:124
_mm256_scaled_norm_dist_ps_avx2
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:134
vector_32fc_index_max_variant1
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:230
vector_32fc_index_max_variant0
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:168
_mm256_polar_deinterleave
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:246
_mm256_log2_poly_avx2
static __m256 _mm256_log2_poly_avx2(const __m256 x)
Definition: volk_avx2_intrinsics.h:425
i
for i
Definition: volk_config_fixed.tmpl.h:13
vector_32fc_index_min_variant0
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:280
_mm256_rsqrt_nr_avx2
static __m256 _mm256_rsqrt_nr_avx2(const __m256 a)
Definition: volk_avx2_intrinsics.h:26
_mm256_sin_poly_avx2
static __m256 _mm256_sin_poly_avx2(const __m256 x)
Definition: volk_avx2_intrinsics.h:382
vector_32fc_index_min_variant1
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:342
_mm256_polar_fsign_add_llrs_avx2
static __m256 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx2_intrinsics.h:110
_mm256_cos_poly_avx2
static __m256 _mm256_cos_poly_avx2(const __m256 x)
Definition: volk_avx2_intrinsics.h:403
volk_avx_intrinsics.h
_mm256_real
static __m256 _mm256_real(const __m256 z1, const __m256 z2)
Definition: volk_avx2_intrinsics.h:48
_mm256_polar_sign_mask_avx2
static __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
Definition: volk_avx2_intrinsics.h:62
_mm256_imag
static __m256 _mm256_imag(const __m256 z1, const __m256 z2)
Definition: volk_avx2_intrinsics.h:55