Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_atan_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  * Copyright 2023, 2024 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
57 #include <math.h>
58 
59 #ifndef INCLUDED_volk_32f_atan_32f_a_H
60 #define INCLUDED_volk_32f_atan_32f_a_H
61 
62 #ifdef LV_HAVE_GENERIC
63 static inline void
64 volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
65 {
66  for (unsigned int number = 0; number < num_points; number++) {
67  *out++ = atanf(*in++);
68  }
69 }
70 #endif /* LV_HAVE_GENERIC */
71 
72 #ifdef LV_HAVE_GENERIC
73 static inline void
74 volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
75 {
76  for (unsigned int number = 0; number < num_points; number++) {
77  *out++ = volk_arctan(*in++);
78  }
79 }
80 #endif /* LV_HAVE_GENERIC */
81 
82 #if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
83 #include <immintrin.h>
85 static inline void
86 volk_32f_atan_32f_a_avx512dq(float* out, const float* in, unsigned int num_points)
87 {
88  const __m512 one = _mm512_set1_ps(1.f);
89  const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
90  const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
91  const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
92 
93  const unsigned int sixteenth_points = num_points / 16;
94 
95  for (unsigned int number = 0; number < sixteenth_points; number++) {
96  __m512 x = _mm512_load_ps(in);
97  in += 16;
98  __mmask16 swap_mask =
99  _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
100  __m512 x_star = _mm512_mask_blend_ps(swap_mask, x, _mm512_rcp14_ps(x));
101  __m512 result = _mm512_arctan_poly_avx512(x_star);
102  __m512 term = _mm512_and_ps(x_star, sign_mask);
103  term = _mm512_or_ps(pi_over_2, term);
104  term = _mm512_sub_ps(term, result);
105  result = _mm512_mask_blend_ps(swap_mask, result, term);
106  _mm512_store_ps(out, result);
107  out += 16;
108  }
109 
110  for (unsigned int number = sixteenth_points * 16; number < num_points; number++) {
111  *out++ = volk_arctan(*in++);
112  }
113 }
114 #endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */
115 
116 #if LV_HAVE_AVX2 && LV_HAVE_FMA
117 #include <immintrin.h>
119 static inline void
120 volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
121 {
122  const __m256 one = _mm256_set1_ps(1.f);
123  const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
124  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
125  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
126 
127  const unsigned int eighth_points = num_points / 8;
128 
129  for (unsigned int number = 0; number < eighth_points; number++) {
130  __m256 x = _mm256_load_ps(in);
131  in += 8;
132  __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
133  __m256 x_star = _mm256_blendv_ps(x, _mm256_rcp_ps(x), swap_mask);
134  __m256 result = _mm256_arctan_poly_avx2_fma(x_star);
135  __m256 term = _mm256_and_ps(x_star, sign_mask);
136  term = _mm256_or_ps(pi_over_2, term);
137  term = _mm256_sub_ps(term, result);
138  result = _mm256_blendv_ps(result, term, swap_mask);
139  _mm256_store_ps(out, result);
140  out += 8;
141  }
142 
143  for (unsigned int number = eighth_points * 8; number < num_points; number++) {
144  *out++ = volk_arctan(*in++);
145  }
146 }
147 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
148 
149 #if LV_HAVE_AVX
150 #include <immintrin.h>
152 static inline void
153 volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
154 {
155  const __m256 one = _mm256_set1_ps(1.f);
156  const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
157  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
158  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
159 
160  const unsigned int eighth_points = num_points / 8;
161 
162  for (unsigned int number = 0; number < eighth_points; number++) {
163  __m256 x = _mm256_load_ps(in);
164  in += 8;
165  __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
166  __m256 x_star = _mm256_blendv_ps(x, _mm256_rcp_ps(x), swap_mask);
167  __m256 result = _mm256_arctan_poly_avx(x_star);
168  __m256 term = _mm256_and_ps(x_star, sign_mask);
169  term = _mm256_or_ps(pi_over_2, term);
170  term = _mm256_sub_ps(term, result);
171  result = _mm256_blendv_ps(result, term, swap_mask);
172  _mm256_store_ps(out, result);
173  out += 8;
174  }
175 
176  for (unsigned int number = eighth_points * 8; number < num_points; number++) {
177  *out++ = volk_arctan(*in++);
178  }
179 }
180 #endif /* LV_HAVE_AVX for aligned */
181 
182 #ifdef LV_HAVE_SSE4_1
183 #include <smmintrin.h>
185 static inline void
186 volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
187 {
188  const __m128 one = _mm_set1_ps(1.f);
189  const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
190  const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
191  const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
192 
193  const unsigned int quarter_points = num_points / 4;
194 
195  for (unsigned int number = 0; number < quarter_points; number++) {
196  __m128 x = _mm_load_ps(in);
197  in += 4;
198  __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
199  __m128 x_star = _mm_blendv_ps(x, _mm_rcp_ps(x), swap_mask);
200  __m128 result = _mm_arctan_poly_sse(x_star);
201  __m128 term = _mm_and_ps(x_star, sign_mask);
202  term = _mm_or_ps(pi_over_2, term);
203  term = _mm_sub_ps(term, result);
204  result = _mm_blendv_ps(result, term, swap_mask);
205  _mm_store_ps(out, result);
206  out += 4;
207  }
208 
209  for (unsigned int number = quarter_points * 4; number < num_points; number++) {
210  *out++ = volk_arctan(*in++);
211  }
212 }
213 #endif /* LV_HAVE_SSE4_1 for aligned */
214 
215 #ifdef LV_HAVE_NEON
216 #include <arm_neon.h>
218 static inline void
219 volk_32f_atan_32f_neon(float* out, const float* in, unsigned int num_points)
220 {
221  const float32x4_t one = vdupq_n_f32(1.f);
222  const float32x4_t pi_over_2 = vdupq_n_f32(0x1.921fb6p0f);
223  const uint32x4_t sign_mask = vdupq_n_u32(0x80000000);
224 
225  const unsigned int quarter_points = num_points / 4;
226 
227  for (unsigned int number = 0; number < quarter_points; number++) {
228  float32x4_t x = vld1q_f32(in);
229  in += 4;
230 
231  // swap_mask = |x| > 1
232  float32x4_t abs_x = vabsq_f32(x);
233  uint32x4_t swap_mask = vcgtq_f32(abs_x, one);
234 
235  // x_star = swap_mask ? 1/x : x
236  float32x4_t x_star = vbslq_f32(swap_mask, _vinvq_f32(x), x);
237 
238  // Compute arctan polynomial
239  float32x4_t result = _varctan_poly_f32(x_star);
240 
241  // If swapped: result = sign(x_star)*pi/2 - result
242  uint32x4_t x_star_sign = vandq_u32(vreinterpretq_u32_f32(x_star), sign_mask);
243  float32x4_t term = vreinterpretq_f32_u32(
244  vorrq_u32(vreinterpretq_u32_f32(pi_over_2), x_star_sign));
245  term = vsubq_f32(term, result);
246  result = vbslq_f32(swap_mask, term, result);
247 
248  vst1q_f32(out, result);
249  out += 4;
250  }
251 
252  for (unsigned int number = quarter_points * 4; number < num_points; number++) {
253  *out++ = volk_arctan(*in++);
254  }
255 }
256 #endif /* LV_HAVE_NEON */
257 
258 #ifdef LV_HAVE_NEONV8
259 #include <arm_neon.h>
261 /* ARMv8 NEON with FMA and 2x unrolling for better ILP */
262 static inline void
263 volk_32f_atan_32f_neonv8(float* out, const float* in, unsigned int num_points)
264 {
265  const float32x4_t one = vdupq_n_f32(1.f);
266  const float32x4_t pi_over_2 = vdupq_n_f32(0x1.921fb6p0f);
267  const uint32x4_t sign_mask = vdupq_n_u32(0x80000000);
268 
269  const unsigned int eighth_points = num_points / 8;
270 
271  for (unsigned int number = 0; number < eighth_points; number++) {
272  /* Load 8 floats */
273  float32x4_t x0 = vld1q_f32(in);
274  float32x4_t x1 = vld1q_f32(in + 4);
275  in += 8;
276 
277  /* Process first 4 */
278  float32x4_t abs_x0 = vabsq_f32(x0);
279  uint32x4_t swap_mask0 = vcgtq_f32(abs_x0, one);
280  float32x4_t x_star0 = vbslq_f32(swap_mask0, _vinvq_f32(x0), x0);
281  float32x4_t result0 = _varctan_poly_neonv8(x_star0);
282  uint32x4_t x_star_sign0 = vandq_u32(vreinterpretq_u32_f32(x_star0), sign_mask);
283  float32x4_t term0 = vreinterpretq_f32_u32(
284  vorrq_u32(vreinterpretq_u32_f32(pi_over_2), x_star_sign0));
285  term0 = vsubq_f32(term0, result0);
286  result0 = vbslq_f32(swap_mask0, term0, result0);
287 
288  /* Process second 4 */
289  float32x4_t abs_x1 = vabsq_f32(x1);
290  uint32x4_t swap_mask1 = vcgtq_f32(abs_x1, one);
291  float32x4_t x_star1 = vbslq_f32(swap_mask1, _vinvq_f32(x1), x1);
292  float32x4_t result1 = _varctan_poly_neonv8(x_star1);
293  uint32x4_t x_star_sign1 = vandq_u32(vreinterpretq_u32_f32(x_star1), sign_mask);
294  float32x4_t term1 = vreinterpretq_f32_u32(
295  vorrq_u32(vreinterpretq_u32_f32(pi_over_2), x_star_sign1));
296  term1 = vsubq_f32(term1, result1);
297  result1 = vbslq_f32(swap_mask1, term1, result1);
298 
299  vst1q_f32(out, result0);
300  vst1q_f32(out + 4, result1);
301  out += 8;
302  }
303 
304  for (unsigned int number = eighth_points * 8; number < num_points; number++) {
305  *out++ = volk_arctan(*in++);
306  }
307 }
308 #endif /* LV_HAVE_NEONV8 */
309 
310 #endif /* INCLUDED_volk_32f_atan_32f_a_H */
311 
312 #ifndef INCLUDED_volk_32f_atan_32f_u_H
313 #define INCLUDED_volk_32f_atan_32f_u_H
314 
315 #if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
316 #include <immintrin.h>
318 static inline void
319 volk_32f_atan_32f_u_avx512dq(float* out, const float* in, unsigned int num_points)
320 {
321  const __m512 one = _mm512_set1_ps(1.f);
322  const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
323  const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
324  const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
325 
326  const unsigned int sixteenth_points = num_points / 16;
327 
328  for (unsigned int number = 0; number < sixteenth_points; number++) {
329  __m512 x = _mm512_loadu_ps(in);
330  in += 16;
331  __mmask16 swap_mask =
332  _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
333  __m512 x_star = _mm512_mask_blend_ps(swap_mask, x, _mm512_rcp14_ps(x));
334  __m512 result = _mm512_arctan_poly_avx512(x_star);
335  __m512 term = _mm512_and_ps(x_star, sign_mask);
336  term = _mm512_or_ps(pi_over_2, term);
337  term = _mm512_sub_ps(term, result);
338  result = _mm512_mask_blend_ps(swap_mask, result, term);
339  _mm512_storeu_ps(out, result);
340  out += 16;
341  }
342 
343  for (unsigned int number = sixteenth_points * 16; number < num_points; number++) {
344  *out++ = volk_arctan(*in++);
345  }
346 }
347 #endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */
348 
349 #if LV_HAVE_AVX2 && LV_HAVE_FMA
350 #include <immintrin.h>
351 static inline void
352 volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
353 {
354  const __m256 one = _mm256_set1_ps(1.f);
355  const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
356  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
357  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
358 
359  const unsigned int eighth_points = num_points / 8;
360 
361  for (unsigned int number = 0; number < eighth_points; number++) {
362  __m256 x = _mm256_loadu_ps(in);
363  in += 8;
364  __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
365  __m256 x_star = _mm256_blendv_ps(x, _mm256_rcp_ps(x), swap_mask);
366  __m256 result = _mm256_arctan_poly_avx2_fma(x_star);
367  __m256 term = _mm256_and_ps(x_star, sign_mask);
368  term = _mm256_or_ps(pi_over_2, term);
369  term = _mm256_sub_ps(term, result);
370  result = _mm256_blendv_ps(result, term, swap_mask);
371  _mm256_storeu_ps(out, result);
372  out += 8;
373  }
374 
375  for (unsigned int number = eighth_points * 8; number < num_points; number++) {
376  *out++ = volk_arctan(*in++);
377  }
378 }
379 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
380 
381 #if LV_HAVE_AVX
382 #include <immintrin.h>
383 static inline void
384 volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
385 {
386  const __m256 one = _mm256_set1_ps(1.f);
387  const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
388  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
389  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
390 
391  const unsigned int eighth_points = num_points / 8;
392 
393  for (unsigned int number = 0; number < eighth_points; number++) {
394  __m256 x = _mm256_loadu_ps(in);
395  in += 8;
396  __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
397  __m256 x_star = _mm256_blendv_ps(x, _mm256_rcp_ps(x), swap_mask);
398  __m256 result = _mm256_arctan_poly_avx(x_star);
399  __m256 term = _mm256_and_ps(x_star, sign_mask);
400  term = _mm256_or_ps(pi_over_2, term);
401  term = _mm256_sub_ps(term, result);
402  result = _mm256_blendv_ps(result, term, swap_mask);
403  _mm256_storeu_ps(out, result);
404  out += 8;
405  }
406 
407  for (unsigned int number = eighth_points * 8; number < num_points; number++) {
408  *out++ = volk_arctan(*in++);
409  }
410 }
411 #endif /* LV_HAVE_AVX for unaligned */
412 
413 #ifdef LV_HAVE_SSE4_1
414 #include <smmintrin.h>
416 static inline void
417 volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
418 {
419  const __m128 one = _mm_set1_ps(1.f);
420  const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
421  const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
422  const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
423 
424  const unsigned int quarter_points = num_points / 4;
425 
426  for (unsigned int number = 0; number < quarter_points; number++) {
427  __m128 x = _mm_loadu_ps(in);
428  in += 4;
429  __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
430  __m128 x_star = _mm_blendv_ps(x, _mm_rcp_ps(x), swap_mask);
431  __m128 result = _mm_arctan_poly_sse(x_star);
432  __m128 term = _mm_and_ps(x_star, sign_mask);
433  term = _mm_or_ps(pi_over_2, term);
434  term = _mm_sub_ps(term, result);
435  result = _mm_blendv_ps(result, term, swap_mask);
436  _mm_storeu_ps(out, result);
437  out += 4;
438  }
439 
440  for (unsigned int number = quarter_points * 4; number < num_points; number++) {
441  *out++ = volk_arctan(*in++);
442  }
443 }
444 #endif /* LV_HAVE_SSE4_1 for unaligned */
445 
446 #ifdef LV_HAVE_RVV
447 #include <riscv_vector.h>
448 
449 static inline void
450 volk_32f_atan_32f_rvv(float* out, const float* in, unsigned int num_points)
451 {
452  size_t vlmax = __riscv_vsetvlmax_e32m2();
453 
454  const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
455  const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
456  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
457  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
458  const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
459  const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
460  const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
461  const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
462  const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
463 
464  size_t n = num_points;
465  for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
466  vl = __riscv_vsetvl_e32m2(n);
467  vfloat32m2_t v = __riscv_vle32_v_f32m2(in, vl);
468  vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(v, vl), cf1, vl);
469  vfloat32m2_t x = __riscv_vfdiv_mu(mswap, v, cf1, v, vl);
470  vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
471  vfloat32m2_t p = c13;
472  p = __riscv_vfmadd(p, xx, c11, vl);
473  p = __riscv_vfmadd(p, xx, c9, vl);
474  p = __riscv_vfmadd(p, xx, c7, vl);
475  p = __riscv_vfmadd(p, xx, c5, vl);
476  p = __riscv_vfmadd(p, xx, c3, vl);
477  p = __riscv_vfmadd(p, xx, c1, vl);
478  p = __riscv_vfmul(p, x, vl);
479 
480  vfloat32m2_t t = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
481  p = __riscv_vmerge(p, t, mswap, vl);
482 
483  __riscv_vse32(out, p, vl);
484  }
485 }
486 #endif /*LV_HAVE_RVV*/
487 
488 #endif /* INCLUDED_volk_32f_atan_32f_u_H */
_mm256_arctan_poly_avx2_fma
static __m256 _mm256_arctan_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:26
volk_32f_atan_32f_u_avx2
static void volk_32f_atan_32f_u_avx2(float *out, const float *in, unsigned int num_points)
Definition: volk_32f_atan_32f.h:384
volk_avx512_intrinsics.h
volk_32f_atan_32f_a_avx2
static void volk_32f_atan_32f_a_avx2(float *out, const float *in, unsigned int num_points)
Definition: volk_32f_atan_32f.h:153
_mm256_arctan_poly_avx
static __m256 _mm256_arctan_poly_avx(const __m256 x)
Definition: volk_avx_intrinsics.h:64
volk_32f_atan_32f_polynomial
static void volk_32f_atan_32f_polynomial(float *out, const float *in, unsigned int num_points)
Definition: volk_32f_atan_32f.h:74
volk_arctan
static float volk_arctan(const float x)
Definition: volk_common.h:300
volk_sse_intrinsics.h
_mm512_arctan_poly_avx512
static __m512 _mm512_arctan_poly_avx512(const __m512 x)
Definition: volk_avx512_intrinsics.h:73
volk_avx2_fma_intrinsics.h
_varctan_poly_f32
static float32x4_t _varctan_poly_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:329
volk_32f_atan_32f_neon
static void volk_32f_atan_32f_neon(float *out, const float *in, unsigned int num_points)
Definition: volk_32f_atan_32f.h:219
_mm_arctan_poly_sse
static __m128 _mm_arctan_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:55
volk_neon_intrinsics.h
volk_32f_atan_32f_generic
static void volk_32f_atan_32f_generic(float *out, const float *in, unsigned int num_points)
Definition: volk_32f_atan_32f.h:64
volk_avx_intrinsics.h
_vinvq_f32
static float32x4_t _vinvq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:114