Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_s32f_atan2_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  * Copyright 2023, 2024 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
61 #ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
62 #define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
63 
64 #include <math.h>
65 
66 #ifdef LV_HAVE_GENERIC
67 static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector,
68  const lv_32fc_t* inputVector,
69  const float normalizeFactor,
70  unsigned int num_points)
71 {
72  float* outPtr = outputVector;
73  const float* inPtr = (float*)inputVector;
74  const float invNormalizeFactor = 1.f / normalizeFactor;
75 
76  for (unsigned int number = 0; number < num_points; number++) {
77  const float real = *inPtr++;
78  const float imag = *inPtr++;
79  *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
80  }
81 }
82 #endif /* LV_HAVE_GENERIC */
83 
84 #ifdef LV_HAVE_GENERIC
85 #include <volk/volk_common.h>
86 static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector,
87  const lv_32fc_t* inputVector,
88  const float normalizeFactor,
89  unsigned int num_points)
90 {
91  float* outPtr = outputVector;
92  const float* inPtr = (float*)inputVector;
93  const float invNormalizeFactor = 1.f / normalizeFactor;
94 
95  for (unsigned int number = 0; number < num_points; number++) {
96  const float x = *inPtr++;
97  const float y = *inPtr++;
98  *outPtr++ = volk_atan2(y, x) * invNormalizeFactor;
99  }
100 }
101 #endif /* LV_HAVE_GENERIC */
102 
103 #if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
104 #include <immintrin.h>
106 static inline void volk_32fc_s32f_atan2_32f_a_avx512dq(float* outputVector,
107  const lv_32fc_t* complexVector,
108  const float normalizeFactor,
109  unsigned int num_points)
110 {
111  const float* in = (float*)complexVector;
112  float* out = (float*)outputVector;
113 
114  const float invNormalizeFactor = 1.f / normalizeFactor;
115  const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor);
116  const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f);
117  const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
118  const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
119  const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
120 
121  unsigned int number = 0;
122  const unsigned int sixteenth_points = num_points / 16;
123  for (; number < sixteenth_points; number++) {
124  __m512 z1 = _mm512_load_ps(in);
125  in += 16;
126  __m512 z2 = _mm512_load_ps(in);
127  in += 16;
128 
129  __m512 x = _mm512_real(z1, z2);
130  __m512 y = _mm512_imag(z1, z2);
131 
132  // Detect NaN in original inputs before division
133  __mmask16 input_nan_mask = _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q) |
134  _mm512_cmp_ps_mask(y, y, _CMP_UNORD_Q);
135 
136  // Handle infinity cases per IEEE 754
137  const __m512 zero = _mm512_setzero_ps();
138  const __m512 pi_4 = _mm512_set1_ps(0x1.921fb6p-1f); // π/4
139  const __m512 three_pi_4 = _mm512_set1_ps(0x1.2d97c8p1f); // 3π/4
140 
141  __mmask16 y_inf_mask = _mm512_fpclass_ps_mask(y, 0x18); // ±inf
142  __mmask16 x_inf_mask = _mm512_fpclass_ps_mask(x, 0x18); // ±inf
143  __mmask16 x_pos_mask = _mm512_cmp_ps_mask(x, zero, _CMP_GT_OS);
144 
145  // Build infinity result
146  __m512 inf_result = zero;
147  // Both infinite: ±π/4 or ±3π/4
148  __mmask16 both_inf = y_inf_mask & x_inf_mask;
149  __m512 both_inf_result = _mm512_mask_blend_ps(x_pos_mask, three_pi_4, pi_4);
150  both_inf_result = _mm512_or_ps(both_inf_result, _mm512_and_ps(y, sign_mask));
151  inf_result = _mm512_mask_blend_ps(both_inf, inf_result, both_inf_result);
152 
153  // y infinite, x finite: ±π/2
154  __mmask16 y_inf_only = y_inf_mask & ~x_inf_mask;
155  __m512 y_inf_result = _mm512_or_ps(pi_2, _mm512_and_ps(y, sign_mask));
156  inf_result = _mm512_mask_blend_ps(y_inf_only, inf_result, y_inf_result);
157 
158  // x infinite, y finite: 0 or ±π
159  __mmask16 x_inf_only = x_inf_mask & ~y_inf_mask;
160  __m512 x_inf_result =
161  _mm512_mask_blend_ps(x_pos_mask,
162  _mm512_or_ps(pi, _mm512_and_ps(y, sign_mask)),
163  _mm512_or_ps(zero, _mm512_and_ps(y, sign_mask)));
164  inf_result = _mm512_mask_blend_ps(x_inf_only, inf_result, x_inf_result);
165 
166  __mmask16 any_inf_mask = y_inf_mask | x_inf_mask;
167 
168  __mmask16 swap_mask = _mm512_cmp_ps_mask(
169  _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS);
170  __m512 numerator = _mm512_mask_blend_ps(swap_mask, y, x);
171  __m512 denominator = _mm512_mask_blend_ps(swap_mask, x, y);
172  __m512 input = _mm512_div_ps(numerator, denominator);
173 
174  // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
175  // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
176  __mmask16 div_nan_mask =
177  _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q) & ~input_nan_mask;
178  input = _mm512_mask_blend_ps(div_nan_mask, input, numerator);
179  __m512 result = _mm512_arctan_poly_avx512(input);
180 
181  input =
182  _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result);
183  result = _mm512_mask_blend_ps(swap_mask, result, input);
184 
185  __m512 x_sign_mask =
186  _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31));
187 
188  result = _mm512_add_ps(
189  _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask),
190  result);
191 
192  // Select infinity result or normal result
193  result = _mm512_mask_blend_ps(any_inf_mask, result, inf_result);
194 
195  result = _mm512_mul_ps(result, vinvNormalizeFactor);
196 
197  _mm512_store_ps(out, result);
198  out += 16;
199  }
200 
201  number = sixteenth_points * 16;
203  out, complexVector + number, normalizeFactor, num_points - number);
204 }
205 #endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */
206 
207 #if LV_HAVE_AVX2 && LV_HAVE_FMA
208 #include <immintrin.h>
210 static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector,
211  const lv_32fc_t* complexVector,
212  const float normalizeFactor,
213  unsigned int num_points)
214 {
215  const float* in = (float*)complexVector;
216  float* out = (float*)outputVector;
217 
218  const float invNormalizeFactor = 1.f / normalizeFactor;
219  const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
220  const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
221  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
222  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
223  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
224 
225  unsigned int number = 0;
226  const unsigned int eighth_points = num_points / 8;
227  for (; number < eighth_points; number++) {
228  __m256 z1 = _mm256_load_ps(in);
229  in += 8;
230  __m256 z2 = _mm256_load_ps(in);
231  in += 8;
232 
233  __m256 x = _mm256_real(z1, z2);
234  __m256 y = _mm256_imag(z1, z2);
235 
236  // Detect NaN in original inputs before division
237  __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q),
238  _mm256_cmp_ps(y, y, _CMP_UNORD_Q));
239 
240  // Handle infinity cases per IEEE 754
241  const __m256 zero = _mm256_setzero_ps();
242  const __m256 inf = _mm256_set1_ps(HUGE_VALF);
243  const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4
244  const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4
245 
246  __m256 y_abs = _mm256_and_ps(y, abs_mask);
247  __m256 x_abs = _mm256_and_ps(x, abs_mask);
248  __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ); // |y| == inf
249  __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ); // |x| == inf
250  __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS);
251 
252  // Build infinity result
253  __m256 inf_result = zero;
254  // Both infinite: ±π/4 or ±3π/4
255  __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask);
256  __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask);
257  both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask));
258  inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf);
259 
260  // y infinite, x finite: ±π/2
261  __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask);
262  __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask));
263  inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only);
264 
265  // x infinite, y finite: 0 or ±π
266  __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask);
267  __m256 x_inf_result =
268  _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)),
269  _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)),
270  x_pos_mask);
271  inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only);
272 
273  __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask);
274 
275  __m256 swap_mask = _mm256_cmp_ps(
276  _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
277  __m256 numerator = _mm256_blendv_ps(y, x, swap_mask);
278  __m256 denominator = _mm256_blendv_ps(x, y, swap_mask);
279  __m256 input = _mm256_div_ps(numerator, denominator);
280 
281  // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
282  // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
283  __m256 div_nan_mask =
284  _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q));
285  input = _mm256_blendv_ps(input, numerator, div_nan_mask);
286  __m256 result = _mm256_arctan_poly_avx2_fma(input);
287 
288  input =
289  _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
290  result = _mm256_blendv_ps(result, input, swap_mask);
291 
292  __m256 x_sign_mask =
293  _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
294 
295  result = _mm256_add_ps(
296  _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
297  result);
298 
299  // Select infinity result or normal result
300  result = _mm256_blendv_ps(result, inf_result, any_inf_mask);
301 
302  result = _mm256_mul_ps(result, vinvNormalizeFactor);
303 
304  _mm256_store_ps(out, result);
305  out += 8;
306  }
307 
308  number = eighth_points * 8;
310  out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
311 }
312 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
313 
314 #if LV_HAVE_AVX2
315 #include <immintrin.h>
317 static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector,
318  const lv_32fc_t* complexVector,
319  const float normalizeFactor,
320  unsigned int num_points)
321 {
322  const float* in = (float*)complexVector;
323  float* out = (float*)outputVector;
324 
325  const float invNormalizeFactor = 1.f / normalizeFactor;
326  const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
327  const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
328  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
329  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
330  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
331 
332  unsigned int number = 0;
333  const unsigned int eighth_points = num_points / 8;
334  for (; number < eighth_points; number++) {
335  __m256 z1 = _mm256_load_ps(in);
336  in += 8;
337  __m256 z2 = _mm256_load_ps(in);
338  in += 8;
339 
340  __m256 x = _mm256_real(z1, z2);
341  __m256 y = _mm256_imag(z1, z2);
342 
343  // Detect NaN in original inputs before division
344  __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q),
345  _mm256_cmp_ps(y, y, _CMP_UNORD_Q));
346 
347  // Handle infinity cases per IEEE 754
348  const __m256 zero = _mm256_setzero_ps();
349  const __m256 inf = _mm256_set1_ps(HUGE_VALF);
350  const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4
351  const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4
352 
353  __m256 y_abs = _mm256_and_ps(y, abs_mask);
354  __m256 x_abs = _mm256_and_ps(x, abs_mask);
355  __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ); // |y| == inf
356  __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ); // |x| == inf
357  __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS);
358 
359  // Build infinity result
360  __m256 inf_result = zero;
361  // Both infinite: ±π/4 or ±3π/4
362  __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask);
363  __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask);
364  both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask));
365  inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf);
366 
367  // y infinite, x finite: ±π/2
368  __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask);
369  __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask));
370  inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only);
371 
372  // x infinite, y finite: 0 or ±π
373  __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask);
374  __m256 x_inf_result =
375  _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)),
376  _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)),
377  x_pos_mask);
378  inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only);
379 
380  __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask);
381 
382  __m256 swap_mask = _mm256_cmp_ps(
383  _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
384  __m256 numerator = _mm256_blendv_ps(y, x, swap_mask);
385  __m256 denominator = _mm256_blendv_ps(x, y, swap_mask);
386  __m256 input = _mm256_div_ps(numerator, denominator);
387 
388  // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
389  // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
390  __m256 div_nan_mask =
391  _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q));
392  input = _mm256_blendv_ps(input, numerator, div_nan_mask);
393  __m256 result = _mm256_arctan_poly_avx(input);
394 
395  input =
396  _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
397  result = _mm256_blendv_ps(result, input, swap_mask);
398 
399  __m256 x_sign_mask =
400  _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
401 
402  result = _mm256_add_ps(
403  _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
404  result);
405 
406  // Select infinity result or normal result
407  result = _mm256_blendv_ps(result, inf_result, any_inf_mask);
408 
409  result = _mm256_mul_ps(result, vinvNormalizeFactor);
410 
411  _mm256_store_ps(out, result);
412  out += 8;
413  }
414 
415  number = eighth_points * 8;
417  out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
418 }
419 #endif /* LV_HAVE_AVX2 for aligned */
420 
421 #ifdef LV_HAVE_NEON
422 #include <arm_neon.h>
424 static inline void volk_32fc_s32f_atan2_32f_neon(float* outputVector,
425  const lv_32fc_t* complexVector,
426  const float normalizeFactor,
427  unsigned int num_points)
428 {
429  const float* in = (float*)complexVector;
430  float* out = outputVector;
431 
432  const float invNormalizeFactor = 1.f / normalizeFactor;
433  const float32x4_t vInvNorm = vdupq_n_f32(invNormalizeFactor);
434  const float32x4_t pi = vdupq_n_f32(0x1.921fb6p1f);
435  const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
436  const float32x4_t pi_4 = vdupq_n_f32(0x1.921fb6p-1f);
437  const float32x4_t three_pi_4 = vdupq_n_f32(0x1.2d97c8p1f);
438  const float32x4_t zero = vdupq_n_f32(0.f);
439  const float32x4_t inf = vdupq_n_f32(__builtin_huge_valf());
440  const uint32x4_t sign_mask = vdupq_n_u32(0x80000000);
441 
442  const unsigned int quarter_points = num_points / 4;
443 
444  for (unsigned int number = 0; number < quarter_points; number++) {
445  float32x4x2_t z = vld2q_f32(in);
446  in += 8;
447 
448  float32x4_t x = z.val[0];
449  float32x4_t y = z.val[1];
450 
451  float32x4_t abs_x = vabsq_f32(x);
452  float32x4_t abs_y = vabsq_f32(y);
453 
454  /* Detect input NaN */
455  uint32x4_t input_nan =
456  vorrq_u32(vmvnq_u32(vceqq_f32(x, x)), vmvnq_u32(vceqq_f32(y, y)));
457 
458  /* Infinity handling */
459  uint32x4_t y_inf = vceqq_f32(abs_y, inf);
460  uint32x4_t x_inf = vceqq_f32(abs_x, inf);
461  uint32x4_t x_pos = vcgtq_f32(x, zero);
462  uint32x4_t both_inf = vandq_u32(y_inf, x_inf);
463  uint32x4_t y_inf_only = vbicq_u32(y_inf, x_inf);
464  uint32x4_t x_inf_only = vbicq_u32(x_inf, y_inf);
465  uint32x4_t any_inf = vorrq_u32(y_inf, x_inf);
466 
467  float32x4_t inf_result = zero;
468  /* Both infinite: ±π/4 or ±3π/4 */
469  float32x4_t both_inf_r = vbslq_f32(x_pos, pi_4, three_pi_4);
470  both_inf_r = vreinterpretq_f32_u32(
471  vorrq_u32(vreinterpretq_u32_f32(both_inf_r),
472  vandq_u32(vreinterpretq_u32_f32(y), sign_mask)));
473  inf_result = vbslq_f32(both_inf, both_inf_r, inf_result);
474  /* y infinite, x finite: ±π/2 */
475  float32x4_t y_inf_r = vreinterpretq_f32_u32(vorrq_u32(
476  vreinterpretq_u32_f32(pi_2), vandq_u32(vreinterpretq_u32_f32(y), sign_mask)));
477  inf_result = vbslq_f32(y_inf_only, y_inf_r, inf_result);
478  /* x infinite, y finite: 0 or ±π */
479  float32x4_t x_inf_r = vbslq_f32(
480  x_pos,
481  vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(y), sign_mask)),
482  vreinterpretq_f32_u32(
483  vorrq_u32(vreinterpretq_u32_f32(pi),
484  vandq_u32(vreinterpretq_u32_f32(y), sign_mask))));
485  inf_result = vbslq_f32(x_inf_only, x_inf_r, inf_result);
486 
487  /* Normal computation */
488  uint32x4_t swap_mask = vcgtq_f32(abs_y, abs_x);
489  float32x4_t num = vbslq_f32(swap_mask, x, y);
490  float32x4_t den = vbslq_f32(swap_mask, y, x);
491  float32x4_t input = vmulq_f32(num, _vinvq_f32(den));
492 
493  /* Handle division NaN (0/0), but not input NaN */
494  uint32x4_t div_nan = vbicq_u32(vmvnq_u32(vceqq_f32(input, input)), input_nan);
495  input = vbslq_f32(div_nan, num, input);
496 
497  float32x4_t result = _varctan_poly_f32(input);
498  uint32x4_t input_sign = vandq_u32(vreinterpretq_u32_f32(input), sign_mask);
499  float32x4_t term =
500  vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi_2), input_sign));
501  term = vsubq_f32(term, result);
502  result = vbslq_f32(swap_mask, term, result);
503 
504  uint32x4_t x_neg = vcltq_f32(x, zero);
505  uint32x4_t y_sign = vandq_u32(vreinterpretq_u32_f32(y), sign_mask);
506  float32x4_t pi_adj =
507  vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi), y_sign));
508  result = vbslq_f32(x_neg, vaddq_f32(result, pi_adj), result);
509 
510  /* Select infinity result or normal result */
511  result = vbslq_f32(any_inf, inf_result, result);
512  result = vmulq_f32(result, vInvNorm);
513 
514  vst1q_f32(out, result);
515  out += 4;
516  }
517 
518  unsigned int number = quarter_points * 4;
520  out, complexVector + number, normalizeFactor, num_points - number);
521 }
522 #endif /* LV_HAVE_NEON */
523 
524 #ifdef LV_HAVE_NEONV8
525 #include <arm_neon.h>
527 /* ARMv8 NEON with FMA, proper infinity/NaN handling, and 2x unrolling */
528 static inline void volk_32fc_s32f_atan2_32f_neonv8(float* outputVector,
529  const lv_32fc_t* complexVector,
530  const float normalizeFactor,
531  unsigned int num_points)
532 {
533  const float* in = (float*)complexVector;
534  float* out = outputVector;
535 
536  const float invNormalizeFactor = 1.f / normalizeFactor;
537  const float32x4_t vInvNorm = vdupq_n_f32(invNormalizeFactor);
538  const float32x4_t pi = vdupq_n_f32(0x1.921fb6p1f);
539  const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
540  const float32x4_t pi_4 = vdupq_n_f32(0x1.921fb6p-1f);
541  const float32x4_t three_pi_4 = vdupq_n_f32(0x1.2d97c8p1f);
542  const float32x4_t zero = vdupq_n_f32(0.f);
543  const float32x4_t inf = vdupq_n_f32(__builtin_huge_valf());
544  const uint32x4_t sign_mask = vdupq_n_u32(0x80000000);
545 
546  const unsigned int eighth_points = num_points / 8;
547 
548  for (unsigned int number = 0; number < eighth_points; number++) {
549  /* Load 8 complex numbers and deinterleave */
550  float32x4x2_t z0 = vld2q_f32(in);
551  float32x4x2_t z1 = vld2q_f32(in + 8);
552  in += 16;
553 
554  float32x4_t x0 = z0.val[0], y0 = z0.val[1];
555  float32x4_t x1 = z1.val[0], y1 = z1.val[1];
556 
557  /* --- Process first 4 --- */
558  float32x4_t abs_x0 = vabsq_f32(x0);
559  float32x4_t abs_y0 = vabsq_f32(y0);
560 
561  /* Detect input NaN */
562  uint32x4_t input_nan0 =
563  vorrq_u32(vmvnq_u32(vceqq_f32(x0, x0)), vmvnq_u32(vceqq_f32(y0, y0)));
564 
565  /* Infinity handling */
566  uint32x4_t y_inf0 = vceqq_f32(abs_y0, inf);
567  uint32x4_t x_inf0 = vceqq_f32(abs_x0, inf);
568  uint32x4_t x_pos0 = vcgtq_f32(x0, zero);
569  uint32x4_t both_inf0 = vandq_u32(y_inf0, x_inf0);
570  uint32x4_t y_inf_only0 = vbicq_u32(y_inf0, x_inf0);
571  uint32x4_t x_inf_only0 = vbicq_u32(x_inf0, y_inf0);
572  uint32x4_t any_inf0 = vorrq_u32(y_inf0, x_inf0);
573 
574  float32x4_t inf_result0 = zero;
575  float32x4_t both_inf_r0 = vbslq_f32(x_pos0, pi_4, three_pi_4);
576  both_inf_r0 = vreinterpretq_f32_u32(
577  vorrq_u32(vreinterpretq_u32_f32(both_inf_r0),
578  vandq_u32(vreinterpretq_u32_f32(y0), sign_mask)));
579  inf_result0 = vbslq_f32(both_inf0, both_inf_r0, inf_result0);
580  float32x4_t y_inf_r0 = vreinterpretq_f32_u32(
581  vorrq_u32(vreinterpretq_u32_f32(pi_2),
582  vandq_u32(vreinterpretq_u32_f32(y0), sign_mask)));
583  inf_result0 = vbslq_f32(y_inf_only0, y_inf_r0, inf_result0);
584  float32x4_t x_inf_r0 = vbslq_f32(
585  x_pos0,
586  vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(y0), sign_mask)),
587  vreinterpretq_f32_u32(
588  vorrq_u32(vreinterpretq_u32_f32(pi),
589  vandq_u32(vreinterpretq_u32_f32(y0), sign_mask))));
590  inf_result0 = vbslq_f32(x_inf_only0, x_inf_r0, inf_result0);
591 
592  /* Normal computation */
593  uint32x4_t swap_mask0 = vcgtq_f32(abs_y0, abs_x0);
594  float32x4_t num0 = vbslq_f32(swap_mask0, x0, y0);
595  float32x4_t den0 = vbslq_f32(swap_mask0, y0, x0);
596  float32x4_t input0 = vmulq_f32(num0, _vinvq_f32(den0));
597 
598  uint32x4_t div_nan0 = vbicq_u32(vmvnq_u32(vceqq_f32(input0, input0)), input_nan0);
599  input0 = vbslq_f32(div_nan0, num0, input0);
600 
601  float32x4_t result0 = _varctan_poly_neonv8(input0);
602  uint32x4_t input_sign0 = vandq_u32(vreinterpretq_u32_f32(input0), sign_mask);
603  float32x4_t term0 =
604  vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi_2), input_sign0));
605  term0 = vsubq_f32(term0, result0);
606  result0 = vbslq_f32(swap_mask0, term0, result0);
607 
608  uint32x4_t x_neg0 = vcltq_f32(x0, zero);
609  uint32x4_t y_sign0 = vandq_u32(vreinterpretq_u32_f32(y0), sign_mask);
610  float32x4_t pi_adj0 =
611  vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi), y_sign0));
612  result0 = vbslq_f32(x_neg0, vaddq_f32(result0, pi_adj0), result0);
613 
614  result0 = vbslq_f32(any_inf0, inf_result0, result0);
615  result0 = vmulq_f32(result0, vInvNorm);
616 
617  /* --- Process second 4 --- */
618  float32x4_t abs_x1 = vabsq_f32(x1);
619  float32x4_t abs_y1 = vabsq_f32(y1);
620 
621  uint32x4_t input_nan1 =
622  vorrq_u32(vmvnq_u32(vceqq_f32(x1, x1)), vmvnq_u32(vceqq_f32(y1, y1)));
623 
624  uint32x4_t y_inf1 = vceqq_f32(abs_y1, inf);
625  uint32x4_t x_inf1 = vceqq_f32(abs_x1, inf);
626  uint32x4_t x_pos1 = vcgtq_f32(x1, zero);
627  uint32x4_t both_inf1 = vandq_u32(y_inf1, x_inf1);
628  uint32x4_t y_inf_only1 = vbicq_u32(y_inf1, x_inf1);
629  uint32x4_t x_inf_only1 = vbicq_u32(x_inf1, y_inf1);
630  uint32x4_t any_inf1 = vorrq_u32(y_inf1, x_inf1);
631 
632  float32x4_t inf_result1 = zero;
633  float32x4_t both_inf_r1 = vbslq_f32(x_pos1, pi_4, three_pi_4);
634  both_inf_r1 = vreinterpretq_f32_u32(
635  vorrq_u32(vreinterpretq_u32_f32(both_inf_r1),
636  vandq_u32(vreinterpretq_u32_f32(y1), sign_mask)));
637  inf_result1 = vbslq_f32(both_inf1, both_inf_r1, inf_result1);
638  float32x4_t y_inf_r1 = vreinterpretq_f32_u32(
639  vorrq_u32(vreinterpretq_u32_f32(pi_2),
640  vandq_u32(vreinterpretq_u32_f32(y1), sign_mask)));
641  inf_result1 = vbslq_f32(y_inf_only1, y_inf_r1, inf_result1);
642  float32x4_t x_inf_r1 = vbslq_f32(
643  x_pos1,
644  vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(y1), sign_mask)),
645  vreinterpretq_f32_u32(
646  vorrq_u32(vreinterpretq_u32_f32(pi),
647  vandq_u32(vreinterpretq_u32_f32(y1), sign_mask))));
648  inf_result1 = vbslq_f32(x_inf_only1, x_inf_r1, inf_result1);
649 
650  uint32x4_t swap_mask1 = vcgtq_f32(abs_y1, abs_x1);
651  float32x4_t num1 = vbslq_f32(swap_mask1, x1, y1);
652  float32x4_t den1 = vbslq_f32(swap_mask1, y1, x1);
653  float32x4_t input1 = vmulq_f32(num1, _vinvq_f32(den1));
654 
655  uint32x4_t div_nan1 = vbicq_u32(vmvnq_u32(vceqq_f32(input1, input1)), input_nan1);
656  input1 = vbslq_f32(div_nan1, num1, input1);
657 
658  float32x4_t result1 = _varctan_poly_neonv8(input1);
659  uint32x4_t input_sign1 = vandq_u32(vreinterpretq_u32_f32(input1), sign_mask);
660  float32x4_t term1 =
661  vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi_2), input_sign1));
662  term1 = vsubq_f32(term1, result1);
663  result1 = vbslq_f32(swap_mask1, term1, result1);
664 
665  uint32x4_t x_neg1 = vcltq_f32(x1, zero);
666  uint32x4_t y_sign1 = vandq_u32(vreinterpretq_u32_f32(y1), sign_mask);
667  float32x4_t pi_adj1 =
668  vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi), y_sign1));
669  result1 = vbslq_f32(x_neg1, vaddq_f32(result1, pi_adj1), result1);
670 
671  result1 = vbslq_f32(any_inf1, inf_result1, result1);
672  result1 = vmulq_f32(result1, vInvNorm);
673 
674  vst1q_f32(out, result0);
675  vst1q_f32(out + 4, result1);
676  out += 8;
677  }
678 
679  unsigned int number = eighth_points * 8;
681  out, complexVector + number, normalizeFactor, num_points - number);
682 }
683 #endif /* LV_HAVE_NEONV8 */
684 
685 #endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */
686 
687 #ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H
688 #define INCLUDED_volk_32fc_s32f_atan2_32f_u_H
689 
690 #if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
691 #include <immintrin.h>
693 static inline void volk_32fc_s32f_atan2_32f_u_avx512dq(float* outputVector,
694  const lv_32fc_t* complexVector,
695  const float normalizeFactor,
696  unsigned int num_points)
697 {
698  const float* in = (float*)complexVector;
699  float* out = (float*)outputVector;
700 
701  const float invNormalizeFactor = 1.f / normalizeFactor;
702  const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor);
703  const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f);
704  const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
705  const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
706  const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
707 
708  const unsigned int sixteenth_points = num_points / 16;
709 
710  for (unsigned int number = 0; number < sixteenth_points; number++) {
711  __m512 z1 = _mm512_loadu_ps(in);
712  in += 16;
713  __m512 z2 = _mm512_loadu_ps(in);
714  in += 16;
715 
716  __m512 x = _mm512_real(z1, z2);
717  __m512 y = _mm512_imag(z1, z2);
718 
719  // Detect NaN in original inputs before division
720  __mmask16 input_nan_mask = _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q) |
721  _mm512_cmp_ps_mask(y, y, _CMP_UNORD_Q);
722 
723  // Handle infinity cases per IEEE 754
724  const __m512 zero = _mm512_setzero_ps();
725  const __m512 pi_4 = _mm512_set1_ps(0x1.921fb6p-1f); // π/4
726  const __m512 three_pi_4 = _mm512_set1_ps(0x1.2d97c8p1f); // 3π/4
727 
728  __mmask16 y_inf_mask = _mm512_fpclass_ps_mask(y, 0x18); // ±inf
729  __mmask16 x_inf_mask = _mm512_fpclass_ps_mask(x, 0x18); // ±inf
730  __mmask16 x_pos_mask = _mm512_cmp_ps_mask(x, zero, _CMP_GT_OS);
731 
732  // Build infinity result
733  __m512 inf_result = zero;
734  // Both infinite: ±π/4 or ±3π/4
735  __mmask16 both_inf = y_inf_mask & x_inf_mask;
736  __m512 both_inf_result = _mm512_mask_blend_ps(x_pos_mask, three_pi_4, pi_4);
737  both_inf_result = _mm512_or_ps(both_inf_result, _mm512_and_ps(y, sign_mask));
738  inf_result = _mm512_mask_blend_ps(both_inf, inf_result, both_inf_result);
739 
740  // y infinite, x finite: ±π/2
741  __mmask16 y_inf_only = y_inf_mask & ~x_inf_mask;
742  __m512 y_inf_result = _mm512_or_ps(pi_2, _mm512_and_ps(y, sign_mask));
743  inf_result = _mm512_mask_blend_ps(y_inf_only, inf_result, y_inf_result);
744 
745  // x infinite, y finite: 0 or ±π
746  __mmask16 x_inf_only = x_inf_mask & ~y_inf_mask;
747  __m512 x_inf_result =
748  _mm512_mask_blend_ps(x_pos_mask,
749  _mm512_or_ps(pi, _mm512_and_ps(y, sign_mask)),
750  _mm512_or_ps(zero, _mm512_and_ps(y, sign_mask)));
751  inf_result = _mm512_mask_blend_ps(x_inf_only, inf_result, x_inf_result);
752 
753  __mmask16 any_inf_mask = y_inf_mask | x_inf_mask;
754 
755  __mmask16 swap_mask = _mm512_cmp_ps_mask(
756  _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS);
757  __m512 numerator = _mm512_mask_blend_ps(swap_mask, y, x);
758  __m512 denominator = _mm512_mask_blend_ps(swap_mask, x, y);
759  __m512 input = _mm512_div_ps(numerator, denominator);
760 
761  // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
762  // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
763  __mmask16 div_nan_mask =
764  _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q) & ~input_nan_mask;
765  input = _mm512_mask_blend_ps(div_nan_mask, input, numerator);
766  __m512 result = _mm512_arctan_poly_avx512(input);
767 
768  input =
769  _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result);
770  result = _mm512_mask_blend_ps(swap_mask, result, input);
771 
772  __m512 x_sign_mask =
773  _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31));
774 
775  result = _mm512_add_ps(
776  _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask),
777  result);
778 
779  // Select infinity result or normal result
780  result = _mm512_mask_blend_ps(any_inf_mask, result, inf_result);
781 
782  result = _mm512_mul_ps(result, vinvNormalizeFactor);
783 
784  _mm512_storeu_ps(out, result);
785  out += 16;
786  }
787 
788  unsigned int number = sixteenth_points * 16;
790  out, complexVector + number, normalizeFactor, num_points - number);
791 }
792 #endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */
793 
794 #if LV_HAVE_AVX2 && LV_HAVE_FMA
795 #include <immintrin.h>
797 static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector,
798  const lv_32fc_t* complexVector,
799  const float normalizeFactor,
800  unsigned int num_points)
801 {
802  const float* in = (float*)complexVector;
803  float* out = (float*)outputVector;
804 
805  const float invNormalizeFactor = 1.f / normalizeFactor;
806  const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
807  const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
808  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
809  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
810  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
811 
812  unsigned int number = 0;
813  const unsigned int eighth_points = num_points / 8;
814  for (; number < eighth_points; number++) {
815  __m256 z1 = _mm256_loadu_ps(in);
816  in += 8;
817  __m256 z2 = _mm256_loadu_ps(in);
818  in += 8;
819 
820  __m256 x = _mm256_real(z1, z2);
821  __m256 y = _mm256_imag(z1, z2);
822 
823  // Detect NaN in original inputs before division
824  __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q),
825  _mm256_cmp_ps(y, y, _CMP_UNORD_Q));
826 
827  // Handle infinity cases per IEEE 754
828  const __m256 zero = _mm256_setzero_ps();
829  const __m256 inf = _mm256_set1_ps(HUGE_VALF);
830  const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4
831  const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4
832 
833  __m256 y_abs = _mm256_and_ps(y, abs_mask);
834  __m256 x_abs = _mm256_and_ps(x, abs_mask);
835  __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ); // |y| == inf
836  __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ); // |x| == inf
837  __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS);
838 
839  // Build infinity result
840  __m256 inf_result = zero;
841  // Both infinite: ±π/4 or ±3π/4
842  __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask);
843  __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask);
844  both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask));
845  inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf);
846 
847  // y infinite, x finite: ±π/2
848  __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask);
849  __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask));
850  inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only);
851 
852  // x infinite, y finite: 0 or ±π
853  __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask);
854  __m256 x_inf_result =
855  _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)),
856  _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)),
857  x_pos_mask);
858  inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only);
859 
860  __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask);
861 
862  __m256 swap_mask = _mm256_cmp_ps(
863  _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
864  __m256 numerator = _mm256_blendv_ps(y, x, swap_mask);
865  __m256 denominator = _mm256_blendv_ps(x, y, swap_mask);
866  __m256 input = _mm256_div_ps(numerator, denominator);
867 
868  // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
869  // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
870  __m256 div_nan_mask =
871  _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q));
872  input = _mm256_blendv_ps(input, numerator, div_nan_mask);
873  __m256 result = _mm256_arctan_poly_avx2_fma(input);
874 
875  input =
876  _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
877  result = _mm256_blendv_ps(result, input, swap_mask);
878 
879  __m256 x_sign_mask =
880  _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
881 
882  result = _mm256_add_ps(
883  _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
884  result);
885 
886  // Select infinity result or normal result
887  result = _mm256_blendv_ps(result, inf_result, any_inf_mask);
888 
889  result = _mm256_mul_ps(result, vinvNormalizeFactor);
890 
891  _mm256_storeu_ps(out, result);
892  out += 8;
893  }
894 
895  number = eighth_points * 8;
897  out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
898 }
899 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
900 
901 #if LV_HAVE_AVX2
902 #include <immintrin.h>
904 static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector,
905  const lv_32fc_t* complexVector,
906  const float normalizeFactor,
907  unsigned int num_points)
908 {
909  const float* in = (float*)complexVector;
910  float* out = (float*)outputVector;
911 
912  const float invNormalizeFactor = 1.f / normalizeFactor;
913  const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
914  const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
915  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
916  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
917  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
918 
919  unsigned int number = 0;
920  const unsigned int eighth_points = num_points / 8;
921  for (; number < eighth_points; number++) {
922  __m256 z1 = _mm256_loadu_ps(in);
923  in += 8;
924  __m256 z2 = _mm256_loadu_ps(in);
925  in += 8;
926 
927  __m256 x = _mm256_real(z1, z2);
928  __m256 y = _mm256_imag(z1, z2);
929 
930  // Detect NaN in original inputs before division
931  __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q),
932  _mm256_cmp_ps(y, y, _CMP_UNORD_Q));
933 
934  // Handle infinity cases per IEEE 754
935  const __m256 zero = _mm256_setzero_ps();
936  const __m256 inf = _mm256_set1_ps(HUGE_VALF);
937  const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4
938  const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4
939 
940  __m256 y_abs = _mm256_and_ps(y, abs_mask);
941  __m256 x_abs = _mm256_and_ps(x, abs_mask);
942  __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ); // |y| == inf
943  __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ); // |x| == inf
944  __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS);
945 
946  // Build infinity result
947  __m256 inf_result = zero;
948  // Both infinite: ±π/4 or ±3π/4
949  __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask);
950  __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask);
951  both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask));
952  inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf);
953 
954  // y infinite, x finite: ±π/2
955  __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask);
956  __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask));
957  inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only);
958 
959  // x infinite, y finite: 0 or ±π
960  __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask);
961  __m256 x_inf_result =
962  _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)),
963  _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)),
964  x_pos_mask);
965  inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only);
966 
967  __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask);
968 
969  __m256 swap_mask = _mm256_cmp_ps(
970  _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
971  __m256 numerator = _mm256_blendv_ps(y, x, swap_mask);
972  __m256 denominator = _mm256_blendv_ps(x, y, swap_mask);
973  __m256 input = _mm256_div_ps(numerator, denominator);
974 
975  // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
976  // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
977  __m256 div_nan_mask =
978  _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q));
979  input = _mm256_blendv_ps(input, numerator, div_nan_mask);
980  __m256 result = _mm256_arctan_poly_avx(input);
981 
982  input =
983  _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
984  result = _mm256_blendv_ps(result, input, swap_mask);
985 
986  __m256 x_sign_mask =
987  _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
988 
989  result = _mm256_add_ps(
990  _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
991  result);
992 
993  // Select infinity result or normal result
994  result = _mm256_blendv_ps(result, inf_result, any_inf_mask);
995 
996  result = _mm256_mul_ps(result, vinvNormalizeFactor);
997 
998  _mm256_storeu_ps(out, result);
999  out += 8;
1000  }
1001 
1002  number = eighth_points * 8;
1004  out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
1005 }
1006 #endif /* LV_HAVE_AVX2 for unaligned */
1007 
1008 #ifdef LV_HAVE_RVV
1009 #include <riscv_vector.h>
1010 #include <volk/volk_rvv_intrinsics.h>
1011 
1012 static inline void volk_32fc_s32f_atan2_32f_rvv(float* outputVector,
1013  const lv_32fc_t* inputVector,
1014  const float normalizeFactor,
1015  unsigned int num_points)
1016 {
1017  size_t vlmax = __riscv_vsetvlmax_e32m2();
1018 
1019  const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax);
1020  const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
1021  const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
1022  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
1023  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
1024  const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
1025  const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
1026  const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
1027  const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
1028  const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
1029 
1030  const vfloat32m2_t zero = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
1031  const vfloat32m2_t inf = __riscv_vfmv_v_f_f32m2(HUGE_VALF, vlmax);
1032  const vfloat32m2_t pi_4 = __riscv_vfmv_v_f_f32m2(0x1.921fb6p-1f, vlmax); // π/4
1033  const vfloat32m2_t three_pi_4 = __riscv_vfmv_v_f_f32m2(0x1.2d97c8p1f, vlmax); // 3π/4
1034 
1035  size_t n = num_points;
1036  for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
1037  vl = __riscv_vsetvl_e32m2(n);
1038  vuint64m4_t v = __riscv_vle64_v_u64m4((const uint64_t*)inputVector, vl);
1039  vfloat32m2_t vr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 0, vl));
1040  vfloat32m2_t vi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 32, vl));
1041 
1042  // Detect NaN in original inputs before division
1043  vbool16_t input_nan_mask =
1044  __riscv_vmor(__riscv_vmfne(vr, vr, vl), __riscv_vmfne(vi, vi, vl), vl);
1045 
1046  // Handle infinity cases per IEEE 754
1047  vfloat32m2_t vr_abs = __riscv_vfabs(vr, vl);
1048  vfloat32m2_t vi_abs = __riscv_vfabs(vi, vl);
1049  vbool16_t vr_inf_mask = __riscv_vmfeq(vr_abs, inf, vl); // |vr| == inf
1050  vbool16_t vi_inf_mask = __riscv_vmfeq(vi_abs, inf, vl); // |vi| == inf
1051  vbool16_t vr_pos_mask = __riscv_vmfgt(vr, zero, vl);
1052 
1053  // Build infinity result
1054  vfloat32m2_t inf_result = zero;
1055  // Both infinite: ±π/4 or ±3π/4
1056  vbool16_t both_inf = __riscv_vmand(vi_inf_mask, vr_inf_mask, vl);
1057  vfloat32m2_t both_inf_result = __riscv_vmerge(three_pi_4, pi_4, vr_pos_mask, vl);
1058  both_inf_result = __riscv_vfsgnj(both_inf_result, vi, vl); // Copy sign from vi
1059  inf_result = __riscv_vmerge(inf_result, both_inf_result, both_inf, vl);
1060 
1061  // vi infinite, vr finite: ±π/2
1062  vbool16_t vi_inf_only = __riscv_vmandn(vi_inf_mask, vr_inf_mask, vl);
1063  vfloat32m2_t vi_inf_result = __riscv_vfsgnj(cpio2, vi, vl); // π/2 with sign of vi
1064  inf_result = __riscv_vmerge(inf_result, vi_inf_result, vi_inf_only, vl);
1065 
1066  // vr infinite, vi finite: 0 or ±π
1067  vbool16_t vr_inf_only = __riscv_vmandn(vr_inf_mask, vi_inf_mask, vl);
1068  vfloat32m2_t vr_inf_result =
1069  __riscv_vmerge(__riscv_vfsgnj(cpi, vi, vl), // π with sign of vi
1070  __riscv_vfsgnj(zero, vi, vl), // 0 with sign of vi
1071  vr_pos_mask,
1072  vl);
1073  inf_result = __riscv_vmerge(inf_result, vr_inf_result, vr_inf_only, vl);
1074 
1075  vbool16_t any_inf_mask = __riscv_vmor(vi_inf_mask, vr_inf_mask, vl);
1076 
1077  vbool16_t mswap = __riscv_vmfgt(vi_abs, vr_abs, vl);
1078  vfloat32m2_t numerator = __riscv_vmerge(vi, vr, mswap, vl);
1079  vfloat32m2_t denominator = __riscv_vmerge(vr, vi, mswap, vl);
1080  vfloat32m2_t x = __riscv_vfdiv(numerator, denominator, vl);
1081 
1082  // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
1083  // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
1084  vbool16_t x_nan_mask = __riscv_vmfne(x, x, vl);
1085  // div_nan_mask = x_nan_mask & ~input_nan_mask (vmandn computes vs2 & ~vs1)
1086  vbool16_t div_nan_mask = __riscv_vmandn(x_nan_mask, input_nan_mask, vl);
1087  x = __riscv_vmerge(x, numerator, div_nan_mask, vl);
1088 
1089  vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
1090  vfloat32m2_t p = c13;
1091  p = __riscv_vfmadd(p, xx, c11, vl);
1092  p = __riscv_vfmadd(p, xx, c9, vl);
1093  p = __riscv_vfmadd(p, xx, c7, vl);
1094  p = __riscv_vfmadd(p, xx, c5, vl);
1095  p = __riscv_vfmadd(p, xx, c3, vl);
1096  p = __riscv_vfmadd(p, xx, c1, vl);
1097  p = __riscv_vfmul(p, x, vl);
1098 
1099  x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
1100  p = __riscv_vmerge(p, x, mswap, vl);
1101  p = __riscv_vfadd_mu(
1102  RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl);
1103 
1104  // Select infinity result or normal result
1105  p = __riscv_vmerge(p, inf_result, any_inf_mask, vl);
1106 
1107  __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl);
1108  }
1109 }
1110 #endif /*LV_HAVE_RVV*/
1111 
1112 #ifdef LV_HAVE_RVVSEG
1113 #include <riscv_vector.h>
1114 #include <volk/volk_rvv_intrinsics.h>
1115 
1116 static inline void volk_32fc_s32f_atan2_32f_rvvseg(float* outputVector,
1117  const lv_32fc_t* inputVector,
1118  const float normalizeFactor,
1119  unsigned int num_points)
1120 {
1121  size_t vlmax = __riscv_vsetvlmax_e32m2();
1122 
1123  const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax);
1124  const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
1125  const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
1126  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
1127  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
1128  const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
1129  const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
1130  const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
1131  const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
1132  const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
1133 
1134  const vfloat32m2_t zero = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
1135  const vfloat32m2_t inf = __riscv_vfmv_v_f_f32m2(HUGE_VALF, vlmax);
1136  const vfloat32m2_t pi_4 = __riscv_vfmv_v_f_f32m2(0x1.921fb6p-1f, vlmax); // π/4
1137  const vfloat32m2_t three_pi_4 = __riscv_vfmv_v_f_f32m2(0x1.2d97c8p1f, vlmax); // 3π/4
1138 
1139  size_t n = num_points;
1140  for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
1141  vl = __riscv_vsetvl_e32m2(n);
1142  vfloat32m2x2_t v = __riscv_vlseg2e32_v_f32m2x2((const float*)inputVector, vl);
1143  vfloat32m2_t vr = __riscv_vget_f32m2(v, 0), vi = __riscv_vget_f32m2(v, 1);
1144 
1145  // Detect NaN in original inputs before division
1146  vbool16_t input_nan_mask =
1147  __riscv_vmor(__riscv_vmfne(vr, vr, vl), __riscv_vmfne(vi, vi, vl), vl);
1148 
1149  // Handle infinity cases per IEEE 754
1150  vfloat32m2_t vr_abs = __riscv_vfabs(vr, vl);
1151  vfloat32m2_t vi_abs = __riscv_vfabs(vi, vl);
1152  vbool16_t vr_inf_mask = __riscv_vmfeq(vr_abs, inf, vl); // |vr| == inf
1153  vbool16_t vi_inf_mask = __riscv_vmfeq(vi_abs, inf, vl); // |vi| == inf
1154  vbool16_t vr_pos_mask = __riscv_vmfgt(vr, zero, vl);
1155 
1156  // Build infinity result
1157  vfloat32m2_t inf_result = zero;
1158  // Both infinite: ±π/4 or ±3π/4
1159  vbool16_t both_inf = __riscv_vmand(vi_inf_mask, vr_inf_mask, vl);
1160  vfloat32m2_t both_inf_result = __riscv_vmerge(three_pi_4, pi_4, vr_pos_mask, vl);
1161  both_inf_result = __riscv_vfsgnj(both_inf_result, vi, vl); // Copy sign from vi
1162  inf_result = __riscv_vmerge(inf_result, both_inf_result, both_inf, vl);
1163 
1164  // vi infinite, vr finite: ±π/2
1165  vbool16_t vi_inf_only = __riscv_vmandn(vi_inf_mask, vr_inf_mask, vl);
1166  vfloat32m2_t vi_inf_result = __riscv_vfsgnj(cpio2, vi, vl); // π/2 with sign of vi
1167  inf_result = __riscv_vmerge(inf_result, vi_inf_result, vi_inf_only, vl);
1168 
1169  // vr infinite, vi finite: 0 or ±π
1170  vbool16_t vr_inf_only = __riscv_vmandn(vr_inf_mask, vi_inf_mask, vl);
1171  vfloat32m2_t vr_inf_result =
1172  __riscv_vmerge(__riscv_vfsgnj(cpi, vi, vl), // π with sign of vi
1173  __riscv_vfsgnj(zero, vi, vl), // 0 with sign of vi
1174  vr_pos_mask,
1175  vl);
1176  inf_result = __riscv_vmerge(inf_result, vr_inf_result, vr_inf_only, vl);
1177 
1178  vbool16_t any_inf_mask = __riscv_vmor(vi_inf_mask, vr_inf_mask, vl);
1179 
1180  vbool16_t mswap = __riscv_vmfgt(vi_abs, vr_abs, vl);
1181  vfloat32m2_t numerator = __riscv_vmerge(vi, vr, mswap, vl);
1182  vfloat32m2_t denominator = __riscv_vmerge(vr, vi, mswap, vl);
1183  vfloat32m2_t x = __riscv_vfdiv(numerator, denominator, vl);
1184 
1185  // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
1186  // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
1187  vbool16_t x_nan_mask = __riscv_vmfne(x, x, vl);
1188  // div_nan_mask = x_nan_mask & ~input_nan_mask (vmandn computes vs2 & ~vs1)
1189  vbool16_t div_nan_mask = __riscv_vmandn(x_nan_mask, input_nan_mask, vl);
1190  x = __riscv_vmerge(x, numerator, div_nan_mask, vl);
1191 
1192  vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
1193  vfloat32m2_t p = c13;
1194  p = __riscv_vfmadd(p, xx, c11, vl);
1195  p = __riscv_vfmadd(p, xx, c9, vl);
1196  p = __riscv_vfmadd(p, xx, c7, vl);
1197  p = __riscv_vfmadd(p, xx, c5, vl);
1198  p = __riscv_vfmadd(p, xx, c3, vl);
1199  p = __riscv_vfmadd(p, xx, c1, vl);
1200  p = __riscv_vfmul(p, x, vl);
1201 
1202  x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
1203  p = __riscv_vmerge(p, x, mswap, vl);
1204  p = __riscv_vfadd_mu(
1205  RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl);
1206 
1207  // Select infinity result or normal result
1208  p = __riscv_vmerge(p, inf_result, any_inf_mask, vl);
1209 
1210  __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl);
1211  }
1212 }
1213 #endif /*LV_HAVE_RVVSEG*/
1214 
1215 #endif /* INCLUDED_volk_32fc_s32f_atan2_32f_u_H */
volk_32fc_s32f_atan2_32f_neon
static void volk_32fc_s32f_atan2_32f_neon(float *outputVector, const lv_32fc_t *complexVector, const float normalizeFactor, unsigned int num_points)
Definition: volk_32fc_s32f_atan2_32f.h:424
_mm256_arctan_poly_avx2_fma
static __m256 _mm256_arctan_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:26
volk_avx512_intrinsics.h
_mm256_arctan_poly_avx
static __m256 _mm256_arctan_poly_avx(const __m256 x)
Definition: volk_avx_intrinsics.h:64
RISCV_VMFLTZ
#define RISCV_VMFLTZ(T, v, vl)
Definition: volk_rvv_intrinsics.h:75
volk_common.h
_mm512_arctan_poly_avx512
static __m512 _mm512_arctan_poly_avx512(const __m512 x)
Definition: volk_avx512_intrinsics.h:73
volk_avx2_fma_intrinsics.h
_varctan_poly_f32
static float32x4_t _varctan_poly_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:329
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
_mm512_imag
static __m512 _mm512_imag(const __m512 z1, const __m512 z2)
Definition: volk_avx512_intrinsics.h:60
volk_rvv_intrinsics.h
bit128::f
float f[4]
Definition: volk_common.h:120
volk_neon_intrinsics.h
_mm512_real
static __m512 _mm512_real(const __m512 z1, const __m512 z2)
Definition: volk_avx512_intrinsics.h:49
volk_avx_intrinsics.h
_mm256_real
static __m256 _mm256_real(const __m256 z1, const __m256 z2)
Definition: volk_avx2_intrinsics.h:48
volk_32fc_s32f_atan2_32f_generic
static void volk_32fc_s32f_atan2_32f_generic(float *outputVector, const lv_32fc_t *inputVector, const float normalizeFactor, unsigned int num_points)
Definition: volk_32fc_s32f_atan2_32f.h:67
volk_32fc_s32f_atan2_32f_polynomial
static void volk_32fc_s32f_atan2_32f_polynomial(float *outputVector, const lv_32fc_t *inputVector, const float normalizeFactor, unsigned int num_points)
Definition: volk_32fc_s32f_atan2_32f.h:86
_mm256_imag
static __m256 _mm256_imag(const __m256 z1, const __m256 z2)
Definition: volk_avx2_intrinsics.h:55
_vinvq_f32
static float32x4_t _vinvq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:114
volk_atan2
static float volk_atan2(const float y, const float x)
Definition: volk_common.h:326