Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_sincos_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #include <inttypes.h>
59 #include <math.h>
60 #include <stdio.h>
61 
62 #ifndef INCLUDED_volk_32f_sincos_32f_x2_a_H
63 #define INCLUDED_volk_32f_sincos_32f_x2_a_H
64 
65 #ifdef LV_HAVE_GENERIC
66 
67 static inline void volk_32f_sincos_32f_x2_generic(float* sinVector,
68  float* cosVector,
69  const float* inVector,
70  unsigned int num_points)
71 {
72  for (unsigned int i = 0; i < num_points; i++) {
73  sinVector[i] = sinf(inVector[i]);
74  cosVector[i] = cosf(inVector[i]);
75  }
76 }
77 
78 #endif /* LV_HAVE_GENERIC */
79 
80 #ifdef LV_HAVE_GENERIC
81 #include <volk/volk_common.h>
82 
83 static inline void volk_32f_sincos_32f_x2_polynomial(float* sinVector,
84  float* cosVector,
85  const float* inVector,
86  unsigned int num_points)
87 {
88  /*
89  * Cody-Waite argument reduction with shared sin/cos polynomial evaluation
90  */
91  const float two_over_pi = 0x1.45f306p-1f;
92  const float pi_over_2_hi = 0x1.921fb6p+0f;
93  const float pi_over_2_lo = -0x1.777a5cp-25f;
94 
95  for (unsigned int i = 0; i < num_points; i++) {
96  float x = inVector[i];
97 
98  float n_f = rintf(x * two_over_pi);
99  int n = (int)n_f;
100 
101  float r = fmaf(-n_f, pi_over_2_hi, x);
102  r = fmaf(-n_f, pi_over_2_lo, r);
103 
104  float sin_r = volk_sin_poly(r);
105  float cos_r = volk_cos_poly(r);
106 
107  // Sin: n&1 swaps, n&2 negates
108  float sin_result = (n & 1) ? cos_r : sin_r;
109  sinVector[i] = (n & 2) ? -sin_result : sin_result;
110 
111  // Cos: n&1 swaps, (n+1)&2 negates
112  float cos_result = (n & 1) ? sin_r : cos_r;
113  cosVector[i] = ((n + 1) & 2) ? -cos_result : cos_result;
114  }
115 }
116 #endif /* LV_HAVE_GENERIC */
117 
118 #ifdef LV_HAVE_AVX512F
119 #include <immintrin.h>
121 
122 static inline void volk_32f_sincos_32f_x2_a_avx512f(float* sinVector,
123  float* cosVector,
124  const float* inVector,
125  unsigned int num_points)
126 {
127  float* sinPtr = sinVector;
128  float* cosPtr = cosVector;
129  const float* inPtr = inVector;
130 
131  unsigned int number = 0;
132  unsigned int sixteenPoints = num_points / 16;
133 
134  // Constants for Cody-Waite argument reduction
135  const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f);
136  const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f);
137  const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f);
138 
139  const __m512i ones = _mm512_set1_epi32(1);
140  const __m512i twos = _mm512_set1_epi32(2);
141  const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
142 
143  for (; number < sixteenPoints; number++) {
144  __m512 x = _mm512_load_ps(inPtr);
145 
146  // Argument reduction: n = round(x * 2/pi)
147  __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
148  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
149  __m512i n = _mm512_cvtps_epi32(n_f);
150 
151  // r = x - n * (pi/2)
152  __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
153  r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
154 
155  // Evaluate both sin and cos polynomials
156  __m512 sin_r = _mm512_sin_poly_avx512(r);
157  __m512 cos_r = _mm512_cos_poly_avx512(r);
158 
159  // Quadrant selection
160  __m512i n_and_1 = _mm512_and_si512(n, ones);
161  __m512i n_and_2 = _mm512_and_si512(n, twos);
162  __m512i n_plus_1_and_2 = _mm512_and_si512(_mm512_add_epi32(n, ones), twos);
163 
164  // For sin: swap when n&1, negate when n&2
165  __mmask16 sin_swap = _mm512_cmpeq_epi32_mask(n_and_1, ones);
166  __m512 sin_result = _mm512_mask_blend_ps(sin_swap, sin_r, cos_r);
167  __mmask16 sin_neg = _mm512_cmpeq_epi32_mask(n_and_2, twos);
168  sin_result =
169  _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(sin_result),
170  sin_neg,
171  _mm512_castps_si512(sin_result),
172  sign_bit));
173 
174  // For cos: swap when n&1, negate when (n+1)&2
175  __mmask16 cos_swap = sin_swap;
176  __m512 cos_result = _mm512_mask_blend_ps(cos_swap, cos_r, sin_r);
177  __mmask16 cos_neg = _mm512_cmpeq_epi32_mask(n_plus_1_and_2, twos);
178  cos_result =
179  _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(cos_result),
180  cos_neg,
181  _mm512_castps_si512(cos_result),
182  sign_bit));
183 
184  _mm512_store_ps(sinPtr, sin_result);
185  _mm512_store_ps(cosPtr, cos_result);
186  inPtr += 16;
187  sinPtr += 16;
188  cosPtr += 16;
189  }
190 
191  number = sixteenPoints * 16;
192  for (; number < num_points; number++) {
193  *sinPtr++ = sinf(*inPtr);
194  *cosPtr++ = cosf(*inPtr++);
195  }
196 }
197 
198 #endif /* LV_HAVE_AVX512F for aligned */
199 
200 #if LV_HAVE_AVX2 && LV_HAVE_FMA
201 #include <immintrin.h>
203 
204 static inline void volk_32f_sincos_32f_x2_a_avx2_fma(float* sinVector,
205  float* cosVector,
206  const float* inVector,
207  unsigned int num_points)
208 {
209  float* sinPtr = sinVector;
210  float* cosPtr = cosVector;
211  const float* inPtr = inVector;
212 
213  unsigned int number = 0;
214  unsigned int eighthPoints = num_points / 8;
215 
216  // Constants for Cody-Waite argument reduction
217  const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
218  const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
219  const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
220 
221  const __m256i ones = _mm256_set1_epi32(1);
222  const __m256i twos = _mm256_set1_epi32(2);
223  const __m256 sign_bit = _mm256_set1_ps(-0.0f);
224 
225  for (; number < eighthPoints; number++) {
226  __m256 x = _mm256_load_ps(inPtr);
227 
228  // Argument reduction: n = round(x * 2/pi)
229  __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
230  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
231  __m256i n = _mm256_cvtps_epi32(n_f);
232 
233  // r = x - n * (pi/2)
234  __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
235  r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
236 
237  // Evaluate both sin and cos polynomials
238  __m256 sin_r = _mm256_sin_poly_avx2_fma(r);
239  __m256 cos_r = _mm256_cos_poly_avx2_fma(r);
240 
241  // Quadrant selection
242  __m256i n_and_1 = _mm256_and_si256(n, ones);
243  __m256i n_and_2 = _mm256_and_si256(n, twos);
244  __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
245 
246  // For sin: swap when n&1, negate when n&2
247  __m256 sin_swap = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
248  __m256 sin_result = _mm256_blendv_ps(sin_r, cos_r, sin_swap);
249  __m256 sin_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
250  sin_result = _mm256_xor_ps(sin_result, _mm256_and_ps(sin_neg, sign_bit));
251 
252  // For cos: swap when n&1, negate when (n+1)&2
253  __m256 cos_result = _mm256_blendv_ps(cos_r, sin_r, sin_swap);
254  __m256 cos_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
255  cos_result = _mm256_xor_ps(cos_result, _mm256_and_ps(cos_neg, sign_bit));
256 
257  _mm256_store_ps(sinPtr, sin_result);
258  _mm256_store_ps(cosPtr, cos_result);
259  inPtr += 8;
260  sinPtr += 8;
261  cosPtr += 8;
262  }
263 
264  number = eighthPoints * 8;
265  for (; number < num_points; number++) {
266  *sinPtr++ = sinf(*inPtr);
267  *cosPtr++ = cosf(*inPtr++);
268  }
269 }
270 
271 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
272 
273 #ifdef LV_HAVE_AVX2
274 #include <immintrin.h>
276 
277 static inline void volk_32f_sincos_32f_x2_a_avx2(float* sinVector,
278  float* cosVector,
279  const float* inVector,
280  unsigned int num_points)
281 {
282  float* sinPtr = sinVector;
283  float* cosPtr = cosVector;
284  const float* inPtr = inVector;
285 
286  unsigned int number = 0;
287  unsigned int eighthPoints = num_points / 8;
288 
289  // Constants for Cody-Waite argument reduction
290  const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
291  const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
292  const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
293 
294  const __m256i ones = _mm256_set1_epi32(1);
295  const __m256i twos = _mm256_set1_epi32(2);
296  const __m256 sign_bit = _mm256_set1_ps(-0.0f);
297 
298  for (; number < eighthPoints; number++) {
299  __m256 x = _mm256_load_ps(inPtr);
300 
301  // Argument reduction: n = round(x * 2/pi)
302  __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
303  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
304  __m256i n = _mm256_cvtps_epi32(n_f);
305 
306  // r = x - n * (pi/2)
307  __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
308  r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
309 
310  // Evaluate both sin and cos polynomials
311  __m256 sin_r = _mm256_sin_poly_avx2(r);
312  __m256 cos_r = _mm256_cos_poly_avx2(r);
313 
314  // Quadrant selection
315  __m256i n_and_1 = _mm256_and_si256(n, ones);
316  __m256i n_and_2 = _mm256_and_si256(n, twos);
317  __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
318 
319  // For sin: swap when n&1, negate when n&2
320  __m256 sin_swap = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
321  __m256 sin_result = _mm256_blendv_ps(sin_r, cos_r, sin_swap);
322  __m256 sin_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
323  sin_result = _mm256_xor_ps(sin_result, _mm256_and_ps(sin_neg, sign_bit));
324 
325  // For cos: swap when n&1, negate when (n+1)&2
326  __m256 cos_result = _mm256_blendv_ps(cos_r, sin_r, sin_swap);
327  __m256 cos_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
328  cos_result = _mm256_xor_ps(cos_result, _mm256_and_ps(cos_neg, sign_bit));
329 
330  _mm256_store_ps(sinPtr, sin_result);
331  _mm256_store_ps(cosPtr, cos_result);
332  inPtr += 8;
333  sinPtr += 8;
334  cosPtr += 8;
335  }
336 
337  number = eighthPoints * 8;
338  for (; number < num_points; number++) {
339  *sinPtr++ = sinf(*inPtr);
340  *cosPtr++ = cosf(*inPtr++);
341  }
342 }
343 
344 #endif /* LV_HAVE_AVX2 for aligned */
345 
346 #ifdef LV_HAVE_SSE4_1
347 #include <smmintrin.h>
349 
350 static inline void volk_32f_sincos_32f_x2_a_sse4_1(float* sinVector,
351  float* cosVector,
352  const float* inVector,
353  unsigned int num_points)
354 {
355  float* sinPtr = sinVector;
356  float* cosPtr = cosVector;
357  const float* inPtr = inVector;
358 
359  unsigned int number = 0;
360  unsigned int quarterPoints = num_points / 4;
361 
362  // Constants for Cody-Waite argument reduction
363  const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f);
364  const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f);
365  const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f);
366 
367  const __m128i ones = _mm_set1_epi32(1);
368  const __m128i twos = _mm_set1_epi32(2);
369  const __m128 sign_bit = _mm_set1_ps(-0.0f);
370 
371  for (; number < quarterPoints; number++) {
372  __m128 x = _mm_load_ps(inPtr);
373 
374  // Argument reduction: n = round(x * 2/pi)
375  __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
376  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
377  __m128i n = _mm_cvtps_epi32(n_f);
378 
379  // r = x - n * (pi/2)
380  __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
381  r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
382 
383  // Evaluate both sin and cos polynomials
384  __m128 sin_r = _mm_sin_poly_sse(r);
385  __m128 cos_r = _mm_cos_poly_sse(r);
386 
387  // Quadrant selection
388  __m128i n_and_1 = _mm_and_si128(n, ones);
389  __m128i n_and_2 = _mm_and_si128(n, twos);
390  __m128i n_plus_1_and_2 = _mm_and_si128(_mm_add_epi32(n, ones), twos);
391 
392  // For sin: swap when n&1, negate when n&2
393  __m128 sin_swap = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
394  __m128 sin_result = _mm_blendv_ps(sin_r, cos_r, sin_swap);
395  __m128 sin_neg = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_2, twos));
396  sin_result = _mm_xor_ps(sin_result, _mm_and_ps(sin_neg, sign_bit));
397 
398  // For cos: swap when n&1, negate when (n+1)&2
399  __m128 cos_result = _mm_blendv_ps(cos_r, sin_r, sin_swap);
400  __m128 cos_neg = _mm_castsi128_ps(_mm_cmpeq_epi32(n_plus_1_and_2, twos));
401  cos_result = _mm_xor_ps(cos_result, _mm_and_ps(cos_neg, sign_bit));
402 
403  _mm_store_ps(sinPtr, sin_result);
404  _mm_store_ps(cosPtr, cos_result);
405  inPtr += 4;
406  sinPtr += 4;
407  cosPtr += 4;
408  }
409 
410  number = quarterPoints * 4;
411  for (; number < num_points; number++) {
412  *sinPtr++ = sinf(*inPtr);
413  *cosPtr++ = cosf(*inPtr++);
414  }
415 }
416 
417 #endif /* LV_HAVE_SSE4_1 for aligned */
418 
419 #endif /* INCLUDED_volk_32f_sincos_32f_x2_a_H */
420 
421 
422 #ifndef INCLUDED_volk_32f_sincos_32f_x2_u_H
423 #define INCLUDED_volk_32f_sincos_32f_x2_u_H
424 
425 #ifdef LV_HAVE_AVX512F
426 #include <immintrin.h>
428 
429 static inline void volk_32f_sincos_32f_x2_u_avx512f(float* sinVector,
430  float* cosVector,
431  const float* inVector,
432  unsigned int num_points)
433 {
434  float* sinPtr = sinVector;
435  float* cosPtr = cosVector;
436  const float* inPtr = inVector;
437 
438  unsigned int number = 0;
439  unsigned int sixteenPoints = num_points / 16;
440 
441  // Constants for Cody-Waite argument reduction
442  const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f);
443  const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f);
444  const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f);
445 
446  const __m512i ones = _mm512_set1_epi32(1);
447  const __m512i twos = _mm512_set1_epi32(2);
448  const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
449 
450  for (; number < sixteenPoints; number++) {
451  __m512 x = _mm512_loadu_ps(inPtr);
452 
453  // Argument reduction: n = round(x * 2/pi)
454  __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
455  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
456  __m512i n = _mm512_cvtps_epi32(n_f);
457 
458  // r = x - n * (pi/2)
459  __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
460  r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
461 
462  // Evaluate both sin and cos polynomials
463  __m512 sin_r = _mm512_sin_poly_avx512(r);
464  __m512 cos_r = _mm512_cos_poly_avx512(r);
465 
466  // Quadrant selection
467  __m512i n_and_1 = _mm512_and_si512(n, ones);
468  __m512i n_and_2 = _mm512_and_si512(n, twos);
469  __m512i n_plus_1_and_2 = _mm512_and_si512(_mm512_add_epi32(n, ones), twos);
470 
471  // For sin: swap when n&1, negate when n&2
472  __mmask16 sin_swap = _mm512_cmpeq_epi32_mask(n_and_1, ones);
473  __m512 sin_result = _mm512_mask_blend_ps(sin_swap, sin_r, cos_r);
474  __mmask16 sin_neg = _mm512_cmpeq_epi32_mask(n_and_2, twos);
475  sin_result =
476  _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(sin_result),
477  sin_neg,
478  _mm512_castps_si512(sin_result),
479  sign_bit));
480 
481  // For cos: swap when n&1, negate when (n+1)&2
482  __mmask16 cos_swap = sin_swap;
483  __m512 cos_result = _mm512_mask_blend_ps(cos_swap, cos_r, sin_r);
484  __mmask16 cos_neg = _mm512_cmpeq_epi32_mask(n_plus_1_and_2, twos);
485  cos_result =
486  _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(cos_result),
487  cos_neg,
488  _mm512_castps_si512(cos_result),
489  sign_bit));
490 
491  _mm512_storeu_ps(sinPtr, sin_result);
492  _mm512_storeu_ps(cosPtr, cos_result);
493  inPtr += 16;
494  sinPtr += 16;
495  cosPtr += 16;
496  }
497 
498  number = sixteenPoints * 16;
499  for (; number < num_points; number++) {
500  *sinPtr++ = sinf(*inPtr);
501  *cosPtr++ = cosf(*inPtr++);
502  }
503 }
504 
505 #endif /* LV_HAVE_AVX512F for unaligned */
506 
507 #if LV_HAVE_AVX2 && LV_HAVE_FMA
508 #include <immintrin.h>
510 
511 static inline void volk_32f_sincos_32f_x2_u_avx2_fma(float* sinVector,
512  float* cosVector,
513  const float* inVector,
514  unsigned int num_points)
515 {
516  float* sinPtr = sinVector;
517  float* cosPtr = cosVector;
518  const float* inPtr = inVector;
519 
520  unsigned int number = 0;
521  unsigned int eighthPoints = num_points / 8;
522 
523  // Constants for Cody-Waite argument reduction
524  const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
525  const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
526  const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
527 
528  const __m256i ones = _mm256_set1_epi32(1);
529  const __m256i twos = _mm256_set1_epi32(2);
530  const __m256 sign_bit = _mm256_set1_ps(-0.0f);
531 
532  for (; number < eighthPoints; number++) {
533  __m256 x = _mm256_loadu_ps(inPtr);
534 
535  // Argument reduction: n = round(x * 2/pi)
536  __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
537  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
538  __m256i n = _mm256_cvtps_epi32(n_f);
539 
540  // r = x - n * (pi/2)
541  __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
542  r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
543 
544  // Evaluate both sin and cos polynomials
545  __m256 sin_r = _mm256_sin_poly_avx2_fma(r);
546  __m256 cos_r = _mm256_cos_poly_avx2_fma(r);
547 
548  // Quadrant selection
549  __m256i n_and_1 = _mm256_and_si256(n, ones);
550  __m256i n_and_2 = _mm256_and_si256(n, twos);
551  __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
552 
553  // For sin: swap when n&1, negate when n&2
554  __m256 sin_swap = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
555  __m256 sin_result = _mm256_blendv_ps(sin_r, cos_r, sin_swap);
556  __m256 sin_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
557  sin_result = _mm256_xor_ps(sin_result, _mm256_and_ps(sin_neg, sign_bit));
558 
559  // For cos: swap when n&1, negate when (n+1)&2
560  __m256 cos_result = _mm256_blendv_ps(cos_r, sin_r, sin_swap);
561  __m256 cos_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
562  cos_result = _mm256_xor_ps(cos_result, _mm256_and_ps(cos_neg, sign_bit));
563 
564  _mm256_storeu_ps(sinPtr, sin_result);
565  _mm256_storeu_ps(cosPtr, cos_result);
566  inPtr += 8;
567  sinPtr += 8;
568  cosPtr += 8;
569  }
570 
571  number = eighthPoints * 8;
572  for (; number < num_points; number++) {
573  *sinPtr++ = sinf(*inPtr);
574  *cosPtr++ = cosf(*inPtr++);
575  }
576 }
577 
578 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
579 
580 #ifdef LV_HAVE_AVX2
581 #include <immintrin.h>
583 
584 static inline void volk_32f_sincos_32f_x2_u_avx2(float* sinVector,
585  float* cosVector,
586  const float* inVector,
587  unsigned int num_points)
588 {
589  float* sinPtr = sinVector;
590  float* cosPtr = cosVector;
591  const float* inPtr = inVector;
592 
593  unsigned int number = 0;
594  unsigned int eighthPoints = num_points / 8;
595 
596  // Constants for Cody-Waite argument reduction
597  const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
598  const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
599  const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
600 
601  const __m256i ones = _mm256_set1_epi32(1);
602  const __m256i twos = _mm256_set1_epi32(2);
603  const __m256 sign_bit = _mm256_set1_ps(-0.0f);
604 
605  for (; number < eighthPoints; number++) {
606  __m256 x = _mm256_loadu_ps(inPtr);
607 
608  // Argument reduction: n = round(x * 2/pi)
609  __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
610  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
611  __m256i n = _mm256_cvtps_epi32(n_f);
612 
613  // r = x - n * (pi/2)
614  __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
615  r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
616 
617  // Evaluate both sin and cos polynomials
618  __m256 sin_r = _mm256_sin_poly_avx2(r);
619  __m256 cos_r = _mm256_cos_poly_avx2(r);
620 
621  // Quadrant selection
622  __m256i n_and_1 = _mm256_and_si256(n, ones);
623  __m256i n_and_2 = _mm256_and_si256(n, twos);
624  __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
625 
626  // For sin: swap when n&1, negate when n&2
627  __m256 sin_swap = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
628  __m256 sin_result = _mm256_blendv_ps(sin_r, cos_r, sin_swap);
629  __m256 sin_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
630  sin_result = _mm256_xor_ps(sin_result, _mm256_and_ps(sin_neg, sign_bit));
631 
632  // For cos: swap when n&1, negate when (n+1)&2
633  __m256 cos_result = _mm256_blendv_ps(cos_r, sin_r, sin_swap);
634  __m256 cos_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
635  cos_result = _mm256_xor_ps(cos_result, _mm256_and_ps(cos_neg, sign_bit));
636 
637  _mm256_storeu_ps(sinPtr, sin_result);
638  _mm256_storeu_ps(cosPtr, cos_result);
639  inPtr += 8;
640  sinPtr += 8;
641  cosPtr += 8;
642  }
643 
644  number = eighthPoints * 8;
645  for (; number < num_points; number++) {
646  *sinPtr++ = sinf(*inPtr);
647  *cosPtr++ = cosf(*inPtr++);
648  }
649 }
650 
651 #endif /* LV_HAVE_AVX2 for unaligned */
652 
653 #ifdef LV_HAVE_SSE4_1
654 #include <smmintrin.h>
656 
657 static inline void volk_32f_sincos_32f_x2_u_sse4_1(float* sinVector,
658  float* cosVector,
659  const float* inVector,
660  unsigned int num_points)
661 {
662  float* sinPtr = sinVector;
663  float* cosPtr = cosVector;
664  const float* inPtr = inVector;
665 
666  unsigned int number = 0;
667  unsigned int quarterPoints = num_points / 4;
668 
669  // Constants for Cody-Waite argument reduction
670  const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f);
671  const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f);
672  const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f);
673 
674  const __m128i ones = _mm_set1_epi32(1);
675  const __m128i twos = _mm_set1_epi32(2);
676  const __m128 sign_bit = _mm_set1_ps(-0.0f);
677 
678  for (; number < quarterPoints; number++) {
679  __m128 x = _mm_loadu_ps(inPtr);
680 
681  // Argument reduction: n = round(x * 2/pi)
682  __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
683  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
684  __m128i n = _mm_cvtps_epi32(n_f);
685 
686  // r = x - n * (pi/2)
687  __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
688  r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
689 
690  // Evaluate both sin and cos polynomials
691  __m128 sin_r = _mm_sin_poly_sse(r);
692  __m128 cos_r = _mm_cos_poly_sse(r);
693 
694  // Quadrant selection
695  __m128i n_and_1 = _mm_and_si128(n, ones);
696  __m128i n_and_2 = _mm_and_si128(n, twos);
697  __m128i n_plus_1_and_2 = _mm_and_si128(_mm_add_epi32(n, ones), twos);
698 
699  // For sin: swap when n&1, negate when n&2
700  __m128 sin_swap = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
701  __m128 sin_result = _mm_blendv_ps(sin_r, cos_r, sin_swap);
702  __m128 sin_neg = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_2, twos));
703  sin_result = _mm_xor_ps(sin_result, _mm_and_ps(sin_neg, sign_bit));
704 
705  // For cos: swap when n&1, negate when (n+1)&2
706  __m128 cos_result = _mm_blendv_ps(cos_r, sin_r, sin_swap);
707  __m128 cos_neg = _mm_castsi128_ps(_mm_cmpeq_epi32(n_plus_1_and_2, twos));
708  cos_result = _mm_xor_ps(cos_result, _mm_and_ps(cos_neg, sign_bit));
709 
710  _mm_storeu_ps(sinPtr, sin_result);
711  _mm_storeu_ps(cosPtr, cos_result);
712  inPtr += 4;
713  sinPtr += 4;
714  cosPtr += 4;
715  }
716 
717  number = quarterPoints * 4;
718  for (; number < num_points; number++) {
719  *sinPtr++ = sinf(*inPtr);
720  *cosPtr++ = cosf(*inPtr++);
721  }
722 }
723 
724 #endif /* LV_HAVE_SSE4_1 for unaligned */
725 
726 #ifdef LV_HAVE_NEON
727 #include <arm_neon.h>
729 
730 /* NEON polynomial-based sincos with shared argument reduction */
731 static inline void volk_32f_sincos_32f_x2_neon(float* sinVector,
732  float* cosVector,
733  const float* inVector,
734  unsigned int num_points)
735 {
736  // Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
737  const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f);
738  const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f);
739  const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f);
740 
741  const int32x4_t ones = vdupq_n_s32(1);
742  const int32x4_t twos = vdupq_n_s32(2);
743  const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
744  const float32x4_t half = vdupq_n_f32(0.5f);
745  const float32x4_t neg_half = vdupq_n_f32(-0.5f);
746  const float32x4_t fzeroes = vdupq_n_f32(0.0f);
747 
748  unsigned int number = 0;
749  const unsigned int quarterPoints = num_points / 4;
750 
751  for (; number < quarterPoints; number++) {
752  float32x4_t x = vld1q_f32(inVector);
753  inVector += 4;
754 
755  // n = round(x * 2/pi) - emulate round-to-nearest for ARMv7
756  float32x4_t scaled = vmulq_f32(x, two_over_pi);
757  uint32x4_t is_neg = vcltq_f32(scaled, fzeroes);
758  float32x4_t adj = vbslq_f32(is_neg, neg_half, half);
759  float32x4_t n_f = vcvtq_f32_s32(vcvtq_s32_f32(vaddq_f32(scaled, adj)));
760  int32x4_t n = vcvtq_s32_f32(n_f);
761 
762  // r = x - n * (pi/2) using extended precision
763  float32x4_t r = vmlsq_f32(x, n_f, pi_over_2_hi);
764  r = vmlsq_f32(r, n_f, pi_over_2_lo);
765 
766  // Evaluate sin and cos polynomials (shared for both outputs)
767  float32x4_t sin_r = _vsin_poly_f32(r);
768  float32x4_t cos_r = _vcos_poly_f32(r);
769 
770  // Quadrant selection
771  int32x4_t n_and_1 = vandq_s32(n, ones);
772  int32x4_t n_and_2 = vandq_s32(n, twos);
773  int32x4_t n_plus_1_and_2 = vandq_s32(vaddq_s32(n, ones), twos);
774 
775  uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
776 
777  // Sin result: use cos when n&1, negate when n&2
778  float32x4_t sin_result = vbslq_f32(swap_mask, cos_r, sin_r);
779  uint32x4_t sin_neg = vceqq_s32(n_and_2, twos);
780  sin_result = vreinterpretq_f32_u32(
781  veorq_u32(vreinterpretq_u32_f32(sin_result),
782  vandq_u32(sin_neg, vreinterpretq_u32_f32(sign_bit))));
783 
784  // Cos result: use sin when n&1, negate when (n+1)&2
785  float32x4_t cos_result = vbslq_f32(swap_mask, sin_r, cos_r);
786  uint32x4_t cos_neg = vceqq_s32(n_plus_1_and_2, twos);
787  cos_result = vreinterpretq_f32_u32(
788  veorq_u32(vreinterpretq_u32_f32(cos_result),
789  vandq_u32(cos_neg, vreinterpretq_u32_f32(sign_bit))));
790 
791  vst1q_f32(sinVector, sin_result);
792  vst1q_f32(cosVector, cos_result);
793  sinVector += 4;
794  cosVector += 4;
795  }
796 
797  for (number = quarterPoints * 4; number < num_points; number++) {
798  *sinVector++ = sinf(*inVector);
799  *cosVector++ = cosf(*inVector++);
800  }
801 }
802 #endif /* LV_HAVE_NEON */
803 
804 #ifdef LV_HAVE_NEONV8
805 #include <arm_neon.h>
807 
808 /* NEONv8 polynomial-based sincos with FMA and shared argument reduction */
809 static inline void volk_32f_sincos_32f_x2_neonv8(float* sinVector,
810  float* cosVector,
811  const float* inVector,
812  unsigned int num_points)
813 {
814  // Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
815  const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f);
816  const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f);
817  const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f);
818 
819  const int32x4_t ones = vdupq_n_s32(1);
820  const int32x4_t twos = vdupq_n_s32(2);
821  const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
822 
823  unsigned int number = 0;
824  const unsigned int quarterPoints = num_points / 4;
825 
826  for (; number < quarterPoints; number++) {
827  float32x4_t x = vld1q_f32(inVector);
828  inVector += 4;
829 
830  // n = round(x * 2/pi) using ARMv8 vrndnq_f32
831  float32x4_t n_f = vrndnq_f32(vmulq_f32(x, two_over_pi));
832  int32x4_t n = vcvtq_s32_f32(n_f);
833 
834  // r = x - n * (pi/2) using FMA for extended precision
835  float32x4_t r = vfmsq_f32(x, n_f, pi_over_2_hi);
836  r = vfmsq_f32(r, n_f, pi_over_2_lo);
837 
838  // Evaluate sin and cos polynomials using FMA (shared for both outputs)
839  float32x4_t sin_r = _vsin_poly_neonv8(r);
840  float32x4_t cos_r = _vcos_poly_neonv8(r);
841 
842  // Quadrant selection
843  int32x4_t n_and_1 = vandq_s32(n, ones);
844  int32x4_t n_and_2 = vandq_s32(n, twos);
845  int32x4_t n_plus_1_and_2 = vandq_s32(vaddq_s32(n, ones), twos);
846 
847  uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
848 
849  // Sin result: use cos when n&1, negate when n&2
850  float32x4_t sin_result = vbslq_f32(swap_mask, cos_r, sin_r);
851  uint32x4_t sin_neg = vceqq_s32(n_and_2, twos);
852  sin_result = vreinterpretq_f32_u32(
853  veorq_u32(vreinterpretq_u32_f32(sin_result),
854  vandq_u32(sin_neg, vreinterpretq_u32_f32(sign_bit))));
855 
856  // Cos result: use sin when n&1, negate when (n+1)&2
857  float32x4_t cos_result = vbslq_f32(swap_mask, sin_r, cos_r);
858  uint32x4_t cos_neg = vceqq_s32(n_plus_1_and_2, twos);
859  cos_result = vreinterpretq_f32_u32(
860  veorq_u32(vreinterpretq_u32_f32(cos_result),
861  vandq_u32(cos_neg, vreinterpretq_u32_f32(sign_bit))));
862 
863  vst1q_f32(sinVector, sin_result);
864  vst1q_f32(cosVector, cos_result);
865  sinVector += 4;
866  cosVector += 4;
867  }
868 
869  for (number = quarterPoints * 4; number < num_points; number++) {
870  *sinVector++ = sinf(*inVector);
871  *cosVector++ = cosf(*inVector++);
872  }
873 }
874 #endif /* LV_HAVE_NEONV8 */
875 
876 #endif /* INCLUDED_volk_32f_sincos_32f_x2_u_H */
volk_avx512_intrinsics.h
_mm256_sin_poly_avx2_fma
static __m256 _mm256_sin_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:79
_mm_sin_poly_sse
static __m128 _mm_sin_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:157
_mm256_cos_poly_avx2_fma
static __m256 _mm256_cos_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:99
_mm512_sin_poly_avx512
static __m512 _mm512_sin_poly_avx512(const __m512 x)
Definition: volk_avx512_intrinsics.h:206
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_sin_poly
static float volk_sin_poly(const float x)
Definition: volk_common.h:205
volk_common.h
_mm512_cos_poly_avx512
static __m512 _mm512_cos_poly_avx512(const __m512 x)
Definition: volk_avx512_intrinsics.h:227
_vcos_poly_f32
static float32x4_t _vcos_poly_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:396
volk_sse_intrinsics.h
_mm256_sin_poly_avx2
static __m256 _mm256_sin_poly_avx2(const __m256 x)
Definition: volk_avx2_intrinsics.h:382
volk_32f_sincos_32f_x2_polynomial
static void volk_32f_sincos_32f_x2_polynomial(float *sinVector, float *cosVector, const float *inVector, unsigned int num_points)
Definition: volk_32f_sincos_32f_x2.h:83
volk_avx2_fma_intrinsics.h
volk_cos_poly
static float volk_cos_poly(const float x)
Definition: volk_common.h:227
volk_32f_sincos_32f_x2_generic
static void volk_32f_sincos_32f_x2_generic(float *sinVector, float *cosVector, const float *inVector, unsigned int num_points)
Definition: volk_32f_sincos_32f_x2.h:67
_mm256_cos_poly_avx2
static __m256 _mm256_cos_poly_avx2(const __m256 x)
Definition: volk_avx2_intrinsics.h:403
volk_neon_intrinsics.h
volk_32f_sincos_32f_x2_neon
static void volk_32f_sincos_32f_x2_neon(float *sinVector, float *cosVector, const float *inVector, unsigned int num_points)
Definition: volk_32f_sincos_32f_x2.h:731
_mm_cos_poly_sse
static __m128 _mm_cos_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:177
volk_avx2_intrinsics.h
rintf
static float rintf(float x)
Definition: config.h:45
_vsin_poly_f32
static float32x4_t _vsin_poly_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:376