Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_sin_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
60 #include <inttypes.h>
61 #include <math.h>
62 #include <stdio.h>
63 
64 #ifndef INCLUDED_volk_32f_sin_32f_a_H
65 #define INCLUDED_volk_32f_sin_32f_a_H
66 
67 #ifdef LV_HAVE_GENERIC
68 #include <volk/volk_common.h>
69 
70 static inline void
71 volk_32f_sin_32f_polynomial(float* bVector, const float* aVector, unsigned int num_points)
72 {
73  for (unsigned int number = 0; number < num_points; number++) {
74  *bVector++ = volk_sin(*aVector++);
75  }
76 }
77 #endif /* LV_HAVE_GENERIC */
78 
79 #ifdef LV_HAVE_AVX512F
80 #include <immintrin.h>
82 
83 static inline void volk_32f_sin_32f_a_avx512f(float* sinVector,
84  const float* inVector,
85  unsigned int num_points)
86 {
87  float* sinPtr = sinVector;
88  const float* inPtr = inVector;
89 
90  unsigned int number = 0;
91  unsigned int sixteenPoints = num_points / 16;
92 
93  // Constants for Cody-Waite argument reduction
94  // n = round(x * 2/pi), then r = x - n * pi/2
95  const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f); // 2/pi
96  const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f); // pi/2 high
97  const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f); // pi/2 low
98 
99  const __m512i ones = _mm512_set1_epi32(1);
100  const __m512i twos = _mm512_set1_epi32(2);
101  const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
102 
103  for (; number < sixteenPoints; number++) {
104  __m512 x = _mm512_load_ps(inPtr);
105 
106  // Argument reduction: n = round(x * 2/pi)
107  __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
108  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
109  __m512i n = _mm512_cvtps_epi32(n_f);
110 
111  // r = x - n * (pi/2), using extended precision
112  __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
113  r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
114 
115  // Evaluate both sin and cos polynomials
116  __m512 sin_r = _mm512_sin_poly_avx512(r);
117  __m512 cos_r = _mm512_cos_poly_avx512(r);
118 
119  // Reconstruct sin(x) based on quadrant (n mod 4):
120  // n&1 == 0: use sin_r, n&1 == 1: use cos_r
121  // n&2 == 0: positive, n&2 == 2: negative
122  __m512i n_and_1 = _mm512_and_si512(n, ones);
123  __m512i n_and_2 = _mm512_and_si512(n, twos);
124 
125  // swap_mask: where n&1 != 0, we use cos instead of sin
126  __mmask16 swap_mask = _mm512_cmpeq_epi32_mask(n_and_1, ones);
127  __m512 result = _mm512_mask_blend_ps(swap_mask, sin_r, cos_r);
128 
129  // neg_mask: where n&2 != 0, we negate the result (use integer xor for AVX512F)
130  __mmask16 neg_mask = _mm512_cmpeq_epi32_mask(n_and_2, twos);
131  result = _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(result),
132  neg_mask,
133  _mm512_castps_si512(result),
134  sign_bit));
135 
136  _mm512_store_ps(sinPtr, result);
137  inPtr += 16;
138  sinPtr += 16;
139  }
140 
141  number = sixteenPoints * 16;
142  for (; number < num_points; number++) {
143  *sinPtr++ = sinf(*inPtr++);
144  }
145 }
146 #endif /* LV_HAVE_AVX512F */
147 
148 #if LV_HAVE_AVX2 && LV_HAVE_FMA
149 #include <immintrin.h>
151 
152 static inline void
153 volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
154 {
155  float* bPtr = bVector;
156  const float* aPtr = aVector;
157 
158  unsigned int number = 0;
159  unsigned int eighthPoints = num_points / 8;
160 
161  // Constants for Cody-Waite argument reduction
162  // n = round(x * 2/pi), then r = x - n * pi/2
163  const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
164  const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
165  const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
166 
167  const __m256i ones = _mm256_set1_epi32(1);
168  const __m256i twos = _mm256_set1_epi32(2);
169  const __m256 sign_bit = _mm256_set1_ps(-0.0f);
170 
171  for (; number < eighthPoints; number++) {
172  __m256 x = _mm256_load_ps(aPtr);
173 
174  // Argument reduction: n = round(x * 2/pi)
175  __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
176  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
177  __m256i n = _mm256_cvtps_epi32(n_f);
178 
179  // r = x - n * (pi/2), using extended precision
180  __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
181  r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
182 
183  // Evaluate both sin and cos polynomials
184  __m256 sin_r = _mm256_sin_poly_avx2_fma(r);
185  __m256 cos_r = _mm256_cos_poly_avx2_fma(r);
186 
187  // Reconstruct sin(x) based on quadrant (n mod 4):
188  // n&1 == 0: use sin_r, n&1 == 1: use cos_r
189  // n&2 == 0: positive, n&2 == 2: negative
190  __m256i n_and_1 = _mm256_and_si256(n, ones);
191  __m256i n_and_2 = _mm256_and_si256(n, twos);
192 
193  // swap_mask: where n&1 != 0, we use cos instead of sin
194  __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
195  __m256 result = _mm256_blendv_ps(sin_r, cos_r, swap_mask);
196 
197  // neg_mask: where n&2 != 0, we negate the result
198  __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
199  result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
200 
201  _mm256_store_ps(bPtr, result);
202  aPtr += 8;
203  bPtr += 8;
204  }
205 
206  number = eighthPoints * 8;
207  for (; number < num_points; number++) {
208  *bPtr++ = sinf(*aPtr++);
209  }
210 }
211 
212 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
213 
214 #ifdef LV_HAVE_AVX2
215 #include <immintrin.h>
217 
218 static inline void
219 volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
220 {
221  float* bPtr = bVector;
222  const float* aPtr = aVector;
223 
224  unsigned int number = 0;
225  unsigned int eighthPoints = num_points / 8;
226 
227  // Constants for Cody-Waite argument reduction
228  // n = round(x * 2/pi), then r = x - n * pi/2
229  const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
230  const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
231  const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
232 
233  const __m256i ones = _mm256_set1_epi32(1);
234  const __m256i twos = _mm256_set1_epi32(2);
235  const __m256 sign_bit = _mm256_set1_ps(-0.0f);
236 
237  for (; number < eighthPoints; number++) {
238  __m256 x = _mm256_load_ps(aPtr);
239 
240  // Argument reduction: n = round(x * 2/pi)
241  __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
242  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
243  __m256i n = _mm256_cvtps_epi32(n_f);
244 
245  // r = x - n * (pi/2), using extended precision
246  __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
247  r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
248 
249  // Evaluate both sin and cos polynomials
250  __m256 sin_r = _mm256_sin_poly_avx2(r);
251  __m256 cos_r = _mm256_cos_poly_avx2(r);
252 
253  // Reconstruct sin(x) based on quadrant (n mod 4):
254  // n&1 == 0: use sin_r, n&1 == 1: use cos_r
255  // n&2 == 0: positive, n&2 == 2: negative
256  __m256i n_and_1 = _mm256_and_si256(n, ones);
257  __m256i n_and_2 = _mm256_and_si256(n, twos);
258 
259  // swap_mask: where n&1 != 0, we use cos instead of sin
260  __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
261  __m256 result = _mm256_blendv_ps(sin_r, cos_r, swap_mask);
262 
263  // neg_mask: where n&2 != 0, we negate the result
264  __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
265  result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
266 
267  _mm256_store_ps(bPtr, result);
268  aPtr += 8;
269  bPtr += 8;
270  }
271 
272  number = eighthPoints * 8;
273  for (; number < num_points; number++) {
274  *bPtr++ = sinf(*aPtr++);
275  }
276 }
277 
278 #endif /* LV_HAVE_AVX2 */
279 
280 #ifdef LV_HAVE_SSE4_1
281 #include <smmintrin.h>
283 
284 static inline void
285 volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
286 {
287  float* bPtr = bVector;
288  const float* aPtr = aVector;
289 
290  unsigned int number = 0;
291  unsigned int quarterPoints = num_points / 4;
292 
293  // Constants for Cody-Waite argument reduction
294  // n = round(x * 2/pi), then r = x - n * pi/2
295  const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f); // 2/pi
296  const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f); // pi/2 high
297  const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f); // pi/2 low
298 
299  const __m128i ones = _mm_set1_epi32(1);
300  const __m128i twos = _mm_set1_epi32(2);
301  const __m128 sign_bit = _mm_set1_ps(-0.0f);
302 
303  for (; number < quarterPoints; number++) {
304  __m128 x = _mm_load_ps(aPtr);
305 
306  // Argument reduction: n = round(x * 2/pi)
307  __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
308  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
309  __m128i n = _mm_cvtps_epi32(n_f);
310 
311  // r = x - n * (pi/2), using extended precision
312  __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
313  r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
314 
315  // Evaluate both sin and cos polynomials
316  __m128 sin_r = _mm_sin_poly_sse(r);
317  __m128 cos_r = _mm_cos_poly_sse(r);
318 
319  // Reconstruct sin(x) based on quadrant (n mod 4):
320  // n&1 == 0: use sin_r, n&1 == 1: use cos_r
321  // n&2 == 0: positive, n&2 == 2: negative
322  __m128i n_and_1 = _mm_and_si128(n, ones);
323  __m128i n_and_2 = _mm_and_si128(n, twos);
324 
325  // swap_mask: where n&1 != 0, we use cos instead of sin
326  __m128 swap_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
327  __m128 result = _mm_blendv_ps(sin_r, cos_r, swap_mask);
328 
329  // neg_mask: where n&2 != 0, we negate the result
330  __m128 neg_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_2, twos));
331  result = _mm_xor_ps(result, _mm_and_ps(neg_mask, sign_bit));
332 
333  _mm_store_ps(bPtr, result);
334  aPtr += 4;
335  bPtr += 4;
336  }
337 
338  number = quarterPoints * 4;
339  for (; number < num_points; number++) {
340  *bPtr++ = sinf(*aPtr++);
341  }
342 }
343 
344 #endif /* LV_HAVE_SSE4_1 */
345 
346 
347 #endif /* INCLUDED_volk_32f_sin_32f_a_H */
348 
349 #ifndef INCLUDED_volk_32f_sin_32f_u_H
350 #define INCLUDED_volk_32f_sin_32f_u_H
351 
352 #ifdef LV_HAVE_AVX512F
353 #include <immintrin.h>
355 
356 static inline void volk_32f_sin_32f_u_avx512f(float* sinVector,
357  const float* inVector,
358  unsigned int num_points)
359 {
360  float* sinPtr = sinVector;
361  const float* inPtr = inVector;
362 
363  unsigned int number = 0;
364  unsigned int sixteenPoints = num_points / 16;
365 
366  // Constants for Cody-Waite argument reduction
367  // n = round(x * 2/pi), then r = x - n * pi/2
368  const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f); // 2/pi
369  const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f); // pi/2 high
370  const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f); // pi/2 low
371 
372  const __m512i ones = _mm512_set1_epi32(1);
373  const __m512i twos = _mm512_set1_epi32(2);
374  const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
375 
376  for (; number < sixteenPoints; number++) {
377  __m512 x = _mm512_loadu_ps(inPtr);
378 
379  // Argument reduction: n = round(x * 2/pi)
380  __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
381  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
382  __m512i n = _mm512_cvtps_epi32(n_f);
383 
384  // r = x - n * (pi/2), using extended precision
385  __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
386  r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
387 
388  // Evaluate both sin and cos polynomials
389  __m512 sin_r = _mm512_sin_poly_avx512(r);
390  __m512 cos_r = _mm512_cos_poly_avx512(r);
391 
392  // Reconstruct sin(x) based on quadrant (n mod 4):
393  // n&1 == 0: use sin_r, n&1 == 1: use cos_r
394  // n&2 == 0: positive, n&2 == 2: negative
395  __m512i n_and_1 = _mm512_and_si512(n, ones);
396  __m512i n_and_2 = _mm512_and_si512(n, twos);
397 
398  // swap_mask: where n&1 != 0, we use cos instead of sin
399  __mmask16 swap_mask = _mm512_cmpeq_epi32_mask(n_and_1, ones);
400  __m512 result = _mm512_mask_blend_ps(swap_mask, sin_r, cos_r);
401 
402  // neg_mask: where n&2 != 0, we negate the result (use integer xor for AVX512F)
403  __mmask16 neg_mask = _mm512_cmpeq_epi32_mask(n_and_2, twos);
404  result = _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(result),
405  neg_mask,
406  _mm512_castps_si512(result),
407  sign_bit));
408 
409  _mm512_storeu_ps(sinPtr, result);
410  inPtr += 16;
411  sinPtr += 16;
412  }
413 
414  number = sixteenPoints * 16;
415  for (; number < num_points; number++) {
416  *sinPtr++ = sinf(*inPtr++);
417  }
418 }
419 #endif /* LV_HAVE_AVX512F */
420 
421 #if LV_HAVE_AVX2 && LV_HAVE_FMA
422 #include <immintrin.h>
424 
425 static inline void
426 volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
427 {
428  float* bPtr = bVector;
429  const float* aPtr = aVector;
430 
431  unsigned int number = 0;
432  unsigned int eighthPoints = num_points / 8;
433 
434  // Constants for Cody-Waite argument reduction
435  // n = round(x * 2/pi), then r = x - n * pi/2
436  const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
437  const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
438  const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
439 
440  const __m256i ones = _mm256_set1_epi32(1);
441  const __m256i twos = _mm256_set1_epi32(2);
442  const __m256 sign_bit = _mm256_set1_ps(-0.0f);
443 
444  for (; number < eighthPoints; number++) {
445  __m256 x = _mm256_loadu_ps(aPtr);
446 
447  // Argument reduction: n = round(x * 2/pi)
448  __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
449  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
450  __m256i n = _mm256_cvtps_epi32(n_f);
451 
452  // r = x - n * (pi/2), using extended precision
453  __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
454  r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
455 
456  // Evaluate both sin and cos polynomials
457  __m256 sin_r = _mm256_sin_poly_avx2_fma(r);
458  __m256 cos_r = _mm256_cos_poly_avx2_fma(r);
459 
460  // Reconstruct sin(x) based on quadrant (n mod 4):
461  // n&1 == 0: use sin_r, n&1 == 1: use cos_r
462  // n&2 == 0: positive, n&2 == 2: negative
463  __m256i n_and_1 = _mm256_and_si256(n, ones);
464  __m256i n_and_2 = _mm256_and_si256(n, twos);
465 
466  // swap_mask: where n&1 != 0, we use cos instead of sin
467  __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
468  __m256 result = _mm256_blendv_ps(sin_r, cos_r, swap_mask);
469 
470  // neg_mask: where n&2 != 0, we negate the result
471  __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
472  result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
473 
474  _mm256_storeu_ps(bPtr, result);
475  aPtr += 8;
476  bPtr += 8;
477  }
478 
479  number = eighthPoints * 8;
480  for (; number < num_points; number++) {
481  *bPtr++ = sinf(*aPtr++);
482  }
483 }
484 
485 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
486 
487 #ifdef LV_HAVE_AVX2
488 #include <immintrin.h>
490 
491 static inline void
492 volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
493 {
494  float* bPtr = bVector;
495  const float* aPtr = aVector;
496 
497  unsigned int number = 0;
498  unsigned int eighthPoints = num_points / 8;
499 
500  // Constants for Cody-Waite argument reduction
501  // n = round(x * 2/pi), then r = x - n * pi/2
502  const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
503  const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
504  const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
505 
506  const __m256i ones = _mm256_set1_epi32(1);
507  const __m256i twos = _mm256_set1_epi32(2);
508  const __m256 sign_bit = _mm256_set1_ps(-0.0f);
509 
510  for (; number < eighthPoints; number++) {
511  __m256 x = _mm256_loadu_ps(aPtr);
512 
513  // Argument reduction: n = round(x * 2/pi)
514  __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
515  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
516  __m256i n = _mm256_cvtps_epi32(n_f);
517 
518  // r = x - n * (pi/2), using extended precision
519  __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
520  r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
521 
522  // Evaluate both sin and cos polynomials
523  __m256 sin_r = _mm256_sin_poly_avx2(r);
524  __m256 cos_r = _mm256_cos_poly_avx2(r);
525 
526  // Reconstruct sin(x) based on quadrant (n mod 4):
527  // n&1 == 0: use sin_r, n&1 == 1: use cos_r
528  // n&2 == 0: positive, n&2 == 2: negative
529  __m256i n_and_1 = _mm256_and_si256(n, ones);
530  __m256i n_and_2 = _mm256_and_si256(n, twos);
531 
532  // swap_mask: where n&1 != 0, we use cos instead of sin
533  __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
534  __m256 result = _mm256_blendv_ps(sin_r, cos_r, swap_mask);
535 
536  // neg_mask: where n&2 != 0, we negate the result
537  __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
538  result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
539 
540  _mm256_storeu_ps(bPtr, result);
541  aPtr += 8;
542  bPtr += 8;
543  }
544 
545  number = eighthPoints * 8;
546  for (; number < num_points; number++) {
547  *bPtr++ = sinf(*aPtr++);
548  }
549 }
550 
551 #endif /* LV_HAVE_AVX2 */
552 
553 
554 #ifdef LV_HAVE_SSE4_1
555 #include <smmintrin.h>
557 
558 static inline void
559 volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
560 {
561  float* bPtr = bVector;
562  const float* aPtr = aVector;
563 
564  unsigned int number = 0;
565  unsigned int quarterPoints = num_points / 4;
566 
567  // Constants for Cody-Waite argument reduction
568  // n = round(x * 2/pi), then r = x - n * pi/2
569  const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f); // 2/pi
570  const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f); // pi/2 high
571  const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f); // pi/2 low
572 
573  const __m128i ones = _mm_set1_epi32(1);
574  const __m128i twos = _mm_set1_epi32(2);
575  const __m128 sign_bit = _mm_set1_ps(-0.0f);
576 
577  for (; number < quarterPoints; number++) {
578  __m128 x = _mm_loadu_ps(aPtr);
579 
580  // Argument reduction: n = round(x * 2/pi)
581  __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
582  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
583  __m128i n = _mm_cvtps_epi32(n_f);
584 
585  // r = x - n * (pi/2), using extended precision
586  __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
587  r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
588 
589  // Evaluate both sin and cos polynomials
590  __m128 sin_r = _mm_sin_poly_sse(r);
591  __m128 cos_r = _mm_cos_poly_sse(r);
592 
593  // Reconstruct sin(x) based on quadrant (n mod 4):
594  // n&1 == 0: use sin_r, n&1 == 1: use cos_r
595  // n&2 == 0: positive, n&2 == 2: negative
596  __m128i n_and_1 = _mm_and_si128(n, ones);
597  __m128i n_and_2 = _mm_and_si128(n, twos);
598 
599  // swap_mask: where n&1 != 0, we use cos instead of sin
600  __m128 swap_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
601  __m128 result = _mm_blendv_ps(sin_r, cos_r, swap_mask);
602 
603  // neg_mask: where n&2 != 0, we negate the result
604  __m128 neg_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_2, twos));
605  result = _mm_xor_ps(result, _mm_and_ps(neg_mask, sign_bit));
606 
607  _mm_storeu_ps(bPtr, result);
608  aPtr += 4;
609  bPtr += 4;
610  }
611 
612  number = quarterPoints * 4;
613  for (; number < num_points; number++) {
614  *bPtr++ = sinf(*aPtr++);
615  }
616 }
617 
618 #endif /* LV_HAVE_SSE4_1 */
619 
620 
621 #ifdef LV_HAVE_GENERIC
622 
623 static inline void
624 volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
625 {
626  float* bPtr = bVector;
627  const float* aPtr = aVector;
628  unsigned int number = 0;
629 
630  for (number = 0; number < num_points; number++) {
631  *bPtr++ = sinf(*aPtr++);
632  }
633 }
634 
635 #endif /* LV_HAVE_GENERIC */
636 
637 #ifdef LV_HAVE_NEON
638 #include <arm_neon.h>
640 
641 /* NEON polynomial-based sin using Cody-Waite argument reduction */
642 static inline void
643 volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
644 {
645  // Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
646  const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f); // 2/pi
647  const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f); // pi/2 high
648  const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f); // pi/2 low
649 
650  const int32x4_t ones = vdupq_n_s32(1);
651  const int32x4_t twos = vdupq_n_s32(2);
652  const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
653  const float32x4_t half = vdupq_n_f32(0.5f);
654  const float32x4_t neg_half = vdupq_n_f32(-0.5f);
655  const float32x4_t fzeroes = vdupq_n_f32(0.0f);
656 
657  unsigned int number = 0;
658  const unsigned int quarterPoints = num_points / 4;
659 
660  for (; number < quarterPoints; number++) {
661  float32x4_t x = vld1q_f32(aVector);
662  aVector += 4;
663 
664  // n = round(x * 2/pi) - emulate round-to-nearest for ARMv7
665  float32x4_t scaled = vmulq_f32(x, two_over_pi);
666  uint32x4_t is_neg = vcltq_f32(scaled, fzeroes);
667  float32x4_t adj = vbslq_f32(is_neg, neg_half, half);
668  float32x4_t n_f = vcvtq_f32_s32(vcvtq_s32_f32(vaddq_f32(scaled, adj)));
669  int32x4_t n = vcvtq_s32_f32(n_f);
670 
671  // r = x - n * (pi/2) using extended precision
672  float32x4_t r = vmlsq_f32(x, n_f, pi_over_2_hi);
673  r = vmlsq_f32(r, n_f, pi_over_2_lo);
674 
675  // Evaluate sin and cos polynomials
676  float32x4_t sin_r = _vsin_poly_f32(r);
677  float32x4_t cos_r = _vcos_poly_f32(r);
678 
679  // Quadrant-based reconstruction:
680  // n&1 == 0: use sin_r, n&1 == 1: use cos_r
681  // n&2 == 0: positive, n&2 == 2: negative
682  int32x4_t n_and_1 = vandq_s32(n, ones);
683  int32x4_t n_and_2 = vandq_s32(n, twos);
684 
685  uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
686  float32x4_t result = vbslq_f32(swap_mask, cos_r, sin_r);
687 
688  uint32x4_t neg_mask = vceqq_s32(n_and_2, twos);
689  result = vreinterpretq_f32_u32(
690  veorq_u32(vreinterpretq_u32_f32(result),
691  vandq_u32(neg_mask, vreinterpretq_u32_f32(sign_bit))));
692 
693  vst1q_f32(bVector, result);
694  bVector += 4;
695  }
696 
697  for (number = quarterPoints * 4; number < num_points; number++) {
698  *bVector++ = sinf(*aVector++);
699  }
700 }
701 #endif /* LV_HAVE_NEON */
702 
703 #ifdef LV_HAVE_NEONV8
704 #include <arm_neon.h>
706 
707 /* NEONv8 polynomial-based sin using Cody-Waite argument reduction with FMA */
708 static inline void
709 volk_32f_sin_32f_neonv8(float* bVector, const float* aVector, unsigned int num_points)
710 {
711  // Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
712  const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f); // 2/pi
713  const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f); // pi/2 high
714  const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f); // pi/2 low
715 
716  const int32x4_t ones = vdupq_n_s32(1);
717  const int32x4_t twos = vdupq_n_s32(2);
718  const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
719 
720  unsigned int number = 0;
721  const unsigned int quarterPoints = num_points / 4;
722 
723  for (; number < quarterPoints; number++) {
724  float32x4_t x = vld1q_f32(aVector);
725  aVector += 4;
726 
727  // n = round(x * 2/pi) using ARMv8 vrndnq_f32
728  float32x4_t n_f = vrndnq_f32(vmulq_f32(x, two_over_pi));
729  int32x4_t n = vcvtq_s32_f32(n_f);
730 
731  // r = x - n * (pi/2) using FMA for extended precision
732  float32x4_t r = vfmsq_f32(x, n_f, pi_over_2_hi);
733  r = vfmsq_f32(r, n_f, pi_over_2_lo);
734 
735  // Evaluate sin and cos polynomials using FMA
736  float32x4_t sin_r = _vsin_poly_neonv8(r);
737  float32x4_t cos_r = _vcos_poly_neonv8(r);
738 
739  // Quadrant-based reconstruction:
740  // n&1 == 0: use sin_r, n&1 == 1: use cos_r
741  // n&2 == 0: positive, n&2 == 2: negative
742  int32x4_t n_and_1 = vandq_s32(n, ones);
743  int32x4_t n_and_2 = vandq_s32(n, twos);
744 
745  uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
746  float32x4_t result = vbslq_f32(swap_mask, cos_r, sin_r);
747 
748  uint32x4_t neg_mask = vceqq_s32(n_and_2, twos);
749  result = vreinterpretq_f32_u32(
750  veorq_u32(vreinterpretq_u32_f32(result),
751  vandq_u32(neg_mask, vreinterpretq_u32_f32(sign_bit))));
752 
753  vst1q_f32(bVector, result);
754  bVector += 4;
755  }
756 
757  for (number = quarterPoints * 4; number < num_points; number++) {
758  *bVector++ = sinf(*aVector++);
759  }
760 }
761 
762 #endif /* LV_HAVE_NEONV8 */
763 
764 #ifdef LV_HAVE_RVV
765 #include <riscv_vector.h>
766 
767 static inline void
768 volk_32f_sin_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
769 {
770  size_t vlmax = __riscv_vsetvlmax_e32m2();
771 
772  const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
773  const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
774  const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
775  const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
776 
777  const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
778  const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
779 
780  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
781  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
782  const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05, vlmax);
783  const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07, vlmax);
784 
785  size_t n = num_points;
786  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
787  vl = __riscv_vsetvl_e32m2(n);
788  vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
789  vfloat32m2_t s = __riscv_vfabs(v, vl);
790  vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
791  vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
792 
793  s = __riscv_vfnmsac(s, cPio4a, r, vl);
794  s = __riscv_vfnmsac(s, cPio4b, r, vl);
795  s = __riscv_vfnmsac(s, cPio4c, r, vl);
796 
797  s = __riscv_vfmul(s, 1 / 8.0f, vl);
798  s = __riscv_vfmul(s, s, vl);
799  vfloat32m2_t t = s;
800  s = __riscv_vfmsub(s, c5, c4, vl);
801  s = __riscv_vfmadd(s, t, c3, vl);
802  s = __riscv_vfmsub(s, t, c2, vl);
803  s = __riscv_vfmadd(s, t, cf1, vl);
804  s = __riscv_vfmul(s, t, vl);
805  s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
806  s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
807  s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
808  s = __riscv_vfmul(s, 1 / 2.0f, vl);
809 
810  vfloat32m2_t sine =
811  __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
812  vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
813 
814  vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
815  vbool16_t m2 = __riscv_vmxor(__riscv_vmslt(__riscv_vreinterpret_i32m2(v), 0, vl),
816  __riscv_vmsne(__riscv_vand(q, 4, vl), 0, vl),
817  vl);
818 
819  sine = __riscv_vmerge(sine, cosine, m1, vl);
820  sine = __riscv_vfneg_mu(m2, sine, sine, vl);
821 
822  __riscv_vse32(bVector, sine, vl);
823  }
824 }
825 #endif /*LV_HAVE_RVV*/
826 
827 #endif /* INCLUDED_volk_32f_sin_32f_u_H */
volk_32f_sin_32f_polynomial
static void volk_32f_sin_32f_polynomial(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:71
volk_avx512_intrinsics.h
_mm256_sin_poly_avx2_fma
static __m256 _mm256_sin_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:79
_mm_sin_poly_sse
static __m128 _mm_sin_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:157
volk_32f_sin_32f_generic
static void volk_32f_sin_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:624
_mm256_cos_poly_avx2_fma
static __m256 _mm256_cos_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:99
_mm512_sin_poly_avx512
static __m512 _mm512_sin_poly_avx512(const __m512 x)
Definition: volk_avx512_intrinsics.h:206
volk_common.h
_mm512_cos_poly_avx512
static __m512 _mm512_cos_poly_avx512(const __m512 x)
Definition: volk_avx512_intrinsics.h:227
volk_sin
static float volk_sin(const float x)
Definition: volk_common.h:248
_vcos_poly_f32
static float32x4_t _vcos_poly_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:396
volk_sse_intrinsics.h
_mm256_sin_poly_avx2
static __m256 _mm256_sin_poly_avx2(const __m256 x)
Definition: volk_avx2_intrinsics.h:382
volk_32f_sin_32f_neon
static void volk_32f_sin_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:643
volk_avx2_fma_intrinsics.h
_mm256_cos_poly_avx2
static __m256 _mm256_cos_poly_avx2(const __m256 x)
Definition: volk_avx2_intrinsics.h:403
volk_neon_intrinsics.h
_mm_cos_poly_sse
static __m128 _mm_cos_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:177
volk_avx2_intrinsics.h
_vsin_poly_f32
static float32x4_t _vsin_poly_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:376