Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_cos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
59 #include <inttypes.h>
60 #include <math.h>
61 #include <stdio.h>
62 
63 #ifndef INCLUDED_volk_32f_cos_32f_a_H
64 #define INCLUDED_volk_32f_cos_32f_a_H
65 
66 #ifdef LV_HAVE_GENERIC
67 
68 static inline void
69 volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
70 {
71  float* bPtr = bVector;
72  const float* aPtr = aVector;
73  unsigned int number = 0;
74 
75  for (; number < num_points; number++) {
76  *bPtr++ = cosf(*aPtr++);
77  }
78 }
79 
80 #endif /* LV_HAVE_GENERIC */
81 
82 #ifdef LV_HAVE_GENERIC
83 #include <volk/volk_common.h>
84 
85 static inline void
86 volk_32f_cos_32f_polynomial(float* bVector, const float* aVector, unsigned int num_points)
87 {
88  for (unsigned int number = 0; number < num_points; number++) {
89  *bVector++ = volk_cos(*aVector++);
90  }
91 }
92 #endif /* LV_HAVE_GENERIC */
93 
94 #ifdef LV_HAVE_AVX512F
95 #include <immintrin.h>
97 
98 static inline void volk_32f_cos_32f_a_avx512f(float* cosVector,
99  const float* inVector,
100  unsigned int num_points)
101 {
102  float* cosPtr = cosVector;
103  const float* inPtr = inVector;
104 
105  unsigned int number = 0;
106  unsigned int sixteenPoints = num_points / 16;
107 
108  // Constants for Cody-Waite argument reduction
109  // n = round(x * 2/pi), then r = x - n * pi/2
110  const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f); // 2/pi
111  const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f); // pi/2 high
112  const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f); // pi/2 low
113 
114  const __m512i ones = _mm512_set1_epi32(1);
115  const __m512i twos = _mm512_set1_epi32(2);
116  const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
117 
118  for (; number < sixteenPoints; number++) {
119  __m512 x = _mm512_load_ps(inPtr);
120 
121  // Argument reduction: n = round(x * 2/pi)
122  __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
123  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
124  __m512i n = _mm512_cvtps_epi32(n_f);
125 
126  // r = x - n * (pi/2), using extended precision
127  __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
128  r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
129 
130  // Evaluate both sin and cos polynomials
131  __m512 sin_r = _mm512_sin_poly_avx512(r);
132  __m512 cos_r = _mm512_cos_poly_avx512(r);
133 
134  // Reconstruct cos(x) based on quadrant (n mod 4):
135  // n&1 == 0: use cos_r, n&1 == 1: use sin_r
136  // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
137  __m512i n_and_1 = _mm512_and_si512(n, ones);
138  __m512i n_plus_1_and_2 = _mm512_and_si512(_mm512_add_epi32(n, ones), twos);
139 
140  // swap_mask: where n&1 != 0, we use sin instead of cos
141  __mmask16 swap_mask = _mm512_cmpeq_epi32_mask(n_and_1, ones);
142  __m512 result = _mm512_mask_blend_ps(swap_mask, cos_r, sin_r);
143 
144  // neg_mask: where (n+1)&2 != 0, we negate the result (use integer xor for
145  // AVX512F)
146  __mmask16 neg_mask = _mm512_cmpeq_epi32_mask(n_plus_1_and_2, twos);
147  result = _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(result),
148  neg_mask,
149  _mm512_castps_si512(result),
150  sign_bit));
151 
152  _mm512_store_ps(cosPtr, result);
153  inPtr += 16;
154  cosPtr += 16;
155  }
156 
157  number = sixteenPoints * 16;
158  for (; number < num_points; number++) {
159  *cosPtr++ = cosf(*inPtr++);
160  }
161 }
162 #endif /* LV_HAVE_AVX512F */
163 
164 #if LV_HAVE_AVX2 && LV_HAVE_FMA
165 #include <immintrin.h>
167 
168 static inline void
169 volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
170 {
171  float* bPtr = bVector;
172  const float* aPtr = aVector;
173 
174  unsigned int number = 0;
175  unsigned int eighthPoints = num_points / 8;
176 
177  // Constants for Cody-Waite argument reduction
178  // n = round(x * 2/pi), then r = x - n * pi/2
179  const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
180  const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
181  const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
182 
183  const __m256i ones = _mm256_set1_epi32(1);
184  const __m256i twos = _mm256_set1_epi32(2);
185  const __m256 sign_bit = _mm256_set1_ps(-0.0f);
186 
187  for (; number < eighthPoints; number++) {
188  __m256 x = _mm256_load_ps(aPtr);
189 
190  // Argument reduction: n = round(x * 2/pi)
191  __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
192  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
193  __m256i n = _mm256_cvtps_epi32(n_f);
194 
195  // r = x - n * (pi/2), using extended precision
196  __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
197  r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
198 
199  // Evaluate both sin and cos polynomials
200  __m256 sin_r = _mm256_sin_poly_avx2_fma(r);
201  __m256 cos_r = _mm256_cos_poly_avx2_fma(r);
202 
203  // Reconstruct cos(x) based on quadrant (n mod 4):
204  // n&1 == 0: use cos_r, n&1 == 1: use sin_r
205  // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
206  __m256i n_and_1 = _mm256_and_si256(n, ones);
207  __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
208 
209  // swap_mask: where n&1 != 0, we use sin instead of cos
210  __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
211  __m256 result = _mm256_blendv_ps(cos_r, sin_r, swap_mask);
212 
213  // neg_mask: where (n+1)&2 != 0, we negate the result
214  __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
215  result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
216 
217  _mm256_store_ps(bPtr, result);
218  aPtr += 8;
219  bPtr += 8;
220  }
221 
222  number = eighthPoints * 8;
223  for (; number < num_points; number++) {
224  *bPtr++ = cosf(*aPtr++);
225  }
226 }
227 
228 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
229 
230 #ifdef LV_HAVE_AVX2
231 #include <immintrin.h>
233 
234 static inline void
235 volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
236 {
237  float* bPtr = bVector;
238  const float* aPtr = aVector;
239 
240  unsigned int number = 0;
241  unsigned int eighthPoints = num_points / 8;
242 
243  // Constants for Cody-Waite argument reduction
244  // n = round(x * 2/pi), then r = x - n * pi/2
245  const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
246  const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
247  const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
248 
249  const __m256i ones = _mm256_set1_epi32(1);
250  const __m256i twos = _mm256_set1_epi32(2);
251  const __m256 sign_bit = _mm256_set1_ps(-0.0f);
252 
253  for (; number < eighthPoints; number++) {
254  __m256 x = _mm256_load_ps(aPtr);
255 
256  // Argument reduction: n = round(x * 2/pi)
257  __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
258  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
259  __m256i n = _mm256_cvtps_epi32(n_f);
260 
261  // r = x - n * (pi/2), using extended precision
262  __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
263  r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
264 
265  // Evaluate both sin and cos polynomials
266  __m256 sin_r = _mm256_sin_poly_avx2(r);
267  __m256 cos_r = _mm256_cos_poly_avx2(r);
268 
269  // Reconstruct cos(x) based on quadrant (n mod 4):
270  // n&1 == 0: use cos_r, n&1 == 1: use sin_r
271  // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
272  __m256i n_and_1 = _mm256_and_si256(n, ones);
273  __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
274 
275  // swap_mask: where n&1 != 0, we use sin instead of cos
276  __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
277  __m256 result = _mm256_blendv_ps(cos_r, sin_r, swap_mask);
278 
279  // neg_mask: where (n+1)&2 != 0, we negate the result
280  __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
281  result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
282 
283  _mm256_store_ps(bPtr, result);
284  aPtr += 8;
285  bPtr += 8;
286  }
287 
288  number = eighthPoints * 8;
289  for (; number < num_points; number++) {
290  *bPtr++ = cosf(*aPtr++);
291  }
292 }
293 
294 #endif /* LV_HAVE_AVX2 */
295 
296 #ifdef LV_HAVE_SSE4_1
297 #include <smmintrin.h>
299 
300 static inline void
301 volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
302 {
303  float* bPtr = bVector;
304  const float* aPtr = aVector;
305 
306  unsigned int number = 0;
307  unsigned int quarterPoints = num_points / 4;
308 
309  // Constants for Cody-Waite argument reduction
310  // n = round(x * 2/pi), then r = x - n * pi/2
311  const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f); // 2/pi
312  const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f); // pi/2 high
313  const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f); // pi/2 low
314 
315  const __m128i ones = _mm_set1_epi32(1);
316  const __m128i twos = _mm_set1_epi32(2);
317  const __m128 sign_bit = _mm_set1_ps(-0.0f);
318 
319  for (; number < quarterPoints; number++) {
320  __m128 x = _mm_load_ps(aPtr);
321 
322  // Argument reduction: n = round(x * 2/pi)
323  __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
324  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
325  __m128i n = _mm_cvtps_epi32(n_f);
326 
327  // r = x - n * (pi/2), using extended precision
328  __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
329  r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
330 
331  // Evaluate both sin and cos polynomials
332  __m128 sin_r = _mm_sin_poly_sse(r);
333  __m128 cos_r = _mm_cos_poly_sse(r);
334 
335  // Reconstruct cos(x) based on quadrant (n mod 4):
336  // n&1 == 0: use cos_r, n&1 == 1: use sin_r
337  // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
338  __m128i n_and_1 = _mm_and_si128(n, ones);
339  __m128i n_plus_1_and_2 = _mm_and_si128(_mm_add_epi32(n, ones), twos);
340 
341  // swap_mask: where n&1 != 0, we use sin instead of cos
342  __m128 swap_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
343  __m128 result = _mm_blendv_ps(cos_r, sin_r, swap_mask);
344 
345  // neg_mask: where (n+1)&2 != 0, we negate the result
346  __m128 neg_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_plus_1_and_2, twos));
347  result = _mm_xor_ps(result, _mm_and_ps(neg_mask, sign_bit));
348 
349  _mm_store_ps(bPtr, result);
350  aPtr += 4;
351  bPtr += 4;
352  }
353 
354  number = quarterPoints * 4;
355  for (; number < num_points; number++) {
356  *bPtr++ = cosf(*aPtr++);
357  }
358 }
359 
360 #endif /* LV_HAVE_SSE4_1 */
361 
362 #endif /* INCLUDED_volk_32f_cos_32f_a_H */
363 
364 
365 #ifndef INCLUDED_volk_32f_cos_32f_u_H
366 #define INCLUDED_volk_32f_cos_32f_u_H
367 
368 #ifdef LV_HAVE_AVX512F
369 #include <immintrin.h>
371 
372 static inline void volk_32f_cos_32f_u_avx512f(float* cosVector,
373  const float* inVector,
374  unsigned int num_points)
375 {
376  float* cosPtr = cosVector;
377  const float* inPtr = inVector;
378 
379  unsigned int number = 0;
380  unsigned int sixteenPoints = num_points / 16;
381 
382  // Constants for Cody-Waite argument reduction
383  // n = round(x * 2/pi), then r = x - n * pi/2
384  const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f); // 2/pi
385  const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f); // pi/2 high
386  const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f); // pi/2 low
387 
388  const __m512i ones = _mm512_set1_epi32(1);
389  const __m512i twos = _mm512_set1_epi32(2);
390  const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
391 
392  for (; number < sixteenPoints; number++) {
393  __m512 x = _mm512_loadu_ps(inPtr);
394 
395  // Argument reduction: n = round(x * 2/pi)
396  __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
397  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
398  __m512i n = _mm512_cvtps_epi32(n_f);
399 
400  // r = x - n * (pi/2), using extended precision
401  __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
402  r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
403 
404  // Evaluate both sin and cos polynomials
405  __m512 sin_r = _mm512_sin_poly_avx512(r);
406  __m512 cos_r = _mm512_cos_poly_avx512(r);
407 
408  // Reconstruct cos(x) based on quadrant (n mod 4):
409  // n&1 == 0: use cos_r, n&1 == 1: use sin_r
410  // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
411  __m512i n_and_1 = _mm512_and_si512(n, ones);
412  __m512i n_plus_1_and_2 = _mm512_and_si512(_mm512_add_epi32(n, ones), twos);
413 
414  // swap_mask: where n&1 != 0, we use sin instead of cos
415  __mmask16 swap_mask = _mm512_cmpeq_epi32_mask(n_and_1, ones);
416  __m512 result = _mm512_mask_blend_ps(swap_mask, cos_r, sin_r);
417 
418  // neg_mask: where (n+1)&2 != 0, we negate the result (use integer xor for
419  // AVX512F)
420  __mmask16 neg_mask = _mm512_cmpeq_epi32_mask(n_plus_1_and_2, twos);
421  result = _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(result),
422  neg_mask,
423  _mm512_castps_si512(result),
424  sign_bit));
425 
426  _mm512_storeu_ps(cosPtr, result);
427  inPtr += 16;
428  cosPtr += 16;
429  }
430 
431  number = sixteenPoints * 16;
432  for (; number < num_points; number++) {
433  *cosPtr++ = cosf(*inPtr++);
434  }
435 }
436 #endif /* LV_HAVE_AVX512F */
437 
438 #if LV_HAVE_AVX2 && LV_HAVE_FMA
439 #include <immintrin.h>
441 
442 static inline void
443 volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
444 {
445  float* bPtr = bVector;
446  const float* aPtr = aVector;
447 
448  unsigned int number = 0;
449  unsigned int eighthPoints = num_points / 8;
450 
451  // Constants for Cody-Waite argument reduction
452  // n = round(x * 2/pi), then r = x - n * pi/2
453  const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
454  const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
455  const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
456 
457  const __m256i ones = _mm256_set1_epi32(1);
458  const __m256i twos = _mm256_set1_epi32(2);
459  const __m256 sign_bit = _mm256_set1_ps(-0.0f);
460 
461  for (; number < eighthPoints; number++) {
462  __m256 x = _mm256_loadu_ps(aPtr);
463 
464  // Argument reduction: n = round(x * 2/pi)
465  __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
466  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
467  __m256i n = _mm256_cvtps_epi32(n_f);
468 
469  // r = x - n * (pi/2), using extended precision
470  __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
471  r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
472 
473  // Evaluate both sin and cos polynomials
474  __m256 sin_r = _mm256_sin_poly_avx2_fma(r);
475  __m256 cos_r = _mm256_cos_poly_avx2_fma(r);
476 
477  // Reconstruct cos(x) based on quadrant (n mod 4):
478  // n&1 == 0: use cos_r, n&1 == 1: use sin_r
479  // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
480  __m256i n_and_1 = _mm256_and_si256(n, ones);
481  __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
482 
483  // swap_mask: where n&1 != 0, we use sin instead of cos
484  __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
485  __m256 result = _mm256_blendv_ps(cos_r, sin_r, swap_mask);
486 
487  // neg_mask: where (n+1)&2 != 0, we negate the result
488  __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
489  result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
490 
491  _mm256_storeu_ps(bPtr, result);
492  aPtr += 8;
493  bPtr += 8;
494  }
495 
496  number = eighthPoints * 8;
497  for (; number < num_points; number++) {
498  *bPtr++ = cosf(*aPtr++);
499  }
500 }
501 
502 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
503 
504 #ifdef LV_HAVE_AVX2
505 #include <immintrin.h>
507 
508 static inline void
509 volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
510 {
511  float* bPtr = bVector;
512  const float* aPtr = aVector;
513 
514  unsigned int number = 0;
515  unsigned int eighthPoints = num_points / 8;
516 
517  // Constants for Cody-Waite argument reduction
518  // n = round(x * 2/pi), then r = x - n * pi/2
519  const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
520  const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
521  const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
522 
523  const __m256i ones = _mm256_set1_epi32(1);
524  const __m256i twos = _mm256_set1_epi32(2);
525  const __m256 sign_bit = _mm256_set1_ps(-0.0f);
526 
527  for (; number < eighthPoints; number++) {
528  __m256 x = _mm256_loadu_ps(aPtr);
529 
530  // Argument reduction: n = round(x * 2/pi)
531  __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
532  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
533  __m256i n = _mm256_cvtps_epi32(n_f);
534 
535  // r = x - n * (pi/2), using extended precision
536  __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
537  r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
538 
539  // Evaluate both sin and cos polynomials
540  __m256 sin_r = _mm256_sin_poly_avx2(r);
541  __m256 cos_r = _mm256_cos_poly_avx2(r);
542 
543  // Reconstruct cos(x) based on quadrant (n mod 4):
544  // n&1 == 0: use cos_r, n&1 == 1: use sin_r
545  // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
546  __m256i n_and_1 = _mm256_and_si256(n, ones);
547  __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
548 
549  // swap_mask: where n&1 != 0, we use sin instead of cos
550  __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
551  __m256 result = _mm256_blendv_ps(cos_r, sin_r, swap_mask);
552 
553  // neg_mask: where (n+1)&2 != 0, we negate the result
554  __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
555  result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
556 
557  _mm256_storeu_ps(bPtr, result);
558  aPtr += 8;
559  bPtr += 8;
560  }
561 
562  number = eighthPoints * 8;
563  for (; number < num_points; number++) {
564  *bPtr++ = cosf(*aPtr++);
565  }
566 }
567 
568 #endif /* LV_HAVE_AVX2 */
569 
570 #ifdef LV_HAVE_SSE4_1
571 #include <smmintrin.h>
573 
574 static inline void
575 volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
576 {
577  float* bPtr = bVector;
578  const float* aPtr = aVector;
579 
580  unsigned int number = 0;
581  unsigned int quarterPoints = num_points / 4;
582 
583  // Constants for Cody-Waite argument reduction
584  // n = round(x * 2/pi), then r = x - n * pi/2
585  const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f); // 2/pi
586  const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f); // pi/2 high
587  const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f); // pi/2 low
588 
589  const __m128i ones = _mm_set1_epi32(1);
590  const __m128i twos = _mm_set1_epi32(2);
591  const __m128 sign_bit = _mm_set1_ps(-0.0f);
592 
593  for (; number < quarterPoints; number++) {
594  __m128 x = _mm_loadu_ps(aPtr);
595 
596  // Argument reduction: n = round(x * 2/pi)
597  __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
598  _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
599  __m128i n = _mm_cvtps_epi32(n_f);
600 
601  // r = x - n * (pi/2), using extended precision
602  __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
603  r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
604 
605  // Evaluate both sin and cos polynomials
606  __m128 sin_r = _mm_sin_poly_sse(r);
607  __m128 cos_r = _mm_cos_poly_sse(r);
608 
609  // Reconstruct cos(x) based on quadrant (n mod 4):
610  // n&1 == 0: use cos_r, n&1 == 1: use sin_r
611  // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
612  __m128i n_and_1 = _mm_and_si128(n, ones);
613  __m128i n_plus_1_and_2 = _mm_and_si128(_mm_add_epi32(n, ones), twos);
614 
615  // swap_mask: where n&1 != 0, we use sin instead of cos
616  __m128 swap_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
617  __m128 result = _mm_blendv_ps(cos_r, sin_r, swap_mask);
618 
619  // neg_mask: where (n+1)&2 != 0, we negate the result
620  __m128 neg_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_plus_1_and_2, twos));
621  result = _mm_xor_ps(result, _mm_and_ps(neg_mask, sign_bit));
622 
623  _mm_storeu_ps(bPtr, result);
624  aPtr += 4;
625  bPtr += 4;
626  }
627 
628  number = quarterPoints * 4;
629  for (; number < num_points; number++) {
630  *bPtr++ = cosf(*aPtr++);
631  }
632 }
633 
634 #endif /* LV_HAVE_SSE4_1 */
635 
636 
637 #ifdef LV_HAVE_NEON
638 #include <arm_neon.h>
640 
641 /* NEON polynomial-based cos using Cody-Waite argument reduction */
642 static inline void
643 volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
644 {
645  // Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
646  const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f); // 2/pi
647  const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f); // pi/2 high
648  const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f); // pi/2 low
649 
650  const int32x4_t ones = vdupq_n_s32(1);
651  const int32x4_t twos = vdupq_n_s32(2);
652  const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
653  const float32x4_t half = vdupq_n_f32(0.5f);
654  const float32x4_t neg_half = vdupq_n_f32(-0.5f);
655  const float32x4_t fzeroes = vdupq_n_f32(0.0f);
656 
657  unsigned int number = 0;
658  const unsigned int quarterPoints = num_points / 4;
659 
660  for (; number < quarterPoints; number++) {
661  float32x4_t x = vld1q_f32(aVector);
662  aVector += 4;
663 
664  // n = round(x * 2/pi) - emulate round-to-nearest for ARMv7
665  float32x4_t scaled = vmulq_f32(x, two_over_pi);
666  uint32x4_t is_neg = vcltq_f32(scaled, fzeroes);
667  float32x4_t adj = vbslq_f32(is_neg, neg_half, half);
668  float32x4_t n_f = vcvtq_f32_s32(vcvtq_s32_f32(vaddq_f32(scaled, adj)));
669  int32x4_t n = vcvtq_s32_f32(n_f);
670 
671  // r = x - n * (pi/2) using extended precision
672  float32x4_t r = vmlsq_f32(x, n_f, pi_over_2_hi);
673  r = vmlsq_f32(r, n_f, pi_over_2_lo);
674 
675  // Evaluate sin and cos polynomials
676  float32x4_t sin_r = _vsin_poly_f32(r);
677  float32x4_t cos_r = _vcos_poly_f32(r);
678 
679  // Quadrant-based reconstruction for cos:
680  // n&1 == 0: use cos_r, n&1 == 1: use sin_r
681  // (n+1)&2 == 0: positive, (n+1)&2 == 2: negative
682  int32x4_t n_and_1 = vandq_s32(n, ones);
683  int32x4_t n_plus_1_and_2 = vandq_s32(vaddq_s32(n, ones), twos);
684 
685  uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
686  float32x4_t result = vbslq_f32(swap_mask, sin_r, cos_r);
687 
688  uint32x4_t neg_mask = vceqq_s32(n_plus_1_and_2, twos);
689  result = vreinterpretq_f32_u32(
690  veorq_u32(vreinterpretq_u32_f32(result),
691  vandq_u32(neg_mask, vreinterpretq_u32_f32(sign_bit))));
692 
693  vst1q_f32(bVector, result);
694  bVector += 4;
695  }
696 
697  for (number = quarterPoints * 4; number < num_points; number++) {
698  *bVector++ = cosf(*aVector++);
699  }
700 }
701 #endif /* LV_HAVE_NEON */
702 
703 #ifdef LV_HAVE_NEONV8
704 #include <arm_neon.h>
706 
707 /* NEONv8 polynomial-based cos using Cody-Waite argument reduction with FMA */
708 static inline void
709 volk_32f_cos_32f_neonv8(float* bVector, const float* aVector, unsigned int num_points)
710 {
711  // Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
712  const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f); // 2/pi
713  const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f); // pi/2 high
714  const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f); // pi/2 low
715 
716  const int32x4_t ones = vdupq_n_s32(1);
717  const int32x4_t twos = vdupq_n_s32(2);
718  const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
719 
720  unsigned int number = 0;
721  const unsigned int quarterPoints = num_points / 4;
722 
723  for (; number < quarterPoints; number++) {
724  float32x4_t x = vld1q_f32(aVector);
725  aVector += 4;
726 
727  // n = round(x * 2/pi) using ARMv8 vrndnq_f32
728  float32x4_t n_f = vrndnq_f32(vmulq_f32(x, two_over_pi));
729  int32x4_t n = vcvtq_s32_f32(n_f);
730 
731  // r = x - n * (pi/2) using FMA for extended precision
732  float32x4_t r = vfmsq_f32(x, n_f, pi_over_2_hi);
733  r = vfmsq_f32(r, n_f, pi_over_2_lo);
734 
735  // Evaluate sin and cos polynomials using FMA
736  float32x4_t sin_r = _vsin_poly_neonv8(r);
737  float32x4_t cos_r = _vcos_poly_neonv8(r);
738 
739  // Quadrant-based reconstruction for cos:
740  // n&1 == 0: use cos_r, n&1 == 1: use sin_r
741  // (n+1)&2 == 0: positive, (n+1)&2 == 2: negative
742  int32x4_t n_and_1 = vandq_s32(n, ones);
743  int32x4_t n_plus_1_and_2 = vandq_s32(vaddq_s32(n, ones), twos);
744 
745  uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
746  float32x4_t result = vbslq_f32(swap_mask, sin_r, cos_r);
747 
748  uint32x4_t neg_mask = vceqq_s32(n_plus_1_and_2, twos);
749  result = vreinterpretq_f32_u32(
750  veorq_u32(vreinterpretq_u32_f32(result),
751  vandq_u32(neg_mask, vreinterpretq_u32_f32(sign_bit))));
752 
753  vst1q_f32(bVector, result);
754  bVector += 4;
755  }
756 
757  for (number = quarterPoints * 4; number < num_points; number++) {
758  *bVector++ = cosf(*aVector++);
759  }
760 }
761 
762 #endif /* LV_HAVE_NEONV8 */
763 
764 #ifdef LV_HAVE_RVV
765 #include <riscv_vector.h>
766 
767 static inline void
768 volk_32f_cos_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
769 {
770  size_t vlmax = __riscv_vsetvlmax_e32m2();
771 
772  const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
773  const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
774  const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
775  const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
776 
777  const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
778  const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
779 
780  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
781  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
782  const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05f, vlmax);
783  const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07f, vlmax);
784 
785  size_t n = num_points;
786  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
787  vl = __riscv_vsetvl_e32m2(n);
788  vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
789  vfloat32m2_t s = __riscv_vfabs(v, vl);
790  vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
791  vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
792 
793  s = __riscv_vfnmsac(s, cPio4a, r, vl);
794  s = __riscv_vfnmsac(s, cPio4b, r, vl);
795  s = __riscv_vfnmsac(s, cPio4c, r, vl);
796 
797  s = __riscv_vfmul(s, 1 / 8.0f, vl);
798  s = __riscv_vfmul(s, s, vl);
799  vfloat32m2_t t = s;
800  s = __riscv_vfmsub(s, c5, c4, vl);
801  s = __riscv_vfmadd(s, t, c3, vl);
802  s = __riscv_vfmsub(s, t, c2, vl);
803  s = __riscv_vfmadd(s, t, cf1, vl);
804  s = __riscv_vfmul(s, t, vl);
805  s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
806  s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
807  s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
808  s = __riscv_vfmul(s, 1 / 2.0f, vl);
809 
810  vfloat32m2_t sine =
811  __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
812  vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
813 
814  vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
815  vbool16_t m2 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 2, vl), 4, vl), 0, vl);
816 
817  cosine = __riscv_vmerge(cosine, sine, m1, vl);
818  cosine = __riscv_vfneg_mu(m2, cosine, cosine, vl);
819 
820  __riscv_vse32(bVector, cosine, vl);
821  }
822 }
823 #endif /*LV_HAVE_RVV*/
824 
825 #endif /* INCLUDED_volk_32f_cos_32f_u_H */
volk_avx512_intrinsics.h
volk_32f_cos_32f_neon
static void volk_32f_cos_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:643
_mm256_sin_poly_avx2_fma
static __m256 _mm256_sin_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:79
_mm_sin_poly_sse
static __m128 _mm_sin_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:157
_mm256_cos_poly_avx2_fma
static __m256 _mm256_cos_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:99
volk_32f_cos_32f_generic
static void volk_32f_cos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:69
_mm512_sin_poly_avx512
static __m512 _mm512_sin_poly_avx512(const __m512 x)
Definition: volk_avx512_intrinsics.h:206
volk_cos
static float volk_cos(const float x)
Definition: volk_common.h:274
volk_32f_cos_32f_polynomial
static void volk_32f_cos_32f_polynomial(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:86
volk_common.h
_mm512_cos_poly_avx512
static __m512 _mm512_cos_poly_avx512(const __m512 x)
Definition: volk_avx512_intrinsics.h:227
_vcos_poly_f32
static float32x4_t _vcos_poly_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:396
volk_sse_intrinsics.h
_mm256_sin_poly_avx2
static __m256 _mm256_sin_poly_avx2(const __m256 x)
Definition: volk_avx2_intrinsics.h:382
volk_avx2_fma_intrinsics.h
_mm256_cos_poly_avx2
static __m256 _mm256_cos_poly_avx2(const __m256 x)
Definition: volk_avx2_intrinsics.h:403
volk_neon_intrinsics.h
_mm_cos_poly_sse
static __m128 _mm_cos_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:177
volk_avx2_intrinsics.h
_vsin_poly_f32
static float32x4_t _vsin_poly_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:376