Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_s32fc_x2_rotator2_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
68 #ifndef INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
69 #define INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
70 
71 
72 #include <math.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <volk/volk_complex.h>
76 #define ROTATOR_RELOAD 512
77 #define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
78 #define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
79 #define ROTATOR_RELOAD_8 (ROTATOR_RELOAD / 8)
80 
81 
82 #ifdef LV_HAVE_GENERIC
83 
85  const lv_32fc_t* inVector,
86  const lv_32fc_t* phase_inc,
87  lv_32fc_t* phase,
88  unsigned int num_points)
89 {
90  unsigned int i = 0;
91  int j = 0;
92  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
93  for (j = 0; j < ROTATOR_RELOAD; ++j) {
94  *outVector++ = *inVector++ * (*phase);
95  (*phase) *= *phase_inc;
96  }
97 
98  (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
99  }
100  for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
101  *outVector++ = *inVector++ * (*phase);
102  (*phase) *= *phase_inc;
103  }
104  if (i) {
105  // Make sure, we normalize phase on every call!
106  (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
107  }
108 }
109 
110 #endif /* LV_HAVE_GENERIC */
111 
112 
113 #ifdef LV_HAVE_NEON
114 #include <arm_neon.h>
116 
117 #ifndef M_PI
118 #define M_PI 3.14159265358979323846
119 #endif
120 
127 static inline void volk_32fc_s32fc_x2_rotator2_32fc_neon(lv_32fc_t* outVector,
128  const lv_32fc_t* inVector,
129  const lv_32fc_t* phase_inc,
130  lv_32fc_t* phase,
131  unsigned int num_points)
132 {
133  lv_32fc_t* outputVectorPtr = outVector;
134  const lv_32fc_t* inputVectorPtr = inVector;
135 
136  // Extract angles using double precision
137  const double initial_angle =
138  atan2((double)lv_cimag(*phase), (double)lv_creal(*phase));
139  const double delta_angle =
140  atan2((double)lv_cimag(*phase_inc), (double)lv_creal(*phase_inc));
141 
142  // Kahan summation state for angle accumulation
143  double angle_sum = initial_angle;
144  double angle_c = 0.0; // Compensation for lost low-order bits
145 
146  // Precompute block increment
147  const double block_delta = (double)ROTATOR_RELOAD * delta_angle;
148 
149  // Compute incr = phase_inc^4 from exact angle (more accurate than multiplication)
150  const double delta4 = 4.0 * delta_angle;
151  lv_32fc_t incr = lv_cmake((float)cos(delta4), (float)sin(delta4));
152 
153  float32x4x2_t input_vec;
154  float32x4x2_t output_vec;
155  lv_32fc_t phasePtr[4];
156 
157  const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
158  const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
159  float32x4x2_t phase_vec;
160 
161 // Helper macro for angle reduction
162 #define REDUCE_ANGLE(a) \
163  do { \
164  (a) = fmod((a), 2.0 * M_PI); \
165  if ((a) > M_PI) \
166  (a) -= 2.0 * M_PI; \
167  else if ((a) < -M_PI) \
168  (a) += 2.0 * M_PI; \
169  } while (0)
170 
171  // Initialize phase vector from exact angles
172  for (unsigned int k = 0; k < 4; ++k) {
173  double a = angle_sum + (double)k * delta_angle;
174  REDUCE_ANGLE(a);
175  phasePtr[k] = lv_cmake((float)cos(a), (float)sin(a));
176  }
177  phase_vec = vld2q_f32((float*)phasePtr);
178 
179  unsigned int i, j;
180 
181  // Main loop with periodic resync
182  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
183  // Inner loop: use fast SIMD multiply
184  for (j = 0; j < ROTATOR_RELOAD_4; j++) {
185  input_vec = vld2q_f32((float*)inputVectorPtr);
186  __VOLK_PREFETCH(inputVectorPtr + 4);
187 
188  // Rotate
189  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
190  // Increase phase
191  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
192  // Store output
193  vst2q_f32((float*)outputVectorPtr, output_vec);
194 
195  outputVectorPtr += 4;
196  inputVectorPtr += 4;
197  }
198 
199  // Advance angle using Kahan summation (compensated for rounding error)
200  {
201  double y = block_delta - angle_c;
202  double t = angle_sum + y;
203  angle_c = (t - angle_sum) - y;
204  angle_sum = t;
205  }
206 
207  // Resync: recompute phase from accumulated angle (eliminates drift)
208  for (unsigned int k = 0; k < 4; ++k) {
209  double a = angle_sum + (double)k * delta_angle;
210  REDUCE_ANGLE(a);
211  phasePtr[k] = lv_cmake((float)cos(a), (float)sin(a));
212  }
213  phase_vec = vld2q_f32((float*)phasePtr);
214  }
215 
216  // Handle remainder
217  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
218  input_vec = vld2q_f32((float*)inputVectorPtr);
219  __VOLK_PREFETCH(inputVectorPtr + 4);
220 
221  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
222  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
223  vst2q_f32((float*)outputVectorPtr, output_vec);
224 
225  outputVectorPtr += 4;
226  inputVectorPtr += 4;
227 
228  // Update angle for remainder
229  {
230  double y = (4.0 * delta_angle) - angle_c;
231  double t = angle_sum + y;
232  angle_c = (t - angle_sum) - y;
233  angle_sum = t;
234  }
235  }
236 
237  // Final phase output from exact angle
238  {
239  double a = angle_sum;
240  REDUCE_ANGLE(a);
241  *phase = lv_cmake((float)cos(a), (float)sin(a));
242  }
243 
244  // Scalar remainder
245  for (i = 0; i < num_points % 4; i++) {
246  *outputVectorPtr++ = *inputVectorPtr++ * (*phase);
247  {
248  double y = delta_angle - angle_c;
249  double t = angle_sum + y;
250  angle_c = (t - angle_sum) - y;
251  angle_sum = t;
252  }
253  double a = angle_sum;
254  REDUCE_ANGLE(a);
255  *phase = lv_cmake((float)cos(a), (float)sin(a));
256  }
257 
258 #undef REDUCE_ANGLE
259 }
260 
261 #endif /* LV_HAVE_NEON */
262 
263 
264 #ifdef LV_HAVE_AVX
265 #include <immintrin.h>
267 
268 #ifndef M_PI
269 #define M_PI 3.14159265358979323846
270 #endif
271 
278 static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_avx(lv_32fc_t* outVector,
279  const lv_32fc_t* inVector,
280  const lv_32fc_t* phase_inc,
281  lv_32fc_t* phase,
282  unsigned int num_points)
283 {
284  lv_32fc_t* cPtr = outVector;
285  const lv_32fc_t* aPtr = inVector;
286 
287  // Extract angles using double precision
288  const double initial_angle =
289  atan2((double)lv_cimag(*phase), (double)lv_creal(*phase));
290  const double delta_angle =
291  atan2((double)lv_cimag(*phase_inc), (double)lv_creal(*phase_inc));
292 
293  // Kahan summation state for angle accumulation
294  double angle_sum = initial_angle;
295  double angle_c = 0.0; // Compensation for lost low-order bits
296 
297  // Precompute block increment
298  const double block_delta = (double)ROTATOR_RELOAD * delta_angle;
299 
300  // Compute incr = phase_inc^4 from exact angle (more accurate than multiplication)
301  const double delta4 = 4.0 * delta_angle;
302  lv_32fc_t incr = lv_cmake((float)cos(delta4), (float)sin(delta4));
303 
304  __m256 aVal, phase_Val, z;
305  lv_32fc_t phase_Ptr[4];
306 
307  const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
308  lv_creal(incr),
309  lv_cimag(incr),
310  lv_creal(incr),
311  lv_cimag(incr),
312  lv_creal(incr),
313  lv_cimag(incr),
314  lv_creal(incr));
315 
316 // Helper macro for angle reduction
317 #define REDUCE_ANGLE(a) \
318  do { \
319  (a) = fmod((a), 2.0 * M_PI); \
320  if ((a) > M_PI) \
321  (a) -= 2.0 * M_PI; \
322  else if ((a) < -M_PI) \
323  (a) += 2.0 * M_PI; \
324  } while (0)
325 
326  // Initialize phase vector from exact angles
327  for (unsigned int k = 0; k < 4; ++k) {
328  double a = angle_sum + (double)k * delta_angle;
329  REDUCE_ANGLE(a);
330  phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
331  }
332  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
333 
334  unsigned int i, j;
335 
336  // Main loop with periodic resync
337  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
338  // Inner loop: use fast SIMD multiply
339  for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
340  aVal = _mm256_loadu_ps((float*)aPtr);
341 
342  z = _mm256_complexmul_ps(aVal, phase_Val);
343  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
344 
345  _mm256_storeu_ps((float*)cPtr, z);
346 
347  aPtr += 4;
348  cPtr += 4;
349  }
350 
351  // Advance angle using Kahan summation (compensated for rounding error)
352  {
353  double y = block_delta - angle_c;
354  double t = angle_sum + y;
355  angle_c = (t - angle_sum) - y;
356  angle_sum = t;
357  }
358 
359  // Resync: recompute phase from accumulated angle (eliminates drift)
360  for (unsigned int k = 0; k < 4; ++k) {
361  double a = angle_sum + (double)k * delta_angle;
362  REDUCE_ANGLE(a);
363  phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
364  }
365  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
366  }
367 
368  // Handle remainder
369  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
370  aVal = _mm256_loadu_ps((float*)aPtr);
371 
372  z = _mm256_complexmul_ps(aVal, phase_Val);
373  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
374 
375  _mm256_storeu_ps((float*)cPtr, z);
376 
377  aPtr += 4;
378  cPtr += 4;
379 
380  // Update angle for remainder
381  {
382  double y = (4.0 * delta_angle) - angle_c;
383  double t = angle_sum + y;
384  angle_c = (t - angle_sum) - y;
385  angle_sum = t;
386  }
387  }
388 
389  // Final phase output from exact angle
390  {
391  double a = angle_sum;
392  REDUCE_ANGLE(a);
393  *phase = lv_cmake((float)cos(a), (float)sin(a));
394  }
395 
396  // Scalar remainder
397  for (i = 0; i < num_points % 4; ++i) {
398  *cPtr++ = *aPtr++ * (*phase);
399  {
400  double y = delta_angle - angle_c;
401  double t = angle_sum + y;
402  angle_c = (t - angle_sum) - y;
403  angle_sum = t;
404  }
405  double a = angle_sum;
406  REDUCE_ANGLE(a);
407  *phase = lv_cmake((float)cos(a), (float)sin(a));
408  }
409 
410 #undef REDUCE_ANGLE
411 }
412 
413 #endif /* LV_HAVE_AVX */
414 
415 
416 #ifdef LV_HAVE_AVX
417 #include <immintrin.h>
418 
425 static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx(lv_32fc_t* outVector,
426  const lv_32fc_t* inVector,
427  const lv_32fc_t* phase_inc,
428  lv_32fc_t* phase,
429  unsigned int num_points)
430 {
431  lv_32fc_t* cPtr = outVector;
432  const lv_32fc_t* aPtr = inVector;
433 
434  // Extract angles using double precision
435  const double initial_angle =
436  atan2((double)lv_cimag(*phase), (double)lv_creal(*phase));
437  const double delta_angle =
438  atan2((double)lv_cimag(*phase_inc), (double)lv_creal(*phase_inc));
439 
440  // Kahan summation state for angle accumulation
441  double angle_sum = initial_angle;
442  double angle_c = 0.0;
443 
444  // Precompute block increment
445  const double block_delta = (double)ROTATOR_RELOAD * delta_angle;
446 
447  // Compute incr = phase_inc^4 from exact angle
448  const double delta4 = 4.0 * delta_angle;
449  lv_32fc_t incr = lv_cmake((float)cos(delta4), (float)sin(delta4));
450 
451  __m256 aVal, phase_Val, z;
452  lv_32fc_t phase_Ptr[4];
453 
454  const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
455  lv_creal(incr),
456  lv_cimag(incr),
457  lv_creal(incr),
458  lv_cimag(incr),
459  lv_creal(incr),
460  lv_cimag(incr),
461  lv_creal(incr));
462 
463 #define REDUCE_ANGLE(a) \
464  do { \
465  (a) = fmod((a), 2.0 * M_PI); \
466  if ((a) > M_PI) \
467  (a) -= 2.0 * M_PI; \
468  else if ((a) < -M_PI) \
469  (a) += 2.0 * M_PI; \
470  } while (0)
471 
472  // Initialize phase vector from exact angles
473  for (unsigned int k = 0; k < 4; ++k) {
474  double a = angle_sum + (double)k * delta_angle;
475  REDUCE_ANGLE(a);
476  phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
477  }
478  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
479 
480  unsigned int i, j;
481 
482  // Main loop with periodic resync
483  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
484  for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
485  aVal = _mm256_loadu_ps((float*)aPtr);
486  z = _mm256_complexmul_ps(aVal, phase_Val);
487  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
488  _mm256_storeu_ps((float*)cPtr, z);
489  aPtr += 4;
490  cPtr += 4;
491  }
492 
493  // Advance angle using Kahan summation
494  {
495  double y = block_delta - angle_c;
496  double t = angle_sum + y;
497  angle_c = (t - angle_sum) - y;
498  angle_sum = t;
499  }
500 
501  // Resync: recompute phase from accumulated angle
502  for (unsigned int k = 0; k < 4; ++k) {
503  double a = angle_sum + (double)k * delta_angle;
504  REDUCE_ANGLE(a);
505  phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
506  }
507  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
508  }
509 
510  // Handle remainder
511  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
512  aVal = _mm256_loadu_ps((float*)aPtr);
513  z = _mm256_complexmul_ps(aVal, phase_Val);
514  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
515  _mm256_storeu_ps((float*)cPtr, z);
516  aPtr += 4;
517  cPtr += 4;
518 
519  {
520  double y = (4.0 * delta_angle) - angle_c;
521  double t = angle_sum + y;
522  angle_c = (t - angle_sum) - y;
523  angle_sum = t;
524  }
525  }
526 
527  // Final phase output
528  {
529  double a = angle_sum;
530  REDUCE_ANGLE(a);
531  *phase = lv_cmake((float)cos(a), (float)sin(a));
532  }
533 
534  // Scalar remainder
535  for (i = 0; i < num_points % 4; ++i) {
536  *cPtr++ = *aPtr++ * (*phase);
537  {
538  double y = delta_angle - angle_c;
539  double t = angle_sum + y;
540  angle_c = (t - angle_sum) - y;
541  angle_sum = t;
542  }
543  double a = angle_sum;
544  REDUCE_ANGLE(a);
545  *phase = lv_cmake((float)cos(a), (float)sin(a));
546  }
547 
548 #undef REDUCE_ANGLE
549 }
550 
551 #endif /* LV_HAVE_AVX */
552 
553 
554 #ifdef LV_HAVE_AVX512F
555 #include <immintrin.h>
557 
564 static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_avx512f(lv_32fc_t* outVector,
565  const lv_32fc_t* inVector,
566  const lv_32fc_t* phase_inc,
567  lv_32fc_t* phase,
568  unsigned int num_points)
569 {
570  lv_32fc_t* cPtr = outVector;
571  const lv_32fc_t* aPtr = inVector;
572 
573  // Extract angles using double precision
574  const double initial_angle =
575  atan2((double)lv_cimag(*phase), (double)lv_creal(*phase));
576  const double delta_angle =
577  atan2((double)lv_cimag(*phase_inc), (double)lv_creal(*phase_inc));
578 
579  // Kahan summation state for angle accumulation
580  double angle_sum = initial_angle;
581  double angle_c = 0.0;
582 
583  // Precompute block increment
584  const double block_delta = (double)ROTATOR_RELOAD * delta_angle;
585 
586  // Compute incr = phase_inc^8 from exact angle
587  const double delta8 = 8.0 * delta_angle;
588  lv_32fc_t incr = lv_cmake((float)cos(delta8), (float)sin(delta8));
589 
590  __m512 aVal, phase_Val, z;
592  lv_32fc_t phase_Ptr[8];
593 
594  const __m512 inc_Val = _mm512_set_ps(lv_cimag(incr),
595  lv_creal(incr),
596  lv_cimag(incr),
597  lv_creal(incr),
598  lv_cimag(incr),
599  lv_creal(incr),
600  lv_cimag(incr),
601  lv_creal(incr),
602  lv_cimag(incr),
603  lv_creal(incr),
604  lv_cimag(incr),
605  lv_creal(incr),
606  lv_cimag(incr),
607  lv_creal(incr),
608  lv_cimag(incr),
609  lv_creal(incr));
610 
611 #define REDUCE_ANGLE(a) \
612  do { \
613  (a) = fmod((a), 2.0 * M_PI); \
614  if ((a) > M_PI) \
615  (a) -= 2.0 * M_PI; \
616  else if ((a) < -M_PI) \
617  (a) += 2.0 * M_PI; \
618  } while (0)
619 
620  // Initialize phase vector from exact angles
621  for (unsigned int k = 0; k < 8; ++k) {
622  double a = angle_sum + (double)k * delta_angle;
623  REDUCE_ANGLE(a);
624  phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
625  }
626  phase_Val = _mm512_load_ps((float*)phase_Ptr);
627 
628  unsigned int i, j;
629 
630  // Main loop with periodic resync
631  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
632  for (j = 0; j < ROTATOR_RELOAD_8; ++j) {
633  aVal = _mm512_load_ps((float*)aPtr);
634  z = _mm512_complexmul_ps(aVal, phase_Val);
635  phase_Val = _mm512_complexmul_ps(phase_Val, inc_Val);
636  _mm512_store_ps((float*)cPtr, z);
637  aPtr += 8;
638  cPtr += 8;
639  }
640 
641  // Advance angle using Kahan summation
642  {
643  double y = block_delta - angle_c;
644  double t = angle_sum + y;
645  angle_c = (t - angle_sum) - y;
646  angle_sum = t;
647  }
648 
649  // Resync: recompute phase from accumulated angle
650  for (unsigned int k = 0; k < 8; ++k) {
651  double a = angle_sum + (double)k * delta_angle;
652  REDUCE_ANGLE(a);
653  phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
654  }
655  phase_Val = _mm512_load_ps((float*)phase_Ptr);
656  }
657 
658  // Handle remainder
659  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 8; ++i) {
660  aVal = _mm512_load_ps((float*)aPtr);
661  z = _mm512_complexmul_ps(aVal, phase_Val);
662  phase_Val = _mm512_complexmul_ps(phase_Val, inc_Val);
663  _mm512_store_ps((float*)cPtr, z);
664  aPtr += 8;
665  cPtr += 8;
666 
667  {
668  double y = (8.0 * delta_angle) - angle_c;
669  double t = angle_sum + y;
670  angle_c = (t - angle_sum) - y;
671  angle_sum = t;
672  }
673  }
674 
675  // Final phase output
676  {
677  double a = angle_sum;
678  REDUCE_ANGLE(a);
679  *phase = lv_cmake((float)cos(a), (float)sin(a));
680  }
681 
682  // Scalar remainder
683  for (i = 0; i < num_points % 8; ++i) {
684  *cPtr++ = *aPtr++ * (*phase);
685  {
686  double y = delta_angle - angle_c;
687  double t = angle_sum + y;
688  angle_c = (t - angle_sum) - y;
689  angle_sum = t;
690  }
691  double a = angle_sum;
692  REDUCE_ANGLE(a);
693  *phase = lv_cmake((float)cos(a), (float)sin(a));
694  }
695 
696 #undef REDUCE_ANGLE
697 }
698 
699 #endif /* LV_HAVE_AVX512F */
700 
701 
702 #ifdef LV_HAVE_AVX512F
703 #include <immintrin.h>
705 
710 static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx512f(lv_32fc_t* outVector,
711  const lv_32fc_t* inVector,
712  const lv_32fc_t* phase_inc,
713  lv_32fc_t* phase,
714  unsigned int num_points)
715 {
716  lv_32fc_t* cPtr = outVector;
717  const lv_32fc_t* aPtr = inVector;
718 
719  const double initial_angle =
720  atan2((double)lv_cimag(*phase), (double)lv_creal(*phase));
721  const double delta_angle =
722  atan2((double)lv_cimag(*phase_inc), (double)lv_creal(*phase_inc));
723 
724  double angle_sum = initial_angle;
725  double angle_c = 0.0;
726 
727  const double block_delta = (double)ROTATOR_RELOAD * delta_angle;
728 
729  const double delta8 = 8.0 * delta_angle;
730  lv_32fc_t incr = lv_cmake((float)cos(delta8), (float)sin(delta8));
731 
732  __m512 aVal, phase_Val, z;
733  lv_32fc_t phase_Ptr[8];
734 
735  const __m512 inc_Val = _mm512_set_ps(lv_cimag(incr),
736  lv_creal(incr),
737  lv_cimag(incr),
738  lv_creal(incr),
739  lv_cimag(incr),
740  lv_creal(incr),
741  lv_cimag(incr),
742  lv_creal(incr),
743  lv_cimag(incr),
744  lv_creal(incr),
745  lv_cimag(incr),
746  lv_creal(incr),
747  lv_cimag(incr),
748  lv_creal(incr),
749  lv_cimag(incr),
750  lv_creal(incr));
751 
752 #define REDUCE_ANGLE(a) \
753  do { \
754  (a) = fmod((a), 2.0 * M_PI); \
755  if ((a) > M_PI) \
756  (a) -= 2.0 * M_PI; \
757  else if ((a) < -M_PI) \
758  (a) += 2.0 * M_PI; \
759  } while (0)
760 
761  for (unsigned int k = 0; k < 8; ++k) {
762  double a = angle_sum + (double)k * delta_angle;
763  REDUCE_ANGLE(a);
764  phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
765  }
766  phase_Val = _mm512_loadu_ps((float*)phase_Ptr);
767 
768  unsigned int i, j;
769 
770  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
771  for (j = 0; j < ROTATOR_RELOAD_8; ++j) {
772  aVal = _mm512_loadu_ps((float*)aPtr);
773  z = _mm512_complexmul_ps(aVal, phase_Val);
774  phase_Val = _mm512_complexmul_ps(phase_Val, inc_Val);
775  _mm512_storeu_ps((float*)cPtr, z);
776  aPtr += 8;
777  cPtr += 8;
778  }
779 
780  {
781  double y = block_delta - angle_c;
782  double t = angle_sum + y;
783  angle_c = (t - angle_sum) - y;
784  angle_sum = t;
785  }
786 
787  for (unsigned int k = 0; k < 8; ++k) {
788  double a = angle_sum + (double)k * delta_angle;
789  REDUCE_ANGLE(a);
790  phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
791  }
792  phase_Val = _mm512_loadu_ps((float*)phase_Ptr);
793  }
794 
795  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 8; ++i) {
796  aVal = _mm512_loadu_ps((float*)aPtr);
797  z = _mm512_complexmul_ps(aVal, phase_Val);
798  phase_Val = _mm512_complexmul_ps(phase_Val, inc_Val);
799  _mm512_storeu_ps((float*)cPtr, z);
800  aPtr += 8;
801  cPtr += 8;
802 
803  {
804  double y = (8.0 * delta_angle) - angle_c;
805  double t = angle_sum + y;
806  angle_c = (t - angle_sum) - y;
807  angle_sum = t;
808  }
809  }
810 
811  {
812  double a = angle_sum;
813  REDUCE_ANGLE(a);
814  *phase = lv_cmake((float)cos(a), (float)sin(a));
815  }
816 
817  for (i = 0; i < num_points % 8; ++i) {
818  *cPtr++ = *aPtr++ * (*phase);
819  {
820  double y = delta_angle - angle_c;
821  double t = angle_sum + y;
822  angle_c = (t - angle_sum) - y;
823  angle_sum = t;
824  }
825  double a = angle_sum;
826  REDUCE_ANGLE(a);
827  *phase = lv_cmake((float)cos(a), (float)sin(a));
828  }
829 
830 #undef REDUCE_ANGLE
831 }
832 
833 #endif /* LV_HAVE_AVX512F */
834 
835 
836 /* Note on the RVV implementation:
837  * The complex multiply was expanded, because we don't care about the corner cases.
838  * Otherwise, without -ffast-math, the compiler would inserts function calls,
839  * which invalidates all vector registers and spills them on each loop iteration. */
840 
841 #ifdef LV_HAVE_RVV
842 #include <riscv_vector.h>
843 
844 static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvv(lv_32fc_t* outVector,
845  const lv_32fc_t* inVector,
846  const lv_32fc_t* phase_inc,
847  lv_32fc_t* phase,
848  unsigned int num_points)
849 {
850  size_t vlmax = __riscv_vsetvlmax_e32m2();
851  vlmax = vlmax < ROTATOR_RELOAD ? vlmax : ROTATOR_RELOAD;
852 
853  lv_32fc_t inc = 1.0f;
854  vfloat32m2_t phr = __riscv_vfmv_v_f_f32m2(0, vlmax), phi = phr;
855  for (size_t i = 0; i < vlmax; ++i) {
856  lv_32fc_t ph =
857  lv_cmake(lv_creal(*phase) * lv_creal(inc) - lv_cimag(*phase) * lv_cimag(inc),
858  lv_creal(*phase) * lv_cimag(inc) + lv_cimag(*phase) * lv_creal(inc));
859  phr = __riscv_vfslide1down(phr, lv_creal(ph), vlmax);
860  phi = __riscv_vfslide1down(phi, lv_cimag(ph), vlmax);
861  inc = lv_cmake(
862  lv_creal(*phase_inc) * lv_creal(inc) - lv_cimag(*phase_inc) * lv_cimag(inc),
863  lv_creal(*phase_inc) * lv_cimag(inc) + lv_cimag(*phase_inc) * lv_creal(inc));
864  }
865  vfloat32m2_t incr = __riscv_vfmv_v_f_f32m2(lv_creal(inc), vlmax);
866  vfloat32m2_t inci = __riscv_vfmv_v_f_f32m2(lv_cimag(inc), vlmax);
867 
868  size_t vl = 0;
869  if (num_points > 0)
870  while (1) {
871  size_t n = num_points < ROTATOR_RELOAD ? num_points : ROTATOR_RELOAD;
872  num_points -= n;
873 
874  for (; n > 0; n -= vl, inVector += vl, outVector += vl) {
875  // vl<vlmax can only happen on the last iteration of the loops
876  vl = __riscv_vsetvl_e32m2(n < vlmax ? n : vlmax);
877 
878  vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)inVector, vl);
879  vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
880  vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
881 
882  vfloat32m2_t vr =
883  __riscv_vfnmsac(__riscv_vfmul(var, phr, vl), vai, phi, vl);
884  vfloat32m2_t vi =
885  __riscv_vfmacc(__riscv_vfmul(var, phi, vl), vai, phr, vl);
886 
887  vuint32m2_t vru = __riscv_vreinterpret_u32m2(vr);
888  vuint32m2_t viu = __riscv_vreinterpret_u32m2(vi);
889  vuint64m4_t res =
890  __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
891  __riscv_vse64((uint64_t*)outVector, res, vl);
892 
893  vfloat32m2_t tmp = phr;
894  phr = __riscv_vfnmsac(__riscv_vfmul(tmp, incr, vl), phi, inci, vl);
895  phi = __riscv_vfmacc(__riscv_vfmul(tmp, inci, vl), phi, incr, vl);
896  }
897 
898  if (num_points <= 0)
899  break;
900 
901  // normalize
902  vfloat32m2_t scale =
903  __riscv_vfmacc(__riscv_vfmul(phr, phr, vl), phi, phi, vl);
904  scale = __riscv_vfsqrt(scale, vl);
905  phr = __riscv_vfdiv(phr, scale, vl);
906  phi = __riscv_vfdiv(phi, scale, vl);
907  }
908 
909  lv_32fc_t ph = lv_cmake(__riscv_vfmv_f(phr), __riscv_vfmv_f(phi));
910  for (size_t i = 0; i < vlmax - vl; ++i) {
911  ph /= *phase_inc; // we're going backwards
912  }
913  *phase = ph * 1.0f / hypotf(lv_creal(ph), lv_cimag(ph));
914 }
915 #endif /*LV_HAVE_RVV*/
916 
917 #ifdef LV_HAVE_RVVSEG
918 #include <riscv_vector.h>
919 
920 static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvvseg(lv_32fc_t* outVector,
921  const lv_32fc_t* inVector,
922  const lv_32fc_t* phase_inc,
923  lv_32fc_t* phase,
924  unsigned int num_points)
925 {
926  size_t vlmax = __riscv_vsetvlmax_e32m2();
927  vlmax = vlmax < ROTATOR_RELOAD ? vlmax : ROTATOR_RELOAD;
928 
929  lv_32fc_t inc = 1.0f;
930  vfloat32m2_t phr = __riscv_vfmv_v_f_f32m2(0, vlmax), phi = phr;
931  for (size_t i = 0; i < vlmax; ++i) {
932  lv_32fc_t ph =
933  lv_cmake(lv_creal(*phase) * lv_creal(inc) - lv_cimag(*phase) * lv_cimag(inc),
934  lv_creal(*phase) * lv_cimag(inc) + lv_cimag(*phase) * lv_creal(inc));
935  phr = __riscv_vfslide1down(phr, lv_creal(ph), vlmax);
936  phi = __riscv_vfslide1down(phi, lv_cimag(ph), vlmax);
937  inc = lv_cmake(
938  lv_creal(*phase_inc) * lv_creal(inc) - lv_cimag(*phase_inc) * lv_cimag(inc),
939  lv_creal(*phase_inc) * lv_cimag(inc) + lv_cimag(*phase_inc) * lv_creal(inc));
940  }
941  vfloat32m2_t incr = __riscv_vfmv_v_f_f32m2(lv_creal(inc), vlmax);
942  vfloat32m2_t inci = __riscv_vfmv_v_f_f32m2(lv_cimag(inc), vlmax);
943 
944  size_t vl = 0;
945  if (num_points > 0)
946  while (1) {
947  size_t n = num_points < ROTATOR_RELOAD ? num_points : ROTATOR_RELOAD;
948  num_points -= n;
949 
950  for (; n > 0; n -= vl, inVector += vl, outVector += vl) {
951  // vl<vlmax can only happen on the last iteration of the loops
952  vl = __riscv_vsetvl_e32m2(n < vlmax ? n : vlmax);
953 
954  vfloat32m2x2_t va =
955  __riscv_vlseg2e32_v_f32m2x2((const float*)inVector, vl);
956  vfloat32m2_t var = __riscv_vget_f32m2(va, 0);
957  vfloat32m2_t vai = __riscv_vget_f32m2(va, 1);
958 
959  vfloat32m2_t vr =
960  __riscv_vfnmsac(__riscv_vfmul(var, phr, vl), vai, phi, vl);
961  vfloat32m2_t vi =
962  __riscv_vfmacc(__riscv_vfmul(var, phi, vl), vai, phr, vl);
963  vfloat32m2x2_t vc = __riscv_vcreate_v_f32m2x2(vr, vi);
964  __riscv_vsseg2e32_v_f32m2x2((float*)outVector, vc, vl);
965 
966  vfloat32m2_t tmp = phr;
967  phr = __riscv_vfnmsac(__riscv_vfmul(tmp, incr, vl), phi, inci, vl);
968  phi = __riscv_vfmacc(__riscv_vfmul(tmp, inci, vl), phi, incr, vl);
969  }
970 
971  if (num_points <= 0)
972  break;
973 
974  // normalize
975  vfloat32m2_t scale =
976  __riscv_vfmacc(__riscv_vfmul(phr, phr, vl), phi, phi, vl);
977  scale = __riscv_vfsqrt(scale, vl);
978  phr = __riscv_vfdiv(phr, scale, vl);
979  phi = __riscv_vfdiv(phi, scale, vl);
980  }
981 
982  lv_32fc_t ph = lv_cmake(__riscv_vfmv_f(phr), __riscv_vfmv_f(phi));
983  for (size_t i = 0; i < vlmax - vl; ++i) {
984  ph /= *phase_inc; // we're going backwards
985  }
986  *phase = ph * 1.0f / hypotf(lv_creal(ph), lv_cimag(ph));
987 }
988 #endif /*LV_HAVE_RVVSEG*/
989 
990 #endif /* INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H */
ROTATOR_RELOAD_4
#define ROTATOR_RELOAD_4
Definition: volk_32fc_s32fc_x2_rotator2_32fc.h:78
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:98
volk_32fc_s32fc_x2_rotator2_32fc_generic
static void volk_32fc_s32fc_x2_rotator2_32fc_generic(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t *phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator2_32fc.h:84
_mm256_complexmul_ps
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:120
volk_avx512_intrinsics.h
REDUCE_ANGLE
#define REDUCE_ANGLE(a)
_mm512_complexmul_ps
static __m512 _mm512_complexmul_ps(const __m512 x, const __m512 y)
Definition: volk_avx512_intrinsics.h:124
ROTATOR_RELOAD_8
#define ROTATOR_RELOAD_8
Definition: volk_32fc_s32fc_x2_rotator2_32fc.h:79
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
ROTATOR_RELOAD
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator2_32fc.h:76
i
for i
Definition: volk_config_fixed.tmpl.h:13
lv_cmake
#define lv_cmake(r, i)
Definition: volk_complex.h:77
volk_32fc_s32fc_x2_rotator2_32fc_neon
static void volk_32fc_s32fc_x2_rotator2_32fc_neon(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t *phase_inc, lv_32fc_t *phase, unsigned int num_points)
NEON implementation with angle-based resync for numerical stability.
Definition: volk_32fc_s32fc_x2_rotator2_32fc.h:127
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_complex.h
volk_32fc_s32fc_x2_rotator2_32fc_a_avx
static void volk_32fc_s32fc_x2_rotator2_32fc_a_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t *phase_inc, lv_32fc_t *phase, unsigned int num_points)
AVX implementation with angle-based resync for numerical stability.
Definition: volk_32fc_s32fc_x2_rotator2_32fc.h:278
volk_neon_intrinsics.h
volk_32fc_s32fc_x2_rotator2_32fc_u_avx
static void volk_32fc_s32fc_x2_rotator2_32fc_u_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t *phase_inc, lv_32fc_t *phase, unsigned int num_points)
Unaligned AVX implementation with angle-based resync for numerical stability.
Definition: volk_32fc_s32fc_x2_rotator2_32fc.h:425
volk_avx_intrinsics.h
_vmultiply_complexq_f32
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:172
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:96