Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_divide_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
98 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
99 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
100 
101 #include <float.h>
102 #include <inttypes.h>
103 #include <volk/volk_complex.h>
104 
105 
106 #ifdef LV_HAVE_GENERIC
107 
108 static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
109  const lv_32fc_t* aVector,
110  const lv_32fc_t* bVector,
111  unsigned int num_points)
112 {
113  lv_32fc_t* cPtr = cVector;
114  const lv_32fc_t* aPtr = aVector;
115  const lv_32fc_t* bPtr = bVector;
116 
117  for (unsigned int number = 0; number < num_points; number++) {
118  *cPtr++ = (*aPtr++) / (*bPtr++);
119  }
120 }
121 #endif /* LV_HAVE_GENERIC */
122 
123 
124 #ifdef LV_HAVE_SSE3
125 #include <pmmintrin.h>
127 
128 static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
129  const lv_32fc_t* numeratorVector,
130  const lv_32fc_t* denumeratorVector,
131  unsigned int num_points)
132 {
133  /*
134  * we'll do the "classical"
135  * a a b*
136  * --- = -------
137  * b |b|^2
138  * */
139  unsigned int number = 0;
140  const unsigned int quarterPoints = num_points / 4;
141 
142  __m128 num01, num23, den01, den23, norm, result;
143  lv_32fc_t* c = cVector;
144  const lv_32fc_t* a = numeratorVector;
145  const lv_32fc_t* b = denumeratorVector;
146 
147  for (; number < quarterPoints; number++) {
148  num01 = _mm_loadu_ps((float*)a); // first pair
149  den01 = _mm_loadu_ps((float*)b); // first pair
150  num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
151  a += 2;
152  b += 2;
153 
154  num23 = _mm_loadu_ps((float*)a); // second pair
155  den23 = _mm_loadu_ps((float*)b); // second pair
156  num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
157  a += 2;
158  b += 2;
159 
160  norm = _mm_magnitudesquared_ps_sse3(den01, den23);
161  den01 = _mm_unpacklo_ps(norm, norm);
162  den23 = _mm_unpackhi_ps(norm, norm);
163 
164  result = _mm_div_ps(num01, den01);
165  _mm_storeu_ps((float*)c, result); // Store the results back into the C container
166  c += 2;
167  result = _mm_div_ps(num23, den23);
168  _mm_storeu_ps((float*)c, result); // Store the results back into the C container
169  c += 2;
170  }
171 
172  number *= 4;
173  for (; number < num_points; number++) {
174  *c = (*a) / (*b);
175  a++;
176  b++;
177  c++;
178  }
179 }
180 #endif /* LV_HAVE_SSE3 */
181 
182 
183 #ifdef LV_HAVE_AVX
184 #include <immintrin.h>
186 
187 static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
188  const lv_32fc_t* numeratorVector,
189  const lv_32fc_t* denumeratorVector,
190  unsigned int num_points)
191 {
192  /*
193  * we'll do the "classical"
194  * a a b*
195  * --- = -------
196  * b |b|^2
197  * */
198  unsigned int number = 0;
199  const unsigned int quarterPoints = num_points / 4;
200 
201  __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
202  lv_32fc_t* c = cVector;
203  const lv_32fc_t* a = numeratorVector;
204  const lv_32fc_t* b = denumeratorVector;
205 
206  for (; number < quarterPoints; number++) {
207  num = _mm256_loadu_ps(
208  (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
209  denum = _mm256_loadu_ps(
210  (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
211  mul_conj = _mm256_complexconjugatemul_ps(num, denum);
212  sq = _mm256_mul_ps(denum, denum); // Square the values
213  mag_sq_un = _mm256_hadd_ps(
214  sq, sq); // obtain the actual squared magnitude, although out of order
215  mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
216  // best guide I found on using these functions:
217  // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
218  div = _mm256_div_ps(mul_conj, mag_sq);
219 
220  _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
221 
222  a += 4;
223  b += 4;
224  c += 4;
225  }
226 
227  number = quarterPoints * 4;
228 
229  for (; number < num_points; number++) {
230  *c++ = (*a++) / (*b++);
231  }
232 }
233 #endif /* LV_HAVE_AVX */
234 
235 #if LV_HAVE_AVX2 && LV_HAVE_FMA
236 #include <immintrin.h>
238 #include <volk/volk_complex.h>
239 
240 static inline void volk_32fc_x2_divide_32fc_u_avx2_fma(lv_32fc_t* cVector,
241  const lv_32fc_t* numeratorVector,
242  const lv_32fc_t* denumeratorVector,
243  unsigned int num_points)
244 {
245  lv_32fc_t* c = cVector;
246  const lv_32fc_t* a = numeratorVector;
247  const lv_32fc_t* b = denumeratorVector;
248 
249  const unsigned int eighthPoints = num_points / 8;
250 
251  __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
252 
253  for (unsigned int number = 0; number < eighthPoints; number++) {
254  num01 = _mm256_loadu_ps((float*)a);
255  denum01 = _mm256_loadu_ps((float*)b);
256  num01 = _mm256_complexconjugatemul_ps(num01, denum01);
257  a += 4;
258  b += 4;
259 
260  num23 = _mm256_loadu_ps((float*)a);
261  denum23 = _mm256_loadu_ps((float*)b);
262  num23 = _mm256_complexconjugatemul_ps(num23, denum23);
263  a += 4;
264  b += 4;
265 
266  complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
267  _mm256_mul_ps(denum23, denum23));
268 
269  denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
270  denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
271 
272  result0 = _mm256_div_ps(num01, denum01);
273  result1 = _mm256_div_ps(num23, denum23);
274 
275  _mm256_storeu_ps((float*)c, result0);
276  c += 4;
277  _mm256_storeu_ps((float*)c, result1);
278  c += 4;
279  }
280 
281  volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eighthPoints * 8);
282 }
283 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
284 
285 #ifdef LV_HAVE_AVX512F
286 #include <immintrin.h>
288 #include <volk/volk_complex.h>
289 
290 static inline void volk_32fc_x2_divide_32fc_u_avx512(lv_32fc_t* cVector,
291  const lv_32fc_t* numeratorVector,
292  const lv_32fc_t* denumeratorVector,
293  unsigned int num_points)
294 {
295  lv_32fc_t* c = cVector;
296  const lv_32fc_t* a = numeratorVector;
297  const lv_32fc_t* b = denumeratorVector;
298 
299  const unsigned int sixteenthPoints = num_points / 16;
300 
301  __m512 num01, num23, denum01, denum23;
302  __m512 mag_sq01_shuf, mag_sq23_shuf, mag_sq01, mag_sq23;
303  __m512 result0, result1;
304 
305  for (unsigned int number = 0; number < sixteenthPoints; number++) {
306  num01 = _mm512_loadu_ps((float*)a);
307  denum01 = _mm512_loadu_ps((float*)b);
308  num01 = _mm512_complexconjugatemul_ps(num01, denum01);
309  a += 8;
310  b += 8;
311 
312  num23 = _mm512_loadu_ps((float*)a);
313  denum23 = _mm512_loadu_ps((float*)b);
314  num23 = _mm512_complexconjugatemul_ps(num23, denum23);
315  a += 8;
316  b += 8;
317 
318  // Compute magnitude squared for both sets
319  mag_sq01_shuf = _mm512_shuffle_ps(denum01, denum01, 0xb1);
320  mag_sq01 = _mm512_add_ps(_mm512_mul_ps(denum01, denum01),
321  _mm512_mul_ps(mag_sq01_shuf, mag_sq01_shuf));
322 
323  mag_sq23_shuf = _mm512_shuffle_ps(denum23, denum23, 0xb1);
324  mag_sq23 = _mm512_add_ps(_mm512_mul_ps(denum23, denum23),
325  _mm512_mul_ps(mag_sq23_shuf, mag_sq23_shuf));
326 
327  result0 = _mm512_div_ps(num01, mag_sq01);
328  result1 = _mm512_div_ps(num23, mag_sq23);
329 
330  _mm512_storeu_ps((float*)c, result0);
331  c += 8;
332  _mm512_storeu_ps((float*)c, result1);
333  c += 8;
334  }
335 
336  volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - sixteenthPoints * 16);
337 }
338 #endif /* LV_HAVE_AVX512F */
339 
340 
341 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
342 
343 
344 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
345 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
346 
347 #include <float.h>
348 #include <inttypes.h>
349 #include <stdio.h>
350 #include <volk/volk_complex.h>
351 
352 #ifdef LV_HAVE_SSE3
353 #include <pmmintrin.h>
355 
356 static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
357  const lv_32fc_t* numeratorVector,
358  const lv_32fc_t* denumeratorVector,
359  unsigned int num_points)
360 {
361  /*
362  * we'll do the "classical"
363  * a a b*
364  * --- = -------
365  * b |b|^2
366  * */
367  unsigned int number = 0;
368  const unsigned int quarterPoints = num_points / 4;
369 
370  __m128 num01, num23, den01, den23, norm, result;
371  lv_32fc_t* c = cVector;
372  const lv_32fc_t* a = numeratorVector;
373  const lv_32fc_t* b = denumeratorVector;
374 
375  for (; number < quarterPoints; number++) {
376  num01 = _mm_load_ps((float*)a); // first pair
377  den01 = _mm_load_ps((float*)b); // first pair
378  num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
379  a += 2;
380  b += 2;
381 
382  num23 = _mm_load_ps((float*)a); // second pair
383  den23 = _mm_load_ps((float*)b); // second pair
384  num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
385  a += 2;
386  b += 2;
387 
388  norm = _mm_magnitudesquared_ps_sse3(den01, den23);
389 
390  den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
391  den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
392 
393  result = _mm_div_ps(num01, den01);
394  _mm_store_ps((float*)c, result); // Store the results back into the C container
395  c += 2;
396  result = _mm_div_ps(num23, den23);
397  _mm_store_ps((float*)c, result); // Store the results back into the C container
398  c += 2;
399  }
400 
401  number *= 4;
402  for (; number < num_points; number++) {
403  *c = (*a) / (*b);
404  a++;
405  b++;
406  c++;
407  }
408 }
409 #endif /* LV_HAVE_SSE */
410 
411 #ifdef LV_HAVE_AVX
412 #include <immintrin.h>
414 
415 static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
416  const lv_32fc_t* numeratorVector,
417  const lv_32fc_t* denumeratorVector,
418  unsigned int num_points)
419 {
420  /*
421  * Guide to AVX intrisics:
422  * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
423  *
424  * we'll do the "classical"
425  * a a b*
426  * --- = -------
427  * b |b|^2
428  *
429  */
430  lv_32fc_t* c = cVector;
431  const lv_32fc_t* a = numeratorVector;
432  const lv_32fc_t* b = denumeratorVector;
433 
434  const unsigned int eigthPoints = num_points / 8;
435 
436  __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
437 
438  for (unsigned int number = 0; number < eigthPoints; number++) {
439  // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
440  num01 = _mm256_load_ps((float*)a);
441  denum01 = _mm256_load_ps((float*)b);
442 
443  num01 = _mm256_complexconjugatemul_ps(num01, denum01);
444  a += 4;
445  b += 4;
446 
447  num23 = _mm256_load_ps((float*)a);
448  denum23 = _mm256_load_ps((float*)b);
449  num23 = _mm256_complexconjugatemul_ps(num23, denum23);
450  a += 4;
451  b += 4;
452 
453  complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
454  _mm256_mul_ps(denum23, denum23));
455 
456  denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
457  denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
458 
459  result0 = _mm256_div_ps(num01, denum01);
460  result1 = _mm256_div_ps(num23, denum23);
461 
462  _mm256_store_ps((float*)c, result0);
463  c += 4;
464  _mm256_store_ps((float*)c, result1);
465  c += 4;
466  }
467 
468  volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eigthPoints * 8);
469 }
470 #endif /* LV_HAVE_AVX */
471 
472 #if LV_HAVE_AVX2 && LV_HAVE_FMA
473 #include <immintrin.h>
475 #include <volk/volk_complex.h>
476 
477 static inline void volk_32fc_x2_divide_32fc_a_avx2_fma(lv_32fc_t* cVector,
478  const lv_32fc_t* numeratorVector,
479  const lv_32fc_t* denumeratorVector,
480  unsigned int num_points)
481 {
482  lv_32fc_t* c = cVector;
483  const lv_32fc_t* a = numeratorVector;
484  const lv_32fc_t* b = denumeratorVector;
485 
486  const unsigned int eighthPoints = num_points / 8;
487 
488  __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
489 
490  for (unsigned int number = 0; number < eighthPoints; number++) {
491  num01 = _mm256_load_ps((float*)a);
492  denum01 = _mm256_load_ps((float*)b);
493  num01 = _mm256_complexconjugatemul_ps(num01, denum01);
494  a += 4;
495  b += 4;
496 
497  num23 = _mm256_load_ps((float*)a);
498  denum23 = _mm256_load_ps((float*)b);
499  num23 = _mm256_complexconjugatemul_ps(num23, denum23);
500  a += 4;
501  b += 4;
502 
503  complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
504  _mm256_mul_ps(denum23, denum23));
505 
506  denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
507  denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
508 
509  result0 = _mm256_div_ps(num01, denum01);
510  result1 = _mm256_div_ps(num23, denum23);
511 
512  _mm256_store_ps((float*)c, result0);
513  c += 4;
514  _mm256_store_ps((float*)c, result1);
515  c += 4;
516  }
517 
518  volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eighthPoints * 8);
519 }
520 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
521 
522 #ifdef LV_HAVE_AVX512F
523 #include <immintrin.h>
525 #include <volk/volk_complex.h>
526 
527 static inline void volk_32fc_x2_divide_32fc_a_avx512(lv_32fc_t* cVector,
528  const lv_32fc_t* numeratorVector,
529  const lv_32fc_t* denumeratorVector,
530  unsigned int num_points)
531 {
532  lv_32fc_t* c = cVector;
533  const lv_32fc_t* a = numeratorVector;
534  const lv_32fc_t* b = denumeratorVector;
535 
536  const unsigned int sixteenthPoints = num_points / 16;
537 
538  __m512 num01, num23, denum01, denum23;
539  __m512 mag_sq01_shuf, mag_sq23_shuf, mag_sq01, mag_sq23;
540  __m512 result0, result1;
541 
542  for (unsigned int number = 0; number < sixteenthPoints; number++) {
543  num01 = _mm512_load_ps((float*)a);
544  denum01 = _mm512_load_ps((float*)b);
545  num01 = _mm512_complexconjugatemul_ps(num01, denum01);
546  a += 8;
547  b += 8;
548 
549  num23 = _mm512_load_ps((float*)a);
550  denum23 = _mm512_load_ps((float*)b);
551  num23 = _mm512_complexconjugatemul_ps(num23, denum23);
552  a += 8;
553  b += 8;
554 
555  // Compute magnitude squared for both sets
556  mag_sq01_shuf = _mm512_shuffle_ps(denum01, denum01, 0xb1);
557  mag_sq01 = _mm512_add_ps(_mm512_mul_ps(denum01, denum01),
558  _mm512_mul_ps(mag_sq01_shuf, mag_sq01_shuf));
559 
560  mag_sq23_shuf = _mm512_shuffle_ps(denum23, denum23, 0xb1);
561  mag_sq23 = _mm512_add_ps(_mm512_mul_ps(denum23, denum23),
562  _mm512_mul_ps(mag_sq23_shuf, mag_sq23_shuf));
563 
564  result0 = _mm512_div_ps(num01, mag_sq01);
565  result1 = _mm512_div_ps(num23, mag_sq23);
566 
567  _mm512_store_ps((float*)c, result0);
568  c += 8;
569  _mm512_store_ps((float*)c, result1);
570  c += 8;
571  }
572 
573  volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - sixteenthPoints * 16);
574 }
575 #endif /* LV_HAVE_AVX512F */
576 
577 
578 #ifdef LV_HAVE_NEON
579 #include <arm_neon.h>
580 
581 static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
582  const lv_32fc_t* aVector,
583  const lv_32fc_t* bVector,
584  unsigned int num_points)
585 {
586  lv_32fc_t* cPtr = cVector;
587  const lv_32fc_t* aPtr = aVector;
588  const lv_32fc_t* bPtr = bVector;
589 
590  float32x4x2_t aVal, bVal, cVal;
591  float32x4_t bAbs, bAbsInv;
592 
593  const unsigned int quarterPoints = num_points / 4;
594  unsigned int number = 0;
595  for (; number < quarterPoints; number++) {
596  aVal = vld2q_f32((const float*)(aPtr));
597  bVal = vld2q_f32((const float*)(bPtr));
598  aPtr += 4;
599  bPtr += 4;
600  __VOLK_PREFETCH(aPtr + 4);
601  __VOLK_PREFETCH(bPtr + 4);
602 
603  bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
604  bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
605 
606  bAbsInv = vrecpeq_f32(bAbs);
607  bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
608  bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
609 
610  cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
611  cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
612  cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
613 
614  cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
615  cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
616  cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
617 
618  vst2q_f32((float*)(cPtr), cVal);
619  cPtr += 4;
620  }
621 
622  for (number = quarterPoints * 4; number < num_points; number++) {
623  *cPtr++ = (*aPtr++) / (*bPtr++);
624  }
625 }
626 #endif /* LV_HAVE_NEON */
627 
628 #ifdef LV_HAVE_NEONV8
629 #include <arm_neon.h>
630 
631 static inline void volk_32fc_x2_divide_32fc_neonv8(lv_32fc_t* cVector,
632  const lv_32fc_t* aVector,
633  const lv_32fc_t* bVector,
634  unsigned int num_points)
635 {
636  lv_32fc_t* cPtr = cVector;
637  const lv_32fc_t* aPtr = aVector;
638  const lv_32fc_t* bPtr = bVector;
639 
640  float32x4x2_t aVal, bVal, cVal;
641  float32x4_t bMagSq;
642 
643  const unsigned int quarterPoints = num_points / 4;
644  unsigned int number = 0;
645 
646  for (; number < quarterPoints; number++) {
647  aVal = vld2q_f32((const float*)(aPtr));
648  bVal = vld2q_f32((const float*)(bPtr));
649  aPtr += 4;
650  bPtr += 4;
651  __VOLK_PREFETCH(aPtr + 4);
652  __VOLK_PREFETCH(bPtr + 4);
653 
654  /* Compute |b|^2 = br^2 + bi^2 using FMA */
655  bMagSq = vfmaq_f32(vmulq_f32(bVal.val[0], bVal.val[0]), bVal.val[1], bVal.val[1]);
656 
657  /* Use ARMv8 native division for 1/|b|^2 */
658  float32x4_t bMagSqInv = vdivq_f32(vdupq_n_f32(1.0f), bMagSq);
659 
660  /* real = (ar*br + ai*bi) / |b|^2 */
661  cVal.val[0] =
662  vfmaq_f32(vmulq_f32(aVal.val[0], bVal.val[0]), aVal.val[1], bVal.val[1]);
663  cVal.val[0] = vmulq_f32(cVal.val[0], bMagSqInv);
664 
665  /* imag = (ai*br - ar*bi) / |b|^2 */
666  cVal.val[1] =
667  vfmsq_f32(vmulq_f32(aVal.val[1], bVal.val[0]), aVal.val[0], bVal.val[1]);
668  cVal.val[1] = vmulq_f32(cVal.val[1], bMagSqInv);
669 
670  vst2q_f32((float*)(cPtr), cVal);
671  cPtr += 4;
672  }
673 
674  for (number = quarterPoints * 4; number < num_points; number++) {
675  *cPtr++ = (*aPtr++) / (*bPtr++);
676  }
677 }
678 #endif /* LV_HAVE_NEONV8 */
679 
680 #ifdef LV_HAVE_RVV
681 #include <riscv_vector.h>
682 
683 
684 static inline void volk_32fc_x2_divide_32fc_rvv(lv_32fc_t* cVector,
685  const lv_32fc_t* aVector,
686  const lv_32fc_t* bVector,
687  unsigned int num_points)
688 {
689  uint64_t* out = (uint64_t*)cVector;
690  size_t n = num_points;
691  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, out += vl) {
692  vl = __riscv_vsetvl_e32m4(n);
693  vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl);
694  vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)bVector, vl);
695  vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl));
696  vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
697  vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl));
698  vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
699  vfloat32m4_t mul = __riscv_vfrdiv(
700  __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl);
701  vfloat32m4_t vr = __riscv_vfmul(
702  __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl);
703  vfloat32m4_t vi = __riscv_vfmul(
704  __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl);
705  vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr);
706  vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi);
707  vuint64m8_t v =
708  __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
709  __riscv_vse64(out, v, vl);
710  }
711 }
712 #endif /*LV_HAVE_RVV*/
713 
714 #ifdef LV_HAVE_RVVSEG
715 #include <riscv_vector.h>
716 
717 static inline void volk_32fc_x2_divide_32fc_rvvseg(lv_32fc_t* cVector,
718  const lv_32fc_t* aVector,
719  const lv_32fc_t* bVector,
720  unsigned int num_points)
721 {
722  size_t n = num_points;
723  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
724  vl = __riscv_vsetvl_e32m4(n);
725  vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl);
726  vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl);
727  vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
728  vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
729  vfloat32m4_t mul = __riscv_vfrdiv(
730  __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl);
731  vfloat32m4_t vr = __riscv_vfmul(
732  __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl);
733  vfloat32m4_t vi = __riscv_vfmul(
734  __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl);
735  __riscv_vsseg2e32_v_f32m4x2(
736  (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
737  }
738 }
739 
740 #endif /*LV_HAVE_RVVSEG*/
741 
742 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
volk_32fc_x2_divide_32fc_a_avx
static void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:415
volk_sse3_intrinsics.h
volk_avx512_intrinsics.h
volk_32fc_x2_divide_32fc_u_sse3
static void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:128
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32fc_x2_divide_32fc_generic
static void volk_32fc_x2_divide_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:108
_mm512_complexconjugatemul_ps
static __m512 _mm512_complexconjugatemul_ps(const __m512 x, const __m512 y)
Definition: volk_avx512_intrinsics.h:146
volk_32fc_x2_divide_32fc_u_avx
static void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:187
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_32fc_x2_divide_32fc_a_sse3
static void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:356
volk_complex.h
_mm_magnitudesquared_ps_sse3
static __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:38
_mm256_complexconjugatemul_ps
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:139
bit128::f
float f[4]
Definition: volk_common.h:120
volk_avx_intrinsics.h
_mm_complexconjugatemul_ps
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:31
volk_32fc_x2_divide_32fc_neon
static void volk_32fc_x2_divide_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:581