Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_conjugate_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
61 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
62 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
63 
64 
65 #include <volk/volk_complex.h>
66 
67 
68 #ifdef LV_HAVE_GENERIC
69 
71  const lv_32fc_t* input,
72  const lv_32fc_t* taps,
73  unsigned int num_points)
74 {
75  lv_32fc_t res = lv_cmake(0.f, 0.f);
76  for (unsigned int i = 0; i < num_points; ++i) {
77  res += (*input++) * lv_conj((*taps++));
78  }
79  *result = res;
80 }
81 
82 #endif /*LV_HAVE_GENERIC*/
83 
84 #ifdef LV_HAVE_GENERIC
85 
87  const lv_32fc_t* input,
88  const lv_32fc_t* taps,
89  unsigned int num_points)
90 {
91 
92  const unsigned int num_bytes = num_points * 8;
93 
94  float* res = (float*)result;
95  float* in = (float*)input;
96  float* tp = (float*)taps;
97  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
98 
99  float sum0[2] = { 0, 0 };
100  float sum1[2] = { 0, 0 };
101  unsigned int i = 0;
102 
103  for (i = 0; i < n_2_ccomplex_blocks; ++i) {
104  sum0[0] += in[0] * tp[0] + in[1] * tp[1];
105  sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
106  sum1[0] += in[2] * tp[2] + in[3] * tp[3];
107  sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
108 
109  in += 4;
110  tp += 4;
111  }
112 
113  res[0] = sum0[0] + sum1[0];
114  res[1] = sum0[1] + sum1[1];
115 
116  if (num_bytes >> 3 & 1) {
117  *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
118  }
119 }
120 
121 #endif /*LV_HAVE_GENERIC*/
122 
123 #ifdef LV_HAVE_SSE3
124 
125 #include <pmmintrin.h>
126 #include <xmmintrin.h>
127 
129  const lv_32fc_t* input,
130  const lv_32fc_t* taps,
131  unsigned int num_points)
132 {
133  // Partial sums for indices i and i+1.
134  __m128 sum_a_mult_b_real = _mm_setzero_ps();
135  __m128 sum_a_mult_b_imag = _mm_setzero_ps();
136 
137  for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
138  /* Two complex elements a time are processed.
139  * (ar + j⋅ai)*conj(br + j⋅bi) =
140  * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
141  */
142 
143  /* Load input and taps, split and duplicate real und imaginary parts of taps.
144  * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
145  * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
146  * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
147  * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
148  */
149  __m128 a = _mm_loadu_ps((const float*)&input[i]);
150  __m128 b = _mm_loadu_ps((const float*)&taps[i]);
151  __m128 b_real = _mm_moveldup_ps(b);
152  __m128 b_imag = _mm_movehdup_ps(b);
153 
154  // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
155  sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
156  // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
157  sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
158  }
159 
160  // Swap position of −ar⋅bi and ai⋅bi.
161  sum_a_mult_b_imag =
162  _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
163  // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
164  __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
165  // Sum the two partial sums.
166  sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
167  // Store result.
168  _mm_storel_pi((__m64*)result, sum);
169 
170  // Handle the last element if num_points mod 2 is 1.
171  if (num_points & 1u) {
172  *result += lv_cmake(
173  lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
174  lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
175  lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
176  lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
177  }
178 }
179 
180 #endif /*LV_HAVE_SSE3*/
181 
182 #ifdef LV_HAVE_SSE3
183 
184 #include <pmmintrin.h>
185 #include <xmmintrin.h>
186 
188  const lv_32fc_t* input,
189  const lv_32fc_t* taps,
190  unsigned int num_points)
191 {
192  // Partial sums for indices i and i+1.
193  __m128 sum_a_mult_b_real = _mm_setzero_ps();
194  __m128 sum_a_mult_b_imag = _mm_setzero_ps();
195 
196  for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
197  /* Two complex elements a time are processed.
198  * (ar + j⋅ai)*conj(br + j⋅bi) =
199  * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
200  */
201 
202  /* Load input and taps, split and duplicate real und imaginary parts of taps.
203  * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
204  * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
205  * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
206  * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
207  */
208  __m128 a = _mm_load_ps((const float*)&input[i]);
209  __m128 b = _mm_load_ps((const float*)&taps[i]);
210  __m128 b_real = _mm_moveldup_ps(b);
211  __m128 b_imag = _mm_movehdup_ps(b);
212 
213  // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
214  sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
215  // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
216  sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
217  }
218 
219  // Swap position of −ar⋅bi and ai⋅bi.
220  sum_a_mult_b_imag =
221  _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
222  // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
223  __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
224  // Sum the two partial sums.
225  sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
226  // Store result.
227  _mm_storel_pi((__m64*)result, sum);
228 
229  // Handle the last element if num_points mod 2 is 1.
230  if (num_points & 1u) {
231  *result += lv_cmake(
232  lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
233  lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
234  lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
235  lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
236  }
237 }
238 
239 #endif /*LV_HAVE_SSE3*/
240 
241 #ifdef LV_HAVE_AVX
242 
243 #include <immintrin.h>
244 
246  const lv_32fc_t* input,
247  const lv_32fc_t* taps,
248  unsigned int num_points)
249 {
250  // Partial sums for indices i, i+1, i+2 and i+3.
251  __m256 sum_a_mult_b_real = _mm256_setzero_ps();
252  __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
253 
254  for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
255  /* Four complex elements a time are processed.
256  * (ar + j⋅ai)*conj(br + j⋅bi) =
257  * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
258  */
259 
260  /* Load input and taps, split and duplicate real und imaginary parts of taps.
261  * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
262  * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
263  * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
264  * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
265  */
266  __m256 a = _mm256_loadu_ps((const float*)&input[i]);
267  __m256 b = _mm256_loadu_ps((const float*)&taps[i]);
268  __m256 b_real = _mm256_moveldup_ps(b);
269  __m256 b_imag = _mm256_movehdup_ps(b);
270 
271  // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
272  sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
273  // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
274  sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
275  }
276 
277  // Swap position of −ar⋅bi and ai⋅bi.
278  sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
279  // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
280  __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
281  /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
282  * s1 + s3 and s0 + s2 …
283  */
284  sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
285  // … and now (s0 + s2) + (s1 + s3)
286  sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
287  // Store result.
288  __m128 lower = _mm256_extractf128_ps(sum, 0);
289  _mm_storel_pi((__m64*)result, lower);
290 
291  // Handle the last elements if num_points mod 4 is bigger than 0.
292  for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
293  *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) +
294  lv_cimag(input[i]) * lv_cimag(taps[i]),
295  lv_cimag(input[i]) * lv_creal(taps[i]) -
296  lv_creal(input[i]) * lv_cimag(taps[i]));
297  }
298 }
299 
300 #endif /* LV_HAVE_AVX */
301 
302 #ifdef LV_HAVE_AVX
303 #include <immintrin.h>
304 
306  const lv_32fc_t* input,
307  const lv_32fc_t* taps,
308  unsigned int num_points)
309 {
310  // Partial sums for indices i, i+1, i+2 and i+3.
311  __m256 sum_a_mult_b_real = _mm256_setzero_ps();
312  __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
313 
314  for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
315  /* Four complex elements a time are processed.
316  * (ar + j⋅ai)*conj(br + j⋅bi) =
317  * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
318  */
319 
320  /* Load input and taps, split and duplicate real und imaginary parts of taps.
321  * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
322  * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
323  * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
324  * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
325  */
326  __m256 a = _mm256_load_ps((const float*)&input[i]);
327  __m256 b = _mm256_load_ps((const float*)&taps[i]);
328  __m256 b_real = _mm256_moveldup_ps(b);
329  __m256 b_imag = _mm256_movehdup_ps(b);
330 
331  // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
332  sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
333  // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
334  sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
335  }
336 
337  // Swap position of −ar⋅bi and ai⋅bi.
338  sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
339  // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
340  __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
341  /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
342  * s1 + s3 and s0 + s2 …
343  */
344  sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
345  // … and now (s0 + s2) + (s1 + s3)
346  sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
347  // Store result.
348  __m128 lower = _mm256_extractf128_ps(sum, 0);
349  _mm_storel_pi((__m64*)result, lower);
350 
351  // Handle the last elements if num_points mod 4 is bigger than 0.
352  for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
353  *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) +
354  lv_cimag(input[i]) * lv_cimag(taps[i]),
355  lv_cimag(input[i]) * lv_creal(taps[i]) -
356  lv_creal(input[i]) * lv_cimag(taps[i]));
357  }
358 }
359 #endif /* LV_HAVE_AVX */
360 
361 #if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
362 
363 #include <immintrin.h>
364 
365 static inline void
366 volk_32fc_x2_conjugate_dot_prod_32fc_u_avx512dq(lv_32fc_t* result,
367  const lv_32fc_t* input,
368  const lv_32fc_t* taps,
369  unsigned int num_points)
370 {
371  // Partial sums for indices i through i+7 (8 complex elements).
372  __m512 sum_a_mult_b_real = _mm512_setzero_ps();
373  __m512 sum_a_mult_b_imag = _mm512_setzero_ps();
374 
375  // Sign mask for emulating addsub: negate even indices (real parts)
376  const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set_epi32(0,
377  0x80000000,
378  0,
379  0x80000000,
380  0,
381  0x80000000,
382  0,
383  0x80000000,
384  0,
385  0x80000000,
386  0,
387  0x80000000,
388  0,
389  0x80000000,
390  0,
391  0x80000000));
392 
393  for (long unsigned i = 0; i < (num_points & ~7u); i += 8) {
394  /* Eight complex elements a time are processed.
395  * (ar + j⋅ai)*conj(br + j⋅bi) =
396  * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
397  */
398 
399  // Load input and taps.
400  __m512 a = _mm512_loadu_ps((const float*)&input[i]);
401  __m512 b = _mm512_loadu_ps((const float*)&taps[i]);
402 
403  // Duplicate real and imaginary parts of taps
404  __m512 b_real = _mm512_moveldup_ps(b); // duplicate even elements (real)
405  __m512 b_imag = _mm512_movehdup_ps(b); // duplicate odd elements (imag)
406 
407  // Accumulate ar⋅br and ai⋅br
408  sum_a_mult_b_real = _mm512_fmadd_ps(a, b_real, sum_a_mult_b_real);
409 
410  // Emulate addsub: compute a*b_imag then negate even indices
411  __m512 mult_imag = _mm512_mul_ps(a, b_imag);
412  mult_imag = _mm512_xor_ps(mult_imag, sign_mask); // flip sign of even indices
413  sum_a_mult_b_imag = _mm512_add_ps(sum_a_mult_b_imag, mult_imag);
414  }
415 
416  // Swap position of −ar⋅bi and ai⋅bi, then add
417  sum_a_mult_b_imag = _mm512_permute_ps(sum_a_mult_b_imag, 0xB1); // swap pairs
418  __m512 sum = _mm512_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
419 
420  // Horizontal sum: reduce 8 complex numbers to 1
421  // First reduce to 4 complex (256 bits)
422  __m256 sum_high = _mm512_extractf32x8_ps(sum, 1);
423  __m256 sum_low = _mm512_castps512_ps256(sum);
424  __m256 sum256 = _mm256_add_ps(sum_high, sum_low);
425 
426  // Reduce to 2 complex (128 bits)
427  __m128 sum128_high = _mm256_extractf128_ps(sum256, 1);
428  __m128 sum128_low = _mm256_castps256_ps128(sum256);
429  __m128 sum128 = _mm_add_ps(sum128_high, sum128_low);
430 
431  // Final reduction to 1 complex number
432  sum128 = _mm_add_ps(sum128, _mm_shuffle_ps(sum128, sum128, _MM_SHUFFLE(1, 0, 3, 2)));
433 
434  // Store result
435  _mm_storel_pi((__m64*)result, sum128);
436 
437  // Handle remaining elements
438  for (long unsigned i = num_points & ~7u; i < num_points; ++i) {
439  *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) +
440  lv_cimag(input[i]) * lv_cimag(taps[i]),
441  lv_cimag(input[i]) * lv_creal(taps[i]) -
442  lv_creal(input[i]) * lv_cimag(taps[i]));
443  }
444 }
445 
446 #endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ */
447 
448 #if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
449 
450 #include <immintrin.h>
451 
452 static inline void
453 volk_32fc_x2_conjugate_dot_prod_32fc_a_avx512dq(lv_32fc_t* result,
454  const lv_32fc_t* input,
455  const lv_32fc_t* taps,
456  unsigned int num_points)
457 {
458  // Partial sums for indices i through i+7 (8 complex elements).
459  __m512 sum_a_mult_b_real = _mm512_setzero_ps();
460  __m512 sum_a_mult_b_imag = _mm512_setzero_ps();
461 
462  // Sign mask for emulating addsub: negate even indices (real parts)
463  const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set_epi32(0,
464  0x80000000,
465  0,
466  0x80000000,
467  0,
468  0x80000000,
469  0,
470  0x80000000,
471  0,
472  0x80000000,
473  0,
474  0x80000000,
475  0,
476  0x80000000,
477  0,
478  0x80000000));
479 
480  for (long unsigned i = 0; i < (num_points & ~7u); i += 8) {
481  /* Eight complex elements a time are processed.
482  * (ar + j⋅ai)*conj(br + j⋅bi) =
483  * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
484  */
485 
486  // Load input and taps (aligned).
487  __m512 a = _mm512_load_ps((const float*)&input[i]);
488  __m512 b = _mm512_load_ps((const float*)&taps[i]);
489 
490  // Duplicate real and imaginary parts of taps
491  __m512 b_real = _mm512_moveldup_ps(b); // duplicate even elements (real)
492  __m512 b_imag = _mm512_movehdup_ps(b); // duplicate odd elements (imag)
493 
494  // Accumulate ar⋅br and ai⋅br
495  sum_a_mult_b_real = _mm512_fmadd_ps(a, b_real, sum_a_mult_b_real);
496 
497  // Emulate addsub: compute a*b_imag then negate even indices
498  __m512 mult_imag = _mm512_mul_ps(a, b_imag);
499  mult_imag = _mm512_xor_ps(mult_imag, sign_mask); // flip sign of even indices
500  sum_a_mult_b_imag = _mm512_add_ps(sum_a_mult_b_imag, mult_imag);
501  }
502 
503  // Swap position of −ar⋅bi and ai⋅bi, then add
504  sum_a_mult_b_imag = _mm512_permute_ps(sum_a_mult_b_imag, 0xB1);
505  __m512 sum = _mm512_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
506 
507  // Horizontal sum: reduce 8 complex numbers to 1
508  __m256 sum_high = _mm512_extractf32x8_ps(sum, 1);
509  __m256 sum_low = _mm512_castps512_ps256(sum);
510  __m256 sum256 = _mm256_add_ps(sum_high, sum_low);
511 
512  __m128 sum128_high = _mm256_extractf128_ps(sum256, 1);
513  __m128 sum128_low = _mm256_castps256_ps128(sum256);
514  __m128 sum128 = _mm_add_ps(sum128_high, sum128_low);
515 
516  sum128 = _mm_add_ps(sum128, _mm_shuffle_ps(sum128, sum128, _MM_SHUFFLE(1, 0, 3, 2)));
517 
518  _mm_storel_pi((__m64*)result, sum128);
519 
520  // Handle remaining elements
521  for (long unsigned i = num_points & ~7u; i < num_points; ++i) {
522  *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) +
523  lv_cimag(input[i]) * lv_cimag(taps[i]),
524  lv_cimag(input[i]) * lv_creal(taps[i]) -
525  lv_creal(input[i]) * lv_cimag(taps[i]));
526  }
527 }
528 #endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ */
529 
530 #ifdef LV_HAVE_NEON
531 #include <arm_neon.h>
533  const lv_32fc_t* input,
534  const lv_32fc_t* taps,
535  unsigned int num_points)
536 {
537 
538  unsigned int quarter_points = num_points / 4;
539  unsigned int number;
540 
541  lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
542  lv_32fc_t* b_ptr = (lv_32fc_t*)input;
543  // for 2-lane vectors, 1st lane holds the real part,
544  // 2nd lane holds the imaginary part
545  float32x4x2_t a_val, b_val, accumulator;
546  float32x4x2_t tmp_imag;
547  accumulator.val[0] = vdupq_n_f32(0);
548  accumulator.val[1] = vdupq_n_f32(0);
549 
550  for (number = 0; number < quarter_points; ++number) {
551  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
552  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
553  __VOLK_PREFETCH(a_ptr + 8);
554  __VOLK_PREFETCH(b_ptr + 8);
555 
556  // do the first multiply
557  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
558  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
559 
560  // use multiply accumulate/subtract to get result
561  tmp_imag.val[1] = vmlsq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
562  tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
563 
564  accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
565  accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
566 
567  // increment pointers
568  a_ptr += 4;
569  b_ptr += 4;
570  }
571  lv_32fc_t accum_result[4];
572  vst2q_f32((float*)accum_result, accumulator);
573  *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
574 
575  // tail case
576  for (number = quarter_points * 4; number < num_points; ++number) {
577  *result += (*a_ptr++) * lv_conj(*b_ptr++);
578  }
579  *result = lv_conj(*result);
580 }
581 #endif /*LV_HAVE_NEON*/
582 
583 
584 #ifdef LV_HAVE_NEONV8
585 #include <arm_neon.h>
586 
587 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neonv8(lv_32fc_t* result,
588  const lv_32fc_t* input,
589  const lv_32fc_t* taps,
590  unsigned int num_points)
591 {
592  unsigned int n = num_points;
593  const lv_32fc_t* a = input; /* input */
594  const lv_32fc_t* b = taps; /* taps (will be conjugated) */
595 
596  /* Use 4 accumulators to break data dependencies */
597  float32x4_t acc0_r = vdupq_n_f32(0);
598  float32x4_t acc0_i = vdupq_n_f32(0);
599  float32x4_t acc1_r = vdupq_n_f32(0);
600  float32x4_t acc1_i = vdupq_n_f32(0);
601 
602  /* Process 8 complex numbers per iteration (2x unroll) */
603  while (n >= 8) {
604  float32x4x2_t a0 = vld2q_f32((const float*)a);
605  float32x4x2_t b0 = vld2q_f32((const float*)b);
606  float32x4x2_t a1 = vld2q_f32((const float*)(a + 4));
607  float32x4x2_t b1 = vld2q_f32((const float*)(b + 4));
608  __VOLK_PREFETCH(a + 8);
609  __VOLK_PREFETCH(b + 8);
610 
611  /* Conjugate dot product: input * conj(taps)
612  * (ar + j*ai) * (br - j*bi) = ar*br + ai*bi + j*(ai*br - ar*bi)
613  * real += ar*br + ai*bi
614  * imag += ai*br - ar*bi
615  */
616  acc0_r = vfmaq_f32(acc0_r, a0.val[0], b0.val[0]); /* ar*br */
617  acc0_r = vfmaq_f32(acc0_r, a0.val[1], b0.val[1]); /* + ai*bi */
618  acc0_i = vfmaq_f32(acc0_i, a0.val[1], b0.val[0]); /* ai*br */
619  acc0_i = vfmsq_f32(acc0_i, a0.val[0], b0.val[1]); /* - ar*bi */
620 
621  acc1_r = vfmaq_f32(acc1_r, a1.val[0], b1.val[0]);
622  acc1_r = vfmaq_f32(acc1_r, a1.val[1], b1.val[1]);
623  acc1_i = vfmaq_f32(acc1_i, a1.val[1], b1.val[0]);
624  acc1_i = vfmsq_f32(acc1_i, a1.val[0], b1.val[1]);
625 
626  a += 8;
627  b += 8;
628  n -= 8;
629  }
630 
631  /* Process remaining 4 */
632  if (n >= 4) {
633  float32x4x2_t a0 = vld2q_f32((const float*)a);
634  float32x4x2_t b0 = vld2q_f32((const float*)b);
635 
636  acc0_r = vfmaq_f32(acc0_r, a0.val[0], b0.val[0]);
637  acc0_r = vfmaq_f32(acc0_r, a0.val[1], b0.val[1]);
638  acc0_i = vfmaq_f32(acc0_i, a0.val[1], b0.val[0]);
639  acc0_i = vfmsq_f32(acc0_i, a0.val[0], b0.val[1]);
640 
641  a += 4;
642  b += 4;
643  n -= 4;
644  }
645 
646  /* Combine accumulators */
647  acc0_r = vaddq_f32(acc0_r, acc1_r);
648  acc0_i = vaddq_f32(acc0_i, acc1_i);
649 
650  /* Horizontal sum using pairwise add */
651  float32x2_t sum_r = vadd_f32(vget_low_f32(acc0_r), vget_high_f32(acc0_r));
652  float32x2_t sum_i = vadd_f32(vget_low_f32(acc0_i), vget_high_f32(acc0_i));
653  sum_r = vpadd_f32(sum_r, sum_r);
654  sum_i = vpadd_f32(sum_i, sum_i);
655 
656  float res_r = vget_lane_f32(sum_r, 0);
657  float res_i = vget_lane_f32(sum_i, 0);
658 
659  /* Scalar tail */
660  while (n > 0) {
661  res_r += lv_creal(*a) * lv_creal(*b) + lv_cimag(*a) * lv_cimag(*b);
662  res_i += lv_cimag(*a) * lv_creal(*b) - lv_creal(*a) * lv_cimag(*b);
663  a++;
664  b++;
665  n--;
666  }
667 
668  *result = lv_cmake(res_r, res_i);
669 }
670 
671 #endif /* LV_HAVE_NEONV8 */
672 
673 
674 #ifdef LV_HAVE_RVV
675 #include <riscv_vector.h>
677 
678 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_rvv(lv_32fc_t* result,
679  const lv_32fc_t* input,
680  const lv_32fc_t* taps,
681  unsigned int num_points)
682 {
683  vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2());
684  vfloat32m2_t vsumi = vsumr;
685  size_t n = num_points;
686  for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
687  vl = __riscv_vsetvl_e32m2(n);
688  vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)input, vl);
689  vuint64m4_t vb = __riscv_vle64_v_u64m4((const uint64_t*)taps, vl);
690  vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
691  vfloat32m2_t vbr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 0, vl));
692  vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
693  vfloat32m2_t vbi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 32, vl));
694  vbi = __riscv_vfneg(vbi, vl);
695  vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
696  vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
697  vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
698  vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
699  }
700  size_t vl = __riscv_vsetvlmax_e32m1();
701  vfloat32m1_t vr = RISCV_SHRINK2(vfadd, f, 32, vsumr);
702  vfloat32m1_t vi = RISCV_SHRINK2(vfadd, f, 32, vsumi);
703  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
704  *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
705  __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
706 }
707 #endif /*LV_HAVE_RVV*/
708 
709 #ifdef LV_HAVE_RVVSEG
710 #include <riscv_vector.h>
712 
713 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_rvvseg(lv_32fc_t* result,
714  const lv_32fc_t* input,
715  const lv_32fc_t* taps,
716  unsigned int num_points)
717 {
718  vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2());
719  vfloat32m2_t vsumi = vsumr;
720  size_t n = num_points;
721  for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
722  vl = __riscv_vsetvl_e32m2(n);
723  vfloat32m2x2_t va = __riscv_vlseg2e32_v_f32m2x2((const float*)input, vl);
724  vfloat32m2x2_t vb = __riscv_vlseg2e32_v_f32m2x2((const float*)taps, vl);
725  vfloat32m2_t var = __riscv_vget_f32m2(va, 0), vai = __riscv_vget_f32m2(va, 1);
726  vfloat32m2_t vbr = __riscv_vget_f32m2(vb, 0), vbi = __riscv_vget_f32m2(vb, 1);
727  vbi = __riscv_vfneg(vbi, vl);
728  vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
729  vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
730  vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
731  vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
732  }
733  size_t vl = __riscv_vsetvlmax_e32m1();
734  vfloat32m1_t vr = RISCV_SHRINK2(vfadd, f, 32, vsumr);
735  vfloat32m1_t vi = RISCV_SHRINK2(vfadd, f, 32, vsumi);
736  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
737  *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
738  __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
739 }
740 #endif /*LV_HAVE_RVVSEG*/
741 
742 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
743 
744 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
745 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
746 
747 #include <stdio.h>
748 #include <volk/volk_common.h>
749 #include <volk/volk_complex.h>
750 
751 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:98
volk_32fc_x2_conjugate_dot_prod_32fc_u_avx
static void volk_32fc_x2_conjugate_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_conjugate_dot_prod_32fc.h:245
volk_32fc_x2_conjugate_dot_prod_32fc_generic
static void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_conjugate_dot_prod_32fc.h:70
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3
static void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_conjugate_dot_prod_32fc.h:128
RISCV_SHRINK2
#define RISCV_SHRINK2(op, T, S, v)
Definition: volk_rvv_intrinsics.h:19
lv_conj
#define lv_conj(x)
Definition: volk_complex.h:100
i
for i
Definition: volk_config_fixed.tmpl.h:13
lv_cmake
#define lv_cmake(r, i)
Definition: volk_complex.h:77
volk_common.h
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_32fc_x2_conjugate_dot_prod_32fc_block
static void volk_32fc_x2_conjugate_dot_prod_32fc_block(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_conjugate_dot_prod_32fc.h:86
volk_complex.h
volk_32fc_x2_conjugate_dot_prod_32fc_neon
static void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_conjugate_dot_prod_32fc.h:532
volk_rvv_intrinsics.h
bit128::f
float f[4]
Definition: volk_common.h:120
volk_32fc_x2_conjugate_dot_prod_32fc_a_avx
static void volk_32fc_x2_conjugate_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_conjugate_dot_prod_32fc.h:305
volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3
static void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_conjugate_dot_prod_32fc.h:187
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:96