Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_32f_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
50 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
51 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
52 
53 #include <stdio.h>
54 #include <volk/volk_common.h>
55 
56 #ifdef LV_HAVE_GENERIC
57 
59  const lv_32fc_t* input,
60  const float* taps,
61  unsigned int num_points)
62 {
63 
64  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
65  const float* aPtr = (float*)input;
66  const float* bPtr = taps;
67  unsigned int number = 0;
68 
69  for (number = 0; number < num_points; number++) {
70  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
71  aPtr += 2;
72  bPtr += 1;
73  }
74 
75  *result = returnValue;
76 }
77 
78 #endif /*LV_HAVE_GENERIC*/
79 
80 #ifdef LV_HAVE_AVX512F
81 
82 #include <immintrin.h>
83 
84 static inline void volk_32fc_32f_dot_prod_32fc_a_avx512f(lv_32fc_t* result,
85  const lv_32fc_t* input,
86  const float* taps,
87  unsigned int num_points)
88 {
89  unsigned int number = 0;
90  const unsigned int sixteenthPoints = num_points / 16;
91 
92  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
93  const float* aPtr = (float*)input;
94  const float* bPtr = taps;
95 
96  __m512 a0Val, a1Val;
97  __m512 b0Val, b1Val;
98  __m512 xVal;
99 
100  __m512 dotProdVal0 = _mm512_setzero_ps();
101  __m512 dotProdVal1 = _mm512_setzero_ps();
102 
103  // Create index patterns for duplication: 0,0,1,1,2,2,3,3,...,15,15
104  const __m512i idx = _mm512_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
105  const __m512i idx2 =
106  _mm512_setr_epi32(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15);
107 
108  for (; number < sixteenthPoints; number++) {
109  // Load 16 complex numbers (32 floats)
110  a0Val = _mm512_load_ps(aPtr); // 8 complex (I0,Q0,I1,Q1,...)
111  a1Val = _mm512_load_ps(aPtr + 16); // 8 complex (I8,Q8,I9,Q9,...)
112 
113  // Load 16 real taps
114  xVal = _mm512_load_ps(bPtr); // t0|t1|t2|...|t15
115 
116  // Duplicate each tap value to match complex format using permutexvar
117  b0Val = _mm512_permutexvar_ps(idx, xVal);
118  b1Val = _mm512_permutexvar_ps(idx2, xVal);
119 
120  dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
121  dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
122 
123  aPtr += 32;
124  bPtr += 16;
125  }
126 
127  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
128 
129  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
130  _mm512_store_ps(dotProductVector, dotProdVal0);
131 
132  for (unsigned int i = 0; i < 16; i += 2) {
133  returnValue += lv_cmake(dotProductVector[i], dotProductVector[i + 1]);
134  }
135 
136  number = sixteenthPoints * 16;
137  lv_32fc_t returnTail = lv_cmake(0.0f, 0.0f);
139  &returnTail, input + number, bPtr, num_points - number);
140  returnValue += returnTail;
141 
142  *result = returnValue;
143 }
144 
145 #endif /*LV_HAVE_AVX512F*/
146 
147 #if LV_HAVE_AVX2 && LV_HAVE_FMA
148 
149 #include <immintrin.h>
150 
151 static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
152  const lv_32fc_t* input,
153  const float* taps,
154  unsigned int num_points)
155 {
156 
157  unsigned int number = 0;
158  const unsigned int sixteenthPoints = num_points / 16;
159 
160  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
161  const float* aPtr = (float*)input;
162  const float* bPtr = taps;
163 
164  __m256 a0Val, a1Val, a2Val, a3Val;
165  __m256 b0Val, b1Val, b2Val, b3Val;
166  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
167 
168  __m256 dotProdVal0 = _mm256_setzero_ps();
169  __m256 dotProdVal1 = _mm256_setzero_ps();
170  __m256 dotProdVal2 = _mm256_setzero_ps();
171  __m256 dotProdVal3 = _mm256_setzero_ps();
172 
173  for (; number < sixteenthPoints; number++) {
174 
175  a0Val = _mm256_load_ps(aPtr);
176  a1Val = _mm256_load_ps(aPtr + 8);
177  a2Val = _mm256_load_ps(aPtr + 16);
178  a3Val = _mm256_load_ps(aPtr + 24);
179 
180  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
181  x1Val = _mm256_load_ps(bPtr + 8);
182  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
183  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
184  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
185  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
186 
187  // TODO: it may be possible to rearrange swizzling to better pipeline data
188  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
189  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
190  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
191  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
192 
193  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
194  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
195  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
196  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
197 
198  aPtr += 32;
199  bPtr += 16;
200  }
201 
202  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
203  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
204  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
205 
206  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
207 
208  _mm256_store_ps(dotProductVector,
209  dotProdVal0); // Store the results back into the dot product vector
210 
211  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
212  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
213  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
214  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
215 
216  number = sixteenthPoints * 16;
217  lv_32fc_t returnTail = lv_cmake(0.0f, 0.0f);
219  &returnTail, input + number, bPtr, num_points - number);
220  returnValue += returnTail;
221 
222  *result = returnValue;
223 }
224 
225 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
226 
227 #ifdef LV_HAVE_AVX
228 
229 #include <immintrin.h>
230 
231 static inline void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t* result,
232  const lv_32fc_t* input,
233  const float* taps,
234  unsigned int num_points)
235 {
236 
237  unsigned int number = 0;
238  const unsigned int sixteenthPoints = num_points / 16;
239 
240  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
241  const float* aPtr = (float*)input;
242  const float* bPtr = taps;
243 
244  __m256 a0Val, a1Val, a2Val, a3Val;
245  __m256 b0Val, b1Val, b2Val, b3Val;
246  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
247  __m256 c0Val, c1Val, c2Val, c3Val;
248 
249  __m256 dotProdVal0 = _mm256_setzero_ps();
250  __m256 dotProdVal1 = _mm256_setzero_ps();
251  __m256 dotProdVal2 = _mm256_setzero_ps();
252  __m256 dotProdVal3 = _mm256_setzero_ps();
253 
254  for (; number < sixteenthPoints; number++) {
255 
256  a0Val = _mm256_load_ps(aPtr);
257  a1Val = _mm256_load_ps(aPtr + 8);
258  a2Val = _mm256_load_ps(aPtr + 16);
259  a3Val = _mm256_load_ps(aPtr + 24);
260 
261  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
262  x1Val = _mm256_load_ps(bPtr + 8);
263  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
264  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
265  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
266  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
267 
268  // TODO: it may be possible to rearrange swizzling to better pipeline data
269  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
270  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
271  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
272  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
273 
274  c0Val = _mm256_mul_ps(a0Val, b0Val);
275  c1Val = _mm256_mul_ps(a1Val, b1Val);
276  c2Val = _mm256_mul_ps(a2Val, b2Val);
277  c3Val = _mm256_mul_ps(a3Val, b3Val);
278 
279  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
280  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
281  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
282  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
283 
284  aPtr += 32;
285  bPtr += 16;
286  }
287 
288  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
289  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
290  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
291 
292  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
293 
294  _mm256_store_ps(dotProductVector,
295  dotProdVal0); // Store the results back into the dot product vector
296 
297  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
298  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
299  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
300  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
301 
302  number = sixteenthPoints * 16;
303  for (; number < num_points; number++) {
304  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
305  aPtr += 2;
306  bPtr += 1;
307  }
308 
309  *result = returnValue;
310 }
311 
312 #endif /*LV_HAVE_AVX*/
313 
314 
315 #ifdef LV_HAVE_SSE
316 
317 
318 static inline void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t* result,
319  const lv_32fc_t* input,
320  const float* taps,
321  unsigned int num_points)
322 {
323 
324  unsigned int number = 0;
325  const unsigned int eighthPoints = num_points / 8;
326 
327  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
328  const float* aPtr = (float*)input;
329  const float* bPtr = taps;
330 
331  __m128 a0Val, a1Val, a2Val, a3Val;
332  __m128 b0Val, b1Val, b2Val, b3Val;
333  __m128 x0Val, x1Val, x2Val, x3Val;
334  __m128 c0Val, c1Val, c2Val, c3Val;
335 
336  __m128 dotProdVal0 = _mm_setzero_ps();
337  __m128 dotProdVal1 = _mm_setzero_ps();
338  __m128 dotProdVal2 = _mm_setzero_ps();
339  __m128 dotProdVal3 = _mm_setzero_ps();
340 
341  for (; number < eighthPoints; number++) {
342 
343  a0Val = _mm_load_ps(aPtr);
344  a1Val = _mm_load_ps(aPtr + 4);
345  a2Val = _mm_load_ps(aPtr + 8);
346  a3Val = _mm_load_ps(aPtr + 12);
347 
348  x0Val = _mm_load_ps(bPtr);
349  x1Val = _mm_load_ps(bPtr);
350  x2Val = _mm_load_ps(bPtr + 4);
351  x3Val = _mm_load_ps(bPtr + 4);
352  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
353  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
354  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
355  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
356 
357  c0Val = _mm_mul_ps(a0Val, b0Val);
358  c1Val = _mm_mul_ps(a1Val, b1Val);
359  c2Val = _mm_mul_ps(a2Val, b2Val);
360  c3Val = _mm_mul_ps(a3Val, b3Val);
361 
362  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
363  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
364  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
365  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
366 
367  aPtr += 16;
368  bPtr += 8;
369  }
370 
371  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
372  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
373  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
374 
375  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
376 
377  _mm_store_ps(dotProductVector,
378  dotProdVal0); // Store the results back into the dot product vector
379 
380  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
381  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
382 
383  number = eighthPoints * 8;
384  for (; number < num_points; number++) {
385  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
386  aPtr += 2;
387  bPtr += 1;
388  }
389 
390  *result = returnValue;
391 }
392 
393 #endif /*LV_HAVE_SSE*/
394 
395 #ifdef LV_HAVE_AVX512F
396 
397 #include <immintrin.h>
398 
399 static inline void volk_32fc_32f_dot_prod_32fc_u_avx512f(lv_32fc_t* result,
400  const lv_32fc_t* input,
401  const float* taps,
402  unsigned int num_points)
403 {
404  unsigned int number = 0;
405  const unsigned int sixteenthPoints = num_points / 16;
406 
407  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
408  const float* aPtr = (float*)input;
409  const float* bPtr = taps;
410 
411  __m512 a0Val, a1Val;
412  __m512 b0Val, b1Val;
413  __m512 xVal;
414 
415  __m512 dotProdVal0 = _mm512_setzero_ps();
416  __m512 dotProdVal1 = _mm512_setzero_ps();
417 
418  // Create index patterns for duplication: 0,0,1,1,2,2,3,3,...,15,15
419  const __m512i idx = _mm512_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
420  const __m512i idx2 =
421  _mm512_setr_epi32(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15);
422 
423  for (; number < sixteenthPoints; number++) {
424  // Load 16 complex numbers (32 floats) - unaligned
425  a0Val = _mm512_loadu_ps(aPtr); // 8 complex (I0,Q0,I1,Q1,...)
426  a1Val = _mm512_loadu_ps(aPtr + 16); // 8 complex (I8,Q8,I9,Q9,...)
427 
428  // Load 16 real taps - unaligned
429  xVal = _mm512_loadu_ps(bPtr); // t0|t1|t2|...|t15
430 
431  // Duplicate each tap value to match complex format using permutexvar
432  b0Val = _mm512_permutexvar_ps(idx, xVal);
433  b1Val = _mm512_permutexvar_ps(idx2, xVal);
434 
435  dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
436  dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
437 
438  aPtr += 32;
439  bPtr += 16;
440  }
441 
442  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
443 
444  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
445  _mm512_store_ps(dotProductVector, dotProdVal0);
446 
447  for (unsigned int i = 0; i < 16; i += 2) {
448  returnValue += lv_cmake(dotProductVector[i], dotProductVector[i + 1]);
449  }
450 
451  number = sixteenthPoints * 16;
452  lv_32fc_t returnTail = lv_cmake(0.0f, 0.0f);
454  &returnTail, input + number, bPtr, num_points - number);
455  returnValue += returnTail;
456 
457  *result = returnValue;
458 }
459 
460 #endif /*LV_HAVE_AVX512F*/
461 
462 #if LV_HAVE_AVX2 && LV_HAVE_FMA
463 
464 #include <immintrin.h>
465 
466 static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
467  const lv_32fc_t* input,
468  const float* taps,
469  unsigned int num_points)
470 {
471 
472  unsigned int number = 0;
473  const unsigned int sixteenthPoints = num_points / 16;
474 
475  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
476  const float* aPtr = (float*)input;
477  const float* bPtr = taps;
478 
479  __m256 a0Val, a1Val, a2Val, a3Val;
480  __m256 b0Val, b1Val, b2Val, b3Val;
481  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
482 
483  __m256 dotProdVal0 = _mm256_setzero_ps();
484  __m256 dotProdVal1 = _mm256_setzero_ps();
485  __m256 dotProdVal2 = _mm256_setzero_ps();
486  __m256 dotProdVal3 = _mm256_setzero_ps();
487 
488  for (; number < sixteenthPoints; number++) {
489 
490  a0Val = _mm256_loadu_ps(aPtr);
491  a1Val = _mm256_loadu_ps(aPtr + 8);
492  a2Val = _mm256_loadu_ps(aPtr + 16);
493  a3Val = _mm256_loadu_ps(aPtr + 24);
494 
495  x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
496  x1Val = _mm256_loadu_ps(bPtr + 8);
497  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
498  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
499  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
500  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
501 
502  // TODO: it may be possible to rearrange swizzling to better pipeline data
503  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
504  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
505  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
506  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
507 
508  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
509  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
510  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
511  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
512 
513  aPtr += 32;
514  bPtr += 16;
515  }
516 
517  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
518  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
519  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
520 
521  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
522 
523  _mm256_store_ps(dotProductVector,
524  dotProdVal0); // Store the results back into the dot product vector
525 
526  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
527  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
528  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
529  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
530 
531  number = sixteenthPoints * 16;
532  for (; number < num_points; number++) {
533  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
534  aPtr += 2;
535  bPtr += 1;
536  }
537 
538  *result = returnValue;
539 }
540 
541 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
542 
543 #ifdef LV_HAVE_AVX
544 
545 #include <immintrin.h>
546 
547 static inline void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t* result,
548  const lv_32fc_t* input,
549  const float* taps,
550  unsigned int num_points)
551 {
552 
553  unsigned int number = 0;
554  const unsigned int sixteenthPoints = num_points / 16;
555 
556  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
557  const float* aPtr = (float*)input;
558  const float* bPtr = taps;
559 
560  __m256 a0Val, a1Val, a2Val, a3Val;
561  __m256 b0Val, b1Val, b2Val, b3Val;
562  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
563  __m256 c0Val, c1Val, c2Val, c3Val;
564 
565  __m256 dotProdVal0 = _mm256_setzero_ps();
566  __m256 dotProdVal1 = _mm256_setzero_ps();
567  __m256 dotProdVal2 = _mm256_setzero_ps();
568  __m256 dotProdVal3 = _mm256_setzero_ps();
569 
570  for (; number < sixteenthPoints; number++) {
571 
572  a0Val = _mm256_loadu_ps(aPtr);
573  a1Val = _mm256_loadu_ps(aPtr + 8);
574  a2Val = _mm256_loadu_ps(aPtr + 16);
575  a3Val = _mm256_loadu_ps(aPtr + 24);
576 
577  x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
578  x1Val = _mm256_loadu_ps(bPtr + 8);
579  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
580  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
581  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
582  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
583 
584  // TODO: it may be possible to rearrange swizzling to better pipeline data
585  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
586  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
587  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
588  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
589 
590  c0Val = _mm256_mul_ps(a0Val, b0Val);
591  c1Val = _mm256_mul_ps(a1Val, b1Val);
592  c2Val = _mm256_mul_ps(a2Val, b2Val);
593  c3Val = _mm256_mul_ps(a3Val, b3Val);
594 
595  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
596  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
597  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
598  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
599 
600  aPtr += 32;
601  bPtr += 16;
602  }
603 
604  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
605  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
606  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
607 
608  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
609 
610  _mm256_store_ps(dotProductVector,
611  dotProdVal0); // Store the results back into the dot product vector
612 
613  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
614  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
615  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
616  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
617 
618  number = sixteenthPoints * 16;
619  for (; number < num_points; number++) {
620  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
621  aPtr += 2;
622  bPtr += 1;
623  }
624 
625  *result = returnValue;
626 }
627 #endif /*LV_HAVE_AVX*/
628 
629 #ifdef LV_HAVE_NEON
630 #include <arm_neon.h>
631 
632 static inline void
634  const lv_32fc_t* __restrict input,
635  const float* __restrict taps,
636  unsigned int num_points)
637 {
638 
639  unsigned int number;
640  const unsigned int quarterPoints = num_points / 8;
641 
642  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
643  const float* inputPtr = (float*)input;
644  const float* tapsPtr = taps;
645  float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
646  float accVector_real[4];
647  float accVector_imag[4];
648 
649  float32x4x2_t inputVector0, inputVector1;
650  float32x4_t tapsVector0, tapsVector1;
651  float32x4_t tmp_real0, tmp_imag0;
652  float32x4_t tmp_real1, tmp_imag1;
653  float32x4_t real_accumulator0, imag_accumulator0;
654  float32x4_t real_accumulator1, imag_accumulator1;
655 
656  // zero out accumulators
657  // take a *float, return float32x4_t
658  real_accumulator0 = vld1q_f32(zero);
659  imag_accumulator0 = vld1q_f32(zero);
660  real_accumulator1 = vld1q_f32(zero);
661  imag_accumulator1 = vld1q_f32(zero);
662 
663  for (number = 0; number < quarterPoints; number++) {
664  // load doublewords and duplicate in to second lane
665  tapsVector0 = vld1q_f32(tapsPtr);
666  tapsVector1 = vld1q_f32(tapsPtr + 4);
667 
668  // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
669  inputVector0 = vld2q_f32(inputPtr);
670  inputVector1 = vld2q_f32(inputPtr + 8);
671  // inputVector is now a struct of two vectors, 0th is real, 1st is imag
672 
673  tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
674  tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
675 
676  tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
677  tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
678 
679  real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
680  imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
681 
682  real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
683  imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
684 
685  tapsPtr += 8;
686  inputPtr += 16;
687  }
688 
689  real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
690  imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
691  // void vst1q_f32( float32_t * ptr, float32x4_t val);
692  // store results back to a complex (array of 2 floats)
693  vst1q_f32(accVector_real, real_accumulator0);
694  vst1q_f32(accVector_imag, imag_accumulator0);
695  returnValue += lv_cmake(
696  accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3],
697  accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]);
698 
699  // clean up the remainder
700  for (number = quarterPoints * 8; number < num_points; number++) {
701  returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
702  inputPtr += 2;
703  tapsPtr += 1;
704  }
705 
706  *result = returnValue;
707 }
708 
709 #endif /*LV_HAVE_NEON*/
710 
711 #ifdef LV_HAVE_NEON
712 #include <arm_neon.h>
713 
714 static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result,
715  const lv_32fc_t* __restrict input,
716  const float* __restrict taps,
717  unsigned int num_points)
718 {
719 
720  unsigned int number;
721  const unsigned int quarterPoints = num_points / 4;
722 
723  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
724  const float* inputPtr = (float*)input;
725  const float* tapsPtr = taps;
726  float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
727  float accVector_real[4];
728  float accVector_imag[4];
729 
730  float32x4x2_t inputVector;
731  float32x4_t tapsVector;
732  float32x4_t tmp_real, tmp_imag;
733  float32x4_t real_accumulator, imag_accumulator;
734 
735 
736  // zero out accumulators
737  // take a *float, return float32x4_t
738  real_accumulator = vld1q_f32(zero);
739  imag_accumulator = vld1q_f32(zero);
740 
741  for (number = 0; number < quarterPoints; number++) {
742  // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
743  // load doublewords and duplicate in to second lane
744  tapsVector = vld1q_f32(tapsPtr);
745 
746  // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
747  inputVector = vld2q_f32(inputPtr);
748 
749  tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
750  tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
751 
752  real_accumulator = vaddq_f32(real_accumulator, tmp_real);
753  imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
754 
755 
756  tapsPtr += 4;
757  inputPtr += 8;
758  }
759 
760  // store results back to a complex (array of 2 floats)
761  vst1q_f32(accVector_real, real_accumulator);
762  vst1q_f32(accVector_imag, imag_accumulator);
763  returnValue += lv_cmake(
764  accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3],
765  accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]);
766 
767  // clean up the remainder
768  for (number = quarterPoints * 4; number < num_points; number++) {
769  returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
770  inputPtr += 2;
771  tapsPtr += 1;
772  }
773 
774  *result = returnValue;
775 }
776 
777 #endif /*LV_HAVE_NEON*/
778 
779 #ifdef LV_HAVE_NEONV8
780 #include <arm_neon.h>
781 
782 static inline void volk_32fc_32f_dot_prod_32fc_neonv8(lv_32fc_t* result,
783  const lv_32fc_t* input,
784  const float* taps,
785  unsigned int num_points)
786 {
787  const unsigned int eighthPoints = num_points / 8;
788  const float* inputPtr = (const float*)input;
789  const float* tapsPtr = taps;
790 
791  /* Use 2 independent real/imag accumulators for FMA pipelining */
792  float32x4_t real_acc0 = vdupq_n_f32(0);
793  float32x4_t imag_acc0 = vdupq_n_f32(0);
794  float32x4_t real_acc1 = vdupq_n_f32(0);
795  float32x4_t imag_acc1 = vdupq_n_f32(0);
796 
797  for (unsigned int number = 0; number < eighthPoints; number++) {
798  /* Load 8 complex values deinterleaved */
799  float32x4x2_t cplx0 = vld2q_f32(inputPtr);
800  float32x4x2_t cplx1 = vld2q_f32(inputPtr + 8);
801 
802  /* Load 8 real taps */
803  float32x4_t taps0 = vld1q_f32(tapsPtr);
804  float32x4_t taps1 = vld1q_f32(tapsPtr + 4);
805  __VOLK_PREFETCH(inputPtr + 32);
806  __VOLK_PREFETCH(tapsPtr + 16);
807 
808  /* FMA: acc += taps * complex */
809  real_acc0 = vfmaq_f32(real_acc0, taps0, cplx0.val[0]);
810  imag_acc0 = vfmaq_f32(imag_acc0, taps0, cplx0.val[1]);
811  real_acc1 = vfmaq_f32(real_acc1, taps1, cplx1.val[0]);
812  imag_acc1 = vfmaq_f32(imag_acc1, taps1, cplx1.val[1]);
813 
814  inputPtr += 16;
815  tapsPtr += 8;
816  }
817 
818  /* Combine accumulators */
819  real_acc0 = vaddq_f32(real_acc0, real_acc1);
820  imag_acc0 = vaddq_f32(imag_acc0, imag_acc1);
821 
822  /* Horizontal sum */
823  float real_sum = vaddvq_f32(real_acc0);
824  float imag_sum = vaddvq_f32(imag_acc0);
825 
826  lv_32fc_t returnValue = lv_cmake(real_sum, imag_sum);
827 
828  /* Handle remainder */
829  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
830  returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
831  inputPtr += 2;
832  tapsPtr += 1;
833  }
834 
835  *result = returnValue;
836 }
837 #endif /*LV_HAVE_NEONV8*/
838 
839 #ifdef LV_HAVE_NEONV7
840 extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result,
841  const lv_32fc_t* input,
842  const float* taps,
843  unsigned int num_points);
844 #endif /*LV_HAVE_NEONV7*/
845 
846 #ifdef LV_HAVE_NEONV7
847 extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result,
848  const lv_32fc_t* input,
849  const float* taps,
850  unsigned int num_points);
851 #endif /*LV_HAVE_NEONV7*/
852 
853 #ifdef LV_HAVE_NEONV7
854 extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result,
855  const lv_32fc_t* input,
856  const float* taps,
857  unsigned int num_points);
858 #endif /*LV_HAVE_NEONV7*/
859 
860 #ifdef LV_HAVE_SSE
861 
862 static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result,
863  const lv_32fc_t* input,
864  const float* taps,
865  unsigned int num_points)
866 {
867 
868  unsigned int number = 0;
869  const unsigned int eighthPoints = num_points / 8;
870 
871  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
872  const float* aPtr = (float*)input;
873  const float* bPtr = taps;
874 
875  __m128 a0Val, a1Val, a2Val, a3Val;
876  __m128 b0Val, b1Val, b2Val, b3Val;
877  __m128 x0Val, x1Val, x2Val, x3Val;
878  __m128 c0Val, c1Val, c2Val, c3Val;
879 
880  __m128 dotProdVal0 = _mm_setzero_ps();
881  __m128 dotProdVal1 = _mm_setzero_ps();
882  __m128 dotProdVal2 = _mm_setzero_ps();
883  __m128 dotProdVal3 = _mm_setzero_ps();
884 
885  for (; number < eighthPoints; number++) {
886 
887  a0Val = _mm_loadu_ps(aPtr);
888  a1Val = _mm_loadu_ps(aPtr + 4);
889  a2Val = _mm_loadu_ps(aPtr + 8);
890  a3Val = _mm_loadu_ps(aPtr + 12);
891 
892  x0Val = _mm_loadu_ps(bPtr);
893  x1Val = _mm_loadu_ps(bPtr);
894  x2Val = _mm_loadu_ps(bPtr + 4);
895  x3Val = _mm_loadu_ps(bPtr + 4);
896  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
897  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
898  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
899  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
900 
901  c0Val = _mm_mul_ps(a0Val, b0Val);
902  c1Val = _mm_mul_ps(a1Val, b1Val);
903  c2Val = _mm_mul_ps(a2Val, b2Val);
904  c3Val = _mm_mul_ps(a3Val, b3Val);
905 
906  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
907  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
908  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
909  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
910 
911  aPtr += 16;
912  bPtr += 8;
913  }
914 
915  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
916  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
917  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
918 
919  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
920 
921  _mm_store_ps(dotProductVector,
922  dotProdVal0); // Store the results back into the dot product vector
923 
924  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
925  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
926 
927  number = eighthPoints * 8;
928  for (; number < num_points; number++) {
929  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
930  aPtr += 2;
931  bPtr += 1;
932  }
933 
934  *result = returnValue;
935 }
936 
937 #endif /*LV_HAVE_SSE*/
938 
939 #ifdef LV_HAVE_RVV
940 #include <riscv_vector.h>
942 
943 static inline void volk_32fc_32f_dot_prod_32fc_rvv(lv_32fc_t* result,
944  const lv_32fc_t* input,
945  const float* taps,
946  unsigned int num_points)
947 {
948  vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
949  vfloat32m4_t vsumi = vsumr;
950  size_t n = num_points;
951  for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
952  vl = __riscv_vsetvl_e32m4(n);
953  vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)input, vl);
954  vfloat32m4_t vbr = __riscv_vle32_v_f32m4(taps, vl), vbi = vbr;
955  vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl));
956  vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl));
957  vsumr = __riscv_vfmacc_tu(vsumr, var, vbr, vl);
958  vsumi = __riscv_vfmacc_tu(vsumi, vai, vbi, vl);
959  }
960  size_t vl = __riscv_vsetvlmax_e32m1();
961  vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
962  vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
963  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
964  *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
965  __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
966 }
967 #endif /*LV_HAVE_RVV*/
968 
969 #ifdef LV_HAVE_RVVSEG
970 #include <riscv_vector.h>
972 
973 static inline void volk_32fc_32f_dot_prod_32fc_rvvseg(lv_32fc_t* result,
974  const lv_32fc_t* input,
975  const float* taps,
976  unsigned int num_points)
977 {
978  vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
979  vfloat32m4_t vsumi = vsumr;
980  size_t n = num_points;
981  for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
982  vl = __riscv_vsetvl_e32m4(n);
983  vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)input, vl);
984  vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
985  vfloat32m4_t vbr = __riscv_vle32_v_f32m4(taps, vl), vbi = vbr;
986  vsumr = __riscv_vfmacc_tu(vsumr, var, vbr, vl);
987  vsumi = __riscv_vfmacc_tu(vsumi, vai, vbi, vl);
988  }
989  size_t vl = __riscv_vsetvlmax_e32m1();
990  vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
991  vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
992  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
993  *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
994  __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
995 }
996 #endif /*LV_HAVE_RVVSEG*/
997 
998 #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
RISCV_SHRINK4
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32fc_32f_dot_prod_32fc_a_avx
static void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:231
i
for i
Definition: volk_config_fixed.tmpl.h:13
lv_cmake
#define lv_cmake(r, i)
Definition: volk_complex.h:77
volk_common.h
volk_32fc_32f_dot_prod_32fc_u_sse
static void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:862
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_32fc_32f_dot_prod_32fc_neon_unroll
static void volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:633
volk_32fc_32f_dot_prod_32fc_u_avx
static void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:547
volk_rvv_intrinsics.h
volk_32fc_32f_dot_prod_32fc_a_sse
static void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:318
volk_32fc_32f_dot_prod_32fc_a_neon
static void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:714
volk_32fc_32f_dot_prod_32fc_generic
static void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:58