Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_16i_32fc_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
45 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
46 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
47 
48 #include <stdio.h>
49 #include <volk/volk_common.h>
50 
51 
52 #ifdef LV_HAVE_GENERIC
53 
55  const short* input,
56  const lv_32fc_t* taps,
57  unsigned int num_points)
58 {
59 
60  static const int N_UNROLL = 4;
61 
62  lv_32fc_t acc0 = 0;
63  lv_32fc_t acc1 = 0;
64  lv_32fc_t acc2 = 0;
65  lv_32fc_t acc3 = 0;
66 
67  unsigned i = 0;
68  unsigned n = (num_points / N_UNROLL) * N_UNROLL;
69 
70  for (i = 0; i < n; i += N_UNROLL) {
71  acc0 += taps[i + 0] * (float)input[i + 0];
72  acc1 += taps[i + 1] * (float)input[i + 1];
73  acc2 += taps[i + 2] * (float)input[i + 2];
74  acc3 += taps[i + 3] * (float)input[i + 3];
75  }
76 
77  for (; i < num_points; i++) {
78  acc0 += taps[i] * (float)input[i];
79  }
80 
81  *result = acc0 + acc1 + acc2 + acc3;
82 }
83 
84 #endif /*LV_HAVE_GENERIC*/
85 
86 #ifdef LV_HAVE_NEON
87 #include <arm_neon.h>
88 static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result,
89  const short* input,
90  const lv_32fc_t* taps,
91  unsigned int num_points)
92 {
93 
94  unsigned ii;
95  unsigned quarter_points = num_points / 4;
96  lv_32fc_t* tapsPtr = (lv_32fc_t*)taps;
97  short* inputPtr = (short*)input;
98  lv_32fc_t accumulator_vec[4];
99 
100  float32x4x2_t tapsVal, accumulator_val;
101  int16x4_t input16;
102  int32x4_t input32;
103  float32x4_t input_float, prod_re, prod_im;
104 
105  accumulator_val.val[0] = vdupq_n_f32(0.0);
106  accumulator_val.val[1] = vdupq_n_f32(0.0);
107 
108  for (ii = 0; ii < quarter_points; ++ii) {
109  tapsVal = vld2q_f32((float*)tapsPtr);
110  input16 = vld1_s16(inputPtr);
111  // widen 16-bit int to 32-bit int
112  input32 = vmovl_s16(input16);
113  // convert 32-bit int to float with scale
114  input_float = vcvtq_f32_s32(input32);
115 
116  prod_re = vmulq_f32(input_float, tapsVal.val[0]);
117  prod_im = vmulq_f32(input_float, tapsVal.val[1]);
118 
119  accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
120  accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
121 
122  tapsPtr += 4;
123  inputPtr += 4;
124  }
125  vst2q_f32((float*)accumulator_vec, accumulator_val);
126  accumulator_vec[0] += accumulator_vec[1];
127  accumulator_vec[2] += accumulator_vec[3];
128  accumulator_vec[0] += accumulator_vec[2];
129 
130  for (ii = quarter_points * 4; ii < num_points; ++ii) {
131  accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
132  }
133 
134  *result = accumulator_vec[0];
135 }
136 
137 #endif /*LV_HAVE_NEON*/
138 
139 #ifdef LV_HAVE_NEONV8
140 #include <arm_neon.h>
141 
142 static inline void volk_16i_32fc_dot_prod_32fc_neonv8(lv_32fc_t* result,
143  const short* input,
144  const lv_32fc_t* taps,
145  unsigned int num_points)
146 {
147  const unsigned int eighthPoints = num_points / 8;
148  const short* inputPtr = input;
149  const lv_32fc_t* tapsPtr = taps;
150 
151  /* Use 2 independent real/imag accumulators for FMA pipelining */
152  float32x4_t real_acc0 = vdupq_n_f32(0);
153  float32x4_t imag_acc0 = vdupq_n_f32(0);
154  float32x4_t real_acc1 = vdupq_n_f32(0);
155  float32x4_t imag_acc1 = vdupq_n_f32(0);
156 
157  for (unsigned int number = 0; number < eighthPoints; number++) {
158  /* Load 8 int16 values and convert to float */
159  int16x8_t input16 = vld1q_s16(inputPtr);
160  float32x4_t input_lo = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input16)));
161  float32x4_t input_hi = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input16)));
162 
163  /* Load 8 complex taps deinterleaved */
164  float32x4x2_t taps0 = vld2q_f32((const float*)tapsPtr);
165  float32x4x2_t taps1 = vld2q_f32((const float*)(tapsPtr + 4));
166  __VOLK_PREFETCH(inputPtr + 16);
167  __VOLK_PREFETCH(tapsPtr + 16);
168 
169  /* FMA: acc += input * taps */
170  real_acc0 = vfmaq_f32(real_acc0, input_lo, taps0.val[0]);
171  imag_acc0 = vfmaq_f32(imag_acc0, input_lo, taps0.val[1]);
172  real_acc1 = vfmaq_f32(real_acc1, input_hi, taps1.val[0]);
173  imag_acc1 = vfmaq_f32(imag_acc1, input_hi, taps1.val[1]);
174 
175  inputPtr += 8;
176  tapsPtr += 8;
177  }
178 
179  /* Combine accumulators */
180  real_acc0 = vaddq_f32(real_acc0, real_acc1);
181  imag_acc0 = vaddq_f32(imag_acc0, imag_acc1);
182 
183  /* Horizontal sum */
184  float real_sum = vaddvq_f32(real_acc0);
185  float imag_sum = vaddvq_f32(imag_acc0);
186 
187  lv_32fc_t returnValue = lv_cmake(real_sum, imag_sum);
188 
189  /* Handle remainder */
190  const float* bPtr = (const float*)tapsPtr;
191  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
192  returnValue += lv_cmake(inputPtr[0] * bPtr[0], inputPtr[0] * bPtr[1]);
193  inputPtr += 1;
194  bPtr += 2;
195  }
196 
197  *result = returnValue;
198 }
199 #endif /*LV_HAVE_NEONV8*/
200 
201 #if LV_HAVE_SSE && LV_HAVE_MMX
202 
203 static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result,
204  const short* input,
205  const lv_32fc_t* taps,
206  unsigned int num_points)
207 {
208 
209  unsigned int number = 0;
210  const unsigned int eighthPoints = num_points / 8;
211 
212  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
213  const short* aPtr = input;
214  const float* bPtr = (float*)taps;
215 
216  __m64 m0, m1;
217  __m128 f0, f1, f2, f3;
218  __m128 a0Val, a1Val, a2Val, a3Val;
219  __m128 b0Val, b1Val, b2Val, b3Val;
220  __m128 c0Val, c1Val, c2Val, c3Val;
221 
222  __m128 dotProdVal0 = _mm_setzero_ps();
223  __m128 dotProdVal1 = _mm_setzero_ps();
224  __m128 dotProdVal2 = _mm_setzero_ps();
225  __m128 dotProdVal3 = _mm_setzero_ps();
226 
227  for (; number < eighthPoints; number++) {
228 
229  m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
230  m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
231  f0 = _mm_cvtpi16_ps(m0);
232  f1 = _mm_cvtpi16_ps(m0);
233  f2 = _mm_cvtpi16_ps(m1);
234  f3 = _mm_cvtpi16_ps(m1);
235 
236  a0Val = _mm_unpacklo_ps(f0, f1);
237  a1Val = _mm_unpackhi_ps(f0, f1);
238  a2Val = _mm_unpacklo_ps(f2, f3);
239  a3Val = _mm_unpackhi_ps(f2, f3);
240 
241  b0Val = _mm_loadu_ps(bPtr);
242  b1Val = _mm_loadu_ps(bPtr + 4);
243  b2Val = _mm_loadu_ps(bPtr + 8);
244  b3Val = _mm_loadu_ps(bPtr + 12);
245 
246  c0Val = _mm_mul_ps(a0Val, b0Val);
247  c1Val = _mm_mul_ps(a1Val, b1Val);
248  c2Val = _mm_mul_ps(a2Val, b2Val);
249  c3Val = _mm_mul_ps(a3Val, b3Val);
250 
251  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
252  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
253  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
254  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
255 
256  aPtr += 8;
257  bPtr += 16;
258  }
259 
260  _mm_empty(); // clear the mmx technology state
261 
262  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
263  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
264  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
265 
266  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
267 
268  _mm_store_ps(dotProductVector,
269  dotProdVal0); // Store the results back into the dot product vector
270 
271  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
272  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
273 
274  number = eighthPoints * 8;
275  for (; number < num_points; number++) {
276  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
277  aPtr += 1;
278  bPtr += 2;
279  }
280 
281  *result = returnValue;
282 }
283 
284 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
285 
286 
287 #if LV_HAVE_AVX2 && LV_HAVE_FMA
288 
289 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
290  const short* input,
291  const lv_32fc_t* taps,
292  unsigned int num_points)
293 {
294 
295  unsigned int number = 0;
296  const unsigned int sixteenthPoints = num_points / 16;
297 
298  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
299  const short* aPtr = input;
300  const float* bPtr = (float*)taps;
301 
302  __m128i m0, m1;
303  __m256i f0, f1;
304  __m256 g0, g1, h0, h1, h2, h3;
305  __m256 a0Val, a1Val, a2Val, a3Val;
306  __m256 b0Val, b1Val, b2Val, b3Val;
307 
308  __m256 dotProdVal0 = _mm256_setzero_ps();
309  __m256 dotProdVal1 = _mm256_setzero_ps();
310  __m256 dotProdVal2 = _mm256_setzero_ps();
311  __m256 dotProdVal3 = _mm256_setzero_ps();
312 
313  for (; number < sixteenthPoints; number++) {
314 
315  m0 = _mm_loadu_si128((__m128i const*)aPtr);
316  m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
317 
318  f0 = _mm256_cvtepi16_epi32(m0);
319  g0 = _mm256_cvtepi32_ps(f0);
320  f1 = _mm256_cvtepi16_epi32(m1);
321  g1 = _mm256_cvtepi32_ps(f1);
322 
323  h0 = _mm256_unpacklo_ps(g0, g0);
324  h1 = _mm256_unpackhi_ps(g0, g0);
325  h2 = _mm256_unpacklo_ps(g1, g1);
326  h3 = _mm256_unpackhi_ps(g1, g1);
327 
328  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
329  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
330  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
331  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
332 
333  b0Val = _mm256_loadu_ps(bPtr);
334  b1Val = _mm256_loadu_ps(bPtr + 8);
335  b2Val = _mm256_loadu_ps(bPtr + 16);
336  b3Val = _mm256_loadu_ps(bPtr + 24);
337 
338  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
339  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
340  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
341  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
342 
343  aPtr += 16;
344  bPtr += 32;
345  }
346 
347  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
348  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
349  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
350 
351  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
352 
353  _mm256_store_ps(dotProductVector,
354  dotProdVal0); // Store the results back into the dot product vector
355 
356  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
357  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
358  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
359  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
360 
361  number = sixteenthPoints * 16;
362  for (; number < num_points; number++) {
363  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
364  aPtr += 1;
365  bPtr += 2;
366  }
367 
368  *result = returnValue;
369 }
370 
371 #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
372 
373 
374 #ifdef LV_HAVE_AVX2
375 
376 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result,
377  const short* input,
378  const lv_32fc_t* taps,
379  unsigned int num_points)
380 {
381 
382  unsigned int number = 0;
383  const unsigned int sixteenthPoints = num_points / 16;
384 
385  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
386  const short* aPtr = input;
387  const float* bPtr = (float*)taps;
388 
389  __m128i m0, m1;
390  __m256i f0, f1;
391  __m256 g0, g1, h0, h1, h2, h3;
392  __m256 a0Val, a1Val, a2Val, a3Val;
393  __m256 b0Val, b1Val, b2Val, b3Val;
394  __m256 c0Val, c1Val, c2Val, c3Val;
395 
396  __m256 dotProdVal0 = _mm256_setzero_ps();
397  __m256 dotProdVal1 = _mm256_setzero_ps();
398  __m256 dotProdVal2 = _mm256_setzero_ps();
399  __m256 dotProdVal3 = _mm256_setzero_ps();
400 
401  for (; number < sixteenthPoints; number++) {
402 
403  m0 = _mm_loadu_si128((__m128i const*)aPtr);
404  m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
405 
406  f0 = _mm256_cvtepi16_epi32(m0);
407  g0 = _mm256_cvtepi32_ps(f0);
408  f1 = _mm256_cvtepi16_epi32(m1);
409  g1 = _mm256_cvtepi32_ps(f1);
410 
411  h0 = _mm256_unpacklo_ps(g0, g0);
412  h1 = _mm256_unpackhi_ps(g0, g0);
413  h2 = _mm256_unpacklo_ps(g1, g1);
414  h3 = _mm256_unpackhi_ps(g1, g1);
415 
416  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
417  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
418  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
419  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
420 
421  b0Val = _mm256_loadu_ps(bPtr);
422  b1Val = _mm256_loadu_ps(bPtr + 8);
423  b2Val = _mm256_loadu_ps(bPtr + 16);
424  b3Val = _mm256_loadu_ps(bPtr + 24);
425 
426  c0Val = _mm256_mul_ps(a0Val, b0Val);
427  c1Val = _mm256_mul_ps(a1Val, b1Val);
428  c2Val = _mm256_mul_ps(a2Val, b2Val);
429  c3Val = _mm256_mul_ps(a3Val, b3Val);
430 
431  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
432  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
433  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
434  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
435 
436  aPtr += 16;
437  bPtr += 32;
438  }
439 
440  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
441  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
442  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
443 
444  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
445 
446  _mm256_store_ps(dotProductVector,
447  dotProdVal0); // Store the results back into the dot product vector
448 
449  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
450  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
451  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
452  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
453 
454  number = sixteenthPoints * 16;
455  for (; number < num_points; number++) {
456  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
457  aPtr += 1;
458  bPtr += 2;
459  }
460 
461  *result = returnValue;
462 }
463 
464 #endif /*LV_HAVE_AVX2*/
465 
466 
467 #if LV_HAVE_SSE && LV_HAVE_MMX
468 
469 
470 static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result,
471  const short* input,
472  const lv_32fc_t* taps,
473  unsigned int num_points)
474 {
475 
476  unsigned int number = 0;
477  const unsigned int eighthPoints = num_points / 8;
478 
479  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
480  const short* aPtr = input;
481  const float* bPtr = (float*)taps;
482 
483  __m64 m0, m1;
484  __m128 f0, f1, f2, f3;
485  __m128 a0Val, a1Val, a2Val, a3Val;
486  __m128 b0Val, b1Val, b2Val, b3Val;
487  __m128 c0Val, c1Val, c2Val, c3Val;
488 
489  __m128 dotProdVal0 = _mm_setzero_ps();
490  __m128 dotProdVal1 = _mm_setzero_ps();
491  __m128 dotProdVal2 = _mm_setzero_ps();
492  __m128 dotProdVal3 = _mm_setzero_ps();
493 
494  for (; number < eighthPoints; number++) {
495 
496  m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
497  m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
498  f0 = _mm_cvtpi16_ps(m0);
499  f1 = _mm_cvtpi16_ps(m0);
500  f2 = _mm_cvtpi16_ps(m1);
501  f3 = _mm_cvtpi16_ps(m1);
502 
503  a0Val = _mm_unpacklo_ps(f0, f1);
504  a1Val = _mm_unpackhi_ps(f0, f1);
505  a2Val = _mm_unpacklo_ps(f2, f3);
506  a3Val = _mm_unpackhi_ps(f2, f3);
507 
508  b0Val = _mm_load_ps(bPtr);
509  b1Val = _mm_load_ps(bPtr + 4);
510  b2Val = _mm_load_ps(bPtr + 8);
511  b3Val = _mm_load_ps(bPtr + 12);
512 
513  c0Val = _mm_mul_ps(a0Val, b0Val);
514  c1Val = _mm_mul_ps(a1Val, b1Val);
515  c2Val = _mm_mul_ps(a2Val, b2Val);
516  c3Val = _mm_mul_ps(a3Val, b3Val);
517 
518  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
519  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
520  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
521  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
522 
523  aPtr += 8;
524  bPtr += 16;
525  }
526 
527  _mm_empty(); // clear the mmx technology state
528 
529  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
530  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
531  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
532 
533  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
534 
535  _mm_store_ps(dotProductVector,
536  dotProdVal0); // Store the results back into the dot product vector
537 
538  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
539  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
540 
541  number = eighthPoints * 8;
542  for (; number < num_points; number++) {
543  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
544  aPtr += 1;
545  bPtr += 2;
546  }
547 
548  *result = returnValue;
549 }
550 
551 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
552 
553 #ifdef LV_HAVE_AVX2
554 
555 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result,
556  const short* input,
557  const lv_32fc_t* taps,
558  unsigned int num_points)
559 {
560 
561  unsigned int number = 0;
562  const unsigned int sixteenthPoints = num_points / 16;
563 
564  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
565  const short* aPtr = input;
566  const float* bPtr = (float*)taps;
567 
568  __m128i m0, m1;
569  __m256i f0, f1;
570  __m256 g0, g1, h0, h1, h2, h3;
571  __m256 a0Val, a1Val, a2Val, a3Val;
572  __m256 b0Val, b1Val, b2Val, b3Val;
573  __m256 c0Val, c1Val, c2Val, c3Val;
574 
575  __m256 dotProdVal0 = _mm256_setzero_ps();
576  __m256 dotProdVal1 = _mm256_setzero_ps();
577  __m256 dotProdVal2 = _mm256_setzero_ps();
578  __m256 dotProdVal3 = _mm256_setzero_ps();
579 
580  for (; number < sixteenthPoints; number++) {
581 
582  m0 = _mm_load_si128((__m128i const*)aPtr);
583  m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
584 
585  f0 = _mm256_cvtepi16_epi32(m0);
586  g0 = _mm256_cvtepi32_ps(f0);
587  f1 = _mm256_cvtepi16_epi32(m1);
588  g1 = _mm256_cvtepi32_ps(f1);
589 
590  h0 = _mm256_unpacklo_ps(g0, g0);
591  h1 = _mm256_unpackhi_ps(g0, g0);
592  h2 = _mm256_unpacklo_ps(g1, g1);
593  h3 = _mm256_unpackhi_ps(g1, g1);
594 
595  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
596  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
597  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
598  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
599 
600  b0Val = _mm256_load_ps(bPtr);
601  b1Val = _mm256_load_ps(bPtr + 8);
602  b2Val = _mm256_load_ps(bPtr + 16);
603  b3Val = _mm256_load_ps(bPtr + 24);
604 
605  c0Val = _mm256_mul_ps(a0Val, b0Val);
606  c1Val = _mm256_mul_ps(a1Val, b1Val);
607  c2Val = _mm256_mul_ps(a2Val, b2Val);
608  c3Val = _mm256_mul_ps(a3Val, b3Val);
609 
610  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
611  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
612  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
613  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
614 
615  aPtr += 16;
616  bPtr += 32;
617  }
618 
619  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
620  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
621  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
622 
623  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
624 
625  _mm256_store_ps(dotProductVector,
626  dotProdVal0); // Store the results back into the dot product vector
627 
628  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
629  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
630  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
631  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
632 
633  number = sixteenthPoints * 16;
634  for (; number < num_points; number++) {
635  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
636  aPtr += 1;
637  bPtr += 2;
638  }
639 
640  *result = returnValue;
641 }
642 
643 
644 #endif /*LV_HAVE_AVX2*/
645 
646 #if LV_HAVE_AVX2 && LV_HAVE_FMA
647 
648 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
649  const short* input,
650  const lv_32fc_t* taps,
651  unsigned int num_points)
652 {
653 
654  unsigned int number = 0;
655  const unsigned int sixteenthPoints = num_points / 16;
656 
657  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
658  const short* aPtr = input;
659  const float* bPtr = (float*)taps;
660 
661  __m128i m0, m1;
662  __m256i f0, f1;
663  __m256 g0, g1, h0, h1, h2, h3;
664  __m256 a0Val, a1Val, a2Val, a3Val;
665  __m256 b0Val, b1Val, b2Val, b3Val;
666 
667  __m256 dotProdVal0 = _mm256_setzero_ps();
668  __m256 dotProdVal1 = _mm256_setzero_ps();
669  __m256 dotProdVal2 = _mm256_setzero_ps();
670  __m256 dotProdVal3 = _mm256_setzero_ps();
671 
672  for (; number < sixteenthPoints; number++) {
673 
674  m0 = _mm_load_si128((__m128i const*)aPtr);
675  m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
676 
677  f0 = _mm256_cvtepi16_epi32(m0);
678  g0 = _mm256_cvtepi32_ps(f0);
679  f1 = _mm256_cvtepi16_epi32(m1);
680  g1 = _mm256_cvtepi32_ps(f1);
681 
682  h0 = _mm256_unpacklo_ps(g0, g0);
683  h1 = _mm256_unpackhi_ps(g0, g0);
684  h2 = _mm256_unpacklo_ps(g1, g1);
685  h3 = _mm256_unpackhi_ps(g1, g1);
686 
687  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
688  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
689  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
690  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
691 
692  b0Val = _mm256_load_ps(bPtr);
693  b1Val = _mm256_load_ps(bPtr + 8);
694  b2Val = _mm256_load_ps(bPtr + 16);
695  b3Val = _mm256_load_ps(bPtr + 24);
696 
697  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
698  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
699  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
700  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
701 
702  aPtr += 16;
703  bPtr += 32;
704  }
705 
706  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
707  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
708  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
709 
710  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
711 
712  _mm256_store_ps(dotProductVector,
713  dotProdVal0); // Store the results back into the dot product vector
714 
715  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
716  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
717  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
718  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
719 
720  number = sixteenthPoints * 16;
721  for (; number < num_points; number++) {
722  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
723  aPtr += 1;
724  bPtr += 2;
725  }
726 
727  *result = returnValue;
728 }
729 
730 
731 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
732 
733 #ifdef LV_HAVE_RVV
734 #include <riscv_vector.h>
736 
737 static inline void volk_16i_32fc_dot_prod_32fc_rvv(lv_32fc_t* result,
738  const short* input,
739  const lv_32fc_t* taps,
740  unsigned int num_points)
741 {
742  vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
743  vfloat32m4_t vsumi = vsumr;
744  size_t n = num_points;
745  for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
746  vl = __riscv_vsetvl_e32m4(n);
747  vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)taps, vl);
748  vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
749  vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
750  vfloat32m4_t v =
751  __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
752  vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
753  vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
754  }
755  size_t vl = __riscv_vsetvlmax_e32m1();
756  vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
757  vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
758  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
759  *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
760  __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
761 }
762 #endif /*LV_HAVE_RVV*/
763 
764 #ifdef LV_HAVE_RVVSEG
765 #include <riscv_vector.h>
767 
768 static inline void volk_16i_32fc_dot_prod_32fc_rvvseg(lv_32fc_t* result,
769  const short* input,
770  const lv_32fc_t* taps,
771  unsigned int num_points)
772 {
773  vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
774  vfloat32m4_t vsumi = vsumr;
775  size_t n = num_points;
776  for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
777  vl = __riscv_vsetvl_e32m4(n);
778  vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl);
779  vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
780  vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
781  vfloat32m4_t v =
782  __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
783  vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
784  vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
785  }
786  size_t vl = __riscv_vsetvlmax_e32m1();
787  vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
788  vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
789  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
790  *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
791  __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
792 }
793 #endif /*LV_HAVE_RVVSEG*/
794 
795 #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
volk_16i_32fc_dot_prod_32fc_neon
static void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:88
RISCV_SHRINK4
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
volk_16i_32fc_dot_prod_32fc_generic
static void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:54
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
i
for i
Definition: volk_config_fixed.tmpl.h:13
lv_cmake
#define lv_cmake(r, i)
Definition: volk_complex.h:77
volk_common.h
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_rvv_intrinsics.h