Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_accumulator_s32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2019 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
51 #ifndef INCLUDED_volk_32fc_accumulator_s32fc_a_H
52 #define INCLUDED_volk_32fc_accumulator_s32fc_a_H
53 
54 #include <inttypes.h>
55 #include <volk/volk_common.h>
56 
57 #ifdef LV_HAVE_AVX512F
58 #include <immintrin.h>
59 
60 static inline void volk_32fc_accumulator_s32fc_a_avx512f(lv_32fc_t* result,
61  const lv_32fc_t* inputBuffer,
62  unsigned int num_points)
63 {
64  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
65  unsigned int number = 0;
66  const unsigned int eighthPoints = num_points / 8;
67 
68  const lv_32fc_t* aPtr = inputBuffer;
69  __VOLK_ATTR_ALIGNED(64) float tempBuffer[16];
70 
71  __m512 accumulator = _mm512_setzero_ps();
72  __m512 aVal = _mm512_setzero_ps();
73 
74  for (; number < eighthPoints; number++) {
75  aVal = _mm512_load_ps((float*)aPtr);
76  accumulator = _mm512_add_ps(accumulator, aVal);
77  aPtr += 8;
78  }
79 
80  _mm512_store_ps(tempBuffer, accumulator);
81 
82  // Sum pairs as complex numbers
83  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
84  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
85  returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
86  returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
87  returnValue += lv_cmake(tempBuffer[8], tempBuffer[9]);
88  returnValue += lv_cmake(tempBuffer[10], tempBuffer[11]);
89  returnValue += lv_cmake(tempBuffer[12], tempBuffer[13]);
90  returnValue += lv_cmake(tempBuffer[14], tempBuffer[15]);
91 
92  number = eighthPoints * 8;
93  for (; number < num_points; number++) {
94  returnValue += (*aPtr++);
95  }
96  *result = returnValue;
97 }
98 #endif /* LV_HAVE_AVX512F */
99 
100 
101 #ifdef LV_HAVE_AVX512F
102 #include <immintrin.h>
103 
104 static inline void volk_32fc_accumulator_s32fc_u_avx512f(lv_32fc_t* result,
105  const lv_32fc_t* inputBuffer,
106  unsigned int num_points)
107 {
108  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
109  unsigned int number = 0;
110  const unsigned int eighthPoints = num_points / 8;
111 
112  const lv_32fc_t* aPtr = inputBuffer;
113  __VOLK_ATTR_ALIGNED(64) float tempBuffer[16];
114 
115  __m512 accumulator = _mm512_setzero_ps();
116  __m512 aVal = _mm512_setzero_ps();
117 
118  for (; number < eighthPoints; number++) {
119  aVal = _mm512_loadu_ps((float*)aPtr);
120  accumulator = _mm512_add_ps(accumulator, aVal);
121  aPtr += 8;
122  }
123 
124  _mm512_store_ps(tempBuffer, accumulator);
125 
126  // Sum pairs as complex numbers
127  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
128  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
129  returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
130  returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
131  returnValue += lv_cmake(tempBuffer[8], tempBuffer[9]);
132  returnValue += lv_cmake(tempBuffer[10], tempBuffer[11]);
133  returnValue += lv_cmake(tempBuffer[12], tempBuffer[13]);
134  returnValue += lv_cmake(tempBuffer[14], tempBuffer[15]);
135 
136  number = eighthPoints * 8;
137  for (; number < num_points; number++) {
138  returnValue += (*aPtr++);
139  }
140  *result = returnValue;
141 }
142 #endif /* LV_HAVE_AVX512F */
143 
144 
145 #ifdef LV_HAVE_GENERIC
147  const lv_32fc_t* inputBuffer,
148  unsigned int num_points)
149 {
150  const lv_32fc_t* aPtr = inputBuffer;
151  unsigned int number = 0;
152  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
153 
154  for (; number < num_points; number++) {
155  returnValue += (*aPtr++);
156  }
157  *result = returnValue;
158 }
159 #endif /* LV_HAVE_GENERIC */
160 
161 #ifdef LV_HAVE_AVX
162 #include <immintrin.h>
163 
164 static inline void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t* result,
165  const lv_32fc_t* inputBuffer,
166  unsigned int num_points)
167 {
168  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
169  unsigned int number = 0;
170  const unsigned int quarterPoints = num_points / 4;
171 
172  const lv_32fc_t* aPtr = inputBuffer;
173  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
174 
175  __m256 accumulator = _mm256_setzero_ps();
176  __m256 aVal = _mm256_setzero_ps();
177 
178  for (; number < quarterPoints; number++) {
179  aVal = _mm256_loadu_ps((float*)aPtr);
180  accumulator = _mm256_add_ps(accumulator, aVal);
181  aPtr += 4;
182  }
183 
184  _mm256_store_ps(tempBuffer, accumulator);
185 
186  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
187  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
188  returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
189  returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
190 
191  number = quarterPoints * 4;
192  for (; number < num_points; number++) {
193  returnValue += (*aPtr++);
194  }
195  *result = returnValue;
196 }
197 #endif /* LV_HAVE_AVX */
198 
199 #ifdef LV_HAVE_SSE
200 #include <xmmintrin.h>
201 
202 static inline void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t* result,
203  const lv_32fc_t* inputBuffer,
204  unsigned int num_points)
205 {
206  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
207  unsigned int number = 0;
208  const unsigned int halfPoints = num_points / 2;
209 
210  const lv_32fc_t* aPtr = inputBuffer;
211  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
212 
213  __m128 accumulator = _mm_setzero_ps();
214  __m128 aVal = _mm_setzero_ps();
215 
216  for (; number < halfPoints; number++) {
217  aVal = _mm_loadu_ps((float*)aPtr);
218  accumulator = _mm_add_ps(accumulator, aVal);
219  aPtr += 2;
220  }
221 
222  _mm_store_ps(tempBuffer, accumulator);
223 
224  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
225  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
226 
227  number = halfPoints * 2;
228  for (; number < num_points; number++) {
229  returnValue += (*aPtr++);
230  }
231  *result = returnValue;
232 }
233 #endif /* LV_HAVE_SSE */
234 
235 #ifdef LV_HAVE_AVX
236 #include <immintrin.h>
237 
238 static inline void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t* result,
239  const lv_32fc_t* inputBuffer,
240  unsigned int num_points)
241 {
242  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
243  unsigned int number = 0;
244  const unsigned int quarterPoints = num_points / 4;
245 
246  const lv_32fc_t* aPtr = inputBuffer;
247  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
248 
249  __m256 accumulator = _mm256_setzero_ps();
250  __m256 aVal = _mm256_setzero_ps();
251 
252  for (; number < quarterPoints; number++) {
253  aVal = _mm256_load_ps((float*)aPtr);
254  accumulator = _mm256_add_ps(accumulator, aVal);
255  aPtr += 4;
256  }
257 
258  _mm256_store_ps(tempBuffer, accumulator);
259 
260  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
261  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
262  returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
263  returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
264 
265  number = quarterPoints * 4;
266  for (; number < num_points; number++) {
267  returnValue += (*aPtr++);
268  }
269  *result = returnValue;
270 }
271 #endif /* LV_HAVE_AVX */
272 
273 #ifdef LV_HAVE_SSE
274 #include <xmmintrin.h>
275 
276 static inline void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t* result,
277  const lv_32fc_t* inputBuffer,
278  unsigned int num_points)
279 {
280  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
281  unsigned int number = 0;
282  const unsigned int halfPoints = num_points / 2;
283 
284  const lv_32fc_t* aPtr = inputBuffer;
285  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
286 
287  __m128 accumulator = _mm_setzero_ps();
288  __m128 aVal = _mm_setzero_ps();
289 
290  for (; number < halfPoints; number++) {
291  aVal = _mm_load_ps((float*)aPtr);
292  accumulator = _mm_add_ps(accumulator, aVal);
293  aPtr += 2;
294  }
295 
296  _mm_store_ps(tempBuffer, accumulator);
297 
298  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
299  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
300 
301  number = halfPoints * 2;
302  for (; number < num_points; number++) {
303  returnValue += (*aPtr++);
304  }
305  *result = returnValue;
306 }
307 #endif /* LV_HAVE_SSE */
308 
309 #ifdef LV_HAVE_NEON
310 #include <arm_neon.h>
311 static inline void volk_32fc_accumulator_s32fc_neon(lv_32fc_t* result,
312  const lv_32fc_t* inputBuffer,
313  unsigned int num_points)
314 {
315  const lv_32fc_t* aPtr = inputBuffer;
316  unsigned int number = 0;
317  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
318  unsigned int eighthPoints = num_points / 8;
319  float32x4_t in_vec;
320  float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f };
321  float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f };
322  float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f };
323  float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f };
324  __VOLK_ATTR_ALIGNED(32) float tempBuffer[4];
325 
326  for (; number < eighthPoints; number++) {
327  in_vec = vld1q_f32((float*)aPtr);
328  out_vec0 = vaddq_f32(in_vec, out_vec0);
329  aPtr += 2;
330 
331  in_vec = vld1q_f32((float*)aPtr);
332  out_vec1 = vaddq_f32(in_vec, out_vec1);
333  aPtr += 2;
334 
335  in_vec = vld1q_f32((float*)aPtr);
336  out_vec2 = vaddq_f32(in_vec, out_vec2);
337  aPtr += 2;
338 
339  in_vec = vld1q_f32((float*)aPtr);
340  out_vec3 = vaddq_f32(in_vec, out_vec3);
341  aPtr += 2;
342  }
343  vst1q_f32(tempBuffer, out_vec0);
344  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
345  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
346 
347  vst1q_f32(tempBuffer, out_vec1);
348  returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
349  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
350 
351  vst1q_f32(tempBuffer, out_vec2);
352  returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
353  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
354 
355  vst1q_f32(tempBuffer, out_vec3);
356  returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
357  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
358 
359  number = eighthPoints * 8;
360  for (; number < num_points; number++) {
361  returnValue += (*aPtr++);
362  }
363  *result = returnValue;
364 }
365 #endif /* LV_HAVE_NEON */
366 
367 #ifdef LV_HAVE_NEONV8
368 #include <arm_neon.h>
369 
370 static inline void volk_32fc_accumulator_s32fc_neonv8(lv_32fc_t* result,
371  const lv_32fc_t* inputBuffer,
372  unsigned int num_points)
373 {
374  const lv_32fc_t* aPtr = inputBuffer;
375  unsigned int number = 0;
376  const unsigned int eighthPoints = num_points / 8;
377 
378  /* Keep interleaved like neon version - vld1q is faster than vld2q */
379  float32x4_t in_vec;
380  float32x4_t out_vec0 = vdupq_n_f32(0.f);
381  float32x4_t out_vec1 = vdupq_n_f32(0.f);
382  float32x4_t out_vec2 = vdupq_n_f32(0.f);
383  float32x4_t out_vec3 = vdupq_n_f32(0.f);
384 
385  for (; number < eighthPoints; number++) {
386  in_vec = vld1q_f32((float*)aPtr);
387  out_vec0 = vaddq_f32(in_vec, out_vec0);
388  aPtr += 2;
389 
390  in_vec = vld1q_f32((float*)aPtr);
391  out_vec1 = vaddq_f32(in_vec, out_vec1);
392  aPtr += 2;
393 
394  in_vec = vld1q_f32((float*)aPtr);
395  out_vec2 = vaddq_f32(in_vec, out_vec2);
396  aPtr += 2;
397 
398  in_vec = vld1q_f32((float*)aPtr);
399  out_vec3 = vaddq_f32(in_vec, out_vec3);
400  aPtr += 2;
401  }
402 
403  /* Combine the 4 accumulators */
404  out_vec0 = vaddq_f32(out_vec0, out_vec1);
405  out_vec2 = vaddq_f32(out_vec2, out_vec3);
406  out_vec0 = vaddq_f32(out_vec0, out_vec2);
407 
408  /* Horizontal reduction: out_vec0 = [sum_r0, sum_i0, sum_r1, sum_i1] */
409  /* We need real = sum_r0 + sum_r1, imag = sum_i0 + sum_i1 */
410  float32x2_t low = vget_low_f32(out_vec0); /* [sum_r0, sum_i0] */
411  float32x2_t high = vget_high_f32(out_vec0); /* [sum_r1, sum_i1] */
412  float32x2_t sum = vadd_f32(low, high); /* [real_sum, imag_sum] */
413 
414  lv_32fc_t returnValue = lv_cmake(vget_lane_f32(sum, 0), vget_lane_f32(sum, 1));
415 
416  /* Tail case */
417  for (number = eighthPoints * 8; number < num_points; number++) {
418  returnValue += (*aPtr++);
419  }
420 
421  *result = returnValue;
422 }
423 
424 #endif /* LV_HAVE_NEONV8 */
425 
426 #ifdef LV_HAVE_RVV
427 #include <riscv_vector.h>
429 
430 static inline void volk_32fc_accumulator_s32fc_rvv(lv_32fc_t* result,
431  const lv_32fc_t* inputBuffer,
432  unsigned int num_points)
433 {
434  size_t vlmax = __riscv_vsetvlmax_e32m8();
435  vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, vlmax);
436  const float* in = (const float*)inputBuffer;
437  size_t n = num_points * 2;
438  for (size_t vl; n > 0; n -= vl, in += vl) {
439  vl = __riscv_vsetvl_e32m8(n < vlmax ? n : vlmax); /* force exact vl */
440  vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
441  vsum = __riscv_vfadd_tu(vsum, vsum, v, vl);
442  }
443  vuint64m8_t vsumu = __riscv_vreinterpret_u64m8(__riscv_vreinterpret_u32m8(vsum));
444  vfloat32m4_t vsum1 = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vsumu, 0, vlmax));
445  vfloat32m4_t vsum2 = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vsumu, 32, vlmax));
446  vlmax = __riscv_vsetvlmax_e32m1();
447  vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsum1);
448  vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsum2);
449  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vlmax);
450  *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vlmax)),
451  __riscv_vfmv_f(__riscv_vfredusum(vi, z, vlmax)));
452 }
453 #endif /*LV_HAVE_RVV*/
454 
455 #endif /* INCLUDED_volk_32fc_accumulator_s32fc_a_H */
volk_32fc_accumulator_s32fc_u_sse
static void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:202
RISCV_SHRINK4
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
lv_cmake
#define lv_cmake(r, i)
Definition: volk_complex.h:77
volk_common.h
volk_32fc_accumulator_s32fc_u_avx
static void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:164
volk_32fc_accumulator_s32fc_a_avx
static void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:238
volk_32fc_accumulator_s32fc_a_sse
static void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:276
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_32fc_accumulator_s32fc_generic
static void volk_32fc_accumulator_s32fc_generic(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:146
volk_rvv_intrinsics.h
volk_32fc_accumulator_s32fc_neon
static void volk_32fc_accumulator_s32fc_neon(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:311