Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_accumulator_s32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
50 #ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
51 #define INCLUDED_volk_32f_accumulator_s32f_a_H
52 
53 #include <inttypes.h>
54 #include <volk/volk_common.h>
55 
56 #ifdef LV_HAVE_AVX512F
57 #include <immintrin.h>
58 
59 static inline void volk_32f_accumulator_s32f_a_avx512f(float* result,
60  const float* inputBuffer,
61  unsigned int num_points)
62 {
63  float returnValue = 0;
64  unsigned int number = 0;
65  const unsigned int sixteenthPoints = num_points / 16;
66 
67  const float* aPtr = inputBuffer;
68 
69  __m512 accumulator = _mm512_setzero_ps();
70  __m512 aVal = _mm512_setzero_ps();
71 
72  for (; number < sixteenthPoints; number++) {
73  aVal = _mm512_load_ps(aPtr);
74  accumulator = _mm512_add_ps(accumulator, aVal);
75  aPtr += 16;
76  }
77 
78  // Horizontal sum using AVX512 reduce instruction
79  returnValue = _mm512_reduce_add_ps(accumulator);
80 
81  number = sixteenthPoints * 16;
82  for (; number < num_points; number++) {
83  returnValue += (*aPtr++);
84  }
85  *result = returnValue;
86 }
87 #endif /* LV_HAVE_AVX512F */
88 
89 
90 #ifdef LV_HAVE_AVX
91 #include <immintrin.h>
92 
93 static inline void volk_32f_accumulator_s32f_a_avx(float* result,
94  const float* inputBuffer,
95  unsigned int num_points)
96 {
97  float returnValue = 0;
98  unsigned int number = 0;
99  const unsigned int eighthPoints = num_points / 8;
100 
101  const float* aPtr = inputBuffer;
102  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
103 
104  __m256 accumulator = _mm256_setzero_ps();
105  __m256 aVal = _mm256_setzero_ps();
106 
107  for (; number < eighthPoints; number++) {
108  aVal = _mm256_load_ps(aPtr);
109  accumulator = _mm256_add_ps(accumulator, aVal);
110  aPtr += 8;
111  }
112 
113  _mm256_store_ps(tempBuffer, accumulator);
114 
115  returnValue = tempBuffer[0];
116  returnValue += tempBuffer[1];
117  returnValue += tempBuffer[2];
118  returnValue += tempBuffer[3];
119  returnValue += tempBuffer[4];
120  returnValue += tempBuffer[5];
121  returnValue += tempBuffer[6];
122  returnValue += tempBuffer[7];
123 
124  number = eighthPoints * 8;
125  for (; number < num_points; number++) {
126  returnValue += (*aPtr++);
127  }
128  *result = returnValue;
129 }
130 #endif /* LV_HAVE_AVX */
131 
132 
133 #ifdef LV_HAVE_AVX512F
134 #include <immintrin.h>
135 
136 static inline void volk_32f_accumulator_s32f_u_avx512f(float* result,
137  const float* inputBuffer,
138  unsigned int num_points)
139 {
140  float returnValue = 0;
141  unsigned int number = 0;
142  const unsigned int sixteenthPoints = num_points / 16;
143 
144  const float* aPtr = inputBuffer;
145 
146  __m512 accumulator = _mm512_setzero_ps();
147  __m512 aVal = _mm512_setzero_ps();
148 
149  for (; number < sixteenthPoints; number++) {
150  aVal = _mm512_loadu_ps(aPtr);
151  accumulator = _mm512_add_ps(accumulator, aVal);
152  aPtr += 16;
153  }
154 
155  // Horizontal sum using AVX512 reduce instruction
156  returnValue = _mm512_reduce_add_ps(accumulator);
157 
158  number = sixteenthPoints * 16;
159  for (; number < num_points; number++) {
160  returnValue += (*aPtr++);
161  }
162  *result = returnValue;
163 }
164 #endif /* LV_HAVE_AVX512F */
165 
166 
167 #ifdef LV_HAVE_AVX
168 #include <immintrin.h>
169 
170 static inline void volk_32f_accumulator_s32f_u_avx(float* result,
171  const float* inputBuffer,
172  unsigned int num_points)
173 {
174  float returnValue = 0;
175  unsigned int number = 0;
176  const unsigned int eighthPoints = num_points / 8;
177 
178  const float* aPtr = inputBuffer;
179  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
180 
181  __m256 accumulator = _mm256_setzero_ps();
182  __m256 aVal = _mm256_setzero_ps();
183 
184  for (; number < eighthPoints; number++) {
185  aVal = _mm256_loadu_ps(aPtr);
186  accumulator = _mm256_add_ps(accumulator, aVal);
187  aPtr += 8;
188  }
189 
190  _mm256_store_ps(tempBuffer, accumulator);
191 
192  returnValue = tempBuffer[0];
193  returnValue += tempBuffer[1];
194  returnValue += tempBuffer[2];
195  returnValue += tempBuffer[3];
196  returnValue += tempBuffer[4];
197  returnValue += tempBuffer[5];
198  returnValue += tempBuffer[6];
199  returnValue += tempBuffer[7];
200 
201  number = eighthPoints * 8;
202  for (; number < num_points; number++) {
203  returnValue += (*aPtr++);
204  }
205  *result = returnValue;
206 }
207 #endif /* LV_HAVE_AVX */
208 
209 
210 #ifdef LV_HAVE_SSE
211 #include <xmmintrin.h>
212 
213 static inline void volk_32f_accumulator_s32f_a_sse(float* result,
214  const float* inputBuffer,
215  unsigned int num_points)
216 {
217  float returnValue = 0;
218  unsigned int number = 0;
219  const unsigned int quarterPoints = num_points / 4;
220 
221  const float* aPtr = inputBuffer;
222  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
223 
224  __m128 accumulator = _mm_setzero_ps();
225  __m128 aVal = _mm_setzero_ps();
226 
227  for (; number < quarterPoints; number++) {
228  aVal = _mm_load_ps(aPtr);
229  accumulator = _mm_add_ps(accumulator, aVal);
230  aPtr += 4;
231  }
232 
233  _mm_store_ps(tempBuffer, accumulator);
234 
235  returnValue = tempBuffer[0];
236  returnValue += tempBuffer[1];
237  returnValue += tempBuffer[2];
238  returnValue += tempBuffer[3];
239 
240  number = quarterPoints * 4;
241  for (; number < num_points; number++) {
242  returnValue += (*aPtr++);
243  }
244  *result = returnValue;
245 }
246 #endif /* LV_HAVE_SSE */
247 
248 
249 #ifdef LV_HAVE_SSE
250 #include <xmmintrin.h>
251 
252 static inline void volk_32f_accumulator_s32f_u_sse(float* result,
253  const float* inputBuffer,
254  unsigned int num_points)
255 {
256  float returnValue = 0;
257  unsigned int number = 0;
258  const unsigned int quarterPoints = num_points / 4;
259 
260  const float* aPtr = inputBuffer;
261  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
262 
263  __m128 accumulator = _mm_setzero_ps();
264  __m128 aVal = _mm_setzero_ps();
265 
266  for (; number < quarterPoints; number++) {
267  aVal = _mm_loadu_ps(aPtr);
268  accumulator = _mm_add_ps(accumulator, aVal);
269  aPtr += 4;
270  }
271 
272  _mm_store_ps(tempBuffer, accumulator);
273 
274  returnValue = tempBuffer[0];
275  returnValue += tempBuffer[1];
276  returnValue += tempBuffer[2];
277  returnValue += tempBuffer[3];
278 
279  number = quarterPoints * 4;
280  for (; number < num_points; number++) {
281  returnValue += (*aPtr++);
282  }
283  *result = returnValue;
284 }
285 #endif /* LV_HAVE_SSE */
286 
287 
288 #ifdef LV_HAVE_NEON
289 #include <arm_neon.h>
290 
291 static inline void volk_32f_accumulator_s32f_neon(float* result,
292  const float* inputBuffer,
293  unsigned int num_points)
294 {
295  float returnValue = 0;
296  unsigned int number = 0;
297  const unsigned int quarterPoints = num_points / 4;
298 
299  const float* aPtr = inputBuffer;
300  float32x4_t accumulator = vdupq_n_f32(0.0f);
301  float32x4_t aVal;
302 
303  for (; number < quarterPoints; number++) {
304  aVal = vld1q_f32(aPtr);
305  accumulator = vaddq_f32(accumulator, aVal);
306  aPtr += 4;
307  }
308 
309  // Horizontal sum - manual for NEON (ARMv7 compatible)
310  float32x2_t sum_pair =
311  vadd_f32(vget_low_f32(accumulator), vget_high_f32(accumulator));
312  sum_pair = vpadd_f32(sum_pair, sum_pair);
313  returnValue = vget_lane_f32(sum_pair, 0);
314 
315  number = quarterPoints * 4;
316  for (; number < num_points; number++) {
317  returnValue += (*aPtr++);
318  }
319  *result = returnValue;
320 }
321 #endif /* LV_HAVE_NEON */
322 
323 
324 #ifdef LV_HAVE_NEONV8
325 #include <arm_neon.h>
326 
327 static inline void volk_32f_accumulator_s32f_neonv8(float* result,
328  const float* inputBuffer,
329  unsigned int num_points)
330 {
331  float returnValue = 0;
332  unsigned int number = 0;
333  const unsigned int eighthPoints = num_points / 8;
334 
335  const float* aPtr = inputBuffer;
336  float32x4_t accumulator0 = vdupq_n_f32(0.0f);
337  float32x4_t accumulator1 = vdupq_n_f32(0.0f);
338 
339  // 2x unrolled loop for better instruction-level parallelism
340  for (; number < eighthPoints; number++) {
341  float32x4_t aVal0 = vld1q_f32(aPtr);
342  float32x4_t aVal1 = vld1q_f32(aPtr + 4);
343  __VOLK_PREFETCH(aPtr + 8);
344  accumulator0 = vaddq_f32(accumulator0, aVal0);
345  accumulator1 = vaddq_f32(accumulator1, aVal1);
346  aPtr += 8;
347  }
348 
349  // Combine accumulators
350  accumulator0 = vaddq_f32(accumulator0, accumulator1);
351 
352  // ARMv8 horizontal sum using vaddvq_f32
353  returnValue = vaddvq_f32(accumulator0);
354 
355  number = eighthPoints * 8;
356  for (; number < num_points; number++) {
357  returnValue += (*aPtr++);
358  }
359  *result = returnValue;
360 }
361 #endif /* LV_HAVE_NEONV8 */
362 
363 
364 #ifdef LV_HAVE_GENERIC
365 static inline void volk_32f_accumulator_s32f_generic(float* result,
366  const float* inputBuffer,
367  unsigned int num_points)
368 {
369  const float* aPtr = inputBuffer;
370  unsigned int number = 0;
371  float returnValue = 0;
372 
373  for (; number < num_points; number++) {
374  returnValue += (*aPtr++);
375  }
376  *result = returnValue;
377 }
378 #endif /* LV_HAVE_GENERIC */
379 
380 #ifdef LV_HAVE_RVV
381 #include <riscv_vector.h>
383 
384 static inline void volk_32f_accumulator_s32f_rvv(float* result,
385  const float* inputBuffer,
386  unsigned int num_points)
387 {
388  vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
389  size_t n = num_points;
390  for (size_t vl; n > 0; n -= vl, inputBuffer += vl) {
391  vl = __riscv_vsetvl_e32m8(n);
392  vfloat32m8_t v = __riscv_vle32_v_f32m8(inputBuffer, vl);
393  vsum = __riscv_vfadd_tu(vsum, vsum, v, vl);
394  }
395  size_t vl = __riscv_vsetvlmax_e32m1();
396  vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum);
397  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
398  *result = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl));
399 }
400 #endif /*LV_HAVE_RVV*/
401 
402 #endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */
volk_32f_accumulator_s32f_neon
static void volk_32f_accumulator_s32f_neon(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:291
volk_32f_accumulator_s32f_u_sse
static void volk_32f_accumulator_s32f_u_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:252
volk_32f_accumulator_s32f_generic
static void volk_32f_accumulator_s32f_generic(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:365
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_common.h
volk_32f_accumulator_s32f_a_sse
static void volk_32f_accumulator_s32f_a_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:213
RISCV_SHRINK8
#define RISCV_SHRINK8(op, T, S, v)
Definition: volk_rvv_intrinsics.h:33
volk_rvv_intrinsics.h
volk_32f_accumulator_s32f_u_avx
static void volk_32f_accumulator_s32f_u_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:170
volk_32f_accumulator_s32f_a_avx
static void volk_32f_accumulator_s32f_a_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:93