Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_64f_x2_dot_prod_64f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
53 #ifndef INCLUDED_volk_64f_x2_dot_prod_64f_u_H
54 #define INCLUDED_volk_64f_x2_dot_prod_64f_u_H
55 
56 #include <volk/volk_common.h>
57 
58 #ifdef LV_HAVE_GENERIC
59 
60 static inline void volk_64f_x2_dot_prod_64f_generic(double* result,
61  const double* input,
62  const double* taps,
63  unsigned int num_points)
64 {
65  double dot = 0.0;
66  for (unsigned int i = 0; i < num_points; i++) {
67  dot += input[i] * taps[i];
68  }
69  *result = dot;
70 }
71 
72 #endif /* LV_HAVE_GENERIC */
73 
74 
75 #ifdef LV_HAVE_SSE2
76 #include <emmintrin.h>
77 
78 static inline void volk_64f_x2_dot_prod_64f_u_sse2(double* result,
79  const double* input,
80  const double* taps,
81  unsigned int num_points)
82 {
83  const unsigned int eighthPoints = num_points / 8;
84  unsigned int number = 0;
85 
86  __m128d acc0 = _mm_setzero_pd();
87  __m128d acc1 = _mm_setzero_pd();
88  __m128d acc2 = _mm_setzero_pd();
89  __m128d acc3 = _mm_setzero_pd();
90 
91  for (; number < eighthPoints; number++) {
92  acc0 = _mm_add_pd(acc0, _mm_mul_pd(_mm_loadu_pd(input), _mm_loadu_pd(taps)));
93  acc1 =
94  _mm_add_pd(acc1, _mm_mul_pd(_mm_loadu_pd(input + 2), _mm_loadu_pd(taps + 2)));
95  acc2 =
96  _mm_add_pd(acc2, _mm_mul_pd(_mm_loadu_pd(input + 4), _mm_loadu_pd(taps + 4)));
97  acc3 =
98  _mm_add_pd(acc3, _mm_mul_pd(_mm_loadu_pd(input + 6), _mm_loadu_pd(taps + 6)));
99  input += 8;
100  taps += 8;
101  }
102 
103  acc0 = _mm_add_pd(acc0, acc1);
104  acc2 = _mm_add_pd(acc2, acc3);
105  acc0 = _mm_add_pd(acc0, acc2);
106 
107  __VOLK_ATTR_ALIGNED(16) double tmp[2];
108  _mm_store_pd(tmp, acc0);
109  double dot = tmp[0] + tmp[1];
110 
111  for (number = eighthPoints * 8; number < num_points; number++) {
112  dot += (*input++) * (*taps++);
113  }
114  *result = dot;
115 }
116 
117 #endif /* LV_HAVE_SSE2 */
118 
119 
120 #ifdef LV_HAVE_AVX
121 #include <immintrin.h>
122 
123 static inline void volk_64f_x2_dot_prod_64f_u_avx(double* result,
124  const double* input,
125  const double* taps,
126  unsigned int num_points)
127 {
128  const unsigned int eighthPoints = num_points / 8;
129  unsigned int number = 0;
130 
131  __m256d acc0 = _mm256_setzero_pd();
132  __m256d acc1 = _mm256_setzero_pd();
133 
134  for (; number < eighthPoints; number++) {
135  acc0 = _mm256_add_pd(
136  acc0, _mm256_mul_pd(_mm256_loadu_pd(input), _mm256_loadu_pd(taps)));
137  acc1 = _mm256_add_pd(
138  acc1, _mm256_mul_pd(_mm256_loadu_pd(input + 4), _mm256_loadu_pd(taps + 4)));
139  input += 8;
140  taps += 8;
141  }
142 
143  acc0 = _mm256_add_pd(acc0, acc1);
144 
145  __VOLK_ATTR_ALIGNED(32) double tmp[4];
146  _mm256_storeu_pd(tmp, acc0);
147  double dot = tmp[0] + tmp[1] + tmp[2] + tmp[3];
148 
149  for (number = eighthPoints * 8; number < num_points; number++) {
150  dot += (*input++) * (*taps++);
151  }
152  *result = dot;
153 }
154 
155 #endif /* LV_HAVE_AVX */
156 
157 
158 #if LV_HAVE_AVX2 && LV_HAVE_FMA
159 #include <immintrin.h>
160 
161 static inline void volk_64f_x2_dot_prod_64f_u_avx2_fma(double* result,
162  const double* input,
163  const double* taps,
164  unsigned int num_points)
165 {
166  const unsigned int eighthPoints = num_points / 8;
167  unsigned int number = 0;
168 
169  __m256d acc0 = _mm256_setzero_pd();
170  __m256d acc1 = _mm256_setzero_pd();
171 
172  for (; number < eighthPoints; number++) {
173  acc0 = _mm256_fmadd_pd(_mm256_loadu_pd(input), _mm256_loadu_pd(taps), acc0);
174  acc1 =
175  _mm256_fmadd_pd(_mm256_loadu_pd(input + 4), _mm256_loadu_pd(taps + 4), acc1);
176  input += 8;
177  taps += 8;
178  }
179 
180  acc0 = _mm256_add_pd(acc0, acc1);
181 
182  __VOLK_ATTR_ALIGNED(32) double tmp[4];
183  _mm256_storeu_pd(tmp, acc0);
184  double dot = tmp[0] + tmp[1] + tmp[2] + tmp[3];
185 
186  for (number = eighthPoints * 8; number < num_points; number++) {
187  dot += (*input++) * (*taps++);
188  }
189  *result = dot;
190 }
191 
192 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
193 
194 
195 #ifdef LV_HAVE_AVX512F
196 #include <immintrin.h>
197 
198 static inline void volk_64f_x2_dot_prod_64f_u_avx512f(double* result,
199  const double* input,
200  const double* taps,
201  unsigned int num_points)
202 {
203  const unsigned int eighthPoints = num_points / 8;
204  unsigned int number = 0;
205 
206  __m512d acc = _mm512_setzero_pd();
207 
208  for (; number < eighthPoints; number++) {
209  acc = _mm512_fmadd_pd(_mm512_loadu_pd(input), _mm512_loadu_pd(taps), acc);
210  input += 8;
211  taps += 8;
212  }
213 
214  double dot = _mm512_reduce_add_pd(acc);
215 
216  for (number = eighthPoints * 8; number < num_points; number++) {
217  dot += (*input++) * (*taps++);
218  }
219  *result = dot;
220 }
221 
222 #endif /* LV_HAVE_AVX512F */
223 
224 
225 #endif /* INCLUDED_volk_64f_x2_dot_prod_64f_u_H */
226 
227 
228 #ifndef INCLUDED_volk_64f_x2_dot_prod_64f_a_H
229 #define INCLUDED_volk_64f_x2_dot_prod_64f_a_H
230 
231 #include <volk/volk_common.h>
232 
233 #ifdef LV_HAVE_SSE2
234 #include <emmintrin.h>
235 
236 static inline void volk_64f_x2_dot_prod_64f_a_sse2(double* result,
237  const double* input,
238  const double* taps,
239  unsigned int num_points)
240 {
241  const unsigned int eighthPoints = num_points / 8;
242  unsigned int number = 0;
243 
244  __m128d acc0 = _mm_setzero_pd();
245  __m128d acc1 = _mm_setzero_pd();
246  __m128d acc2 = _mm_setzero_pd();
247  __m128d acc3 = _mm_setzero_pd();
248 
249  for (; number < eighthPoints; number++) {
250  acc0 = _mm_add_pd(acc0, _mm_mul_pd(_mm_load_pd(input), _mm_load_pd(taps)));
251  acc1 =
252  _mm_add_pd(acc1, _mm_mul_pd(_mm_load_pd(input + 2), _mm_load_pd(taps + 2)));
253  acc2 =
254  _mm_add_pd(acc2, _mm_mul_pd(_mm_load_pd(input + 4), _mm_load_pd(taps + 4)));
255  acc3 =
256  _mm_add_pd(acc3, _mm_mul_pd(_mm_load_pd(input + 6), _mm_load_pd(taps + 6)));
257  input += 8;
258  taps += 8;
259  }
260 
261  acc0 = _mm_add_pd(acc0, acc1);
262  acc2 = _mm_add_pd(acc2, acc3);
263  acc0 = _mm_add_pd(acc0, acc2);
264 
265  __VOLK_ATTR_ALIGNED(16) double tmp[2];
266  _mm_store_pd(tmp, acc0);
267  double dot = tmp[0] + tmp[1];
268 
269  for (number = eighthPoints * 8; number < num_points; number++) {
270  dot += (*input++) * (*taps++);
271  }
272  *result = dot;
273 }
274 
275 #endif /* LV_HAVE_SSE2 */
276 
277 
278 #ifdef LV_HAVE_AVX
279 #include <immintrin.h>
280 
281 static inline void volk_64f_x2_dot_prod_64f_a_avx(double* result,
282  const double* input,
283  const double* taps,
284  unsigned int num_points)
285 {
286  const unsigned int eighthPoints = num_points / 8;
287  unsigned int number = 0;
288 
289  __m256d acc0 = _mm256_setzero_pd();
290  __m256d acc1 = _mm256_setzero_pd();
291 
292  for (; number < eighthPoints; number++) {
293  acc0 = _mm256_add_pd(acc0,
294  _mm256_mul_pd(_mm256_load_pd(input), _mm256_load_pd(taps)));
295  acc1 = _mm256_add_pd(
296  acc1, _mm256_mul_pd(_mm256_load_pd(input + 4), _mm256_load_pd(taps + 4)));
297  input += 8;
298  taps += 8;
299  }
300 
301  acc0 = _mm256_add_pd(acc0, acc1);
302 
303  __VOLK_ATTR_ALIGNED(32) double tmp[4];
304  _mm256_store_pd(tmp, acc0);
305  double dot = tmp[0] + tmp[1] + tmp[2] + tmp[3];
306 
307  for (number = eighthPoints * 8; number < num_points; number++) {
308  dot += (*input++) * (*taps++);
309  }
310  *result = dot;
311 }
312 
313 #endif /* LV_HAVE_AVX */
314 
315 
316 #if LV_HAVE_AVX2 && LV_HAVE_FMA
317 #include <immintrin.h>
318 
319 static inline void volk_64f_x2_dot_prod_64f_a_avx2_fma(double* result,
320  const double* input,
321  const double* taps,
322  unsigned int num_points)
323 {
324  const unsigned int eighthPoints = num_points / 8;
325  unsigned int number = 0;
326 
327  __m256d acc0 = _mm256_setzero_pd();
328  __m256d acc1 = _mm256_setzero_pd();
329 
330  for (; number < eighthPoints; number++) {
331  acc0 = _mm256_fmadd_pd(_mm256_load_pd(input), _mm256_load_pd(taps), acc0);
332  acc1 = _mm256_fmadd_pd(_mm256_load_pd(input + 4), _mm256_load_pd(taps + 4), acc1);
333  input += 8;
334  taps += 8;
335  }
336 
337  acc0 = _mm256_add_pd(acc0, acc1);
338 
339  __VOLK_ATTR_ALIGNED(32) double tmp[4];
340  _mm256_store_pd(tmp, acc0);
341  double dot = tmp[0] + tmp[1] + tmp[2] + tmp[3];
342 
343  for (number = eighthPoints * 8; number < num_points; number++) {
344  dot += (*input++) * (*taps++);
345  }
346  *result = dot;
347 }
348 
349 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
350 
351 
352 #ifdef LV_HAVE_AVX512F
353 #include <immintrin.h>
354 
355 static inline void volk_64f_x2_dot_prod_64f_a_avx512f(double* result,
356  const double* input,
357  const double* taps,
358  unsigned int num_points)
359 {
360  const unsigned int eighthPoints = num_points / 8;
361  unsigned int number = 0;
362 
363  __m512d acc = _mm512_setzero_pd();
364 
365  for (; number < eighthPoints; number++) {
366  acc = _mm512_fmadd_pd(_mm512_load_pd(input), _mm512_load_pd(taps), acc);
367  input += 8;
368  taps += 8;
369  }
370 
371  double dot = _mm512_reduce_add_pd(acc);
372 
373  for (number = eighthPoints * 8; number < num_points; number++) {
374  dot += (*input++) * (*taps++);
375  }
376  *result = dot;
377 }
378 
379 #endif /* LV_HAVE_AVX512F */
380 
381 
382 #ifdef LV_HAVE_NEONV8
383 #include <arm_neon.h>
384 
385 static inline void volk_64f_x2_dot_prod_64f_neonv8(double* result,
386  const double* input,
387  const double* taps,
388  unsigned int num_points)
389 {
390  const unsigned int eighthPoints = num_points / 8;
391  unsigned int number = 0;
392 
393  float64x2_t acc0 = vdupq_n_f64(0.0);
394  float64x2_t acc1 = vdupq_n_f64(0.0);
395  float64x2_t acc2 = vdupq_n_f64(0.0);
396  float64x2_t acc3 = vdupq_n_f64(0.0);
397 
398  for (; number < eighthPoints; number++) {
399  __VOLK_PREFETCH(input + 16);
400  __VOLK_PREFETCH(taps + 16);
401 
402  acc0 = vfmaq_f64(acc0, vld1q_f64(input), vld1q_f64(taps));
403  acc1 = vfmaq_f64(acc1, vld1q_f64(input + 2), vld1q_f64(taps + 2));
404  acc2 = vfmaq_f64(acc2, vld1q_f64(input + 4), vld1q_f64(taps + 4));
405  acc3 = vfmaq_f64(acc3, vld1q_f64(input + 6), vld1q_f64(taps + 6));
406  input += 8;
407  taps += 8;
408  }
409 
410  acc0 = vaddq_f64(acc0, acc1);
411  acc2 = vaddq_f64(acc2, acc3);
412  acc0 = vaddq_f64(acc0, acc2);
413 
414  double dot = vaddvq_f64(acc0);
415 
416  for (number = eighthPoints * 8; number < num_points; number++) {
417  dot += (*input++) * (*taps++);
418  }
419  *result = dot;
420 }
421 
422 #endif /* LV_HAVE_NEONV8 */
423 
424 
425 #ifdef LV_HAVE_RVV
426 #include <riscv_vector.h>
428 
429 static inline void volk_64f_x2_dot_prod_64f_rvv(double* result,
430  const double* input,
431  const double* taps,
432  unsigned int num_points)
433 {
434  vfloat64m8_t vsum = __riscv_vfmv_v_f_f64m8(0, __riscv_vsetvlmax_e64m8());
435  size_t n = num_points;
436  for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
437  vl = __riscv_vsetvl_e64m8(n);
438  vfloat64m8_t v0 = __riscv_vle64_v_f64m8(input, vl);
439  vfloat64m8_t v1 = __riscv_vle64_v_f64m8(taps, vl);
440  vsum = __riscv_vfmacc_tu(vsum, v0, v1, vl);
441  }
442  size_t vl = __riscv_vsetvlmax_e64m1();
443  vfloat64m1_t v = RISCV_SHRINK8(vfadd, f, 64, vsum);
444  v = __riscv_vfredusum(v, __riscv_vfmv_s_f_f64m1(0, vl), vl);
445  *result = __riscv_vfmv_f(v);
446 }
447 
448 #endif /* LV_HAVE_RVV */
449 
450 
451 #endif /* INCLUDED_volk_64f_x2_dot_prod_64f_a_H */
volk_64f_x2_dot_prod_64f_generic
static void volk_64f_x2_dot_prod_64f_generic(double *result, const double *input, const double *taps, unsigned int num_points)
Definition: volk_64f_x2_dot_prod_64f.h:60
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_common.h
volk_64f_x2_dot_prod_64f_u_avx
static void volk_64f_x2_dot_prod_64f_u_avx(double *result, const double *input, const double *taps, unsigned int num_points)
Definition: volk_64f_x2_dot_prod_64f.h:123
volk_64f_x2_dot_prod_64f_u_sse2
static void volk_64f_x2_dot_prod_64f_u_sse2(double *result, const double *input, const double *taps, unsigned int num_points)
Definition: volk_64f_x2_dot_prod_64f.h:78
RISCV_SHRINK8
#define RISCV_SHRINK8(op, T, S, v)
Definition: volk_rvv_intrinsics.h:33
volk_64f_x2_dot_prod_64f_a_avx
static void volk_64f_x2_dot_prod_64f_a_avx(double *result, const double *input, const double *taps, unsigned int num_points)
Definition: volk_64f_x2_dot_prod_64f.h:281
volk_64f_x2_dot_prod_64f_a_sse2
static void volk_64f_x2_dot_prod_64f_a_sse2(double *result, const double *input, const double *taps, unsigned int num_points)
Definition: volk_64f_x2_dot_prod_64f.h:236
volk_rvv_intrinsics.h
bit128::f
float f[4]
Definition: volk_common.h:120