Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_64f_x2_multiply_64f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_64f_x2_multiply_64f_H
59 #define INCLUDED_volk_64f_x2_multiply_64f_H
60 
61 #include <inttypes.h>
62 
63 
64 #ifdef LV_HAVE_GENERIC
65 
66 static inline void volk_64f_x2_multiply_64f_generic(double* cVector,
67  const double* aVector,
68  const double* bVector,
69  unsigned int num_points)
70 {
71  double* cPtr = cVector;
72  const double* aPtr = aVector;
73  const double* bPtr = bVector;
74  unsigned int number = 0;
75 
76  for (number = 0; number < num_points; number++) {
77  *cPtr++ = (*aPtr++) * (*bPtr++);
78  }
79 }
80 
81 #endif /* LV_HAVE_GENERIC */
82 
83 
84 #ifdef LV_HAVE_NEONV8
85 #include <arm_neon.h>
86 
87 static inline void volk_64f_x2_multiply_64f_neonv8(double* cVector,
88  const double* aVector,
89  const double* bVector,
90  unsigned int num_points)
91 {
92  unsigned int number = 0;
93  const unsigned int quarter_points = num_points / 4;
94 
95  double* cPtr = cVector;
96  const double* aPtr = aVector;
97  const double* bPtr = bVector;
98 
99  for (; number < quarter_points; number++) {
100  float64x2_t aVal0 = vld1q_f64(aPtr);
101  float64x2_t aVal1 = vld1q_f64(aPtr + 2);
102  float64x2_t bVal0 = vld1q_f64(bPtr);
103  float64x2_t bVal1 = vld1q_f64(bPtr + 2);
104  __VOLK_PREFETCH(aPtr + 4);
105  __VOLK_PREFETCH(bPtr + 4);
106 
107  float64x2_t cVal0 = vmulq_f64(aVal0, bVal0);
108  float64x2_t cVal1 = vmulq_f64(aVal1, bVal1);
109 
110  vst1q_f64(cPtr, cVal0);
111  vst1q_f64(cPtr + 2, cVal1);
112 
113  aPtr += 4;
114  bPtr += 4;
115  cPtr += 4;
116  }
117 
118  number = quarter_points * 4;
119  for (; number < num_points; number++) {
120  *cPtr++ = (*aPtr++) * (*bPtr++);
121  }
122 }
123 
124 #endif /* LV_HAVE_NEONV8 */
125 
126 
127 /*
128  * Unaligned versions
129  */
130 
131 #ifdef LV_HAVE_SSE2
132 
133 #include <emmintrin.h>
134 
135 static inline void volk_64f_x2_multiply_64f_u_sse2(double* cVector,
136  const double* aVector,
137  const double* bVector,
138  unsigned int num_points)
139 {
140  unsigned int number = 0;
141  const unsigned int half_points = num_points / 2;
142 
143  double* cPtr = cVector;
144  const double* aPtr = aVector;
145  const double* bPtr = bVector;
146 
147  __m128d aVal, bVal, cVal;
148  for (; number < half_points; number++) {
149  aVal = _mm_loadu_pd(aPtr);
150  bVal = _mm_loadu_pd(bPtr);
151 
152  cVal = _mm_mul_pd(aVal, bVal);
153 
154  _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
155 
156  aPtr += 2;
157  bPtr += 2;
158  cPtr += 2;
159  }
160 
161  number = half_points * 2;
162  for (; number < num_points; number++) {
163  *cPtr++ = (*aPtr++) * (*bPtr++);
164  }
165 }
166 
167 #endif /* LV_HAVE_SSE2 */
168 
169 
170 #ifdef LV_HAVE_AVX
171 
172 #include <immintrin.h>
173 
174 static inline void volk_64f_x2_multiply_64f_u_avx(double* cVector,
175  const double* aVector,
176  const double* bVector,
177  unsigned int num_points)
178 {
179  unsigned int number = 0;
180  const unsigned int quarter_points = num_points / 4;
181 
182  double* cPtr = cVector;
183  const double* aPtr = aVector;
184  const double* bPtr = bVector;
185 
186  __m256d aVal, bVal, cVal;
187  for (; number < quarter_points; number++) {
188 
189  aVal = _mm256_loadu_pd(aPtr);
190  bVal = _mm256_loadu_pd(bPtr);
191 
192  cVal = _mm256_mul_pd(aVal, bVal);
193 
194  _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
195 
196  aPtr += 4;
197  bPtr += 4;
198  cPtr += 4;
199  }
200 
201  number = quarter_points * 4;
202  for (; number < num_points; number++) {
203  *cPtr++ = (*aPtr++) * (*bPtr++);
204  }
205 }
206 
207 #endif /* LV_HAVE_AVX */
208 
209 /*
210  * Aligned versions
211  */
212 
213 #ifdef LV_HAVE_SSE2
214 
215 #include <emmintrin.h>
216 
217 static inline void volk_64f_x2_multiply_64f_a_sse2(double* cVector,
218  const double* aVector,
219  const double* bVector,
220  unsigned int num_points)
221 {
222  unsigned int number = 0;
223  const unsigned int half_points = num_points / 2;
224 
225  double* cPtr = cVector;
226  const double* aPtr = aVector;
227  const double* bPtr = bVector;
228 
229  __m128d aVal, bVal, cVal;
230  for (; number < half_points; number++) {
231  aVal = _mm_load_pd(aPtr);
232  bVal = _mm_load_pd(bPtr);
233 
234  cVal = _mm_mul_pd(aVal, bVal);
235 
236  _mm_store_pd(cPtr, cVal); // Store the results back into the C container
237 
238  aPtr += 2;
239  bPtr += 2;
240  cPtr += 2;
241  }
242 
243  number = half_points * 2;
244  for (; number < num_points; number++) {
245  *cPtr++ = (*aPtr++) * (*bPtr++);
246  }
247 }
248 
249 #endif /* LV_HAVE_SSE2 */
250 
251 
252 #ifdef LV_HAVE_AVX
253 
254 #include <immintrin.h>
255 
256 static inline void volk_64f_x2_multiply_64f_a_avx(double* cVector,
257  const double* aVector,
258  const double* bVector,
259  unsigned int num_points)
260 {
261  unsigned int number = 0;
262  const unsigned int quarter_points = num_points / 4;
263 
264  double* cPtr = cVector;
265  const double* aPtr = aVector;
266  const double* bPtr = bVector;
267 
268  __m256d aVal, bVal, cVal;
269  for (; number < quarter_points; number++) {
270 
271  aVal = _mm256_load_pd(aPtr);
272  bVal = _mm256_load_pd(bPtr);
273 
274  cVal = _mm256_mul_pd(aVal, bVal);
275 
276  _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
277 
278  aPtr += 4;
279  bPtr += 4;
280  cPtr += 4;
281  }
282 
283  number = quarter_points * 4;
284  for (; number < num_points; number++) {
285  *cPtr++ = (*aPtr++) * (*bPtr++);
286  }
287 }
288 
289 #endif /* LV_HAVE_AVX */
290 
291 #ifdef LV_HAVE_RVV
292 #include <riscv_vector.h>
293 
294 static inline void volk_64f_x2_multiply_64f_rvv(double* cVector,
295  const double* aVector,
296  const double* bVector,
297  unsigned int num_points)
298 {
299  size_t n = num_points;
300  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
301  vl = __riscv_vsetvl_e64m8(n);
302  vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl);
303  vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
304  __riscv_vse64(cVector, __riscv_vfmul(va, vb, vl), vl);
305  }
306 }
307 #endif /*LV_HAVE_RVV*/
308 
309 #endif /* INCLUDED_volk_64f_x2_multiply_64f_u_H */
volk_64f_x2_multiply_64f_u_avx
static void volk_64f_x2_multiply_64f_u_avx(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_multiply_64f.h:174
volk_64f_x2_multiply_64f_u_sse2
static void volk_64f_x2_multiply_64f_u_sse2(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_multiply_64f.h:135
volk_64f_x2_multiply_64f_generic
static void volk_64f_x2_multiply_64f_generic(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_multiply_64f.h:66
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_64f_x2_multiply_64f_a_avx
static void volk_64f_x2_multiply_64f_a_avx(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_multiply_64f.h:256
volk_64f_x2_multiply_64f_a_sse2
static void volk_64f_x2_multiply_64f_a_sse2(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_multiply_64f.h:217