Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_64f_add_64f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32f_64f_add_64f_H
61 #define INCLUDED_volk_32f_64f_add_64f_H
62 
63 #include <inttypes.h>
64 
65 #ifdef LV_HAVE_GENERIC
66 
67 static inline void volk_32f_64f_add_64f_generic(double* cVector,
68  const float* aVector,
69  const double* bVector,
70  unsigned int num_points)
71 {
72  double* cPtr = cVector;
73  const float* aPtr = aVector;
74  const double* bPtr = bVector;
75  unsigned int number = 0;
76 
77  for (number = 0; number < num_points; number++) {
78  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
79  }
80 }
81 
82 #endif /* LV_HAVE_GENERIC */
83 
84 #ifdef LV_HAVE_NEONV8
85 #include <arm_neon.h>
86 
87 static inline void volk_32f_64f_add_64f_neonv8(double* cVector,
88  const float* aVector,
89  const double* bVector,
90  unsigned int num_points)
91 {
92  unsigned int number = 0;
93  const unsigned int quarter_points = num_points / 4;
94 
95  double* cPtr = cVector;
96  const float* aPtr = aVector;
97  const double* bPtr = bVector;
98 
99  for (; number < quarter_points; number++) {
100  // Load 4 floats
101  float32x4_t aVal_f32 = vld1q_f32(aPtr);
102  // Load 4 doubles (2x2)
103  float64x2_t bVal0 = vld1q_f64(bPtr);
104  float64x2_t bVal1 = vld1q_f64(bPtr + 2);
105  __VOLK_PREFETCH(aPtr + 4);
106  __VOLK_PREFETCH(bPtr + 4);
107 
108  // Convert float to double (low and high halves)
109  float64x2_t aVal0 = vcvt_f64_f32(vget_low_f32(aVal_f32));
110  float64x2_t aVal1 = vcvt_f64_f32(vget_high_f32(aVal_f32));
111 
112  // Add
113  float64x2_t cVal0 = vaddq_f64(aVal0, bVal0);
114  float64x2_t cVal1 = vaddq_f64(aVal1, bVal1);
115 
116  // Store
117  vst1q_f64(cPtr, cVal0);
118  vst1q_f64(cPtr + 2, cVal1);
119 
120  aPtr += 4;
121  bPtr += 4;
122  cPtr += 4;
123  }
124 
125  number = quarter_points * 4;
126  for (; number < num_points; number++) {
127  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
128  }
129 }
130 
131 #endif /* LV_HAVE_NEONV8 */
132 
133 #ifdef LV_HAVE_AVX
134 
135 #include <immintrin.h>
136 #include <xmmintrin.h>
137 
138 static inline void volk_32f_64f_add_64f_u_avx(double* cVector,
139  const float* aVector,
140  const double* bVector,
141  unsigned int num_points)
142 {
143  unsigned int number = 0;
144  const unsigned int eighth_points = num_points / 8;
145 
146  double* cPtr = cVector;
147  const float* aPtr = aVector;
148  const double* bPtr = bVector;
149 
150  __m256 aVal;
151  __m128 aVal1, aVal2;
152  __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
153  for (; number < eighth_points; number++) {
154 
155  aVal = _mm256_loadu_ps(aPtr);
156  bVal1 = _mm256_loadu_pd(bPtr);
157  bVal2 = _mm256_loadu_pd(bPtr + 4);
158 
159  aVal1 = _mm256_extractf128_ps(aVal, 0);
160  aVal2 = _mm256_extractf128_ps(aVal, 1);
161 
162  aDbl1 = _mm256_cvtps_pd(aVal1);
163  aDbl2 = _mm256_cvtps_pd(aVal2);
164 
165  cVal1 = _mm256_add_pd(aDbl1, bVal1);
166  cVal2 = _mm256_add_pd(aDbl2, bVal2);
167 
168  _mm256_storeu_pd(cPtr,
169  cVal1); // Store the results back into the C container
170  _mm256_storeu_pd(cPtr + 4,
171  cVal2); // Store the results back into the C container
172 
173  aPtr += 8;
174  bPtr += 8;
175  cPtr += 8;
176  }
177 
178  number = eighth_points * 8;
179  for (; number < num_points; number++) {
180  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
181  }
182 }
183 
184 #endif /* LV_HAVE_AVX */
185 
186 #ifdef LV_HAVE_AVX
187 
188 #include <immintrin.h>
189 #include <xmmintrin.h>
190 
191 static inline void volk_32f_64f_add_64f_a_avx(double* cVector,
192  const float* aVector,
193  const double* bVector,
194  unsigned int num_points)
195 {
196  unsigned int number = 0;
197  const unsigned int eighth_points = num_points / 8;
198 
199  double* cPtr = cVector;
200  const float* aPtr = aVector;
201  const double* bPtr = bVector;
202 
203  __m256 aVal;
204  __m128 aVal1, aVal2;
205  __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
206  for (; number < eighth_points; number++) {
207 
208  aVal = _mm256_load_ps(aPtr);
209  bVal1 = _mm256_load_pd(bPtr);
210  bVal2 = _mm256_load_pd(bPtr + 4);
211 
212  aVal1 = _mm256_extractf128_ps(aVal, 0);
213  aVal2 = _mm256_extractf128_ps(aVal, 1);
214 
215  aDbl1 = _mm256_cvtps_pd(aVal1);
216  aDbl2 = _mm256_cvtps_pd(aVal2);
217 
218  cVal1 = _mm256_add_pd(aDbl1, bVal1);
219  cVal2 = _mm256_add_pd(aDbl2, bVal2);
220 
221  _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
222  _mm256_store_pd(cPtr + 4,
223  cVal2); // Store the results back into the C container
224 
225  aPtr += 8;
226  bPtr += 8;
227  cPtr += 8;
228  }
229 
230  number = eighth_points * 8;
231  for (; number < num_points; number++) {
232  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
233  }
234 }
235 
236 #endif /* LV_HAVE_AVX */
237 
238 #ifdef LV_HAVE_RVV
239 #include <riscv_vector.h>
240 
241 static inline void volk_32f_64f_add_64f_rvv(double* cVector,
242  const float* aVector,
243  const double* bVector,
244  unsigned int num_points)
245 {
246  size_t n = num_points;
247  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
248  vl = __riscv_vsetvl_e64m8(n);
249  vfloat64m8_t va = __riscv_vfwcvt_f(__riscv_vle32_v_f32m4(aVector, vl), vl);
250  vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
251  __riscv_vse64(cVector, __riscv_vfadd(va, vb, vl), vl);
252  }
253 }
254 #endif /*LV_HAVE_RVV*/
255 
256 #endif /* INCLUDED_volk_32f_64f_add_64f_u_H */
volk_32f_64f_add_64f_generic
static void volk_32f_64f_add_64f_generic(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:67
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32f_64f_add_64f_a_avx
static void volk_32f_64f_add_64f_a_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:191
volk_32f_64f_add_64f_u_avx
static void volk_32f_64f_add_64f_u_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:138