Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_32f_add_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
61 #ifndef INCLUDED_volk_32fc_32f_add_32fc_u_H
62 #define INCLUDED_volk_32fc_32f_add_32fc_u_H
63 
64 #ifdef LV_HAVE_GENERIC
65 
66 static inline void volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector,
67  const lv_32fc_t* aVector,
68  const float* bVector,
69  unsigned int num_points)
70 {
71  lv_32fc_t* cPtr = cVector;
72  const lv_32fc_t* aPtr = aVector;
73  const float* bPtr = bVector;
74  unsigned int number = 0;
75 
76  for (number = 0; number < num_points; number++) {
77  *cPtr++ = (*aPtr++) + (*bPtr++);
78  }
79 }
80 #endif /* LV_HAVE_GENERIC */
81 
82 
83 #ifdef LV_HAVE_AVX
84 #include <immintrin.h>
85 
86 static inline void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector,
87  const lv_32fc_t* aVector,
88  const float* bVector,
89  unsigned int num_points)
90 {
91  unsigned int number = 0;
92  const unsigned int eighthPoints = num_points / 8;
93 
94  lv_32fc_t* cPtr = cVector;
95  const lv_32fc_t* aPtr = aVector;
96  const float* bPtr = bVector;
97 
98  __m256 aVal1, aVal2, bVal, cVal1, cVal2;
99  __m256 cpx_b1, cpx_b2;
100  __m256 zero;
101  zero = _mm256_setzero_ps();
102  __m256 tmp1, tmp2;
103  for (; number < eighthPoints; number++) {
104 
105  aVal1 = _mm256_loadu_ps((float*)aPtr);
106  aVal2 = _mm256_loadu_ps((float*)(aPtr + 4));
107  bVal = _mm256_loadu_ps(bPtr);
108  cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
109  cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
110 
111  tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
112  tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
113 
114  cVal1 = _mm256_add_ps(aVal1, tmp1);
115  cVal2 = _mm256_add_ps(aVal2, tmp2);
116 
117  _mm256_storeu_ps((float*)cPtr,
118  cVal1); // Store the results back into the C container
119  _mm256_storeu_ps((float*)(cPtr + 4),
120  cVal2); // Store the results back into the C container
121 
122  aPtr += 8;
123  bPtr += 8;
124  cPtr += 8;
125  }
126 
127  number = eighthPoints * 8;
128  for (; number < num_points; number++) {
129  *cPtr++ = (*aPtr++) + (*bPtr++);
130  }
131 }
132 #endif /* LV_HAVE_AVX */
133 
134 #ifdef LV_HAVE_AVX
135 #include <immintrin.h>
136 
137 static inline void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector,
138  const lv_32fc_t* aVector,
139  const float* bVector,
140  unsigned int num_points)
141 {
142  unsigned int number = 0;
143  const unsigned int eighthPoints = num_points / 8;
144 
145  lv_32fc_t* cPtr = cVector;
146  const lv_32fc_t* aPtr = aVector;
147  const float* bPtr = bVector;
148 
149  __m256 aVal1, aVal2, bVal, cVal1, cVal2;
150  __m256 cpx_b1, cpx_b2;
151  __m256 zero;
152  zero = _mm256_setzero_ps();
153  __m256 tmp1, tmp2;
154  for (; number < eighthPoints; number++) {
155 
156  aVal1 = _mm256_load_ps((float*)aPtr);
157  aVal2 = _mm256_load_ps((float*)(aPtr + 4));
158  bVal = _mm256_load_ps(bPtr);
159  cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
160  cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
161 
162  tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
163  tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
164 
165  cVal1 = _mm256_add_ps(aVal1, tmp1);
166  cVal2 = _mm256_add_ps(aVal2, tmp2);
167 
168  _mm256_store_ps((float*)cPtr,
169  cVal1); // Store the results back into the C container
170  _mm256_store_ps((float*)(cPtr + 4),
171  cVal2); // Store the results back into the C container
172 
173  aPtr += 8;
174  bPtr += 8;
175  cPtr += 8;
176  }
177 
178  number = eighthPoints * 8;
179  for (; number < num_points; number++) {
180  *cPtr++ = (*aPtr++) + (*bPtr++);
181  }
182 }
183 #endif /* LV_HAVE_AVX */
184 
185 #ifdef LV_HAVE_NEON
186 #include <arm_neon.h>
187 
188 static inline void volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector,
189  const lv_32fc_t* aVector,
190  const float* bVector,
191  unsigned int num_points)
192 {
193  lv_32fc_t* cPtr = cVector;
194  const lv_32fc_t* aPtr = aVector;
195  const float* bPtr = bVector;
196 
197  float32x4x4_t aVal0, aVal1;
198  float32x4x2_t bVal0, bVal1;
199 
200  const unsigned int sixteenthPoints = num_points / 16;
201  unsigned int number = 0;
202  for (; number < sixteenthPoints; number++) {
203  aVal0 = vld4q_f32((const float*)aPtr);
204  aPtr += 8;
205  aVal1 = vld4q_f32((const float*)aPtr);
206  aPtr += 8;
207  __VOLK_PREFETCH(aPtr + 16);
208 
209  bVal0 = vld2q_f32((const float*)bPtr);
210  bPtr += 8;
211  bVal1 = vld2q_f32((const float*)bPtr);
212  bPtr += 8;
213  __VOLK_PREFETCH(bPtr + 16);
214 
215  aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
216  aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
217 
218  aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
219  aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
220 
221  vst4q_f32((float*)(cPtr), aVal0);
222  cPtr += 8;
223  vst4q_f32((float*)(cPtr), aVal1);
224  cPtr += 8;
225  }
226 
227  for (number = sixteenthPoints * 16; number < num_points; number++) {
228  *cPtr++ = (*aPtr++) + (*bPtr++);
229  }
230 }
231 #endif /* LV_HAVE_NEON */
232 
233 #ifdef LV_HAVE_NEONV8
234 #include <arm_neon.h>
235 
236 static inline void volk_32fc_32f_add_32fc_neonv8(lv_32fc_t* cVector,
237  const lv_32fc_t* aVector,
238  const float* bVector,
239  unsigned int num_points)
240 {
241  const unsigned int eighthPoints = num_points / 8;
242 
243  const float* aPtr = (const float*)aVector;
244  const float* bPtr = bVector;
245  float* cPtr = (float*)cVector;
246 
247  for (unsigned int number = 0; number < eighthPoints; number++) {
248  /* Load complex values (interleaved real/imag) */
249  float32x4_t a0 = vld1q_f32(aPtr);
250  float32x4_t a1 = vld1q_f32(aPtr + 4);
251  float32x4_t a2 = vld1q_f32(aPtr + 8);
252  float32x4_t a3 = vld1q_f32(aPtr + 12);
253 
254  /* Load real values and duplicate for complex add */
255  float32x4_t b0 = vld1q_f32(bPtr);
256  float32x4_t b1 = vld1q_f32(bPtr + 4);
257  __VOLK_PREFETCH(aPtr + 32);
258  __VOLK_PREFETCH(bPtr + 16);
259 
260  /* Interleave b values with zeros: [b0, 0, b1, 0] */
261  float32x4x2_t b0_zip = vzipq_f32(b0, vdupq_n_f32(0));
262  float32x4x2_t b1_zip = vzipq_f32(b1, vdupq_n_f32(0));
263 
264  /* Add to complex */
265  vst1q_f32(cPtr, vaddq_f32(a0, b0_zip.val[0]));
266  vst1q_f32(cPtr + 4, vaddq_f32(a1, b0_zip.val[1]));
267  vst1q_f32(cPtr + 8, vaddq_f32(a2, b1_zip.val[0]));
268  vst1q_f32(cPtr + 12, vaddq_f32(a3, b1_zip.val[1]));
269 
270  aPtr += 16;
271  bPtr += 8;
272  cPtr += 16;
273  }
274 
275  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
276  cVector[number] = aVector[number] + bVector[number];
277  }
278 }
279 #endif /* LV_HAVE_NEONV8 */
280 
281 #ifdef LV_HAVE_RVV
282 #include <riscv_vector.h>
283 
284 static inline void volk_32fc_32f_add_32fc_rvv(lv_32fc_t* cVector,
285  const lv_32fc_t* aVector,
286  const float* bVector,
287  unsigned int num_points)
288 {
289  size_t n = num_points;
290  for (size_t vl; n > 0; n -= vl, cVector += vl, aVector += vl, bVector += vl) {
291  vl = __riscv_vsetvl_e32m4(n);
292  vfloat32m8_t vc = __riscv_vle32_v_f32m8((const float*)aVector, vl * 2);
293  vuint32m4_t v = __riscv_vle32_v_u32m4((const uint32_t*)bVector, vl);
294  vfloat32m8_t vf = __riscv_vreinterpret_f32m8(
295  __riscv_vreinterpret_u32m8(__riscv_vzext_vf2_u64m8(v, vl)));
296  __riscv_vse32((float*)cVector, __riscv_vfadd(vc, vf, vl * 2), vl * 2);
297  }
298 }
299 #endif /*LV_HAVE_RVV*/
300 
301 #endif /* INCLUDED_volk_32fc_32f_add_32fc_a_H */
volk_32fc_32f_add_32fc_a_avx
static void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:137
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32fc_32f_add_32fc_neon
static void volk_32fc_32f_add_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:188
volk_32fc_32f_add_32fc_u_avx
static void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:86
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_32fc_32f_add_32fc_generic
static void volk_32fc_32f_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:66