Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_32f_multiply_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
42 #ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
43 #define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
44 
45 #include <inttypes.h>
46 #include <stdio.h>
47 
48 #ifdef LV_HAVE_AVX
49 #include <immintrin.h>
50 
51 static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector,
52  const lv_32fc_t* aVector,
53  const float* bVector,
54  unsigned int num_points)
55 {
56  unsigned int number = 0;
57  const unsigned int eighthPoints = num_points / 8;
58 
59  lv_32fc_t* cPtr = cVector;
60  const lv_32fc_t* aPtr = aVector;
61  const float* bPtr = bVector;
62 
63  __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
64 
65  __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
66 
67  for (; number < eighthPoints; number++) {
68 
69  aVal1 = _mm256_load_ps((float*)aPtr);
70  aPtr += 4;
71 
72  aVal2 = _mm256_load_ps((float*)aPtr);
73  aPtr += 4;
74 
75  bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
76  bPtr += 8;
77 
78  bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
79  bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
80 
81  bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
82  bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
83 
84  cVal1 = _mm256_mul_ps(aVal1, bVal1);
85  cVal2 = _mm256_mul_ps(aVal2, bVal2);
86 
87  _mm256_store_ps((float*)cPtr,
88  cVal1); // Store the results back into the C container
89  cPtr += 4;
90 
91  _mm256_store_ps((float*)cPtr,
92  cVal2); // Store the results back into the C container
93  cPtr += 4;
94  }
95 
96  number = eighthPoints * 8;
97  for (; number < num_points; ++number) {
98  *cPtr++ = (*aPtr++) * (*bPtr++);
99  }
100 }
101 #endif /* LV_HAVE_AVX */
102 
103 
104 #ifdef LV_HAVE_SSE
105 #include <xmmintrin.h>
106 
107 static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector,
108  const lv_32fc_t* aVector,
109  const float* bVector,
110  unsigned int num_points)
111 {
112  unsigned int number = 0;
113  const unsigned int quarterPoints = num_points / 4;
114 
115  lv_32fc_t* cPtr = cVector;
116  const lv_32fc_t* aPtr = aVector;
117  const float* bPtr = bVector;
118 
119  __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
120  for (; number < quarterPoints; number++) {
121 
122  aVal1 = _mm_load_ps((const float*)aPtr);
123  aPtr += 2;
124 
125  aVal2 = _mm_load_ps((const float*)aPtr);
126  aPtr += 2;
127 
128  bVal = _mm_load_ps(bPtr);
129  bPtr += 4;
130 
131  bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0));
132  bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2));
133 
134  cVal = _mm_mul_ps(aVal1, bVal1);
135 
136  _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
137  cPtr += 2;
138 
139  cVal = _mm_mul_ps(aVal2, bVal2);
140 
141  _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
142 
143  cPtr += 2;
144  }
145 
146  number = quarterPoints * 4;
147  for (; number < num_points; number++) {
148  *cPtr++ = (*aPtr++) * (*bPtr);
149  bPtr++;
150  }
151 }
152 #endif /* LV_HAVE_SSE */
153 
154 
155 #ifdef LV_HAVE_GENERIC
156 
157 static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector,
158  const lv_32fc_t* aVector,
159  const float* bVector,
160  unsigned int num_points)
161 {
162  lv_32fc_t* cPtr = cVector;
163  const lv_32fc_t* aPtr = aVector;
164  const float* bPtr = bVector;
165  unsigned int number = 0;
166 
167  for (number = 0; number < num_points; number++) {
168  *cPtr++ = (*aPtr++) * (*bPtr++);
169  }
170 }
171 #endif /* LV_HAVE_GENERIC */
172 
173 
174 #ifdef LV_HAVE_NEON
175 #include <arm_neon.h>
176 
177 static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector,
178  const lv_32fc_t* aVector,
179  const float* bVector,
180  unsigned int num_points)
181 {
182  lv_32fc_t* cPtr = cVector;
183  const lv_32fc_t* aPtr = aVector;
184  const float* bPtr = bVector;
185  unsigned int number = 0;
186  unsigned int quarter_points = num_points / 4;
187 
188  float32x4x2_t inputVector, outputVector;
189  float32x4_t tapsVector;
190  for (number = 0; number < quarter_points; number++) {
191  inputVector = vld2q_f32((float*)aPtr);
192  tapsVector = vld1q_f32(bPtr);
193 
194  outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
195  outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
196 
197  vst2q_f32((float*)cPtr, outputVector);
198  aPtr += 4;
199  bPtr += 4;
200  cPtr += 4;
201  }
202 
203  for (number = quarter_points * 4; number < num_points; number++) {
204  *cPtr++ = (*aPtr++) * (*bPtr++);
205  }
206 }
207 #endif /* LV_HAVE_NEON */
208 
209 
210 #ifdef LV_HAVE_NEONV8
211 #include <arm_neon.h>
212 
213 static inline void volk_32fc_32f_multiply_32fc_neonv8(lv_32fc_t* cVector,
214  const lv_32fc_t* aVector,
215  const float* bVector,
216  unsigned int num_points)
217 {
218  unsigned int n = num_points;
219  lv_32fc_t* c = cVector;
220  const lv_32fc_t* a = aVector;
221  const float* b = bVector;
222 
223  /* Process 8 complex numbers per iteration (2x unroll) */
224  while (n >= 8) {
225  float32x4x2_t a0 = vld2q_f32((const float*)a);
226  float32x4x2_t a1 = vld2q_f32((const float*)(a + 4));
227  float32x4_t b0 = vld1q_f32(b);
228  float32x4_t b1 = vld1q_f32(b + 4);
229  __VOLK_PREFETCH(a + 8);
230  __VOLK_PREFETCH(b + 8);
231 
232  /* Complex × real: just scale both real and imag parts */
233  float32x4x2_t c0, c1;
234  c0.val[0] = vmulq_f32(a0.val[0], b0);
235  c0.val[1] = vmulq_f32(a0.val[1], b0);
236  c1.val[0] = vmulq_f32(a1.val[0], b1);
237  c1.val[1] = vmulq_f32(a1.val[1], b1);
238 
239  vst2q_f32((float*)c, c0);
240  vst2q_f32((float*)(c + 4), c1);
241 
242  a += 8;
243  b += 8;
244  c += 8;
245  n -= 8;
246  }
247 
248  /* Process remaining 4 */
249  if (n >= 4) {
250  float32x4x2_t a0 = vld2q_f32((const float*)a);
251  float32x4_t b0 = vld1q_f32(b);
252  float32x4x2_t c0;
253  c0.val[0] = vmulq_f32(a0.val[0], b0);
254  c0.val[1] = vmulq_f32(a0.val[1], b0);
255  vst2q_f32((float*)c, c0);
256  a += 4;
257  b += 4;
258  c += 4;
259  n -= 4;
260  }
261 
262  /* Scalar tail */
263  while (n > 0) {
264  *c++ = (*a++) * (*b++);
265  n--;
266  }
267 }
268 
269 #endif /* LV_HAVE_NEONV8 */
270 
271 
272 #ifdef LV_HAVE_ORC
273 
274 extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
275  const lv_32fc_t* aVector,
276  const float* bVector,
277  int num_points);
278 
279 static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector,
280  const lv_32fc_t* aVector,
281  const float* bVector,
282  unsigned int num_points)
283 {
284  volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
285 }
286 
287 #endif /* LV_HAVE_GENERIC */
288 
289 #ifdef LV_HAVE_RVV
290 #include <riscv_vector.h>
291 
292 static inline void volk_32fc_32f_multiply_32fc_rvv(lv_32fc_t* cVector,
293  const lv_32fc_t* aVector,
294  const float* bVector,
295  unsigned int num_points)
296 {
297  size_t n = num_points;
298  for (size_t vl; n > 0; n -= vl, cVector += vl, aVector += vl, bVector += vl) {
299  vl = __riscv_vsetvl_e32m4(n);
300  vfloat32m8_t vc = __riscv_vle32_v_f32m8((const float*)aVector, vl * 2);
301  vuint32m4_t v = __riscv_vle32_v_u32m4((const uint32_t*)bVector, vl);
302  vfloat32m8_t vf = __riscv_vreinterpret_f32m8(__riscv_vreinterpret_u32m8(
303  __riscv_vwmaccu(__riscv_vwaddu_vv(v, v, vl), 0xFFFFFFFF, v, vl)));
304  __riscv_vse32((float*)cVector, __riscv_vfmul(vc, vf, vl * 2), vl * 2);
305  }
306 }
307 #endif /*LV_HAVE_RVV*/
308 
309 #endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32fc_32f_multiply_32fc_a_avx
static void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:51
volk_32fc_32f_multiply_32fc_a_sse
static void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:107
volk_32fc_32f_multiply_32fc_generic
static void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:157
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_32fc_32f_multiply_32fc_neon
static void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:177