Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2019 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
74 #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H
75 #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H
76 
77 #include <float.h>
78 #include <inttypes.h>
79 #include <stdio.h>
80 #include <volk/volk_complex.h>
81 
82 
83 #ifdef LV_HAVE_GENERIC
84 
85 static inline void
87  const lv_32fc_t* aVector,
88  const lv_32fc_t* bVector,
89  const lv_32fc_t* scalar,
90  unsigned int num_points)
91 {
92  const lv_32fc_t* aPtr = aVector;
93  const lv_32fc_t* bPtr = bVector;
94  lv_32fc_t* cPtr = cVector;
95  unsigned int number = num_points;
96 
97  // unwrap loop
98  while (number >= 8) {
99  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
100  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
101  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
102  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
103  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
104  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
105  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
106  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
107  number -= 8;
108  }
109 
110  // clean up any remaining
111  while (number-- > 0) {
112  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
113  }
114 }
115 #endif /* LV_HAVE_GENERIC */
116 
117 
118 #ifdef LV_HAVE_AVX
119 #include <immintrin.h>
121 
122 static inline void
124  const lv_32fc_t* aVector,
125  const lv_32fc_t* bVector,
126  const lv_32fc_t* scalar,
127  unsigned int num_points)
128 {
129  unsigned int number = 0;
130  unsigned int i = 0;
131  const unsigned int quarterPoints = num_points / 4;
132  unsigned int isodd = num_points & 3;
133 
134  __m256 x, y, s, z;
135  lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar };
136 
137  const lv_32fc_t* a = aVector;
138  const lv_32fc_t* b = bVector;
139  lv_32fc_t* c = cVector;
140 
141  // Set up constant scalar vector
142  s = _mm256_loadu_ps((float*)v_scalar);
143 
144  for (; number < quarterPoints; number++) {
145  x = _mm256_loadu_ps((float*)b);
146  y = _mm256_loadu_ps((float*)a);
148  z = _mm256_add_ps(y, z);
149  _mm256_storeu_ps((float*)c, z);
150 
151  a += 4;
152  b += 4;
153  c += 4;
154  }
155 
156  for (i = num_points - isodd; i < num_points; i++) {
157  *c++ = (*a++) + lv_conj(*b++) * (*scalar);
158  }
159 }
160 #endif /* LV_HAVE_AVX */
161 
162 
163 #ifdef LV_HAVE_SSE3
164 #include <pmmintrin.h>
166 
167 static inline void
169  const lv_32fc_t* aVector,
170  const lv_32fc_t* bVector,
171  const lv_32fc_t* scalar,
172  unsigned int num_points)
173 {
174  unsigned int number = 0;
175  const unsigned int halfPoints = num_points / 2;
176 
177  __m128 x, y, s, z;
178  lv_32fc_t v_scalar[2] = { *scalar, *scalar };
179 
180  const lv_32fc_t* a = aVector;
181  const lv_32fc_t* b = bVector;
182  lv_32fc_t* c = cVector;
183 
184  // Set up constant scalar vector
185  s = _mm_loadu_ps((float*)v_scalar);
186 
187  for (; number < halfPoints; number++) {
188  x = _mm_loadu_ps((float*)b);
189  y = _mm_loadu_ps((float*)a);
190  z = _mm_complexconjugatemul_ps(s, x);
191  z = _mm_add_ps(y, z);
192  _mm_storeu_ps((float*)c, z);
193 
194  a += 2;
195  b += 2;
196  c += 2;
197  }
198 
199  if ((num_points % 2) != 0) {
200  *c = *a + lv_conj(*b) * (*scalar);
201  }
202 }
203 #endif /* LV_HAVE_SSE */
204 
205 
206 #ifdef LV_HAVE_AVX
207 #include <immintrin.h>
209 
210 static inline void
212  const lv_32fc_t* aVector,
213  const lv_32fc_t* bVector,
214  const lv_32fc_t* scalar,
215  unsigned int num_points)
216 {
217  unsigned int number = 0;
218  unsigned int i = 0;
219  const unsigned int quarterPoints = num_points / 4;
220  unsigned int isodd = num_points & 3;
221 
222  __m256 x, y, s, z;
223  lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar };
224 
225  const lv_32fc_t* a = aVector;
226  const lv_32fc_t* b = bVector;
227  lv_32fc_t* c = cVector;
228 
229  // Set up constant scalar vector
230  s = _mm256_loadu_ps((float*)v_scalar);
231 
232  for (; number < quarterPoints; number++) {
233  x = _mm256_load_ps((float*)b);
234  y = _mm256_load_ps((float*)a);
236  z = _mm256_add_ps(y, z);
237  _mm256_store_ps((float*)c, z);
238 
239  a += 4;
240  b += 4;
241  c += 4;
242  }
243 
244  for (i = num_points - isodd; i < num_points; i++) {
245  *c++ = (*a++) + lv_conj(*b++) * (*scalar);
246  }
247 }
248 #endif /* LV_HAVE_AVX */
249 
250 
251 #ifdef LV_HAVE_SSE3
252 #include <pmmintrin.h>
254 
255 static inline void
257  const lv_32fc_t* aVector,
258  const lv_32fc_t* bVector,
259  const lv_32fc_t* scalar,
260  unsigned int num_points)
261 {
262  unsigned int number = 0;
263  const unsigned int halfPoints = num_points / 2;
264 
265  __m128 x, y, s, z;
266  lv_32fc_t v_scalar[2] = { *scalar, *scalar };
267 
268  const lv_32fc_t* a = aVector;
269  const lv_32fc_t* b = bVector;
270  lv_32fc_t* c = cVector;
271 
272  // Set up constant scalar vector
273  s = _mm_loadu_ps((float*)v_scalar);
274 
275  for (; number < halfPoints; number++) {
276  x = _mm_load_ps((float*)b);
277  y = _mm_load_ps((float*)a);
278  z = _mm_complexconjugatemul_ps(s, x);
279  z = _mm_add_ps(y, z);
280  _mm_store_ps((float*)c, z);
281 
282  a += 2;
283  b += 2;
284  c += 2;
285  }
286 
287  if ((num_points % 2) != 0) {
288  *c = *a + lv_conj(*b) * (*scalar);
289  }
290 }
291 #endif /* LV_HAVE_SSE */
292 
293 
294 #ifdef LV_HAVE_NEON
295 #include <arm_neon.h>
296 
297 static inline void
299  const lv_32fc_t* aVector,
300  const lv_32fc_t* bVector,
301  const lv_32fc_t* scalar,
302  unsigned int num_points)
303 {
304  const lv_32fc_t* bPtr = bVector;
305  const lv_32fc_t* aPtr = aVector;
306  lv_32fc_t* cPtr = cVector;
307  unsigned int number = 0;
308  unsigned int quarter_points = num_points / 4;
309 
310  float32x4x2_t a_val, b_val, c_val, scalar_val;
311  float32x4x2_t tmp_val;
312 
313  scalar_val.val[0] = vld1q_dup_f32((const float*)scalar);
314  scalar_val.val[1] = vld1q_dup_f32(((const float*)scalar) + 1);
315 
316  for (number = 0; number < quarter_points; ++number) {
317  a_val = vld2q_f32((float*)aPtr);
318  b_val = vld2q_f32((float*)bPtr);
319  b_val.val[1] = vnegq_f32(b_val.val[1]);
320  __VOLK_PREFETCH(aPtr + 8);
321  __VOLK_PREFETCH(bPtr + 8);
322 
323  tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
324  tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
325 
326  tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
327  tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
328 
329  c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
330  c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
331 
332  vst2q_f32((float*)cPtr, c_val);
333 
334  aPtr += 4;
335  bPtr += 4;
336  cPtr += 4;
337  }
338 
339  for (number = quarter_points * 4; number < num_points; number++) {
340  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
341  }
342 }
343 #endif /* LV_HAVE_NEON */
344 
345 #ifdef LV_HAVE_NEONV8
346 #include <arm_neon.h>
347 
348 static inline void
349 volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neonv8(lv_32fc_t* cVector,
350  const lv_32fc_t* aVector,
351  const lv_32fc_t* bVector,
352  const lv_32fc_t* scalar,
353  unsigned int num_points)
354 {
355  const lv_32fc_t* bPtr = bVector;
356  const lv_32fc_t* aPtr = aVector;
357  lv_32fc_t* cPtr = cVector;
358  unsigned int number = 0;
359  unsigned int quarter_points = num_points / 4;
360 
361  float32x4x2_t a_val, b_val, c_val;
362  float32x4x2_t tmp_val;
363 
364  float32x4_t scalar_real = vdupq_n_f32(lv_creal(*scalar));
365  float32x4_t scalar_imag = vdupq_n_f32(lv_cimag(*scalar));
366 
367  for (number = 0; number < quarter_points; ++number) {
368  a_val = vld2q_f32((float*)aPtr);
369  b_val = vld2q_f32((float*)bPtr);
370  __VOLK_PREFETCH(aPtr + 8);
371  __VOLK_PREFETCH(bPtr + 8);
372 
373  /* Conjugate b: negate imaginary part */
374  /* tmp_real = br*sr - (-bi)*si = br*sr + bi*si */
375  tmp_val.val[0] =
376  vfmaq_f32(vmulq_f32(b_val.val[0], scalar_real), b_val.val[1], scalar_imag);
377 
378  /* tmp_imag = (-bi)*sr + br*si = br*si - bi*sr */
379  tmp_val.val[1] =
380  vfmsq_f32(vmulq_f32(b_val.val[0], scalar_imag), b_val.val[1], scalar_real);
381 
382  /* c = a + tmp */
383  c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
384  c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
385 
386  vst2q_f32((float*)cPtr, c_val);
387 
388  aPtr += 4;
389  bPtr += 4;
390  cPtr += 4;
391  }
392 
393  for (number = quarter_points * 4; number < num_points; number++) {
394  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
395  }
396 }
397 #endif /* LV_HAVE_NEONV8 */
398 
399 #ifdef LV_HAVE_RVV
400 #include <riscv_vector.h>
401 
402 static inline void
403 volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_rvv(lv_32fc_t* cVector,
404  const lv_32fc_t* aVector,
405  const lv_32fc_t* bVector,
406  const lv_32fc_t* scalar,
407  unsigned int num_points)
408 {
409  vfloat32m2_t vbr =
410  __riscv_vfmv_v_f_f32m2(lv_creal(*scalar), __riscv_vsetvlmax_e32m2());
411  vfloat32m2_t vbi =
412  __riscv_vfmv_v_f_f32m2(lv_cimag(*scalar), __riscv_vsetvlmax_e32m2());
413  size_t n = num_points;
414  for (size_t vl; n > 0; n -= vl, bVector += vl, aVector += vl, cVector += vl) {
415  vl = __riscv_vsetvl_e32m2(n);
416  vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)bVector, vl);
417  vuint64m4_t vc = __riscv_vle64_v_u64m4((const uint64_t*)aVector, vl);
418  vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
419  vfloat32m2_t vcr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 0, vl));
420  vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
421  vfloat32m2_t vci = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 32, vl));
422  vfloat32m2_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
423  vfloat32m2_t vi = __riscv_vfnmsac(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
424  vuint32m2_t vru = __riscv_vreinterpret_u32m2(__riscv_vfadd(vr, vcr, vl));
425  vuint32m2_t viu = __riscv_vreinterpret_u32m2(__riscv_vfadd(vi, vci, vl));
426  vuint64m4_t v =
427  __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
428  __riscv_vse64((uint64_t*)cVector, v, vl);
429  }
430 }
431 #endif /*LV_HAVE_RVV*/
432 
433 #ifdef LV_HAVE_RVVSEG
434 #include <riscv_vector.h>
435 
436 static inline void
437 volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_rvvseg(lv_32fc_t* cVector,
438  const lv_32fc_t* aVector,
439  const lv_32fc_t* bVector,
440  const lv_32fc_t* scalar,
441  unsigned int num_points)
442 {
443  vfloat32m4_t vbr =
444  __riscv_vfmv_v_f_f32m4(lv_creal(*scalar), __riscv_vsetvlmax_e32m4());
445  vfloat32m4_t vbi =
446  __riscv_vfmv_v_f_f32m4(lv_cimag(*scalar), __riscv_vsetvlmax_e32m4());
447  size_t n = num_points;
448  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
449  vl = __riscv_vsetvl_e32m4(n);
450  vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl);
451  vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl);
452  vfloat32m4_t vcr = __riscv_vget_f32m4(vc, 0), vci = __riscv_vget_f32m4(vc, 1);
453  vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
454  vfloat32m4_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
455  vfloat32m4_t vi = __riscv_vfnmsac(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
456  vr = __riscv_vfadd(vr, vcr, vl);
457  vi = __riscv_vfadd(vi, vci, vl);
458  __riscv_vsseg2e32_v_f32m4x2(
459  (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
460  }
461 }
462 #endif /*LV_HAVE_RVVSEG*/
463 
464 #endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H */
volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_generic
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:86
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:98
volk_sse3_intrinsics.h
volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_avx
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:211
volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_sse3
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:256
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_sse3
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:168
lv_conj
#define lv_conj(x)
Definition: volk_complex.h:100
i
for i
Definition: volk_config_fixed.tmpl.h:13
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_complex.h
_mm256_complexconjugatemul_ps
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:139
volk_avx_intrinsics.h
_mm_complexconjugatemul_ps
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:31
volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_avx
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:123
volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:298
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:96