Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_8ic_x2_s32f_multiply_conjugate_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
44 #ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
45 #define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
46 
47 #include <inttypes.h>
48 #include <stdio.h>
49 #include <volk/volk_complex.h>
50 
51 #ifdef LV_HAVE_AVX2
52 #include <immintrin.h>
53 
54 static inline void
55 volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector,
56  const lv_8sc_t* aVector,
57  const lv_8sc_t* bVector,
58  const float scalar,
59  unsigned int num_points)
60 {
61  unsigned int number = 0;
62  const unsigned int oneEigthPoints = num_points / 8;
63 
64  __m256i x, y, realz, imagz;
65  __m256 ret, retlo, rethi;
66  lv_32fc_t* c = cVector;
67  const lv_8sc_t* a = aVector;
68  const lv_8sc_t* b = bVector;
69  __m256i conjugateSign =
70  _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
71 
72  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
73 
74  for (; number < oneEigthPoints; number++) {
75  // Convert 8 bit values into 16 bit values
76  x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
77  y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
78 
79  // Calculate the ar*cr - ai*(-ci) portions
80  realz = _mm256_madd_epi16(x, y);
81 
82  // Calculate the complex conjugate of the cr + ci j values
83  y = _mm256_sign_epi16(y, conjugateSign);
84 
85  // Shift the order of the cr and ci values
86  y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
87  _MM_SHUFFLE(2, 3, 0, 1));
88 
89  // Calculate the ar*(-ci) + cr*(ai)
90  imagz = _mm256_madd_epi16(x, y);
91 
92  // Interleave real and imaginary and then convert to float values
93  retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
94 
95  // Normalize the floating point values
96  retlo = _mm256_mul_ps(retlo, invScalar);
97 
98  // Interleave real and imaginary and then convert to float values
99  rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
100 
101  // Normalize the floating point values
102  rethi = _mm256_mul_ps(rethi, invScalar);
103 
104  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
105  _mm256_store_ps((float*)c, ret);
106  c += 4;
107 
108  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
109  _mm256_store_ps((float*)c, ret);
110  c += 4;
111 
112  a += 8;
113  b += 8;
114  }
115 
116  number = oneEigthPoints * 8;
117  float* cFloatPtr = (float*)&cVector[number];
118  int8_t* a8Ptr = (int8_t*)&aVector[number];
119  int8_t* b8Ptr = (int8_t*)&bVector[number];
120  for (; number < num_points; number++) {
121  float aReal = (float)*a8Ptr++;
122  float aImag = (float)*a8Ptr++;
123  lv_32fc_t aVal = lv_cmake(aReal, aImag);
124  float bReal = (float)*b8Ptr++;
125  float bImag = (float)*b8Ptr++;
126  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
127  lv_32fc_t temp = aVal * bVal;
128 
129  *cFloatPtr++ = lv_creal(temp) / scalar;
130  *cFloatPtr++ = lv_cimag(temp) / scalar;
131  }
132 }
133 #endif /* LV_HAVE_AVX2*/
134 
135 
136 #ifdef LV_HAVE_SSE4_1
137 #include <smmintrin.h>
138 
139 static inline void
140 volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector,
141  const lv_8sc_t* aVector,
142  const lv_8sc_t* bVector,
143  const float scalar,
144  unsigned int num_points)
145 {
146  unsigned int number = 0;
147  const unsigned int quarterPoints = num_points / 4;
148 
149  __m128i x, y, realz, imagz;
150  __m128 ret;
151  lv_32fc_t* c = cVector;
152  const lv_8sc_t* a = aVector;
153  const lv_8sc_t* b = bVector;
154  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
155 
156  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
157 
158  for (; number < quarterPoints; number++) {
159  // Convert into 8 bit values into 16 bit values
160  x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
161  y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
162 
163  // Calculate the ar*cr - ai*(-ci) portions
164  realz = _mm_madd_epi16(x, y);
165 
166  // Calculate the complex conjugate of the cr + ci j values
167  y = _mm_sign_epi16(y, conjugateSign);
168 
169  // Shift the order of the cr and ci values
170  y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
171  _MM_SHUFFLE(2, 3, 0, 1));
172 
173  // Calculate the ar*(-ci) + cr*(ai)
174  imagz = _mm_madd_epi16(x, y);
175 
176  // Interleave real and imaginary and then convert to float values
177  ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
178 
179  // Normalize the floating point values
180  ret = _mm_mul_ps(ret, invScalar);
181 
182  // Store the floating point values
183  _mm_store_ps((float*)c, ret);
184  c += 2;
185 
186  // Interleave real and imaginary and then convert to float values
187  ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
188 
189  // Normalize the floating point values
190  ret = _mm_mul_ps(ret, invScalar);
191 
192  // Store the floating point values
193  _mm_store_ps((float*)c, ret);
194  c += 2;
195 
196  a += 4;
197  b += 4;
198  }
199 
200  number = quarterPoints * 4;
201  float* cFloatPtr = (float*)&cVector[number];
202  int8_t* a8Ptr = (int8_t*)&aVector[number];
203  int8_t* b8Ptr = (int8_t*)&bVector[number];
204  for (; number < num_points; number++) {
205  float aReal = (float)*a8Ptr++;
206  float aImag = (float)*a8Ptr++;
207  lv_32fc_t aVal = lv_cmake(aReal, aImag);
208  float bReal = (float)*b8Ptr++;
209  float bImag = (float)*b8Ptr++;
210  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
211  lv_32fc_t temp = aVal * bVal;
212 
213  *cFloatPtr++ = lv_creal(temp) / scalar;
214  *cFloatPtr++ = lv_cimag(temp) / scalar;
215  }
216 }
217 #endif /* LV_HAVE_SSE4_1 */
218 
219 
220 #ifdef LV_HAVE_GENERIC
221 
222 static inline void
224  const lv_8sc_t* aVector,
225  const lv_8sc_t* bVector,
226  const float scalar,
227  unsigned int num_points)
228 {
229  unsigned int number = 0;
230  float* cPtr = (float*)cVector;
231  const float invScalar = 1.0 / scalar;
232  int8_t* a8Ptr = (int8_t*)aVector;
233  int8_t* b8Ptr = (int8_t*)bVector;
234  for (number = 0; number < num_points; number++) {
235  float aReal = (float)*a8Ptr++;
236  float aImag = (float)*a8Ptr++;
237  lv_32fc_t aVal = lv_cmake(aReal, aImag);
238  float bReal = (float)*b8Ptr++;
239  float bImag = (float)*b8Ptr++;
240  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
241  lv_32fc_t temp = aVal * bVal;
242 
243  *cPtr++ = (lv_creal(temp) * invScalar);
244  *cPtr++ = (lv_cimag(temp) * invScalar);
245  }
246 }
247 #endif /* LV_HAVE_GENERIC */
248 
249 
250 #ifdef LV_HAVE_NEON
251 #include <arm_neon.h>
252 
254  const lv_8sc_t* aVector,
255  const lv_8sc_t* bVector,
256  const float scalar,
257  unsigned int num_points)
258 {
259  unsigned int number = 0;
260  const unsigned int eighthPoints = num_points / 8;
261 
262  lv_32fc_t* cPtr = cVector;
263  const lv_8sc_t* aPtr = aVector;
264  const lv_8sc_t* bPtr = bVector;
265  const float invScalar = 1.0f / scalar;
266  float32x4_t vInvScalar = vdupq_n_f32(invScalar);
267 
268  int8x8x2_t aVal, bVal;
269  int16x8_t aReal, aImag, bReal, bImag;
270  int32x4_t realLo, realHi, imagLo, imagHi;
271  float32x4_t realFloatLo, realFloatHi, imagFloatLo, imagFloatHi;
272 
273  for (; number < eighthPoints; number++) {
274  // Load 8 complex 8-bit values (deinterleaved)
275  aVal = vld2_s8((const int8_t*)aPtr);
276  bVal = vld2_s8((const int8_t*)bPtr);
277 
278  // Widen to 16-bit
279  aReal = vmovl_s8(aVal.val[0]);
280  aImag = vmovl_s8(aVal.val[1]);
281  bReal = vmovl_s8(bVal.val[0]);
282  bImag = vmovl_s8(bVal.val[1]);
283 
284  // Complex multiply with conjugate: (ar + ai*j) * (br - bi*j)
285  // real = ar*br + ai*bi
286  // imag = ai*br - ar*bi
287 
288  // Low half (first 4 complex values)
289  realLo = vmlal_s16(vmull_s16(vget_low_s16(aReal), vget_low_s16(bReal)),
290  vget_low_s16(aImag),
291  vget_low_s16(bImag));
292  imagLo = vmlsl_s16(vmull_s16(vget_low_s16(aImag), vget_low_s16(bReal)),
293  vget_low_s16(aReal),
294  vget_low_s16(bImag));
295 
296  // High half (next 4 complex values)
297  realHi = vmlal_s16(vmull_s16(vget_high_s16(aReal), vget_high_s16(bReal)),
298  vget_high_s16(aImag),
299  vget_high_s16(bImag));
300  imagHi = vmlsl_s16(vmull_s16(vget_high_s16(aImag), vget_high_s16(bReal)),
301  vget_high_s16(aReal),
302  vget_high_s16(bImag));
303 
304  // Convert to float and scale
305  realFloatLo = vmulq_f32(vcvtq_f32_s32(realLo), vInvScalar);
306  imagFloatLo = vmulq_f32(vcvtq_f32_s32(imagLo), vInvScalar);
307  realFloatHi = vmulq_f32(vcvtq_f32_s32(realHi), vInvScalar);
308  imagFloatHi = vmulq_f32(vcvtq_f32_s32(imagHi), vInvScalar);
309 
310  // Store interleaved (first 4 complex values)
311  float32x4x2_t resultLo;
312  resultLo.val[0] = realFloatLo;
313  resultLo.val[1] = imagFloatLo;
314  vst2q_f32((float*)cPtr, resultLo);
315  cPtr += 4;
316 
317  // Store interleaved (next 4 complex values)
318  float32x4x2_t resultHi;
319  resultHi.val[0] = realFloatHi;
320  resultHi.val[1] = imagFloatHi;
321  vst2q_f32((float*)cPtr, resultHi);
322  cPtr += 4;
323 
324  aPtr += 8;
325  bPtr += 8;
326  }
327 
328  number = eighthPoints * 8;
329  float* cFloatPtr = (float*)&cVector[number];
330  int8_t* a8Ptr = (int8_t*)&aVector[number];
331  int8_t* b8Ptr = (int8_t*)&bVector[number];
332  for (; number < num_points; number++) {
333  float aReal_f = (float)*a8Ptr++;
334  float aImag_f = (float)*a8Ptr++;
335  lv_32fc_t aVal_c = lv_cmake(aReal_f, aImag_f);
336  float bReal_f = (float)*b8Ptr++;
337  float bImag_f = (float)*b8Ptr++;
338  lv_32fc_t bVal_c = lv_cmake(bReal_f, -bImag_f);
339  lv_32fc_t temp = aVal_c * bVal_c;
340 
341  *cFloatPtr++ = lv_creal(temp) * invScalar;
342  *cFloatPtr++ = lv_cimag(temp) * invScalar;
343  }
344 }
345 #endif /* LV_HAVE_NEON */
346 
347 
348 #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */
349 
350 #ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
351 #define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
352 
353 #include <inttypes.h>
354 #include <stdio.h>
355 #include <volk/volk_complex.h>
356 
357 #ifdef LV_HAVE_AVX2
358 #include <immintrin.h>
359 
360 static inline void
361 volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
362  const lv_8sc_t* aVector,
363  const lv_8sc_t* bVector,
364  const float scalar,
365  unsigned int num_points)
366 {
367  unsigned int number = 0;
368  const unsigned int oneEigthPoints = num_points / 8;
369 
370  __m256i x, y, realz, imagz;
371  __m256 ret, retlo, rethi;
372  lv_32fc_t* c = cVector;
373  const lv_8sc_t* a = aVector;
374  const lv_8sc_t* b = bVector;
375  __m256i conjugateSign =
376  _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
377 
378  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
379 
380  for (; number < oneEigthPoints; number++) {
381  // Convert 8 bit values into 16 bit values
382  x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
383  y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
384 
385  // Calculate the ar*cr - ai*(-ci) portions
386  realz = _mm256_madd_epi16(x, y);
387 
388  // Calculate the complex conjugate of the cr + ci j values
389  y = _mm256_sign_epi16(y, conjugateSign);
390 
391  // Shift the order of the cr and ci values
392  y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
393  _MM_SHUFFLE(2, 3, 0, 1));
394 
395  // Calculate the ar*(-ci) + cr*(ai)
396  imagz = _mm256_madd_epi16(x, y);
397 
398  // Interleave real and imaginary and then convert to float values
399  retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
400 
401  // Normalize the floating point values
402  retlo = _mm256_mul_ps(retlo, invScalar);
403 
404  // Interleave real and imaginary and then convert to float values
405  rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
406 
407  // Normalize the floating point values
408  rethi = _mm256_mul_ps(rethi, invScalar);
409 
410  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
411  _mm256_storeu_ps((float*)c, ret);
412  c += 4;
413 
414  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
415  _mm256_storeu_ps((float*)c, ret);
416  c += 4;
417 
418  a += 8;
419  b += 8;
420  }
421 
422  number = oneEigthPoints * 8;
423  float* cFloatPtr = (float*)&cVector[number];
424  int8_t* a8Ptr = (int8_t*)&aVector[number];
425  int8_t* b8Ptr = (int8_t*)&bVector[number];
426  for (; number < num_points; number++) {
427  float aReal = (float)*a8Ptr++;
428  float aImag = (float)*a8Ptr++;
429  lv_32fc_t aVal = lv_cmake(aReal, aImag);
430  float bReal = (float)*b8Ptr++;
431  float bImag = (float)*b8Ptr++;
432  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
433  lv_32fc_t temp = aVal * bVal;
434 
435  *cFloatPtr++ = lv_creal(temp) / scalar;
436  *cFloatPtr++ = lv_cimag(temp) / scalar;
437  }
438 }
439 #endif /* LV_HAVE_AVX2*/
440 
441 
442 #ifdef LV_HAVE_RVV
443 #include <riscv_vector.h>
444 
445 static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_rvv(lv_32fc_t* cVector,
446  const lv_8sc_t* aVector,
447  const lv_8sc_t* bVector,
448  const float scalar,
449  unsigned int num_points)
450 {
451  size_t n = num_points;
452  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
453  vl = __riscv_vsetvl_e8m1(n);
454  vint16m2_t va = __riscv_vle16_v_i16m2((const int16_t*)aVector, vl);
455  vint16m2_t vb = __riscv_vle16_v_i16m2((const int16_t*)bVector, vl);
456  vint8m1_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl);
457  vint8m1_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl);
458  vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
459  vint16m2_t vi =
460  __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
461  vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl);
462  vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl);
463  vuint32m4_t vru = __riscv_vreinterpret_u32m4(vrf);
464  vuint32m4_t viu = __riscv_vreinterpret_u32m4(vif);
465  vuint64m8_t v =
466  __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
467  __riscv_vse64((uint64_t*)cVector, v, vl);
468  }
469 }
470 #endif /*LV_HAVE_RVV*/
471 
472 #ifdef LV_HAVE_RVVSEG
473 #include <riscv_vector.h>
474 
475 static inline void
476 volk_8ic_x2_s32f_multiply_conjugate_32fc_rvvseg(lv_32fc_t* cVector,
477  const lv_8sc_t* aVector,
478  const lv_8sc_t* bVector,
479  const float scalar,
480  unsigned int num_points)
481 {
482  size_t n = num_points;
483  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
484  vl = __riscv_vsetvl_e8m1(n);
485  vint8m1x2_t va = __riscv_vlseg2e8_v_i8m1x2((const int8_t*)aVector, vl);
486  vint8m1x2_t vb = __riscv_vlseg2e8_v_i8m1x2((const int8_t*)bVector, vl);
487  vint8m1_t var = __riscv_vget_i8m1(va, 0), vai = __riscv_vget_i8m1(va, 1);
488  vint8m1_t vbr = __riscv_vget_i8m1(vb, 0), vbi = __riscv_vget_i8m1(vb, 1);
489  vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
490  vint16m2_t vi =
491  __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
492  vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl);
493  vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl);
494  __riscv_vsseg2e32_v_f32m4x2(
495  (float*)cVector, __riscv_vcreate_v_f32m4x2(vrf, vif), vl);
496  }
497 }
498 
499 #endif /*LV_HAVE_RVVSEG*/
500 
501 #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:98
volk_8ic_x2_s32f_multiply_conjugate_32fc_generic
static void volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t *cVector, const lv_8sc_t *aVector, const lv_8sc_t *bVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_x2_s32f_multiply_conjugate_32fc.h:223
volk_8ic_x2_s32f_multiply_conjugate_32fc_neon
static void volk_8ic_x2_s32f_multiply_conjugate_32fc_neon(lv_32fc_t *cVector, const lv_8sc_t *aVector, const lv_8sc_t *bVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_x2_s32f_multiply_conjugate_32fc.h:253
lv_cmake
#define lv_cmake(r, i)
Definition: volk_complex.h:77
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_complex.h
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:96
lv_8sc_t
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:70