Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_8ic_x2_multiply_conjugate_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 #ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
11 #define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
12 
13 #include <inttypes.h>
14 #include <limits.h>
15 #include <stdio.h>
16 #include <volk/volk_complex.h>
17 
18 #ifdef LV_HAVE_AVX2
19 #include <immintrin.h>
28 static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector,
29  const lv_8sc_t* aVector,
30  const lv_8sc_t* bVector,
31  unsigned int num_points)
32 {
33  unsigned int number = 0;
34  const unsigned int quarterPoints = num_points / 8;
35 
36  __m256i x, y, realz, imagz;
37  lv_16sc_t* c = cVector;
38  const lv_8sc_t* a = aVector;
39  const lv_8sc_t* b = bVector;
40  __m256i conjugateSign =
41  _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
42 
43  for (; number < quarterPoints; number++) {
44  // Convert 8 bit values into 16 bit values
45  x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
46  y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
47 
48  // Calculate the ar*cr - ai*(-ci) portions
49  realz = _mm256_madd_epi16(x, y);
50 
51  // Calculate the complex conjugate of the cr + ci j values
52  y = _mm256_sign_epi16(y, conjugateSign);
53 
54  // Shift the order of the cr and ci values
55  y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
56  _MM_SHUFFLE(2, 3, 0, 1));
57 
58  // Calculate the ar*(-ci) + cr*(ai)
59  imagz = _mm256_madd_epi16(x, y);
60 
61  // Perform the addition of products
62 
63  _mm256_store_si256((__m256i*)c,
64  _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
65  _mm256_unpackhi_epi32(realz, imagz)));
66 
67  a += 8;
68  b += 8;
69  c += 8;
70  }
71 
72  number = quarterPoints * 8;
73  int16_t* c16Ptr = (int16_t*)&cVector[number];
74  int8_t* a8Ptr = (int8_t*)&aVector[number];
75  int8_t* b8Ptr = (int8_t*)&bVector[number];
76  for (; number < num_points; number++) {
77  float aReal = (float)*a8Ptr++;
78  float aImag = (float)*a8Ptr++;
79  lv_32fc_t aVal = lv_cmake(aReal, aImag);
80  float bReal = (float)*b8Ptr++;
81  float bImag = (float)*b8Ptr++;
82  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
83  lv_32fc_t temp = aVal * bVal;
84 
85  *c16Ptr++ = (int16_t)(lv_creal(temp) > SHRT_MAX ? SHRT_MAX : lv_creal(temp));
86  *c16Ptr++ = (int16_t)lv_cimag(temp);
87  }
88 }
89 #endif /* LV_HAVE_AVX2 */
90 
91 
92 #ifdef LV_HAVE_SSE4_1
93 #include <smmintrin.h>
102 static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector,
103  const lv_8sc_t* aVector,
104  const lv_8sc_t* bVector,
105  unsigned int num_points)
106 {
107  unsigned int number = 0;
108  const unsigned int quarterPoints = num_points / 4;
109 
110  __m128i x, y, realz, imagz;
111  lv_16sc_t* c = cVector;
112  const lv_8sc_t* a = aVector;
113  const lv_8sc_t* b = bVector;
114  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
115 
116  for (; number < quarterPoints; number++) {
117  // Convert into 8 bit values into 16 bit values
118  x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
119  y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
120 
121  // Calculate the ar*cr - ai*(-ci) portions
122  realz = _mm_madd_epi16(x, y);
123 
124  // Calculate the complex conjugate of the cr + ci j values
125  y = _mm_sign_epi16(y, conjugateSign);
126 
127  // Shift the order of the cr and ci values
128  y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
129  _MM_SHUFFLE(2, 3, 0, 1));
130 
131  // Calculate the ar*(-ci) + cr*(ai)
132  imagz = _mm_madd_epi16(x, y);
133 
134  _mm_store_si128((__m128i*)c,
135  _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz),
136  _mm_unpackhi_epi32(realz, imagz)));
137 
138  a += 4;
139  b += 4;
140  c += 4;
141  }
142 
143  number = quarterPoints * 4;
144  int16_t* c16Ptr = (int16_t*)&cVector[number];
145  int8_t* a8Ptr = (int8_t*)&aVector[number];
146  int8_t* b8Ptr = (int8_t*)&bVector[number];
147  for (; number < num_points; number++) {
148  float aReal = (float)*a8Ptr++;
149  float aImag = (float)*a8Ptr++;
150  lv_32fc_t aVal = lv_cmake(aReal, aImag);
151  float bReal = (float)*b8Ptr++;
152  float bImag = (float)*b8Ptr++;
153  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
154  lv_32fc_t temp = aVal * bVal;
155 
156  *c16Ptr++ = (int16_t)(lv_creal(temp) > SHRT_MAX ? SHRT_MAX : lv_creal(temp));
157  *c16Ptr++ = (int16_t)lv_cimag(temp);
158  }
159 }
160 #endif /* LV_HAVE_SSE4_1 */
161 
162 #ifdef LV_HAVE_GENERIC
163 
172  const lv_8sc_t* aVector,
173  const lv_8sc_t* bVector,
174  unsigned int num_points)
175 {
176  unsigned int number = 0;
177  int16_t* c16Ptr = (int16_t*)cVector;
178  int8_t* a8Ptr = (int8_t*)aVector;
179  int8_t* b8Ptr = (int8_t*)bVector;
180  for (number = 0; number < num_points; number++) {
181  float aReal = (float)*a8Ptr++;
182  float aImag = (float)*a8Ptr++;
183  lv_32fc_t aVal = lv_cmake(aReal, aImag);
184  float bReal = (float)*b8Ptr++;
185  float bImag = (float)*b8Ptr++;
186  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
187  lv_32fc_t temp = aVal * bVal;
188 
189  *c16Ptr++ = (int16_t)(lv_creal(temp) > SHRT_MAX ? SHRT_MAX : lv_creal(temp));
190  *c16Ptr++ = (int16_t)lv_cimag(temp);
191  }
192 }
193 #endif /* LV_HAVE_GENERIC */
194 
195 
196 #ifdef LV_HAVE_NEON
197 #include <arm_neon.h>
198 
200  const lv_8sc_t* aVector,
201  const lv_8sc_t* bVector,
202  unsigned int num_points)
203 {
204  unsigned int number = 0;
205  const unsigned int eighthPoints = num_points / 8;
206 
207  lv_16sc_t* cPtr = cVector;
208  const lv_8sc_t* aPtr = aVector;
209  const lv_8sc_t* bPtr = bVector;
210 
211  int8x8x2_t aVal, bVal;
212  int16x8_t aReal, aImag, bReal, bImag;
213  int32x4_t realLo, realHi, imagLo, imagHi;
214  int16x4_t realNarrowLo, realNarrowHi, imagNarrowLo, imagNarrowHi;
215 
216  for (; number < eighthPoints; number++) {
217  // Load 8 complex 8-bit values (deinterleaved)
218  aVal = vld2_s8((const int8_t*)aPtr);
219  bVal = vld2_s8((const int8_t*)bPtr);
220 
221  // Widen to 16-bit
222  aReal = vmovl_s8(aVal.val[0]);
223  aImag = vmovl_s8(aVal.val[1]);
224  bReal = vmovl_s8(bVal.val[0]);
225  bImag = vmovl_s8(bVal.val[1]);
226 
227  // Complex multiply with conjugate: (ar + ai*j) * (br - bi*j)
228  // real = ar*br + ai*bi
229  // imag = ai*br - ar*bi
230 
231  // Low half (first 4 complex values)
232  realLo = vmlal_s16(vmull_s16(vget_low_s16(aReal), vget_low_s16(bReal)),
233  vget_low_s16(aImag),
234  vget_low_s16(bImag));
235  imagLo = vmlsl_s16(vmull_s16(vget_low_s16(aImag), vget_low_s16(bReal)),
236  vget_low_s16(aReal),
237  vget_low_s16(bImag));
238 
239  // High half (next 4 complex values)
240  realHi = vmlal_s16(vmull_s16(vget_high_s16(aReal), vget_high_s16(bReal)),
241  vget_high_s16(aImag),
242  vget_high_s16(bImag));
243  imagHi = vmlsl_s16(vmull_s16(vget_high_s16(aImag), vget_high_s16(bReal)),
244  vget_high_s16(aReal),
245  vget_high_s16(bImag));
246 
247  // Narrow with saturation to 16-bit
248  realNarrowLo = vqmovn_s32(realLo);
249  realNarrowHi = vqmovn_s32(realHi);
250  imagNarrowLo = vqmovn_s32(imagLo);
251  imagNarrowHi = vqmovn_s32(imagHi);
252 
253  // Interleave real and imaginary
254  int16x8_t realResult = vcombine_s16(realNarrowLo, realNarrowHi);
255  int16x8_t imagResult = vcombine_s16(imagNarrowLo, imagNarrowHi);
256 
257  // Store interleaved
258  int16x8x2_t result;
259  result.val[0] = realResult;
260  result.val[1] = imagResult;
261  vst2q_s16((int16_t*)cPtr, result);
262 
263  aPtr += 8;
264  bPtr += 8;
265  cPtr += 8;
266  }
267 
268  number = eighthPoints * 8;
269  int16_t* c16Ptr = (int16_t*)&cVector[number];
270  int8_t* a8Ptr = (int8_t*)&aVector[number];
271  int8_t* b8Ptr = (int8_t*)&bVector[number];
272  for (; number < num_points; number++) {
273  float aReal_f = (float)*a8Ptr++;
274  float aImag_f = (float)*a8Ptr++;
275  lv_32fc_t aVal_c = lv_cmake(aReal_f, aImag_f);
276  float bReal_f = (float)*b8Ptr++;
277  float bImag_f = (float)*b8Ptr++;
278  lv_32fc_t bVal_c = lv_cmake(bReal_f, -bImag_f);
279  lv_32fc_t temp = aVal_c * bVal_c;
280 
281  *c16Ptr++ = (int16_t)(lv_creal(temp) > SHRT_MAX ? SHRT_MAX : lv_creal(temp));
282  *c16Ptr++ = (int16_t)lv_cimag(temp);
283  }
284 }
285 #endif /* LV_HAVE_NEON */
286 
287 
288 #endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H */
289 
290 #ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
291 #define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
292 
293 #include <inttypes.h>
294 #include <stdio.h>
295 #include <volk/volk_complex.h>
296 
297 #ifdef LV_HAVE_AVX2
298 #include <immintrin.h>
307 static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector,
308  const lv_8sc_t* aVector,
309  const lv_8sc_t* bVector,
310  unsigned int num_points)
311 {
312  unsigned int number = 0;
313  const unsigned int oneEigthPoints = num_points / 8;
314 
315  __m256i x, y, realz, imagz;
316  lv_16sc_t* c = cVector;
317  const lv_8sc_t* a = aVector;
318  const lv_8sc_t* b = bVector;
319  __m256i conjugateSign =
320  _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
321 
322  for (; number < oneEigthPoints; number++) {
323  // Convert 8 bit values into 16 bit values
324  x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
325  y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
326 
327  // Calculate the ar*cr - ai*(-ci) portions
328  realz = _mm256_madd_epi16(x, y);
329 
330  // Calculate the complex conjugate of the cr + ci j values
331  y = _mm256_sign_epi16(y, conjugateSign);
332 
333  // Shift the order of the cr and ci values
334  y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
335  _MM_SHUFFLE(2, 3, 0, 1));
336 
337  // Calculate the ar*(-ci) + cr*(ai)
338  imagz = _mm256_madd_epi16(x, y);
339 
340  // Perform the addition of products
341 
342  _mm256_storeu_si256((__m256i*)c,
343  _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
344  _mm256_unpackhi_epi32(realz, imagz)));
345 
346  a += 8;
347  b += 8;
348  c += 8;
349  }
350 
351  number = oneEigthPoints * 8;
352  int16_t* c16Ptr = (int16_t*)&cVector[number];
353  int8_t* a8Ptr = (int8_t*)&aVector[number];
354  int8_t* b8Ptr = (int8_t*)&bVector[number];
355  for (; number < num_points; number++) {
356  float aReal = (float)*a8Ptr++;
357  float aImag = (float)*a8Ptr++;
358  lv_32fc_t aVal = lv_cmake(aReal, aImag);
359  float bReal = (float)*b8Ptr++;
360  float bImag = (float)*b8Ptr++;
361  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
362  lv_32fc_t temp = aVal * bVal;
363 
364  *c16Ptr++ = (int16_t)(lv_creal(temp) > SHRT_MAX ? SHRT_MAX : lv_creal(temp));
365  *c16Ptr++ = (int16_t)lv_cimag(temp);
366  }
367 }
368 #endif /* LV_HAVE_AVX2 */
369 
370 #ifdef LV_HAVE_RVV
371 #include <riscv_vector.h>
372 
373 static inline void volk_8ic_x2_multiply_conjugate_16ic_rvv(lv_16sc_t* cVector,
374  const lv_8sc_t* aVector,
375  const lv_8sc_t* bVector,
376  unsigned int num_points)
377 {
378  size_t n = num_points;
379  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
380  vl = __riscv_vsetvl_e8m2(n);
381  vint16m4_t va = __riscv_vle16_v_i16m4((const int16_t*)aVector, vl);
382  vint16m4_t vb = __riscv_vle16_v_i16m4((const int16_t*)bVector, vl);
383  vint8m2_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl);
384  vint8m2_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl);
385  vint16m4_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
386  vint16m4_t vi =
387  __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
388  vuint16m4_t vru = __riscv_vreinterpret_u16m4(vr);
389  vuint16m4_t viu = __riscv_vreinterpret_u16m4(vi);
390  vuint32m8_t v = __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFF, viu, vl);
391  __riscv_vse32((uint32_t*)cVector, v, vl);
392  }
393 }
394 #endif /*LV_HAVE_RVV*/
395 
396 #ifdef LV_HAVE_RVVSEG
397 #include <riscv_vector.h>
398 
399 static inline void volk_8ic_x2_multiply_conjugate_16ic_rvvseg(lv_16sc_t* cVector,
400  const lv_8sc_t* aVector,
401  const lv_8sc_t* bVector,
402  unsigned int num_points)
403 {
404  size_t n = num_points;
405  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
406  vl = __riscv_vsetvl_e8m2(n);
407  vint8m2x2_t va = __riscv_vlseg2e8_v_i8m2x2((const int8_t*)aVector, vl);
408  vint8m2x2_t vb = __riscv_vlseg2e8_v_i8m2x2((const int8_t*)bVector, vl);
409  vint8m2_t var = __riscv_vget_i8m2(va, 0), vai = __riscv_vget_i8m2(va, 1);
410  vint8m2_t vbr = __riscv_vget_i8m2(vb, 0), vbi = __riscv_vget_i8m2(vb, 1);
411  vint16m4_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
412  vint16m4_t vi =
413  __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
414  __riscv_vsseg2e16_v_i16m4x2(
415  (int16_t*)cVector, __riscv_vcreate_v_i16m4x2(vr, vi), vl);
416  }
417 }
418 
419 #endif /*LV_HAVE_RVVSEG*/
420 
421 #endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H */
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:98
volk_8ic_x2_multiply_conjugate_16ic_generic
static void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t *cVector, const lv_8sc_t *aVector, const lv_8sc_t *bVector, unsigned int num_points)
Multiplys the one complex vector with the complex conjugate of the second complex vector and stores t...
Definition: volk_8ic_x2_multiply_conjugate_16ic.h:171
lv_16sc_t
short complex lv_16sc_t
Definition: volk_complex.h:71
lv_cmake
#define lv_cmake(r, i)
Definition: volk_complex.h:77
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_complex.h
volk_8ic_x2_multiply_conjugate_16ic_neon
static void volk_8ic_x2_multiply_conjugate_16ic_neon(lv_16sc_t *cVector, const lv_8sc_t *aVector, const lv_8sc_t *bVector, unsigned int num_points)
Definition: volk_8ic_x2_multiply_conjugate_16ic.h:199
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:96
lv_8sc_t
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:70