Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_16i_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
41 #ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
42 #define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
43 
44 #include <inttypes.h>
45 #include <stdio.h>
46 #ifdef LV_HAVE_AVX2
47 #include <immintrin.h>
48 
49 static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
50  int16_t* qBuffer,
51  const lv_16sc_t* complexVector,
52  unsigned int num_points)
53 {
54  unsigned int number = 0;
55  const int8_t* complexVectorPtr = (int8_t*)complexVector;
56  int16_t* iBufferPtr = iBuffer;
57  int16_t* qBufferPtr = qBuffer;
58 
59  __m256i MoveMask = _mm256_set_epi8(15,
60  14,
61  11,
62  10,
63  7,
64  6,
65  3,
66  2,
67  13,
68  12,
69  9,
70  8,
71  5,
72  4,
73  1,
74  0,
75  15,
76  14,
77  11,
78  10,
79  7,
80  6,
81  3,
82  2,
83  13,
84  12,
85  9,
86  8,
87  5,
88  4,
89  1,
90  0);
91 
92  __m256i iMove2, iMove1;
93  __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
94 
95  unsigned int sixteenthPoints = num_points / 16;
96 
97  for (number = 0; number < sixteenthPoints; number++) {
98  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
99  complexVectorPtr += 32;
100  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
101  complexVectorPtr += 32;
102 
103  iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
104  iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
105 
106  iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
107  _mm256_permute4x64_epi64(iMove2, 0x80),
108  0x30);
109  qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
110  _mm256_permute4x64_epi64(iMove2, 0xd0),
111  0x30);
112 
113  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
114  _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
115 
116  iBufferPtr += 16;
117  qBufferPtr += 16;
118  }
119 
120  number = sixteenthPoints * 16;
121  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
122  for (; number < num_points; number++) {
123  *iBufferPtr++ = *int16ComplexVectorPtr++;
124  *qBufferPtr++ = *int16ComplexVectorPtr++;
125  }
126 }
127 #endif /* LV_HAVE_AVX2 */
128 
129 #ifdef LV_HAVE_SSSE3
130 #include <tmmintrin.h>
131 
132 static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer,
133  int16_t* qBuffer,
134  const lv_16sc_t* complexVector,
135  unsigned int num_points)
136 {
137  unsigned int number = 0;
138  const int8_t* complexVectorPtr = (int8_t*)complexVector;
139  int16_t* iBufferPtr = iBuffer;
140  int16_t* qBufferPtr = qBuffer;
141 
142  __m128i iMoveMask1 = _mm_set_epi8(
143  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
144  __m128i iMoveMask2 = _mm_set_epi8(
145  13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
146 
147  __m128i qMoveMask1 = _mm_set_epi8(
148  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
149  __m128i qMoveMask2 = _mm_set_epi8(
150  15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
151 
152  __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
153 
154  unsigned int eighthPoints = num_points / 8;
155 
156  for (number = 0; number < eighthPoints; number++) {
157  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
158  complexVectorPtr += 16;
159  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
160  complexVectorPtr += 16;
161 
162  iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1),
163  _mm_shuffle_epi8(complexVal2, iMoveMask2));
164  qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1),
165  _mm_shuffle_epi8(complexVal2, qMoveMask2));
166 
167  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
168  _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
169 
170  iBufferPtr += 8;
171  qBufferPtr += 8;
172  }
173 
174  number = eighthPoints * 8;
175  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
176  for (; number < num_points; number++) {
177  *iBufferPtr++ = *int16ComplexVectorPtr++;
178  *qBufferPtr++ = *int16ComplexVectorPtr++;
179  }
180 }
181 #endif /* LV_HAVE_SSSE3 */
182 
183 #ifdef LV_HAVE_SSE2
184 #include <emmintrin.h>
185 
186 static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer,
187  int16_t* qBuffer,
188  const lv_16sc_t* complexVector,
189  unsigned int num_points)
190 {
191  unsigned int number = 0;
192  const int16_t* complexVectorPtr = (int16_t*)complexVector;
193  int16_t* iBufferPtr = iBuffer;
194  int16_t* qBufferPtr = qBuffer;
195  __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1,
196  qComplexVal2, iOutputVal, qOutputVal;
197  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
198  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
199 
200  unsigned int eighthPoints = num_points / 8;
201 
202  for (number = 0; number < eighthPoints; number++) {
203  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
204  complexVectorPtr += 8;
205  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
206  complexVectorPtr += 8;
207 
208  iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
209 
210  iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
211 
212  iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
213 
214  iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
215 
216  iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0));
217 
218  iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
219 
220  iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask),
221  _mm_and_si128(iComplexVal2, highMask));
222 
223  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
224 
225  qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1));
226 
227  qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1));
228 
229  qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
230 
231  qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
232 
233  qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
234 
235  qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
236 
237  qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask),
238  _mm_and_si128(qComplexVal2, highMask));
239 
240  _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
241 
242  iBufferPtr += 8;
243  qBufferPtr += 8;
244  }
245 
246  number = eighthPoints * 8;
247  for (; number < num_points; number++) {
248  *iBufferPtr++ = *complexVectorPtr++;
249  *qBufferPtr++ = *complexVectorPtr++;
250  }
251 }
252 #endif /* LV_HAVE_SSE2 */
253 
254 #ifdef LV_HAVE_GENERIC
255 
256 static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
257  int16_t* qBuffer,
258  const lv_16sc_t* complexVector,
259  unsigned int num_points)
260 {
261  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
262  int16_t* iBufferPtr = iBuffer;
263  int16_t* qBufferPtr = qBuffer;
264  unsigned int number;
265  for (number = 0; number < num_points; number++) {
266  *iBufferPtr++ = *complexVectorPtr++;
267  *qBufferPtr++ = *complexVectorPtr++;
268  }
269 }
270 #endif /* LV_HAVE_GENERIC */
271 
272 
273 #ifdef LV_HAVE_NEON
274 #include <arm_neon.h>
275 
276 static inline void volk_16ic_deinterleave_16i_x2_neon(int16_t* iBuffer,
277  int16_t* qBuffer,
278  const lv_16sc_t* complexVector,
279  unsigned int num_points)
280 {
281  unsigned int number = 0;
282  const unsigned int eighthPoints = num_points / 8;
283  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
284  int16_t* iBufferPtr = iBuffer;
285  int16_t* qBufferPtr = qBuffer;
286 
287  int16x8x2_t complexVal;
288 
289  for (; number < eighthPoints; number++) {
290  complexVal = vld2q_s16(complexVectorPtr);
291  vst1q_s16(iBufferPtr, complexVal.val[0]);
292  vst1q_s16(qBufferPtr, complexVal.val[1]);
293  complexVectorPtr += 16;
294  iBufferPtr += 8;
295  qBufferPtr += 8;
296  }
297 
298  number = eighthPoints * 8;
299  for (; number < num_points; number++) {
300  *iBufferPtr++ = *complexVectorPtr++;
301  *qBufferPtr++ = *complexVectorPtr++;
302  }
303 }
304 #endif /* LV_HAVE_NEON */
305 
306 
307 #ifdef LV_HAVE_NEONV8
308 #include <arm_neon.h>
309 
310 static inline void volk_16ic_deinterleave_16i_x2_neonv8(int16_t* iBuffer,
311  int16_t* qBuffer,
312  const lv_16sc_t* complexVector,
313  unsigned int num_points)
314 {
315  unsigned int number = 0;
316  const unsigned int sixteenthPoints = num_points / 16;
317  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
318  int16_t* iBufferPtr = iBuffer;
319  int16_t* qBufferPtr = qBuffer;
320 
321  int16x8x2_t complexVal0, complexVal1;
322 
323  for (; number < sixteenthPoints; number++) {
324  complexVal0 = vld2q_s16(complexVectorPtr);
325  complexVal1 = vld2q_s16(complexVectorPtr + 16);
326  __VOLK_PREFETCH(complexVectorPtr + 32);
327 
328  vst1q_s16(iBufferPtr, complexVal0.val[0]);
329  vst1q_s16(iBufferPtr + 8, complexVal1.val[0]);
330  vst1q_s16(qBufferPtr, complexVal0.val[1]);
331  vst1q_s16(qBufferPtr + 8, complexVal1.val[1]);
332 
333  complexVectorPtr += 32;
334  iBufferPtr += 16;
335  qBufferPtr += 16;
336  }
337 
338  number = sixteenthPoints * 16;
339  for (; number < num_points; number++) {
340  *iBufferPtr++ = *complexVectorPtr++;
341  *qBufferPtr++ = *complexVectorPtr++;
342  }
343 }
344 #endif /* LV_HAVE_NEONV8 */
345 
346 
347 #ifdef LV_HAVE_ORC
348 
349 extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer,
350  int16_t* qBuffer,
351  const lv_16sc_t* complexVector,
352  int num_points);
353 static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer,
354  int16_t* qBuffer,
355  const lv_16sc_t* complexVector,
356  unsigned int num_points)
357 {
358  volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
359 }
360 #endif /* LV_HAVE_ORC */
361 
362 #endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a_H */
363 
364 
365 #ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
366 #define INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
367 
368 #include <inttypes.h>
369 #include <stdio.h>
370 #ifdef LV_HAVE_AVX2
371 #include <immintrin.h>
372 
373 static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
374  int16_t* qBuffer,
375  const lv_16sc_t* complexVector,
376  unsigned int num_points)
377 {
378  unsigned int number = 0;
379  const int8_t* complexVectorPtr = (int8_t*)complexVector;
380  int16_t* iBufferPtr = iBuffer;
381  int16_t* qBufferPtr = qBuffer;
382 
383  __m256i MoveMask = _mm256_set_epi8(15,
384  14,
385  11,
386  10,
387  7,
388  6,
389  3,
390  2,
391  13,
392  12,
393  9,
394  8,
395  5,
396  4,
397  1,
398  0,
399  15,
400  14,
401  11,
402  10,
403  7,
404  6,
405  3,
406  2,
407  13,
408  12,
409  9,
410  8,
411  5,
412  4,
413  1,
414  0);
415 
416  __m256i iMove2, iMove1;
417  __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
418 
419  unsigned int sixteenthPoints = num_points / 16;
420 
421  for (number = 0; number < sixteenthPoints; number++) {
422  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
423  complexVectorPtr += 32;
424  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
425  complexVectorPtr += 32;
426 
427  iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
428  iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
429 
430  iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
431  _mm256_permute4x64_epi64(iMove2, 0x80),
432  0x30);
433  qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
434  _mm256_permute4x64_epi64(iMove2, 0xd0),
435  0x30);
436 
437  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
438  _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
439 
440  iBufferPtr += 16;
441  qBufferPtr += 16;
442  }
443 
444  number = sixteenthPoints * 16;
445  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
446  for (; number < num_points; number++) {
447  *iBufferPtr++ = *int16ComplexVectorPtr++;
448  *qBufferPtr++ = *int16ComplexVectorPtr++;
449  }
450 }
451 #endif /* LV_HAVE_AVX2 */
452 
453 #ifdef LV_HAVE_RVV
454 #include <riscv_vector.h>
455 
456 static inline void volk_16ic_deinterleave_16i_x2_rvv(int16_t* iBuffer,
457  int16_t* qBuffer,
458  const lv_16sc_t* complexVector,
459  unsigned int num_points)
460 {
461  size_t n = num_points;
462  for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
463  vl = __riscv_vsetvl_e16m4(n);
464  vuint32m8_t vc = __riscv_vle32_v_u32m8((const uint32_t*)complexVector, vl);
465  vuint16m4_t vr = __riscv_vnsrl(vc, 0, vl);
466  vuint16m4_t vi = __riscv_vnsrl(vc, 16, vl);
467  __riscv_vse16((uint16_t*)iBuffer, vr, vl);
468  __riscv_vse16((uint16_t*)qBuffer, vi, vl);
469  }
470 }
471 #endif /*LV_HAVE_RVV*/
472 
473 #ifdef LV_HAVE_RVVSEG
474 #include <riscv_vector.h>
475 
476 static inline void volk_16ic_deinterleave_16i_x2_rvvseg(int16_t* iBuffer,
477  int16_t* qBuffer,
478  const lv_16sc_t* complexVector,
479  unsigned int num_points)
480 {
481  size_t n = num_points;
482  for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
483  vl = __riscv_vsetvl_e16m4(n);
484  vuint16m4x2_t vc =
485  __riscv_vlseg2e16_v_u16m4x2((const uint16_t*)complexVector, vl);
486  vuint16m4_t vr = __riscv_vget_u16m4(vc, 0);
487  vuint16m4_t vi = __riscv_vget_u16m4(vc, 1);
488  __riscv_vse16((uint16_t*)iBuffer, vr, vl);
489  __riscv_vse16((uint16_t*)qBuffer, vi, vl);
490  }
491 }
492 #endif /*LV_HAVE_RVVSEG*/
493 
494 #endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_u_H */
volk_16ic_deinterleave_16i_x2_neon
static void volk_16ic_deinterleave_16i_x2_neon(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:276
volk_16ic_deinterleave_16i_x2_a_sse2
static void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:186
volk_16ic_deinterleave_16i_x2_a_ssse3
static void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:132
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
lv_16sc_t
short complex lv_16sc_t
Definition: volk_complex.h:71
volk_16ic_deinterleave_16i_x2_generic
static void volk_16ic_deinterleave_16i_x2_generic(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:256