Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_real_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
41 #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H
42 #define INCLUDED_volk_16ic_deinterleave_real_8i_a_H
43 
44 #include <inttypes.h>
45 #include <stdio.h>
46 
47 
48 #ifdef LV_HAVE_AVX2
49 #include <immintrin.h>
50 
51 static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
52  const lv_16sc_t* complexVector,
53  unsigned int num_points)
54 {
55  unsigned int number = 0;
56  const int8_t* complexVectorPtr = (int8_t*)complexVector;
57  int8_t* iBufferPtr = iBuffer;
58  __m256i iMoveMask1 = _mm256_set_epi8(0x80,
59  0x80,
60  0x80,
61  0x80,
62  0x80,
63  0x80,
64  0x80,
65  0x80,
66  13,
67  12,
68  9,
69  8,
70  5,
71  4,
72  1,
73  0,
74  0x80,
75  0x80,
76  0x80,
77  0x80,
78  0x80,
79  0x80,
80  0x80,
81  0x80,
82  13,
83  12,
84  9,
85  8,
86  5,
87  4,
88  1,
89  0);
90  __m256i iMoveMask2 = _mm256_set_epi8(13,
91  12,
92  9,
93  8,
94  5,
95  4,
96  1,
97  0,
98  0x80,
99  0x80,
100  0x80,
101  0x80,
102  0x80,
103  0x80,
104  0x80,
105  0x80,
106  13,
107  12,
108  9,
109  8,
110  5,
111  4,
112  1,
113  0,
114  0x80,
115  0x80,
116  0x80,
117  0x80,
118  0x80,
119  0x80,
120  0x80,
121  0x80);
122  __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
123 
124  unsigned int thirtysecondPoints = num_points / 32;
125 
126  for (number = 0; number < thirtysecondPoints; number++) {
127  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
128  complexVectorPtr += 32;
129  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
130  complexVectorPtr += 32;
131 
132  complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);
133  complexVectorPtr += 32;
134  complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);
135  complexVectorPtr += 32;
136 
137  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
138  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
139 
140  complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
141  complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
142 
143  complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
144  complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
145 
146  complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
147  complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
148 
149  complexVal1 = _mm256_srai_epi16(complexVal1, 8);
150  complexVal3 = _mm256_srai_epi16(complexVal3, 8);
151 
152  iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
153  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
154 
155  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
156 
157  iBufferPtr += 32;
158  }
159 
160  number = thirtysecondPoints * 32;
161  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
162  for (; number < num_points; number++) {
163  *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
164  int16ComplexVectorPtr++;
165  }
166 }
167 #endif /* LV_HAVE_AVX2 */
168 
169 
170 #ifdef LV_HAVE_SSSE3
171 #include <tmmintrin.h>
172 
173 static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
174  const lv_16sc_t* complexVector,
175  unsigned int num_points)
176 {
177  unsigned int number = 0;
178  const int8_t* complexVectorPtr = (int8_t*)complexVector;
179  int8_t* iBufferPtr = iBuffer;
180  __m128i iMoveMask1 = _mm_set_epi8(
181  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
182  __m128i iMoveMask2 = _mm_set_epi8(
183  13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
184  __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
185 
186  unsigned int sixteenthPoints = num_points / 16;
187 
188  for (number = 0; number < sixteenthPoints; number++) {
189  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
190  complexVectorPtr += 16;
191  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
192  complexVectorPtr += 16;
193 
194  complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);
195  complexVectorPtr += 16;
196  complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);
197  complexVectorPtr += 16;
198 
199  complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
200  complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
201 
202  complexVal1 = _mm_or_si128(complexVal1, complexVal2);
203 
204  complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
205  complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
206 
207  complexVal3 = _mm_or_si128(complexVal3, complexVal4);
208 
209 
210  complexVal1 = _mm_srai_epi16(complexVal1, 8);
211  complexVal3 = _mm_srai_epi16(complexVal3, 8);
212 
213  iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
214 
215  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
216 
217  iBufferPtr += 16;
218  }
219 
220  number = sixteenthPoints * 16;
221  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
222  for (; number < num_points; number++) {
223  *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
224  int16ComplexVectorPtr++;
225  }
226 }
227 #endif /* LV_HAVE_SSSE3 */
228 
229 #ifdef LV_HAVE_GENERIC
230 
231 static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer,
232  const lv_16sc_t* complexVector,
233  unsigned int num_points)
234 {
235  unsigned int number = 0;
236  int16_t* complexVectorPtr = (int16_t*)complexVector;
237  int8_t* iBufferPtr = iBuffer;
238  for (number = 0; number < num_points; number++) {
239  *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
240  complexVectorPtr++;
241  }
242 }
243 #endif /* LV_HAVE_GENERIC */
244 
245 #ifdef LV_HAVE_NEON
246 #include <arm_neon.h>
247 
248 static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer,
249  const lv_16sc_t* complexVector,
250  unsigned int num_points)
251 {
252  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
253  int8_t* iBufferPtr = iBuffer;
254  unsigned int eighth_points = num_points / 8;
255  unsigned int number;
256 
257  int16x8x2_t complexInput;
258  int8x8_t realOutput;
259  for (number = 0; number < eighth_points; number++) {
260  complexInput = vld2q_s16(complexVectorPtr);
261  realOutput = vshrn_n_s16(complexInput.val[0], 8);
262  vst1_s8(iBufferPtr, realOutput);
263  complexVectorPtr += 16;
264  iBufferPtr += 8;
265  }
266 
267  for (number = eighth_points * 8; number < num_points; number++) {
268  *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
269  complexVectorPtr++;
270  }
271 }
272 #endif
273 
274 #ifdef LV_HAVE_NEONV8
275 #include <arm_neon.h>
276 
277 static inline void volk_16ic_deinterleave_real_8i_neonv8(int8_t* iBuffer,
278  const lv_16sc_t* complexVector,
279  unsigned int num_points)
280 {
281  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
282  int8_t* iBufferPtr = iBuffer;
283  const unsigned int sixteenthPoints = num_points / 16;
284 
285  for (unsigned int number = 0; number < sixteenthPoints; number++) {
286  int16x8x2_t cplx0 = vld2q_s16(complexVectorPtr);
287  int16x8x2_t cplx1 = vld2q_s16(complexVectorPtr + 16);
288  __VOLK_PREFETCH(complexVectorPtr + 64);
289 
290  int8x8_t out0 = vshrn_n_s16(cplx0.val[0], 8);
291  int8x8_t out1 = vshrn_n_s16(cplx1.val[0], 8);
292 
293  vst1_s8(iBufferPtr, out0);
294  vst1_s8(iBufferPtr + 8, out1);
295 
296  complexVectorPtr += 32;
297  iBufferPtr += 16;
298  }
299 
300  for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
301  *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
302  complexVectorPtr++;
303  }
304 }
305 #endif /* LV_HAVE_NEONV8 */
306 
307 #ifdef LV_HAVE_ORC
308 
309 extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer,
310  const lv_16sc_t* complexVector,
311  int num_points);
312 
313 static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer,
314  const lv_16sc_t* complexVector,
315  unsigned int num_points)
316 {
317  volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
318 }
319 #endif /* LV_HAVE_ORC */
320 
321 
322 #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */
323 
324 #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_u_H
325 #define INCLUDED_volk_16ic_deinterleave_real_8i_u_H
326 
327 #include <inttypes.h>
328 #include <stdio.h>
329 
330 
331 #ifdef LV_HAVE_AVX2
332 #include <immintrin.h>
333 
334 static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
335  const lv_16sc_t* complexVector,
336  unsigned int num_points)
337 {
338  unsigned int number = 0;
339  const int8_t* complexVectorPtr = (int8_t*)complexVector;
340  int8_t* iBufferPtr = iBuffer;
341  __m256i iMoveMask1 = _mm256_set_epi8(0x80,
342  0x80,
343  0x80,
344  0x80,
345  0x80,
346  0x80,
347  0x80,
348  0x80,
349  13,
350  12,
351  9,
352  8,
353  5,
354  4,
355  1,
356  0,
357  0x80,
358  0x80,
359  0x80,
360  0x80,
361  0x80,
362  0x80,
363  0x80,
364  0x80,
365  13,
366  12,
367  9,
368  8,
369  5,
370  4,
371  1,
372  0);
373  __m256i iMoveMask2 = _mm256_set_epi8(13,
374  12,
375  9,
376  8,
377  5,
378  4,
379  1,
380  0,
381  0x80,
382  0x80,
383  0x80,
384  0x80,
385  0x80,
386  0x80,
387  0x80,
388  0x80,
389  13,
390  12,
391  9,
392  8,
393  5,
394  4,
395  1,
396  0,
397  0x80,
398  0x80,
399  0x80,
400  0x80,
401  0x80,
402  0x80,
403  0x80,
404  0x80);
405  __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
406 
407  unsigned int thirtysecondPoints = num_points / 32;
408 
409  for (number = 0; number < thirtysecondPoints; number++) {
410  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
411  complexVectorPtr += 32;
412  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
413  complexVectorPtr += 32;
414 
415  complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
416  complexVectorPtr += 32;
417  complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
418  complexVectorPtr += 32;
419 
420  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
421  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
422 
423  complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
424  complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
425 
426  complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
427  complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
428 
429  complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
430  complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
431 
432  complexVal1 = _mm256_srai_epi16(complexVal1, 8);
433  complexVal3 = _mm256_srai_epi16(complexVal3, 8);
434 
435  iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
436  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
437 
438  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
439 
440  iBufferPtr += 32;
441  }
442 
443  number = thirtysecondPoints * 32;
444  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
445  for (; number < num_points; number++) {
446  *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
447  int16ComplexVectorPtr++;
448  }
449 }
450 #endif /* LV_HAVE_AVX2 */
451 
452 
453 #ifdef LV_HAVE_RVV
454 #include <riscv_vector.h>
455 
456 static inline void volk_16ic_deinterleave_real_8i_rvv(int8_t* iBuffer,
457  const lv_16sc_t* complexVector,
458  unsigned int num_points)
459 {
460  const uint32_t* in = (const uint32_t*)complexVector;
461  size_t n = num_points;
462  for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
463  vl = __riscv_vsetvl_e32m8(n);
464  vuint32m8_t vc = __riscv_vle32_v_u32m8(in, vl);
465  __riscv_vse8(
466  (uint8_t*)iBuffer, __riscv_vnsrl(__riscv_vnsrl(vc, 0, vl), 8, vl), vl);
467  }
468 }
469 #endif /*LV_HAVE_RVV*/
470 
471 #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
lv_16sc_t
short complex lv_16sc_t
Definition: volk_complex.h:71
volk_16ic_deinterleave_real_8i_generic
static void volk_16ic_deinterleave_real_8i_generic(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:231
volk_16ic_deinterleave_real_8i_a_ssse3
static void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:173
volk_16ic_deinterleave_real_8i_neon
static void volk_16ic_deinterleave_real_8i_neon(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:248