Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_real_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
41 #ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a_H
42 #define INCLUDED_volk_16ic_deinterleave_real_16i_a_H
43 
44 #include <inttypes.h>
45 #include <stdio.h>
46 
47 
48 #ifdef LV_HAVE_AVX2
49 #include <immintrin.h>
50 
51 static inline void volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
52  const lv_16sc_t* complexVector,
53  unsigned int num_points)
54 {
55  unsigned int number = 0;
56  const int16_t* complexVectorPtr = (int16_t*)complexVector;
57  int16_t* iBufferPtr = iBuffer;
58 
59  __m256i iMoveMask1 = _mm256_set_epi8(0x80,
60  0x80,
61  0x80,
62  0x80,
63  0x80,
64  0x80,
65  0x80,
66  0x80,
67  13,
68  12,
69  9,
70  8,
71  5,
72  4,
73  1,
74  0,
75  0x80,
76  0x80,
77  0x80,
78  0x80,
79  0x80,
80  0x80,
81  0x80,
82  0x80,
83  13,
84  12,
85  9,
86  8,
87  5,
88  4,
89  1,
90  0);
91  __m256i iMoveMask2 = _mm256_set_epi8(13,
92  12,
93  9,
94  8,
95  5,
96  4,
97  1,
98  0,
99  0x80,
100  0x80,
101  0x80,
102  0x80,
103  0x80,
104  0x80,
105  0x80,
106  0x80,
107  13,
108  12,
109  9,
110  8,
111  5,
112  4,
113  1,
114  0,
115  0x80,
116  0x80,
117  0x80,
118  0x80,
119  0x80,
120  0x80,
121  0x80,
122  0x80);
123 
124  __m256i complexVal1, complexVal2, iOutputVal;
125 
126  unsigned int sixteenthPoints = num_points / 16;
127 
128  for (number = 0; number < sixteenthPoints; number++) {
129  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
130  complexVectorPtr += 16;
131  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
132  complexVectorPtr += 16;
133 
134  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
135  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
136 
137  iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
138  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
139 
140  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
141 
142  iBufferPtr += 16;
143  }
144 
145  number = sixteenthPoints * 16;
146  for (; number < num_points; number++) {
147  *iBufferPtr++ = *complexVectorPtr++;
148  complexVectorPtr++;
149  }
150 }
151 #endif /* LV_HAVE_AVX2 */
152 
153 #ifdef LV_HAVE_SSSE3
154 #include <tmmintrin.h>
155 
156 static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer,
157  const lv_16sc_t* complexVector,
158  unsigned int num_points)
159 {
160  unsigned int number = 0;
161  const int16_t* complexVectorPtr = (int16_t*)complexVector;
162  int16_t* iBufferPtr = iBuffer;
163 
164  __m128i iMoveMask1 = _mm_set_epi8(
165  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
166  __m128i iMoveMask2 = _mm_set_epi8(
167  13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
168 
169  __m128i complexVal1, complexVal2, iOutputVal;
170 
171  unsigned int eighthPoints = num_points / 8;
172 
173  for (number = 0; number < eighthPoints; number++) {
174  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
175  complexVectorPtr += 8;
176  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
177  complexVectorPtr += 8;
178 
179  complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
180  complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
181 
182  iOutputVal = _mm_or_si128(complexVal1, complexVal2);
183 
184  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
185 
186  iBufferPtr += 8;
187  }
188 
189  number = eighthPoints * 8;
190  for (; number < num_points; number++) {
191  *iBufferPtr++ = *complexVectorPtr++;
192  complexVectorPtr++;
193  }
194 }
195 #endif /* LV_HAVE_SSSE3 */
196 
197 
198 #ifdef LV_HAVE_SSE2
199 #include <emmintrin.h>
200 
201 static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer,
202  const lv_16sc_t* complexVector,
203  unsigned int num_points)
204 {
205  unsigned int number = 0;
206  const int16_t* complexVectorPtr = (int16_t*)complexVector;
207  int16_t* iBufferPtr = iBuffer;
208  __m128i complexVal1, complexVal2, iOutputVal;
209  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
210  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
211 
212  unsigned int eighthPoints = num_points / 8;
213 
214  for (number = 0; number < eighthPoints; number++) {
215  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
216  complexVectorPtr += 8;
217  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
218  complexVectorPtr += 8;
219 
220  complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
221 
222  complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
223 
224  complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
225 
226  complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
227 
228  complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
229 
230  complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
231 
232  iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask),
233  _mm_and_si128(complexVal2, highMask));
234 
235  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
236 
237  iBufferPtr += 8;
238  }
239 
240  number = eighthPoints * 8;
241  for (; number < num_points; number++) {
242  *iBufferPtr++ = *complexVectorPtr++;
243  complexVectorPtr++;
244  }
245 }
246 #endif /* LV_HAVE_SSE2 */
247 
248 #ifdef LV_HAVE_GENERIC
249 
250 static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer,
251  const lv_16sc_t* complexVector,
252  unsigned int num_points)
253 {
254  unsigned int number = 0;
255  const int16_t* complexVectorPtr = (int16_t*)complexVector;
256  int16_t* iBufferPtr = iBuffer;
257  for (number = 0; number < num_points; number++) {
258  *iBufferPtr++ = *complexVectorPtr++;
259  complexVectorPtr++;
260  }
261 }
262 #endif /* LV_HAVE_GENERIC */
263 
264 
265 #ifdef LV_HAVE_NEON
266 #include <arm_neon.h>
267 
268 static inline void volk_16ic_deinterleave_real_16i_neon(int16_t* iBuffer,
269  const lv_16sc_t* complexVector,
270  unsigned int num_points)
271 {
272  unsigned int number = 0;
273  const unsigned int eighthPoints = num_points / 8;
274  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
275  int16_t* iBufferPtr = iBuffer;
276 
277  int16x8x2_t complexVal;
278 
279  for (; number < eighthPoints; number++) {
280  complexVal = vld2q_s16(complexVectorPtr);
281  vst1q_s16(iBufferPtr, complexVal.val[0]);
282  complexVectorPtr += 16;
283  iBufferPtr += 8;
284  }
285 
286  number = eighthPoints * 8;
287  for (; number < num_points; number++) {
288  *iBufferPtr++ = *complexVectorPtr++;
289  complexVectorPtr++;
290  }
291 }
292 #endif /* LV_HAVE_NEON */
293 
294 
295 #ifdef LV_HAVE_NEONV8
296 #include <arm_neon.h>
297 
298 static inline void volk_16ic_deinterleave_real_16i_neonv8(int16_t* iBuffer,
299  const lv_16sc_t* complexVector,
300  unsigned int num_points)
301 {
302  unsigned int number = 0;
303  const unsigned int sixteenthPoints = num_points / 16;
304  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
305  int16_t* iBufferPtr = iBuffer;
306 
307  int16x8x2_t complexVal0, complexVal1;
308 
309  for (; number < sixteenthPoints; number++) {
310  complexVal0 = vld2q_s16(complexVectorPtr);
311  complexVal1 = vld2q_s16(complexVectorPtr + 16);
312  __VOLK_PREFETCH(complexVectorPtr + 32);
313 
314  vst1q_s16(iBufferPtr, complexVal0.val[0]);
315  vst1q_s16(iBufferPtr + 8, complexVal1.val[0]);
316 
317  complexVectorPtr += 32;
318  iBufferPtr += 16;
319  }
320 
321  number = sixteenthPoints * 16;
322  for (; number < num_points; number++) {
323  *iBufferPtr++ = *complexVectorPtr++;
324  complexVectorPtr++;
325  }
326 }
327 #endif /* LV_HAVE_NEONV8 */
328 
329 
330 #endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a_H */
331 
332 
333 #ifndef INCLUDED_volk_16ic_deinterleave_real_16i_u_H
334 #define INCLUDED_volk_16ic_deinterleave_real_16i_u_H
335 
336 #include <inttypes.h>
337 #include <stdio.h>
338 
339 
340 #ifdef LV_HAVE_AVX2
341 #include <immintrin.h>
342 
343 static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
344  const lv_16sc_t* complexVector,
345  unsigned int num_points)
346 {
347  unsigned int number = 0;
348  const int16_t* complexVectorPtr = (int16_t*)complexVector;
349  int16_t* iBufferPtr = iBuffer;
350 
351  __m256i iMoveMask1 = _mm256_set_epi8(0x80,
352  0x80,
353  0x80,
354  0x80,
355  0x80,
356  0x80,
357  0x80,
358  0x80,
359  13,
360  12,
361  9,
362  8,
363  5,
364  4,
365  1,
366  0,
367  0x80,
368  0x80,
369  0x80,
370  0x80,
371  0x80,
372  0x80,
373  0x80,
374  0x80,
375  13,
376  12,
377  9,
378  8,
379  5,
380  4,
381  1,
382  0);
383  __m256i iMoveMask2 = _mm256_set_epi8(13,
384  12,
385  9,
386  8,
387  5,
388  4,
389  1,
390  0,
391  0x80,
392  0x80,
393  0x80,
394  0x80,
395  0x80,
396  0x80,
397  0x80,
398  0x80,
399  13,
400  12,
401  9,
402  8,
403  5,
404  4,
405  1,
406  0,
407  0x80,
408  0x80,
409  0x80,
410  0x80,
411  0x80,
412  0x80,
413  0x80,
414  0x80);
415 
416  __m256i complexVal1, complexVal2, iOutputVal;
417 
418  unsigned int sixteenthPoints = num_points / 16;
419 
420  for (number = 0; number < sixteenthPoints; number++) {
421  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
422  complexVectorPtr += 16;
423  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
424  complexVectorPtr += 16;
425 
426  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
427  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
428 
429  iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
430  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
431 
432  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
433 
434  iBufferPtr += 16;
435  }
436 
437  number = sixteenthPoints * 16;
438  for (; number < num_points; number++) {
439  *iBufferPtr++ = *complexVectorPtr++;
440  complexVectorPtr++;
441  }
442 }
443 #endif /* LV_HAVE_AVX2 */
444 
445 #ifdef LV_HAVE_RVV
446 #include <riscv_vector.h>
447 
448 static inline void volk_16ic_deinterleave_real_16i_rvv(int16_t* iBuffer,
449  const lv_16sc_t* complexVector,
450  unsigned int num_points)
451 {
452  const uint32_t* in = (const uint32_t*)complexVector;
453  size_t n = num_points;
454  for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
455  vl = __riscv_vsetvl_e32m8(n);
456  vuint32m8_t vc = __riscv_vle32_v_u32m8(in, vl);
457  __riscv_vse16((uint16_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl);
458  }
459 }
460 #endif /*LV_HAVE_RVV*/
461 
462 #endif /* INCLUDED_volk_16ic_deinterleave_real_16i_u_H */
volk_16ic_deinterleave_real_16i_a_sse2
static void volk_16ic_deinterleave_real_16i_a_sse2(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:201
volk_16ic_deinterleave_real_16i_neon
static void volk_16ic_deinterleave_real_16i_neon(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:268
volk_16ic_deinterleave_real_16i_generic
static void volk_16ic_deinterleave_real_16i_generic(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:250
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
lv_16sc_t
short complex lv_16sc_t
Definition: volk_complex.h:71
volk_16ic_deinterleave_real_16i_a_ssse3
static void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:156