Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_8ic_deinterleave_real_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
40 #ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
41 #define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
42 
43 #include <inttypes.h>
44 #include <stdio.h>
45 
46 #ifdef LV_HAVE_AVX2
47 #include <immintrin.h>
48 
49 static inline void volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
50  const lv_8sc_t* complexVector,
51  unsigned int num_points)
52 {
53  unsigned int number = 0;
54  const int8_t* complexVectorPtr = (int8_t*)complexVector;
55  int8_t* iBufferPtr = iBuffer;
56  __m256i moveMask1 = _mm256_set_epi8(0x80,
57  0x80,
58  0x80,
59  0x80,
60  0x80,
61  0x80,
62  0x80,
63  0x80,
64  14,
65  12,
66  10,
67  8,
68  6,
69  4,
70  2,
71  0,
72  0x80,
73  0x80,
74  0x80,
75  0x80,
76  0x80,
77  0x80,
78  0x80,
79  0x80,
80  14,
81  12,
82  10,
83  8,
84  6,
85  4,
86  2,
87  0);
88  __m256i moveMask2 = _mm256_set_epi8(14,
89  12,
90  10,
91  8,
92  6,
93  4,
94  2,
95  0,
96  0x80,
97  0x80,
98  0x80,
99  0x80,
100  0x80,
101  0x80,
102  0x80,
103  0x80,
104  14,
105  12,
106  10,
107  8,
108  6,
109  4,
110  2,
111  0,
112  0x80,
113  0x80,
114  0x80,
115  0x80,
116  0x80,
117  0x80,
118  0x80,
119  0x80);
120  __m256i complexVal1, complexVal2, outputVal;
121 
122  unsigned int thirtysecondPoints = num_points / 32;
123 
124  for (number = 0; number < thirtysecondPoints; number++) {
125 
126  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
127  complexVectorPtr += 32;
128  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
129  complexVectorPtr += 32;
130 
131  complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
132  complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
133  outputVal = _mm256_or_si256(complexVal1, complexVal2);
134  outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
135 
136  _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
137  iBufferPtr += 32;
138  }
139 
140  number = thirtysecondPoints * 32;
141  for (; number < num_points; number++) {
142  *iBufferPtr++ = *complexVectorPtr++;
143  complexVectorPtr++;
144  }
145 }
146 #endif /* LV_HAVE_AVX2 */
147 
148 
149 #ifdef LV_HAVE_SSSE3
150 #include <tmmintrin.h>
151 
152 static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
153  const lv_8sc_t* complexVector,
154  unsigned int num_points)
155 {
156  unsigned int number = 0;
157  const int8_t* complexVectorPtr = (int8_t*)complexVector;
158  int8_t* iBufferPtr = iBuffer;
159  __m128i moveMask1 = _mm_set_epi8(
160  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
161  __m128i moveMask2 = _mm_set_epi8(
162  14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
163  __m128i complexVal1, complexVal2, outputVal;
164 
165  unsigned int sixteenthPoints = num_points / 16;
166 
167  for (number = 0; number < sixteenthPoints; number++) {
168  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
169  complexVectorPtr += 16;
170  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
171  complexVectorPtr += 16;
172 
173  complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
174  complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
175 
176  outputVal = _mm_or_si128(complexVal1, complexVal2);
177 
178  _mm_store_si128((__m128i*)iBufferPtr, outputVal);
179  iBufferPtr += 16;
180  }
181 
182  number = sixteenthPoints * 16;
183  for (; number < num_points; number++) {
184  *iBufferPtr++ = *complexVectorPtr++;
185  complexVectorPtr++;
186  }
187 }
188 #endif /* LV_HAVE_SSSE3 */
189 
190 
191 #ifdef LV_HAVE_AVX
192 #include <immintrin.h>
193 
194 static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer,
195  const lv_8sc_t* complexVector,
196  unsigned int num_points)
197 {
198  unsigned int number = 0;
199  const int8_t* complexVectorPtr = (int8_t*)complexVector;
200  int8_t* iBufferPtr = iBuffer;
201  __m128i moveMaskL = _mm_set_epi8(
202  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
203  __m128i moveMaskH = _mm_set_epi8(
204  14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
205  __m256i complexVal1, complexVal2, outputVal;
206  __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1,
207  outputVal2;
208 
209  unsigned int thirtysecondPoints = num_points / 32;
210 
211  for (number = 0; number < thirtysecondPoints; number++) {
212 
213  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
214  complexVectorPtr += 32;
215  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
216  complexVectorPtr += 32;
217 
218  complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
219  complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
220  complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
221  complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
222 
223  complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
224  complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
225  outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
226 
227 
228  complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
229  complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
230  outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
231 
232  __m256i dummy = _mm256_setzero_si256();
233  outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
234  outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
235 
236 
237  _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
238  iBufferPtr += 32;
239  }
240 
241  number = thirtysecondPoints * 32;
242  for (; number < num_points; number++) {
243  *iBufferPtr++ = *complexVectorPtr++;
244  complexVectorPtr++;
245  }
246 }
247 #endif /* LV_HAVE_AVX */
248 
249 
250 #ifdef LV_HAVE_GENERIC
251 
252 static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer,
253  const lv_8sc_t* complexVector,
254  unsigned int num_points)
255 {
256  unsigned int number = 0;
257  const int8_t* complexVectorPtr = (int8_t*)complexVector;
258  int8_t* iBufferPtr = iBuffer;
259  for (number = 0; number < num_points; number++) {
260  *iBufferPtr++ = *complexVectorPtr++;
261  complexVectorPtr++;
262  }
263 }
264 #endif /* LV_HAVE_GENERIC */
265 
266 
267 #ifdef LV_HAVE_NEON
268 #include <arm_neon.h>
269 
270 static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer,
271  const lv_8sc_t* complexVector,
272  unsigned int num_points)
273 {
274  unsigned int number;
275  unsigned int sixteenth_points = num_points / 16;
276 
277  int8x16x2_t input_vector;
278  for (number = 0; number < sixteenth_points; ++number) {
279  input_vector = vld2q_s8((int8_t*)complexVector);
280  vst1q_s8(iBuffer, input_vector.val[0]);
281  iBuffer += 16;
282  complexVector += 16;
283  }
284 
285  const int8_t* complexVectorPtr = (int8_t*)complexVector;
286  int8_t* iBufferPtr = iBuffer;
287  for (number = sixteenth_points * 16; number < num_points; number++) {
288  *iBufferPtr++ = *complexVectorPtr++;
289  complexVectorPtr++;
290  }
291 }
292 #endif /* LV_HAVE_NEON */
293 
294 #ifdef LV_HAVE_NEONV8
295 #include <arm_neon.h>
296 
297 static inline void volk_8ic_deinterleave_real_8i_neonv8(int8_t* iBuffer,
298  const lv_8sc_t* complexVector,
299  unsigned int num_points)
300 {
301  const unsigned int thirtysecondPoints = num_points / 32;
302 
303  for (unsigned int number = 0; number < thirtysecondPoints; number++) {
304  int8x16x2_t cplx0 = vld2q_s8((const int8_t*)complexVector);
305  int8x16x2_t cplx1 = vld2q_s8((const int8_t*)complexVector + 32);
306  __VOLK_PREFETCH((const int8_t*)complexVector + 64);
307 
308  vst1q_s8(iBuffer, cplx0.val[0]);
309  vst1q_s8(iBuffer + 16, cplx1.val[0]);
310 
311  iBuffer += 32;
312  complexVector += 32;
313  }
314 
315  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
316  for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
317  *iBuffer++ = *complexVectorPtr++;
318  complexVectorPtr++;
319  }
320 }
321 #endif /* LV_HAVE_NEONV8 */
322 
323 
324 #endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
325 
326 #ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
327 #define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
328 
329 #include <inttypes.h>
330 #include <stdio.h>
331 
332 #ifdef LV_HAVE_AVX2
333 #include <immintrin.h>
334 
335 static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
336  const lv_8sc_t* complexVector,
337  unsigned int num_points)
338 {
339  unsigned int number = 0;
340  const int8_t* complexVectorPtr = (int8_t*)complexVector;
341  int8_t* iBufferPtr = iBuffer;
342  __m256i moveMask1 = _mm256_set_epi8(0x80,
343  0x80,
344  0x80,
345  0x80,
346  0x80,
347  0x80,
348  0x80,
349  0x80,
350  14,
351  12,
352  10,
353  8,
354  6,
355  4,
356  2,
357  0,
358  0x80,
359  0x80,
360  0x80,
361  0x80,
362  0x80,
363  0x80,
364  0x80,
365  0x80,
366  14,
367  12,
368  10,
369  8,
370  6,
371  4,
372  2,
373  0);
374  __m256i moveMask2 = _mm256_set_epi8(14,
375  12,
376  10,
377  8,
378  6,
379  4,
380  2,
381  0,
382  0x80,
383  0x80,
384  0x80,
385  0x80,
386  0x80,
387  0x80,
388  0x80,
389  0x80,
390  14,
391  12,
392  10,
393  8,
394  6,
395  4,
396  2,
397  0,
398  0x80,
399  0x80,
400  0x80,
401  0x80,
402  0x80,
403  0x80,
404  0x80,
405  0x80);
406  __m256i complexVal1, complexVal2, outputVal;
407 
408  unsigned int thirtysecondPoints = num_points / 32;
409 
410  for (number = 0; number < thirtysecondPoints; number++) {
411 
412  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
413  complexVectorPtr += 32;
414  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
415  complexVectorPtr += 32;
416 
417  complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
418  complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
419  outputVal = _mm256_or_si256(complexVal1, complexVal2);
420  outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
421 
422  _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
423  iBufferPtr += 32;
424  }
425 
426  number = thirtysecondPoints * 32;
427  for (; number < num_points; number++) {
428  *iBufferPtr++ = *complexVectorPtr++;
429  complexVectorPtr++;
430  }
431 }
432 #endif /* LV_HAVE_AVX2 */
433 
434 #ifdef LV_HAVE_RVV
435 #include <riscv_vector.h>
436 
437 static inline void volk_8ic_deinterleave_real_8i_rvv(int8_t* iBuffer,
438  const lv_8sc_t* complexVector,
439  unsigned int num_points)
440 {
441  const uint16_t* in = (const uint16_t*)complexVector;
442  size_t n = num_points;
443  for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
444  vl = __riscv_vsetvl_e16m8(n);
445  vuint16m8_t vc = __riscv_vle16_v_u16m8(in, vl);
446  __riscv_vse8((uint8_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl);
447  }
448 }
449 #endif /*LV_HAVE_RVV*/
450 
451 #endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H */
volk_8ic_deinterleave_real_8i_a_avx
static void volk_8ic_deinterleave_real_8i_a_avx(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:194
volk_8ic_deinterleave_real_8i_generic
static void volk_8ic_deinterleave_real_8i_generic(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:252
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_8ic_deinterleave_real_8i_neon
static void volk_8ic_deinterleave_real_8i_neon(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:270
volk_8ic_deinterleave_real_8i_a_ssse3
static void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:152
lv_8sc_t
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:70