Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_deinterleave_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
61 #define INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
62 
63 #include <inttypes.h>
64 #include <stdio.h>
65 
66 #ifdef LV_HAVE_GENERIC
67 
68 static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer,
69  float* qBuffer,
70  const lv_32fc_t* complexVector,
71  unsigned int num_points)
72 {
73  const float* complexVectorPtr = (float*)complexVector;
74  float* iBufferPtr = iBuffer;
75  float* qBufferPtr = qBuffer;
76  unsigned int number;
77  for (number = 0; number < num_points; number++) {
78  *iBufferPtr++ = *complexVectorPtr++;
79  *qBufferPtr++ = *complexVectorPtr++;
80  }
81 }
82 #endif /* LV_HAVE_GENERIC */
83 
84 #ifdef LV_HAVE_AVX512F
85 #include <immintrin.h>
86 
87 static inline void volk_32fc_deinterleave_32f_x2_a_avx512f(float* iBuffer,
88  float* qBuffer,
89  const lv_32fc_t* complexVector,
90  unsigned int num_points)
91 {
92  const float* complexVectorPtr = (float*)complexVector;
93  float* iBufferPtr = iBuffer;
94  float* qBufferPtr = qBuffer;
95 
96  unsigned int number = 0;
97  const unsigned int eighthPoints = num_points / 8;
98 
99  __m512 cplxValue;
100  __m512 iValue, qValue;
101 
102  for (; number < eighthPoints; number++) {
103  // Load 8 complex numbers (16 floats): I0,Q0,I1,Q1,...,I7,Q7
104  cplxValue = _mm512_load_ps(complexVectorPtr);
105 
106  // Deinterleave using permute
107  // Extract all I values (even indices: 0,2,4,6,8,10,12,14)
108  iValue = _mm512_permutexvar_ps(
109  _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0),
110  cplxValue);
111 
112  // Extract all Q values (odd indices: 1,3,5,7,9,11,13,15)
113  qValue = _mm512_permutexvar_ps(
114  _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0),
115  cplxValue);
116 
117  // Store only the first 8 results (lower 256 bits)
118  _mm256_store_ps(iBufferPtr, _mm512_castps512_ps256(iValue));
119  _mm256_store_ps(qBufferPtr, _mm512_castps512_ps256(qValue));
120 
121  complexVectorPtr += 16;
122  iBufferPtr += 8;
123  qBufferPtr += 8;
124  }
125 
126  number = eighthPoints * 8;
128  iBufferPtr, qBufferPtr, (const lv_32fc_t*)complexVectorPtr, num_points - number);
129 }
130 #endif /* LV_HAVE_AVX512F */
131 
132 #ifdef LV_HAVE_AVX
133 #include <immintrin.h>
134 static inline void volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer,
135  float* qBuffer,
136  const lv_32fc_t* complexVector,
137  unsigned int num_points)
138 {
139  const float* complexVectorPtr = (float*)complexVector;
140  float* iBufferPtr = iBuffer;
141  float* qBufferPtr = qBuffer;
142 
143  unsigned int number = 0;
144  // Mask for real and imaginary parts
145  const unsigned int eighthPoints = num_points / 8;
146  __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
147  for (; number < eighthPoints; number++) {
148  cplxValue1 = _mm256_load_ps(complexVectorPtr);
149  complexVectorPtr += 8;
150 
151  cplxValue2 = _mm256_load_ps(complexVectorPtr);
152  complexVectorPtr += 8;
153 
154  complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
155  complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
156 
157  // Arrange in i1i2i3i4 format
158  iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
159  // Arrange in q1q2q3q4 format
160  qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
161 
162  _mm256_store_ps(iBufferPtr, iValue);
163  _mm256_store_ps(qBufferPtr, qValue);
164 
165  iBufferPtr += 8;
166  qBufferPtr += 8;
167  }
168 
169  number = eighthPoints * 8;
170  for (; number < num_points; number++) {
171  *iBufferPtr++ = *complexVectorPtr++;
172  *qBufferPtr++ = *complexVectorPtr++;
173  }
174 }
175 #endif /* LV_HAVE_AVX */
176 
177 #ifdef LV_HAVE_SSE
178 #include <xmmintrin.h>
179 
180 static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer,
181  float* qBuffer,
182  const lv_32fc_t* complexVector,
183  unsigned int num_points)
184 {
185  const float* complexVectorPtr = (float*)complexVector;
186  float* iBufferPtr = iBuffer;
187  float* qBufferPtr = qBuffer;
188 
189  unsigned int number = 0;
190  const unsigned int quarterPoints = num_points / 4;
191  __m128 cplxValue1, cplxValue2, iValue, qValue;
192  for (; number < quarterPoints; number++) {
193  cplxValue1 = _mm_load_ps(complexVectorPtr);
194  complexVectorPtr += 4;
195 
196  cplxValue2 = _mm_load_ps(complexVectorPtr);
197  complexVectorPtr += 4;
198 
199  // Arrange in i1i2i3i4 format
200  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
201  // Arrange in q1q2q3q4 format
202  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
203 
204  _mm_store_ps(iBufferPtr, iValue);
205  _mm_store_ps(qBufferPtr, qValue);
206 
207  iBufferPtr += 4;
208  qBufferPtr += 4;
209  }
210 
211  number = quarterPoints * 4;
212  for (; number < num_points; number++) {
213  *iBufferPtr++ = *complexVectorPtr++;
214  *qBufferPtr++ = *complexVectorPtr++;
215  }
216 }
217 #endif /* LV_HAVE_SSE */
218 
219 
220 #ifdef LV_HAVE_NEON
221 #include <arm_neon.h>
222 
223 static inline void volk_32fc_deinterleave_32f_x2_neon(float* iBuffer,
224  float* qBuffer,
225  const lv_32fc_t* complexVector,
226  unsigned int num_points)
227 {
228  unsigned int number = 0;
229  unsigned int quarter_points = num_points / 4;
230  const float* complexVectorPtr = (float*)complexVector;
231  float* iBufferPtr = iBuffer;
232  float* qBufferPtr = qBuffer;
233  float32x4x2_t complexInput;
234 
235  for (number = 0; number < quarter_points; number++) {
236  complexInput = vld2q_f32(complexVectorPtr);
237  vst1q_f32(iBufferPtr, complexInput.val[0]);
238  vst1q_f32(qBufferPtr, complexInput.val[1]);
239  complexVectorPtr += 8;
240  iBufferPtr += 4;
241  qBufferPtr += 4;
242  }
243 
244  for (number = quarter_points * 4; number < num_points; number++) {
245  *iBufferPtr++ = *complexVectorPtr++;
246  *qBufferPtr++ = *complexVectorPtr++;
247  }
248 }
249 #endif /* LV_HAVE_NEON */
250 
251 #ifdef LV_HAVE_NEONV8
252 #include <arm_neon.h>
253 
254 static inline void volk_32fc_deinterleave_32f_x2_neonv8(float* iBuffer,
255  float* qBuffer,
256  const lv_32fc_t* complexVector,
257  unsigned int num_points)
258 {
259  const unsigned int eighthPoints = num_points / 8;
260  const float* complexVectorPtr = (float*)complexVector;
261  float* iBufferPtr = iBuffer;
262  float* qBufferPtr = qBuffer;
263 
264  for (unsigned int number = 0; number < eighthPoints; number++) {
265  float32x4x2_t cplx0 = vld2q_f32(complexVectorPtr);
266  float32x4x2_t cplx1 = vld2q_f32(complexVectorPtr + 8);
267  __VOLK_PREFETCH(complexVectorPtr + 32);
268 
269  vst1q_f32(iBufferPtr, cplx0.val[0]);
270  vst1q_f32(iBufferPtr + 4, cplx1.val[0]);
271  vst1q_f32(qBufferPtr, cplx0.val[1]);
272  vst1q_f32(qBufferPtr + 4, cplx1.val[1]);
273 
274  complexVectorPtr += 16;
275  iBufferPtr += 8;
276  qBufferPtr += 8;
277  }
278 
279  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
280  *iBufferPtr++ = *complexVectorPtr++;
281  *qBufferPtr++ = *complexVectorPtr++;
282  }
283 }
284 #endif /* LV_HAVE_NEONV8 */
285 
286 #endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a_H */
287 
288 
289 #ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_u_H
290 #define INCLUDED_volk_32fc_deinterleave_32f_x2_u_H
291 
292 #include <inttypes.h>
293 #include <stdio.h>
294 
295 #ifdef LV_HAVE_AVX512F
296 #include <immintrin.h>
297 
298 static inline void volk_32fc_deinterleave_32f_x2_u_avx512f(float* iBuffer,
299  float* qBuffer,
300  const lv_32fc_t* complexVector,
301  unsigned int num_points)
302 {
303  const float* complexVectorPtr = (float*)complexVector;
304  float* iBufferPtr = iBuffer;
305  float* qBufferPtr = qBuffer;
306 
307  unsigned int number = 0;
308  const unsigned int eighthPoints = num_points / 8;
309 
310  __m512 cplxValue;
311  __m512 iValue, qValue;
312 
313  for (; number < eighthPoints; number++) {
314  // Load 8 complex numbers (16 floats): I0,Q0,I1,Q1,...,I7,Q7 - unaligned
315  cplxValue = _mm512_loadu_ps(complexVectorPtr);
316 
317  // Deinterleave using permute
318  // Extract all I values (even indices: 0,2,4,6,8,10,12,14)
319  iValue = _mm512_permutexvar_ps(
320  _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0),
321  cplxValue);
322 
323  // Extract all Q values (odd indices: 1,3,5,7,9,11,13,15)
324  qValue = _mm512_permutexvar_ps(
325  _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0),
326  cplxValue);
327 
328  // Store only the first 8 results (lower 256 bits) - unaligned
329  _mm256_storeu_ps(iBufferPtr, _mm512_castps512_ps256(iValue));
330  _mm256_storeu_ps(qBufferPtr, _mm512_castps512_ps256(qValue));
331 
332  complexVectorPtr += 16;
333  iBufferPtr += 8;
334  qBufferPtr += 8;
335  }
336 
337  number = eighthPoints * 8;
339  iBufferPtr, qBufferPtr, (const lv_32fc_t*)complexVectorPtr, num_points - number);
340 }
341 #endif /* LV_HAVE_AVX512F */
342 
343 #ifdef LV_HAVE_AVX
344 #include <immintrin.h>
345 static inline void volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer,
346  float* qBuffer,
347  const lv_32fc_t* complexVector,
348  unsigned int num_points)
349 {
350  const float* complexVectorPtr = (float*)complexVector;
351  float* iBufferPtr = iBuffer;
352  float* qBufferPtr = qBuffer;
353 
354  unsigned int number = 0;
355  // Mask for real and imaginary parts
356  const unsigned int eighthPoints = num_points / 8;
357  __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
358  for (; number < eighthPoints; number++) {
359  cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
360  complexVectorPtr += 8;
361 
362  cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
363  complexVectorPtr += 8;
364 
365  complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
366  complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
367 
368  // Arrange in i1i2i3i4 format
369  iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
370  // Arrange in q1q2q3q4 format
371  qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
372 
373  _mm256_storeu_ps(iBufferPtr, iValue);
374  _mm256_storeu_ps(qBufferPtr, qValue);
375 
376  iBufferPtr += 8;
377  qBufferPtr += 8;
378  }
379 
380  number = eighthPoints * 8;
381  for (; number < num_points; number++) {
382  *iBufferPtr++ = *complexVectorPtr++;
383  *qBufferPtr++ = *complexVectorPtr++;
384  }
385 }
386 #endif /* LV_HAVE_AVX */
387 
388 #ifdef LV_HAVE_RVV
389 #include <riscv_vector.h>
390 
391 static inline void volk_32fc_deinterleave_32f_x2_rvv(float* iBuffer,
392  float* qBuffer,
393  const lv_32fc_t* complexVector,
394  unsigned int num_points)
395 {
396  size_t n = num_points;
397  for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
398  vl = __riscv_vsetvl_e32m4(n);
399  vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl);
400  vuint32m4_t vr = __riscv_vnsrl(vc, 0, vl);
401  vuint32m4_t vi = __riscv_vnsrl(vc, 32, vl);
402  __riscv_vse32((uint32_t*)iBuffer, vr, vl);
403  __riscv_vse32((uint32_t*)qBuffer, vi, vl);
404  }
405 }
406 #endif /*LV_HAVE_RVV*/
407 
408 #ifdef LV_HAVE_RVVSEG
409 #include <riscv_vector.h>
410 
411 static inline void volk_32fc_deinterleave_32f_x2_rvvseg(float* iBuffer,
412  float* qBuffer,
413  const lv_32fc_t* complexVector,
414  unsigned int num_points)
415 {
416  size_t n = num_points;
417  for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
418  vl = __riscv_vsetvl_e32m4(n);
419  vuint32m4x2_t vc =
420  __riscv_vlseg2e32_v_u32m4x2((const uint32_t*)complexVector, vl);
421  vuint32m4_t vr = __riscv_vget_u32m4(vc, 0);
422  vuint32m4_t vi = __riscv_vget_u32m4(vc, 1);
423  __riscv_vse32((uint32_t*)iBuffer, vr, vl);
424  __riscv_vse32((uint32_t*)qBuffer, vi, vl);
425  }
426 }
427 #endif /*LV_HAVE_RVVSEG*/
428 
429 #endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_u_H */
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32fc_deinterleave_32f_x2_neon
static void volk_32fc_deinterleave_32f_x2_neon(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:223
volk_32fc_deinterleave_32f_x2_a_sse
static void volk_32fc_deinterleave_32f_x2_a_sse(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:180
volk_32fc_deinterleave_32f_x2_u_avx
static void volk_32fc_deinterleave_32f_x2_u_avx(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:345
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_32fc_deinterleave_32f_x2_generic
static void volk_32fc_deinterleave_32f_x2_generic(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:68
volk_32fc_deinterleave_32f_x2_a_avx
static void volk_32fc_deinterleave_32f_x2_a_avx(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:134