Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_x2_interleave_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32f_x2_interleave_32fc_a_H
61 #define INCLUDED_volk_32f_x2_interleave_32fc_a_H
62 
63 #include <inttypes.h>
64 #include <stdio.h>
65 
66 #ifdef LV_HAVE_AVX
67 #include <immintrin.h>
68 
69 static inline void volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector,
70  const float* iBuffer,
71  const float* qBuffer,
72  unsigned int num_points)
73 {
74  unsigned int number = 0;
75  float* complexVectorPtr = (float*)complexVector;
76  const float* iBufferPtr = iBuffer;
77  const float* qBufferPtr = qBuffer;
78 
79  const uint64_t eighthPoints = num_points / 8;
80 
81  __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
82  for (; number < eighthPoints; number++) {
83  iValue = _mm256_load_ps(iBufferPtr);
84  qValue = _mm256_load_ps(qBufferPtr);
85 
86  // Interleaves the lower two values in the i and q variables into one buffer
87  cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
88  // Interleaves the upper two values in the i and q variables into one buffer
89  cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
90 
91  cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
92  _mm256_store_ps(complexVectorPtr, cplxValue);
93  complexVectorPtr += 8;
94 
95  cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
96  _mm256_store_ps(complexVectorPtr, cplxValue);
97  complexVectorPtr += 8;
98 
99  iBufferPtr += 8;
100  qBufferPtr += 8;
101  }
102 
103  number = eighthPoints * 8;
104  for (; number < num_points; number++) {
105  *complexVectorPtr++ = *iBufferPtr++;
106  *complexVectorPtr++ = *qBufferPtr++;
107  }
108 }
109 
110 #endif /* LV_HAV_AVX */
111 
112 #ifdef LV_HAVE_SSE
113 #include <xmmintrin.h>
114 
115 static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector,
116  const float* iBuffer,
117  const float* qBuffer,
118  unsigned int num_points)
119 {
120  unsigned int number = 0;
121  float* complexVectorPtr = (float*)complexVector;
122  const float* iBufferPtr = iBuffer;
123  const float* qBufferPtr = qBuffer;
124 
125  const uint64_t quarterPoints = num_points / 4;
126 
127  __m128 iValue, qValue, cplxValue;
128  for (; number < quarterPoints; number++) {
129  iValue = _mm_load_ps(iBufferPtr);
130  qValue = _mm_load_ps(qBufferPtr);
131 
132  // Interleaves the lower two values in the i and q variables into one buffer
133  cplxValue = _mm_unpacklo_ps(iValue, qValue);
134  _mm_store_ps(complexVectorPtr, cplxValue);
135  complexVectorPtr += 4;
136 
137  // Interleaves the upper two values in the i and q variables into one buffer
138  cplxValue = _mm_unpackhi_ps(iValue, qValue);
139  _mm_store_ps(complexVectorPtr, cplxValue);
140  complexVectorPtr += 4;
141 
142  iBufferPtr += 4;
143  qBufferPtr += 4;
144  }
145 
146  number = quarterPoints * 4;
147  for (; number < num_points; number++) {
148  *complexVectorPtr++ = *iBufferPtr++;
149  *complexVectorPtr++ = *qBufferPtr++;
150  }
151 }
152 #endif /* LV_HAVE_SSE */
153 
154 
155 #ifdef LV_HAVE_NEON
156 #include <arm_neon.h>
157 
158 static inline void volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector,
159  const float* iBuffer,
160  const float* qBuffer,
161  unsigned int num_points)
162 {
163  unsigned int quarter_points = num_points / 4;
164  unsigned int number;
165  float* complexVectorPtr = (float*)complexVector;
166 
167  float32x4x2_t complex_vec;
168  for (number = 0; number < quarter_points; ++number) {
169  complex_vec.val[0] = vld1q_f32(iBuffer);
170  complex_vec.val[1] = vld1q_f32(qBuffer);
171  vst2q_f32(complexVectorPtr, complex_vec);
172  iBuffer += 4;
173  qBuffer += 4;
174  complexVectorPtr += 8;
175  }
176 
177  for (number = quarter_points * 4; number < num_points; ++number) {
178  *complexVectorPtr++ = *iBuffer++;
179  *complexVectorPtr++ = *qBuffer++;
180  }
181 }
182 #endif /* LV_HAVE_NEON */
183 
184 #ifdef LV_HAVE_NEONV8
185 #include <arm_neon.h>
186 
187 static inline void volk_32f_x2_interleave_32fc_neonv8(lv_32fc_t* complexVector,
188  const float* iBuffer,
189  const float* qBuffer,
190  unsigned int num_points)
191 {
192  const unsigned int eighthPoints = num_points / 8;
193 
194  float* outPtr = (float*)complexVector;
195  const float* iPtr = iBuffer;
196  const float* qPtr = qBuffer;
197 
198  for (unsigned int number = 0; number < eighthPoints; number++) {
199  float32x4x2_t cplx0, cplx1;
200  cplx0.val[0] = vld1q_f32(iPtr);
201  cplx0.val[1] = vld1q_f32(qPtr);
202  cplx1.val[0] = vld1q_f32(iPtr + 4);
203  cplx1.val[1] = vld1q_f32(qPtr + 4);
204  __VOLK_PREFETCH(iPtr + 16);
205  __VOLK_PREFETCH(qPtr + 16);
206 
207  vst2q_f32(outPtr, cplx0);
208  vst2q_f32(outPtr + 8, cplx1);
209 
210  iPtr += 8;
211  qPtr += 8;
212  outPtr += 16;
213  }
214 
215  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
216  *outPtr++ = *iPtr++;
217  *outPtr++ = *qPtr++;
218  }
219 }
220 #endif /* LV_HAVE_NEONV8 */
221 
222 
223 #ifdef LV_HAVE_GENERIC
224 
225 static inline void volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector,
226  const float* iBuffer,
227  const float* qBuffer,
228  unsigned int num_points)
229 {
230  float* complexVectorPtr = (float*)complexVector;
231  const float* iBufferPtr = iBuffer;
232  const float* qBufferPtr = qBuffer;
233  unsigned int number;
234 
235  for (number = 0; number < num_points; number++) {
236  *complexVectorPtr++ = *iBufferPtr++;
237  *complexVectorPtr++ = *qBufferPtr++;
238  }
239 }
240 #endif /* LV_HAVE_GENERIC */
241 
242 
243 #endif /* INCLUDED_volk_32f_x2_interleave_32fc_a_H */
244 
245 #ifndef INCLUDED_volk_32f_x2_interleave_32fc_u_H
246 #define INCLUDED_volk_32f_x2_interleave_32fc_u_H
247 
248 #include <inttypes.h>
249 #include <stdio.h>
250 
251 #ifdef LV_HAVE_AVX
252 #include <immintrin.h>
253 
254 static inline void volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector,
255  const float* iBuffer,
256  const float* qBuffer,
257  unsigned int num_points)
258 {
259  unsigned int number = 0;
260  float* complexVectorPtr = (float*)complexVector;
261  const float* iBufferPtr = iBuffer;
262  const float* qBufferPtr = qBuffer;
263 
264  const uint64_t eighthPoints = num_points / 8;
265 
266  __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
267  for (; number < eighthPoints; number++) {
268  iValue = _mm256_loadu_ps(iBufferPtr);
269  qValue = _mm256_loadu_ps(qBufferPtr);
270 
271  // Interleaves the lower two values in the i and q variables into one buffer
272  cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
273  // Interleaves the upper two values in the i and q variables into one buffer
274  cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
275 
276  cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
277  _mm256_storeu_ps(complexVectorPtr, cplxValue);
278  complexVectorPtr += 8;
279 
280  cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
281  _mm256_storeu_ps(complexVectorPtr, cplxValue);
282  complexVectorPtr += 8;
283 
284  iBufferPtr += 8;
285  qBufferPtr += 8;
286  }
287 
288  number = eighthPoints * 8;
289  for (; number < num_points; number++) {
290  *complexVectorPtr++ = *iBufferPtr++;
291  *complexVectorPtr++ = *qBufferPtr++;
292  }
293 }
294 #endif /* LV_HAVE_AVX */
295 
296 #ifdef LV_HAVE_RVV
297 #include <riscv_vector.h>
298 
299 static inline void volk_32f_x2_interleave_32fc_rvv(lv_32fc_t* complexVector,
300  const float* iBuffer,
301  const float* qBuffer,
302  unsigned int num_points)
303 {
304  uint64_t* out = (uint64_t*)complexVector;
305  size_t n = num_points;
306  for (size_t vl; n > 0; n -= vl, out += vl, iBuffer += vl, qBuffer += vl) {
307  vl = __riscv_vsetvl_e32m4(n);
308  vuint32m4_t vr = __riscv_vle32_v_u32m4((const uint32_t*)iBuffer, vl);
309  vuint32m4_t vi = __riscv_vle32_v_u32m4((const uint32_t*)qBuffer, vl);
310  vuint64m8_t vc =
311  __riscv_vwmaccu(__riscv_vwaddu_vv(vr, vi, vl), 0xFFFFFFFF, vi, vl);
312  __riscv_vse64(out, vc, vl);
313  }
314 }
315 #endif /*LV_HAVE_RVV*/
316 
317 #ifdef LV_HAVE_RVVSEG
318 #include <riscv_vector.h>
319 
320 static inline void volk_32f_x2_interleave_32fc_rvvseg(lv_32fc_t* complexVector,
321  const float* iBuffer,
322  const float* qBuffer,
323  unsigned int num_points)
324 {
325  size_t n = num_points;
326  for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
327  vl = __riscv_vsetvl_e32m4(n);
328  vfloat32m4_t vr = __riscv_vle32_v_f32m4(iBuffer, vl);
329  vfloat32m4_t vi = __riscv_vle32_v_f32m4(qBuffer, vl);
330  __riscv_vsseg2e32((float*)complexVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
331  }
332 }
333 #endif /*LV_HAVE_RVVSEG*/
334 
335 #endif /* INCLUDED_volk_32f_x2_interleave_32fc_u_H */
volk_32f_x2_interleave_32fc_neon
static void volk_32f_x2_interleave_32fc_neon(lv_32fc_t *complexVector, const float *iBuffer, const float *qBuffer, unsigned int num_points)
Definition: volk_32f_x2_interleave_32fc.h:158
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32f_x2_interleave_32fc_a_avx
static void volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t *complexVector, const float *iBuffer, const float *qBuffer, unsigned int num_points)
Definition: volk_32f_x2_interleave_32fc.h:69
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_32f_x2_interleave_32fc_a_sse
static void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t *complexVector, const float *iBuffer, const float *qBuffer, unsigned int num_points)
Definition: volk_32f_x2_interleave_32fc.h:115
volk_32f_x2_interleave_32fc_u_avx
static void volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t *complexVector, const float *iBuffer, const float *qBuffer, unsigned int num_points)
Definition: volk_32f_x2_interleave_32fc.h:254
volk_32f_x2_interleave_32fc_generic
static void volk_32f_x2_interleave_32fc_generic(lv_32fc_t *complexVector, const float *iBuffer, const float *qBuffer, unsigned int num_points)
Definition: volk_32f_x2_interleave_32fc.h:225