Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_deinterleave_real_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
57 #ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a_H
58 #define INCLUDED_volk_32fc_deinterleave_real_32f_a_H
59 
60 #include <inttypes.h>
61 #include <stdio.h>
62 
63 #ifdef LV_HAVE_AVX2
64 #include <immintrin.h>
65 
66 static inline void volk_32fc_deinterleave_real_32f_a_avx2(float* iBuffer,
67  const lv_32fc_t* complexVector,
68  unsigned int num_points)
69 {
70  unsigned int number = 0;
71  const unsigned int eighthPoints = num_points / 8;
72 
73  const float* complexVectorPtr = (const float*)complexVector;
74  float* iBufferPtr = iBuffer;
75 
76  __m256 cplxValue1, cplxValue2;
77  __m256 iValue;
78  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
79  for (; number < eighthPoints; number++) {
80 
81  cplxValue1 = _mm256_load_ps(complexVectorPtr);
82  complexVectorPtr += 8;
83 
84  cplxValue2 = _mm256_load_ps(complexVectorPtr);
85  complexVectorPtr += 8;
86 
87  // Arrange in i1i2i3i4 format
88  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
89  iValue = _mm256_permutevar8x32_ps(iValue, idx);
90 
91  _mm256_store_ps(iBufferPtr, iValue);
92 
93  iBufferPtr += 8;
94  }
95 
96  number = eighthPoints * 8;
97  for (; number < num_points; number++) {
98  *iBufferPtr++ = *complexVectorPtr++;
99  complexVectorPtr++;
100  }
101 }
102 #endif /* LV_HAVE_AVX2 */
103 
104 #ifdef LV_HAVE_SSE
105 #include <xmmintrin.h>
106 
107 static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer,
108  const lv_32fc_t* complexVector,
109  unsigned int num_points)
110 {
111  unsigned int number = 0;
112  const unsigned int quarterPoints = num_points / 4;
113 
114  const float* complexVectorPtr = (const float*)complexVector;
115  float* iBufferPtr = iBuffer;
116 
117  __m128 cplxValue1, cplxValue2, iValue;
118  for (; number < quarterPoints; number++) {
119 
120  cplxValue1 = _mm_load_ps(complexVectorPtr);
121  complexVectorPtr += 4;
122 
123  cplxValue2 = _mm_load_ps(complexVectorPtr);
124  complexVectorPtr += 4;
125 
126  // Arrange in i1i2i3i4 format
127  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
128 
129  _mm_store_ps(iBufferPtr, iValue);
130 
131  iBufferPtr += 4;
132  }
133 
134  number = quarterPoints * 4;
135  for (; number < num_points; number++) {
136  *iBufferPtr++ = *complexVectorPtr++;
137  complexVectorPtr++;
138  }
139 }
140 #endif /* LV_HAVE_SSE */
141 
142 
143 #ifdef LV_HAVE_GENERIC
144 
145 static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer,
146  const lv_32fc_t* complexVector,
147  unsigned int num_points)
148 {
149  unsigned int number = 0;
150  const float* complexVectorPtr = (float*)complexVector;
151  float* iBufferPtr = iBuffer;
152  for (number = 0; number < num_points; number++) {
153  *iBufferPtr++ = *complexVectorPtr++;
154  complexVectorPtr++;
155  }
156 }
157 #endif /* LV_HAVE_GENERIC */
158 
159 
160 #ifdef LV_HAVE_NEON
161 #include <arm_neon.h>
162 
163 static inline void volk_32fc_deinterleave_real_32f_neon(float* iBuffer,
164  const lv_32fc_t* complexVector,
165  unsigned int num_points)
166 {
167  unsigned int number = 0;
168  unsigned int quarter_points = num_points / 4;
169  const float* complexVectorPtr = (float*)complexVector;
170  float* iBufferPtr = iBuffer;
171  float32x4x2_t complexInput;
172 
173  for (number = 0; number < quarter_points; number++) {
174  complexInput = vld2q_f32(complexVectorPtr);
175  vst1q_f32(iBufferPtr, complexInput.val[0]);
176  complexVectorPtr += 8;
177  iBufferPtr += 4;
178  }
179 
180  for (number = quarter_points * 4; number < num_points; number++) {
181  *iBufferPtr++ = *complexVectorPtr++;
182  complexVectorPtr++;
183  }
184 }
185 #endif /* LV_HAVE_NEON */
186 
187 #ifdef LV_HAVE_NEONV8
188 #include <arm_neon.h>
189 
190 static inline void volk_32fc_deinterleave_real_32f_neonv8(float* iBuffer,
191  const lv_32fc_t* complexVector,
192  unsigned int num_points)
193 {
194  const unsigned int eighthPoints = num_points / 8;
195  const float* complexVectorPtr = (float*)complexVector;
196  float* iBufferPtr = iBuffer;
197 
198  for (unsigned int number = 0; number < eighthPoints; number++) {
199  float32x4x2_t cplx0 = vld2q_f32(complexVectorPtr);
200  float32x4x2_t cplx1 = vld2q_f32(complexVectorPtr + 8);
201  __VOLK_PREFETCH(complexVectorPtr + 32);
202 
203  vst1q_f32(iBufferPtr, cplx0.val[0]);
204  vst1q_f32(iBufferPtr + 4, cplx1.val[0]);
205 
206  complexVectorPtr += 16;
207  iBufferPtr += 8;
208  }
209 
210  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
211  *iBufferPtr++ = *complexVectorPtr++;
212  complexVectorPtr++;
213  }
214 }
215 #endif /* LV_HAVE_NEONV8 */
216 
217 #endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a_H */
218 
219 
220 #ifndef INCLUDED_volk_32fc_deinterleave_real_32f_u_H
221 #define INCLUDED_volk_32fc_deinterleave_real_32f_u_H
222 
223 #include <inttypes.h>
224 #include <stdio.h>
225 
226 #ifdef LV_HAVE_AVX2
227 #include <immintrin.h>
228 
229 static inline void volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer,
230  const lv_32fc_t* complexVector,
231  unsigned int num_points)
232 {
233  unsigned int number = 0;
234  const unsigned int eighthPoints = num_points / 8;
235 
236  const float* complexVectorPtr = (const float*)complexVector;
237  float* iBufferPtr = iBuffer;
238 
239  __m256 cplxValue1, cplxValue2;
240  __m256 iValue;
241  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
242  for (; number < eighthPoints; number++) {
243 
244  cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
245  complexVectorPtr += 8;
246 
247  cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
248  complexVectorPtr += 8;
249 
250  // Arrange in i1i2i3i4 format
251  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
252  iValue = _mm256_permutevar8x32_ps(iValue, idx);
253 
254  _mm256_storeu_ps(iBufferPtr, iValue);
255 
256  iBufferPtr += 8;
257  }
258 
259  number = eighthPoints * 8;
260  for (; number < num_points; number++) {
261  *iBufferPtr++ = *complexVectorPtr++;
262  complexVectorPtr++;
263  }
264 }
265 #endif /* LV_HAVE_AVX2 */
266 
267 #ifdef LV_HAVE_RVV
268 #include <riscv_vector.h>
269 
270 static inline void volk_32fc_deinterleave_real_32f_rvv(float* iBuffer,
271  const lv_32fc_t* complexVector,
272  unsigned int num_points)
273 {
274  const uint64_t* in = (const uint64_t*)complexVector;
275  size_t n = num_points;
276  for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
277  vl = __riscv_vsetvl_e64m8(n);
278  vuint64m8_t vc = __riscv_vle64_v_u64m8(in, vl);
279  __riscv_vse32((uint32_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl);
280  }
281 }
282 #endif /*LV_HAVE_RVV*/
283 
284 #endif /* INCLUDED_volk_32fc_deinterleave_real_32f_u_H */
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32fc_deinterleave_real_32f_a_sse
static void volk_32fc_deinterleave_real_32f_a_sse(float *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_real_32f.h:107
volk_32fc_deinterleave_real_32f_neon
static void volk_32fc_deinterleave_real_32f_neon(float *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_real_32f.h:163
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_32fc_deinterleave_real_32f_generic
static void volk_32fc_deinterleave_real_32f_generic(float *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_real_32f.h:145