Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_16ic_s32f_deinterleave_real_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
43 #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
44 #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
45 
46 #include <inttypes.h>
47 #include <stdio.h>
48 #include <volk/volk_common.h>
49 
50 #ifdef LV_HAVE_AVX2
51 #include <immintrin.h>
52 
53 static inline void
54 volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer,
55  const lv_16sc_t* complexVector,
56  const float scalar,
57  unsigned int num_points)
58 {
59  float* iBufferPtr = iBuffer;
60 
61  unsigned int number = 0;
62  const unsigned int eighthPoints = num_points / 8;
63 
64  __m256 iFloatValue;
65 
66  const float iScalar = 1.0 / scalar;
67  __m256 invScalar = _mm256_set1_ps(iScalar);
68  __m256i complexVal, iIntVal;
69  __m128i complexVal128;
70  int8_t* complexVectorPtr = (int8_t*)complexVector;
71 
72  __m256i moveMask = _mm256_set_epi8(0x80,
73  0x80,
74  0x80,
75  0x80,
76  0x80,
77  0x80,
78  0x80,
79  0x80,
80  13,
81  12,
82  9,
83  8,
84  5,
85  4,
86  1,
87  0,
88  0x80,
89  0x80,
90  0x80,
91  0x80,
92  0x80,
93  0x80,
94  0x80,
95  0x80,
96  13,
97  12,
98  9,
99  8,
100  5,
101  4,
102  1,
103  0);
104 
105  for (; number < eighthPoints; number++) {
106  complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
107  complexVectorPtr += 32;
108  complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
109  complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
110  complexVal128 = _mm256_extracti128_si256(complexVal, 0);
111 
112  iIntVal = _mm256_cvtepi16_epi32(complexVal128);
113  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
114 
115  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
116 
117  _mm256_store_ps(iBufferPtr, iFloatValue);
118 
119  iBufferPtr += 8;
120  }
121 
122  number = eighthPoints * 8;
123  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
124  for (; number < num_points; number++) {
125  *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
126  sixteenTComplexVectorPtr++;
127  }
128 }
129 #endif /* LV_HAVE_AVX2 */
130 
131 #ifdef LV_HAVE_SSE4_1
132 #include <smmintrin.h>
133 
134 static inline void
135 volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
136  const lv_16sc_t* complexVector,
137  const float scalar,
138  unsigned int num_points)
139 {
140  float* iBufferPtr = iBuffer;
141 
142  unsigned int number = 0;
143  const unsigned int quarterPoints = num_points / 4;
144 
145  __m128 iFloatValue;
146 
147  const float iScalar = 1.0 / scalar;
148  __m128 invScalar = _mm_set_ps1(iScalar);
149  __m128i complexVal, iIntVal;
150  int8_t* complexVectorPtr = (int8_t*)complexVector;
151 
152  __m128i moveMask = _mm_set_epi8(
153  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
154 
155  for (; number < quarterPoints; number++) {
156  complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
157  complexVectorPtr += 16;
158  complexVal = _mm_shuffle_epi8(complexVal, moveMask);
159 
160  iIntVal = _mm_cvtepi16_epi32(complexVal);
161  iFloatValue = _mm_cvtepi32_ps(iIntVal);
162 
163  iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
164 
165  _mm_store_ps(iBufferPtr, iFloatValue);
166 
167  iBufferPtr += 4;
168  }
169 
170  number = quarterPoints * 4;
171  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
172  for (; number < num_points; number++) {
173  *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
174  sixteenTComplexVectorPtr++;
175  }
176 }
177 #endif /* LV_HAVE_SSE4_1 */
178 
179 #ifdef LV_HAVE_SSE
180 #include <xmmintrin.h>
181 
182 static inline void
184  const lv_16sc_t* complexVector,
185  const float scalar,
186  unsigned int num_points)
187 {
188  float* iBufferPtr = iBuffer;
189 
190  unsigned int number = 0;
191  const unsigned int quarterPoints = num_points / 4;
192  __m128 iValue;
193 
194  const float iScalar = 1.0 / scalar;
195  __m128 invScalar = _mm_set_ps1(iScalar);
196  int16_t* complexVectorPtr = (int16_t*)complexVector;
197 
198  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
199 
200  for (; number < quarterPoints; number++) {
201  floatBuffer[0] = (float)(*complexVectorPtr);
202  complexVectorPtr += 2;
203  floatBuffer[1] = (float)(*complexVectorPtr);
204  complexVectorPtr += 2;
205  floatBuffer[2] = (float)(*complexVectorPtr);
206  complexVectorPtr += 2;
207  floatBuffer[3] = (float)(*complexVectorPtr);
208  complexVectorPtr += 2;
209 
210  iValue = _mm_load_ps(floatBuffer);
211 
212  iValue = _mm_mul_ps(iValue, invScalar);
213 
214  _mm_store_ps(iBufferPtr, iValue);
215 
216  iBufferPtr += 4;
217  }
218 
219  number = quarterPoints * 4;
220  complexVectorPtr = (int16_t*)&complexVector[number];
221  for (; number < num_points; number++) {
222  *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
223  complexVectorPtr++;
224  }
225 }
226 #endif /* LV_HAVE_SSE */
227 
228 #ifdef LV_HAVE_GENERIC
229 static inline void
231  const lv_16sc_t* complexVector,
232  const float scalar,
233  unsigned int num_points)
234 {
235  unsigned int number = 0;
236  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
237  float* iBufferPtr = iBuffer;
238  const float invScalar = 1.0 / scalar;
239  for (number = 0; number < num_points; number++) {
240  *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
241  complexVectorPtr++;
242  }
243 }
244 #endif /* LV_HAVE_GENERIC */
245 
246 
247 #endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H */
248 
249 #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H
250 #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H
251 
252 #include <inttypes.h>
253 #include <stdio.h>
254 #include <volk/volk_common.h>
255 
256 #ifdef LV_HAVE_AVX2
257 #include <immintrin.h>
258 
259 static inline void
260 volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer,
261  const lv_16sc_t* complexVector,
262  const float scalar,
263  unsigned int num_points)
264 {
265  float* iBufferPtr = iBuffer;
266 
267  unsigned int number = 0;
268  const unsigned int eighthPoints = num_points / 8;
269 
270  __m256 iFloatValue;
271 
272  const float iScalar = 1.0 / scalar;
273  __m256 invScalar = _mm256_set1_ps(iScalar);
274  __m256i complexVal, iIntVal;
275  __m128i complexVal128;
276  int8_t* complexVectorPtr = (int8_t*)complexVector;
277 
278  __m256i moveMask = _mm256_set_epi8(0x80,
279  0x80,
280  0x80,
281  0x80,
282  0x80,
283  0x80,
284  0x80,
285  0x80,
286  13,
287  12,
288  9,
289  8,
290  5,
291  4,
292  1,
293  0,
294  0x80,
295  0x80,
296  0x80,
297  0x80,
298  0x80,
299  0x80,
300  0x80,
301  0x80,
302  13,
303  12,
304  9,
305  8,
306  5,
307  4,
308  1,
309  0);
310 
311  for (; number < eighthPoints; number++) {
312  complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
313  complexVectorPtr += 32;
314  complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
315  complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
316  complexVal128 = _mm256_extracti128_si256(complexVal, 0);
317 
318  iIntVal = _mm256_cvtepi16_epi32(complexVal128);
319  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
320 
321  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
322 
323  _mm256_storeu_ps(iBufferPtr, iFloatValue);
324 
325  iBufferPtr += 8;
326  }
327 
328  number = eighthPoints * 8;
329  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
330  for (; number < num_points; number++) {
331  *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
332  sixteenTComplexVectorPtr++;
333  }
334 }
335 #endif /* LV_HAVE_AVX2 */
336 
337 #ifdef LV_HAVE_NEON
338 #include <arm_neon.h>
339 
340 static inline void
342  const lv_16sc_t* complexVector,
343  const float scalar,
344  unsigned int num_points)
345 {
346  unsigned int number = 0;
347  const unsigned int quarter_points = num_points / 4;
348 
349  float* iBufferPtr = iBuffer;
350  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
351  const float invScalar = 1.0f / scalar;
352  float32x4_t vInvScalar = vdupq_n_f32(invScalar);
353 
354  for (; number < quarter_points; number++) {
355  int16x4x2_t input = vld2_s16(complexVectorPtr);
356  complexVectorPtr += 8;
357 
358  int32x4_t iInt = vmovl_s16(input.val[0]);
359  float32x4_t iFloat = vcvtq_f32_s32(iInt);
360  iFloat = vmulq_f32(iFloat, vInvScalar);
361 
362  vst1q_f32(iBufferPtr, iFloat);
363  iBufferPtr += 4;
364  }
365 
366  number = quarter_points * 4;
367  complexVectorPtr = (const int16_t*)&complexVector[number];
368  for (; number < num_points; number++) {
369  *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
370  complexVectorPtr++;
371  }
372 }
373 #endif /* LV_HAVE_NEON */
374 
375 #ifdef LV_HAVE_RVV
376 #include <riscv_vector.h>
377 
378 static inline void
379 volk_16ic_s32f_deinterleave_real_32f_rvv(float* iBuffer,
380  const lv_16sc_t* complexVector,
381  const float scalar,
382  unsigned int num_points)
383 {
384  const int32_t* in = (const int32_t*)complexVector;
385  size_t n = num_points;
386  for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
387  vl = __riscv_vsetvl_e32m8(n);
388  vint32m8_t vc = __riscv_vle32_v_i32m8(in, vl);
389  vfloat32m8_t vr = __riscv_vfwcvt_f(__riscv_vncvt_x(vc, vl), vl);
390  __riscv_vse32(iBuffer, __riscv_vfmul(vr, 1.0f / scalar, vl), vl);
391  }
392 }
393 #endif /*LV_HAVE_RVV*/
394 
395 #endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H */
volk_16ic_s32f_deinterleave_real_32f_neon
static void volk_16ic_s32f_deinterleave_real_32f_neon(float *iBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_real_32f.h:341
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
volk_16ic_s32f_deinterleave_real_32f_a_sse
static void volk_16ic_s32f_deinterleave_real_32f_a_sse(float *iBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_real_32f.h:183
lv_16sc_t
short complex lv_16sc_t
Definition: volk_complex.h:71
volk_common.h
volk_16ic_s32f_deinterleave_real_32f_generic
static void volk_16ic_s32f_deinterleave_real_32f_generic(float *iBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_real_32f.h:230