Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_s32f_deinterleave_real_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
61 #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
62 
63 #include <inttypes.h>
64 #include <stdio.h>
65 #include <volk/volk_common.h>
66 
67 
68 #ifdef LV_HAVE_AVX2
69 #include <immintrin.h>
70 
71 static inline void
72 volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
73  const lv_32fc_t* complexVector,
74  const float scalar,
75  unsigned int num_points)
76 {
77  unsigned int number = 0;
78  const unsigned int eighthPoints = num_points / 8;
79 
80  const float* complexVectorPtr = (float*)complexVector;
81  int16_t* iBufferPtr = iBuffer;
82 
83  __m256 vScalar = _mm256_set1_ps(scalar);
84 
85  __m256 cplxValue1, cplxValue2, iValue;
86  __m256i a;
87  __m128i b;
88 
89  __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0);
90 
91  for (; number < eighthPoints; number++) {
92  cplxValue1 = _mm256_load_ps(complexVectorPtr);
93  complexVectorPtr += 8;
94 
95  cplxValue2 = _mm256_load_ps(complexVectorPtr);
96  complexVectorPtr += 8;
97 
98  // Arrange in i1i2i3i4 format
99  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
100 
101  iValue = _mm256_mul_ps(iValue, vScalar);
102 
103  a = _mm256_cvtps_epi32(iValue);
104  a = _mm256_packs_epi32(a, a);
105  a = _mm256_permutevar8x32_epi32(a, idx);
106  b = _mm256_extracti128_si256(a, 0);
107 
108  _mm_store_si128((__m128i*)iBufferPtr, b);
109  iBufferPtr += 8;
110  }
111 
112  number = eighthPoints * 8;
113  iBufferPtr = &iBuffer[number];
114  for (; number < num_points; number++) {
115  *iBufferPtr++ = (int16_t)rintf(*complexVectorPtr++ * scalar);
116  complexVectorPtr++;
117  }
118 }
119 
120 
121 #endif /* LV_HAVE_AVX2 */
122 
123 #ifdef LV_HAVE_SSE
124 #include <xmmintrin.h>
125 
126 static inline void
128  const lv_32fc_t* complexVector,
129  const float scalar,
130  unsigned int num_points)
131 {
132  unsigned int number = 0;
133  const unsigned int quarterPoints = num_points / 4;
134 
135  const float* complexVectorPtr = (float*)complexVector;
136  int16_t* iBufferPtr = iBuffer;
137 
138  __m128 vScalar = _mm_set_ps1(scalar);
139 
140  __m128 cplxValue1, cplxValue2, iValue;
141 
142  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
143 
144  for (; number < quarterPoints; number++) {
145  cplxValue1 = _mm_load_ps(complexVectorPtr);
146  complexVectorPtr += 4;
147 
148  cplxValue2 = _mm_load_ps(complexVectorPtr);
149  complexVectorPtr += 4;
150 
151  // Arrange in i1i2i3i4 format
152  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
153 
154  iValue = _mm_mul_ps(iValue, vScalar);
155 
156  _mm_store_ps(floatBuffer, iValue);
157  *iBufferPtr++ = (int16_t)rintf(floatBuffer[0]);
158  *iBufferPtr++ = (int16_t)rintf(floatBuffer[1]);
159  *iBufferPtr++ = (int16_t)rintf(floatBuffer[2]);
160  *iBufferPtr++ = (int16_t)rintf(floatBuffer[3]);
161  }
162 
163  number = quarterPoints * 4;
164  iBufferPtr = &iBuffer[number];
165  for (; number < num_points; number++) {
166  *iBufferPtr++ = (int16_t)rintf(*complexVectorPtr++ * scalar);
167  complexVectorPtr++;
168  }
169 }
170 
171 #endif /* LV_HAVE_SSE */
172 
173 
174 #ifdef LV_HAVE_GENERIC
175 
176 static inline void
178  const lv_32fc_t* complexVector,
179  const float scalar,
180  unsigned int num_points)
181 {
182  const float* complexVectorPtr = (float*)complexVector;
183  int16_t* iBufferPtr = iBuffer;
184  unsigned int number = 0;
185  for (number = 0; number < num_points; number++) {
186  *iBufferPtr++ = (int16_t)rintf(*complexVectorPtr++ * scalar);
187  complexVectorPtr++;
188  }
189 }
190 
191 #endif /* LV_HAVE_GENERIC */
192 
193 #endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H */
194 
195 #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
196 #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
197 
198 #include <inttypes.h>
199 #include <stdio.h>
200 #include <volk/volk_common.h>
201 
202 #ifdef LV_HAVE_AVX2
203 #include <immintrin.h>
204 
205 static inline void
206 volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
207  const lv_32fc_t* complexVector,
208  const float scalar,
209  unsigned int num_points)
210 {
211  unsigned int number = 0;
212  const unsigned int eighthPoints = num_points / 8;
213 
214  const float* complexVectorPtr = (float*)complexVector;
215  int16_t* iBufferPtr = iBuffer;
216 
217  __m256 vScalar = _mm256_set1_ps(scalar);
218 
219  __m256 cplxValue1, cplxValue2, iValue;
220  __m256i a;
221  __m128i b;
222 
223  __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0);
224 
225  for (; number < eighthPoints; number++) {
226  cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
227  complexVectorPtr += 8;
228 
229  cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
230  complexVectorPtr += 8;
231 
232  // Arrange in i1i2i3i4 format
233  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
234 
235  iValue = _mm256_mul_ps(iValue, vScalar);
236 
237  a = _mm256_cvtps_epi32(iValue);
238  a = _mm256_packs_epi32(a, a);
239  a = _mm256_permutevar8x32_epi32(a, idx);
240  b = _mm256_extracti128_si256(a, 0);
241 
242  _mm_storeu_si128((__m128i*)iBufferPtr, b);
243  iBufferPtr += 8;
244  }
245 
246  number = eighthPoints * 8;
247  iBufferPtr = &iBuffer[number];
248  for (; number < num_points; number++) {
249  *iBufferPtr++ = (int16_t)rintf(*complexVectorPtr++ * scalar);
250  complexVectorPtr++;
251  }
252 }
253 
254 #endif /* LV_HAVE_AVX2 */
255 
256 #ifdef LV_HAVE_NEON
257 #include <arm_neon.h>
258 
259 static inline void
261  const lv_32fc_t* complexVector,
262  const float scalar,
263  unsigned int num_points)
264 {
265  unsigned int number = 0;
266  const unsigned int quarter_points = num_points / 4;
267 
268  const float* complexVectorPtr = (float*)complexVector;
269  int16_t* iBufferPtr = iBuffer;
270  float32x4_t vScalar = vdupq_n_f32(scalar);
271 
272  float32x4_t half = vdupq_n_f32(0.5f);
273  float32x4_t neg_half = vdupq_n_f32(-0.5f);
274  float32x4_t zero = vdupq_n_f32(0.0f);
275 
276  for (; number < quarter_points; number++) {
277  float32x4x2_t input = vld2q_f32(complexVectorPtr);
278  complexVectorPtr += 8;
279 
280  float32x4_t scaled = vmulq_f32(input.val[0], vScalar);
281  // Round to nearest: add copysign(0.5, x) before truncating
282  uint32x4_t neg = vcltq_f32(scaled, zero);
283  scaled = vaddq_f32(scaled, vbslq_f32(neg, neg_half, half));
284  int32x4_t intVal = vcvtq_s32_f32(scaled);
285  int16x4_t shortVal = vqmovn_s32(intVal);
286 
287  vst1_s16(iBufferPtr, shortVal);
288  iBufferPtr += 4;
289  }
290 
291  number = quarter_points * 4;
292  for (; number < num_points; number++) {
293  *iBufferPtr++ = (int16_t)rintf(*complexVectorPtr++ * scalar);
294  complexVectorPtr++;
295  }
296 }
297 #endif /* LV_HAVE_NEON */
298 
299 #ifdef LV_HAVE_NEONV8
300 #include <arm_neon.h>
301 
302 static inline void
303 volk_32fc_s32f_deinterleave_real_16i_neonv8(int16_t* iBuffer,
304  const lv_32fc_t* complexVector,
305  const float scalar,
306  unsigned int num_points)
307 {
308  unsigned int number = 0;
309  const unsigned int eighth_points = num_points / 8;
310 
311  const float* complexVectorPtr = (float*)complexVector;
312  int16_t* iBufferPtr = iBuffer;
313  float32x4_t vScalar = vdupq_n_f32(scalar);
314 
315  for (; number < eighth_points; number++) {
316  float32x4x2_t input0 = vld2q_f32(complexVectorPtr);
317  float32x4x2_t input1 = vld2q_f32(complexVectorPtr + 8);
318  complexVectorPtr += 16;
319  __VOLK_PREFETCH(complexVectorPtr + 16);
320 
321  float32x4_t scaled0 = vmulq_f32(input0.val[0], vScalar);
322  float32x4_t scaled1 = vmulq_f32(input1.val[0], vScalar);
323 
324  int32x4_t intVal0 = vcvtnq_s32_f32(scaled0);
325  int32x4_t intVal1 = vcvtnq_s32_f32(scaled1);
326 
327  int16x4_t shortVal0 = vqmovn_s32(intVal0);
328  int16x4_t shortVal1 = vqmovn_s32(intVal1);
329 
330  vst1_s16(iBufferPtr, shortVal0);
331  vst1_s16(iBufferPtr + 4, shortVal1);
332  iBufferPtr += 8;
333  }
334 
335  number = eighth_points * 8;
336  for (; number < num_points; number++) {
337  *iBufferPtr++ = (int16_t)rintf(*complexVectorPtr++ * scalar);
338  complexVectorPtr++;
339  }
340 }
341 #endif /* LV_HAVE_NEONV8 */
342 
343 #ifdef LV_HAVE_RVV
344 #include <riscv_vector.h>
345 
346 static inline void
347 volk_32fc_s32f_deinterleave_real_16i_rvv(int16_t* iBuffer,
348  const lv_32fc_t* complexVector,
349  const float scalar,
350  unsigned int num_points)
351 {
352  const uint64_t* in = (const uint64_t*)complexVector;
353  size_t n = num_points;
354  for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
355  vl = __riscv_vsetvl_e64m8(n);
356  vuint32m4_t vi = __riscv_vnsrl(__riscv_vle64_v_u64m8(in, vl), 0, vl);
357  vfloat32m4_t vif = __riscv_vfmul(__riscv_vreinterpret_f32m4(vi), scalar, vl);
358  __riscv_vse16(iBuffer, __riscv_vncvt_x(__riscv_vfcvt_x(vif, vl), vl), vl);
359  }
360 }
361 #endif /*LV_HAVE_RVV*/
362 
363 #endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H */
volk_32fc_s32f_deinterleave_real_16i_neon
static void volk_32fc_s32f_deinterleave_real_16i_neon(int16_t *iBuffer, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_deinterleave_real_16i.h:260
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32fc_s32f_deinterleave_real_16i_a_sse
static void volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t *iBuffer, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_deinterleave_real_16i.h:127
volk_common.h
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_32fc_s32f_deinterleave_real_16i_generic
static void volk_32fc_s32f_deinterleave_real_16i_generic(int16_t *iBuffer, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_deinterleave_real_16i.h:177
bit128::f
float f[4]
Definition: volk_common.h:120
rintf
static float rintf(float x)
Definition: config.h:45