Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_16i_convert_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
40 #ifndef INCLUDED_volk_16i_convert_8i_u_H
41 #define INCLUDED_volk_16i_convert_8i_u_H
42 
43 #include <inttypes.h>
44 #include <stdio.h>
45 
46 #ifdef LV_HAVE_AVX2
47 #include <immintrin.h>
48 
49 static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector,
50  const int16_t* inputVector,
51  unsigned int num_points)
52 {
53  unsigned int number = 0;
54  const unsigned int thirtysecondPoints = num_points / 32;
55 
56  int8_t* outputVectorPtr = outputVector;
57  int16_t* inputPtr = (int16_t*)inputVector;
58  __m256i inputVal1;
59  __m256i inputVal2;
60  __m256i ret;
61 
62  for (; number < thirtysecondPoints; number++) {
63 
64  // Load the 16 values
65  inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr);
66  inputPtr += 16;
67  inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr);
68  inputPtr += 16;
69 
70  inputVal1 = _mm256_srai_epi16(inputVal1, 8);
71  inputVal2 = _mm256_srai_epi16(inputVal2, 8);
72 
73  ret = _mm256_packs_epi16(inputVal1, inputVal2);
74  ret = _mm256_permute4x64_epi64(ret, 0b11011000);
75 
76  _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
77 
78  outputVectorPtr += 32;
79  }
80 
81  number = thirtysecondPoints * 32;
82  for (; number < num_points; number++) {
83  outputVector[number] = (int8_t)(inputVector[number] >> 8);
84  }
85 }
86 #endif /* LV_HAVE_AVX2 */
87 
88 #ifdef LV_HAVE_AVX512BW
89 #include <immintrin.h>
90 
91 static inline void volk_16i_convert_8i_u_avx512bw(int8_t* outputVector,
92  const int16_t* inputVector,
93  unsigned int num_points)
94 {
95  unsigned int number = 0;
96  const unsigned int sixtyfourthPoints = num_points / 64;
97 
98  int8_t* outputVectorPtr = outputVector;
99  int16_t* inputPtr = (int16_t*)inputVector;
100  __m512i inputVal1;
101  __m512i inputVal2;
102  __m512i shifted1, shifted2;
103  __m256i ret1, ret2;
104 
105  for (; number < sixtyfourthPoints; number++) {
106 
107  // Load 64 int16 values
108  inputVal1 = _mm512_loadu_si512((__m512i*)inputPtr);
109  inputPtr += 32;
110  inputVal2 = _mm512_loadu_si512((__m512i*)inputPtr);
111  inputPtr += 32;
112 
113  shifted1 = _mm512_srai_epi16(inputVal1, 8);
114  shifted2 = _mm512_srai_epi16(inputVal2, 8);
115 
116  ret1 = _mm512_cvtsepi16_epi8(shifted1);
117  ret2 = _mm512_cvtsepi16_epi8(shifted2);
118 
119  _mm256_storeu_si256((__m256i*)outputVectorPtr, ret1);
120  outputVectorPtr += 32;
121  _mm256_storeu_si256((__m256i*)outputVectorPtr, ret2);
122  outputVectorPtr += 32;
123  }
124 
125  number = sixtyfourthPoints * 64;
126  for (; number < num_points; number++) {
127  outputVector[number] = (int8_t)(inputVector[number] >> 8);
128  }
129 }
130 #endif /* LV_HAVE_AVX512BW */
131 
132 
133 #ifdef LV_HAVE_SSE2
134 #include <emmintrin.h>
135 
136 static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector,
137  const int16_t* inputVector,
138  unsigned int num_points)
139 {
140  unsigned int number = 0;
141  const unsigned int sixteenthPoints = num_points / 16;
142 
143  int8_t* outputVectorPtr = outputVector;
144  int16_t* inputPtr = (int16_t*)inputVector;
145  __m128i inputVal1;
146  __m128i inputVal2;
147  __m128i ret;
148 
149  for (; number < sixteenthPoints; number++) {
150 
151  // Load the 16 values
152  inputVal1 = _mm_loadu_si128((__m128i*)inputPtr);
153  inputPtr += 8;
154  inputVal2 = _mm_loadu_si128((__m128i*)inputPtr);
155  inputPtr += 8;
156 
157  inputVal1 = _mm_srai_epi16(inputVal1, 8);
158  inputVal2 = _mm_srai_epi16(inputVal2, 8);
159 
160  ret = _mm_packs_epi16(inputVal1, inputVal2);
161 
162  _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
163 
164  outputVectorPtr += 16;
165  }
166 
167  number = sixteenthPoints * 16;
168  for (; number < num_points; number++) {
169  outputVector[number] = (int8_t)(inputVector[number] >> 8);
170  }
171 }
172 #endif /* LV_HAVE_SSE2 */
173 
174 
175 #ifdef LV_HAVE_GENERIC
176 
177 static inline void volk_16i_convert_8i_generic(int8_t* outputVector,
178  const int16_t* inputVector,
179  unsigned int num_points)
180 {
181  int8_t* outputVectorPtr = outputVector;
182  const int16_t* inputVectorPtr = inputVector;
183  unsigned int number = 0;
184 
185  for (number = 0; number < num_points; number++) {
186  *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
187  }
188 }
189 #endif /* LV_HAVE_GENERIC */
190 
191 
192 #endif /* INCLUDED_volk_16i_convert_8i_u_H */
193 #ifndef INCLUDED_volk_16i_convert_8i_a_H
194 #define INCLUDED_volk_16i_convert_8i_a_H
195 
196 #include <inttypes.h>
197 #include <stdio.h>
198 
199 #ifdef LV_HAVE_AVX2
200 #include <immintrin.h>
201 
202 static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector,
203  const int16_t* inputVector,
204  unsigned int num_points)
205 {
206  unsigned int number = 0;
207  const unsigned int thirtysecondPoints = num_points / 32;
208 
209  int8_t* outputVectorPtr = outputVector;
210  int16_t* inputPtr = (int16_t*)inputVector;
211  __m256i inputVal1;
212  __m256i inputVal2;
213  __m256i ret;
214 
215  for (; number < thirtysecondPoints; number++) {
216 
217  // Load the 16 values
218  inputVal1 = _mm256_load_si256((__m256i*)inputPtr);
219  inputPtr += 16;
220  inputVal2 = _mm256_load_si256((__m256i*)inputPtr);
221  inputPtr += 16;
222 
223  inputVal1 = _mm256_srai_epi16(inputVal1, 8);
224  inputVal2 = _mm256_srai_epi16(inputVal2, 8);
225 
226  ret = _mm256_packs_epi16(inputVal1, inputVal2);
227  ret = _mm256_permute4x64_epi64(ret, 0b11011000);
228 
229  _mm256_store_si256((__m256i*)outputVectorPtr, ret);
230 
231  outputVectorPtr += 32;
232  }
233 
234  number = thirtysecondPoints * 32;
235  for (; number < num_points; number++) {
236  outputVector[number] = (int8_t)(inputVector[number] >> 8);
237  }
238 }
239 #endif /* LV_HAVE_AVX2 */
240 
241 #ifdef LV_HAVE_AVX512BW
242 #include <immintrin.h>
243 
244 static inline void volk_16i_convert_8i_a_avx512bw(int8_t* outputVector,
245  const int16_t* inputVector,
246  unsigned int num_points)
247 {
248  unsigned int number = 0;
249  const unsigned int sixtyfourthPoints = num_points / 64;
250 
251  int8_t* outputVectorPtr = outputVector;
252  int16_t* inputPtr = (int16_t*)inputVector;
253  __m512i inputVal1;
254  __m512i inputVal2;
255  __m512i shifted1, shifted2;
256  __m256i ret1, ret2;
257 
258  for (; number < sixtyfourthPoints; number++) {
259 
260  // Load 64 int16 values
261  inputVal1 = _mm512_load_si512((__m512i*)inputPtr);
262  inputPtr += 32;
263  inputVal2 = _mm512_load_si512((__m512i*)inputPtr);
264  inputPtr += 32;
265 
266  shifted1 = _mm512_srai_epi16(inputVal1, 8);
267  shifted2 = _mm512_srai_epi16(inputVal2, 8);
268 
269  ret1 = _mm512_cvtsepi16_epi8(shifted1);
270  ret2 = _mm512_cvtsepi16_epi8(shifted2);
271 
272  _mm256_store_si256((__m256i*)outputVectorPtr, ret1);
273  outputVectorPtr += 32;
274  _mm256_store_si256((__m256i*)outputVectorPtr, ret2);
275  outputVectorPtr += 32;
276  }
277 
278  number = sixtyfourthPoints * 64;
279  for (; number < num_points; number++) {
280  outputVector[number] = (int8_t)(inputVector[number] >> 8);
281  }
282 }
283 #endif /* LV_HAVE_AVX512BW */
284 
285 
286 #ifdef LV_HAVE_SSE2
287 #include <emmintrin.h>
288 
289 static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector,
290  const int16_t* inputVector,
291  unsigned int num_points)
292 {
293  unsigned int number = 0;
294  const unsigned int sixteenthPoints = num_points / 16;
295 
296  int8_t* outputVectorPtr = outputVector;
297  int16_t* inputPtr = (int16_t*)inputVector;
298  __m128i inputVal1;
299  __m128i inputVal2;
300  __m128i ret;
301 
302  for (; number < sixteenthPoints; number++) {
303 
304  // Load the 16 values
305  inputVal1 = _mm_load_si128((__m128i*)inputPtr);
306  inputPtr += 8;
307  inputVal2 = _mm_load_si128((__m128i*)inputPtr);
308  inputPtr += 8;
309 
310  inputVal1 = _mm_srai_epi16(inputVal1, 8);
311  inputVal2 = _mm_srai_epi16(inputVal2, 8);
312 
313  ret = _mm_packs_epi16(inputVal1, inputVal2);
314 
315  _mm_store_si128((__m128i*)outputVectorPtr, ret);
316 
317  outputVectorPtr += 16;
318  }
319 
320  number = sixteenthPoints * 16;
321  for (; number < num_points; number++) {
322  outputVector[number] = (int8_t)(inputVector[number] >> 8);
323  }
324 }
325 #endif /* LV_HAVE_SSE2 */
326 
327 
328 #ifdef LV_HAVE_NEON
329 #include <arm_neon.h>
330 
331 static inline void volk_16i_convert_8i_neon(int8_t* outputVector,
332  const int16_t* inputVector,
333  unsigned int num_points)
334 {
335  int8_t* outputVectorPtr = outputVector;
336  const int16_t* inputVectorPtr = inputVector;
337  unsigned int number = 0;
338  unsigned int sixteenth_points = num_points / 16;
339 
340  int16x8_t inputVal0;
341  int16x8_t inputVal1;
342  int8x8_t outputVal0;
343  int8x8_t outputVal1;
344  int8x16_t outputVal;
345 
346  for (number = 0; number < sixteenth_points; number++) {
347  // load two input vectors
348  inputVal0 = vld1q_s16(inputVectorPtr);
349  inputVal1 = vld1q_s16(inputVectorPtr + 8);
350  // shift right
351  outputVal0 = vshrn_n_s16(inputVal0, 8);
352  outputVal1 = vshrn_n_s16(inputVal1, 8);
353  // squash two vectors and write output
354  outputVal = vcombine_s8(outputVal0, outputVal1);
355  vst1q_s8(outputVectorPtr, outputVal);
356  inputVectorPtr += 16;
357  outputVectorPtr += 16;
358  }
359 
360  for (number = sixteenth_points * 16; number < num_points; number++) {
361  *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
362  }
363 }
364 #endif /* LV_HAVE_NEON */
365 
366 #ifdef LV_HAVE_NEONV8
367 #include <arm_neon.h>
368 
369 static inline void volk_16i_convert_8i_neonv8(int8_t* outputVector,
370  const int16_t* inputVector,
371  unsigned int num_points)
372 {
373  int8_t* outputVectorPtr = outputVector;
374  const int16_t* inputVectorPtr = inputVector;
375  const unsigned int thirtysecondPoints = num_points / 32;
376 
377  for (unsigned int number = 0; number < thirtysecondPoints; number++) {
378  int16x8_t in0 = vld1q_s16(inputVectorPtr);
379  int16x8_t in1 = vld1q_s16(inputVectorPtr + 8);
380  int16x8_t in2 = vld1q_s16(inputVectorPtr + 16);
381  int16x8_t in3 = vld1q_s16(inputVectorPtr + 24);
382  __VOLK_PREFETCH(inputVectorPtr + 64);
383 
384  int8x8_t out0 = vshrn_n_s16(in0, 8);
385  int8x8_t out1 = vshrn_n_s16(in1, 8);
386  int8x8_t out2 = vshrn_n_s16(in2, 8);
387  int8x8_t out3 = vshrn_n_s16(in3, 8);
388 
389  vst1q_s8(outputVectorPtr, vcombine_s8(out0, out1));
390  vst1q_s8(outputVectorPtr + 16, vcombine_s8(out2, out3));
391 
392  inputVectorPtr += 32;
393  outputVectorPtr += 32;
394  }
395 
396  for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
397  *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
398  }
399 }
400 #endif /* LV_HAVE_NEONV8 */
401 
402 #ifdef LV_HAVE_RVV
403 #include <riscv_vector.h>
404 
405 static inline void volk_16i_convert_8i_rvv(int8_t* outputVector,
406  const int16_t* inputVector,
407  unsigned int num_points)
408 {
409  size_t n = num_points;
410  for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
411  vl = __riscv_vsetvl_e16m8(n);
412  vint16m8_t v = __riscv_vle16_v_i16m8(inputVector, vl);
413  __riscv_vse8(outputVector, __riscv_vnsra(v, 8, vl), vl);
414  }
415 }
416 #endif /*LV_HAVE_RVV*/
417 
418 #endif /* INCLUDED_volk_16i_convert_8i_a_H */
volk_16i_convert_8i_neon
static void volk_16i_convert_8i_neon(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:331
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_16i_convert_8i_u_sse2
static void volk_16i_convert_8i_u_sse2(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:136
volk_16i_convert_8i_generic
static void volk_16i_convert_8i_generic(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:177
volk_16i_convert_8i_a_sse2
static void volk_16i_convert_8i_a_sse2(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:289