Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_8i_convert_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
40 #ifndef INCLUDED_volk_8i_convert_16i_u_H
41 #define INCLUDED_volk_8i_convert_16i_u_H
42 
43 #include <inttypes.h>
44 #include <stdio.h>
45 
46 #ifdef LV_HAVE_AVX2
47 #include <immintrin.h>
48 
49 static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector,
50  const int8_t* inputVector,
51  unsigned int num_points)
52 {
53  unsigned int number = 0;
54  const unsigned int sixteenthPoints = num_points / 16;
55 
56  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
57  __m256i* outputVectorPtr = (__m256i*)outputVector;
58  __m128i inputVal;
59  __m256i ret;
60 
61  for (; number < sixteenthPoints; number++) {
62  inputVal = _mm_loadu_si128(inputVectorPtr);
63  ret = _mm256_cvtepi8_epi16(inputVal);
64  ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
65  _mm256_storeu_si256(outputVectorPtr, ret);
66 
67  outputVectorPtr++;
68  inputVectorPtr++;
69  }
70 
71  number = sixteenthPoints * 16;
72  for (; number < num_points; number++) {
73  outputVector[number] = (int16_t)(inputVector[number]) * 256;
74  }
75 }
76 #endif /* LV_HAVE_AVX2 */
77 
78 #ifdef LV_HAVE_AVX512BW
79 #include <immintrin.h>
80 
81 static inline void volk_8i_convert_16i_u_avx512bw(int16_t* outputVector,
82  const int8_t* inputVector,
83  unsigned int num_points)
84 {
85  unsigned int number = 0;
86  const unsigned int thirtysecondPoints = num_points / 32;
87 
88  const __m256i* inputVectorPtr = (const __m256i*)inputVector;
89  __m512i* outputVectorPtr = (__m512i*)outputVector;
90  __m256i inputVal;
91  __m512i ret;
92 
93  for (; number < thirtysecondPoints; number++) {
94  inputVal = _mm256_loadu_si256(inputVectorPtr);
95  ret = _mm512_cvtepi8_epi16(inputVal);
96  ret = _mm512_slli_epi16(ret, 8); // Multiply by 256
97  _mm512_storeu_si512(outputVectorPtr, ret);
98 
99  outputVectorPtr++;
100  inputVectorPtr++;
101  }
102 
103  number = thirtysecondPoints * 32;
104  for (; number < num_points; number++) {
105  outputVector[number] = (int16_t)(inputVector[number]) * 256;
106  }
107 }
108 #endif /* LV_HAVE_AVX512BW */
109 
110 
111 #ifdef LV_HAVE_SSE4_1
112 #include <smmintrin.h>
113 
114 static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector,
115  const int8_t* inputVector,
116  unsigned int num_points)
117 {
118  unsigned int number = 0;
119  const unsigned int sixteenthPoints = num_points / 16;
120 
121  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
122  __m128i* outputVectorPtr = (__m128i*)outputVector;
123  __m128i inputVal;
124  __m128i ret;
125 
126  for (; number < sixteenthPoints; number++) {
127  inputVal = _mm_loadu_si128(inputVectorPtr);
128  ret = _mm_cvtepi8_epi16(inputVal);
129  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
130  _mm_storeu_si128(outputVectorPtr, ret);
131 
132  outputVectorPtr++;
133 
134  inputVal = _mm_srli_si128(inputVal, 8);
135  ret = _mm_cvtepi8_epi16(inputVal);
136  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
137  _mm_storeu_si128(outputVectorPtr, ret);
138 
139  outputVectorPtr++;
140 
141  inputVectorPtr++;
142  }
143 
144  number = sixteenthPoints * 16;
145  for (; number < num_points; number++) {
146  outputVector[number] = (int16_t)(inputVector[number]) * 256;
147  }
148 }
149 #endif /* LV_HAVE_SSE4_1 */
150 
151 
152 #ifdef LV_HAVE_GENERIC
153 
154 static inline void volk_8i_convert_16i_generic(int16_t* outputVector,
155  const int8_t* inputVector,
156  unsigned int num_points)
157 {
158  int16_t* outputVectorPtr = outputVector;
159  const int8_t* inputVectorPtr = inputVector;
160  unsigned int number = 0;
161 
162  for (number = 0; number < num_points; number++) {
163  *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
164  }
165 }
166 #endif /* LV_HAVE_GENERIC */
167 
168 
169 #endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
170 
171 
172 #ifndef INCLUDED_volk_8i_convert_16i_a_H
173 #define INCLUDED_volk_8i_convert_16i_a_H
174 
175 #include <inttypes.h>
176 #include <stdio.h>
177 
178 #ifdef LV_HAVE_AVX2
179 #include <immintrin.h>
180 
181 static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector,
182  const int8_t* inputVector,
183  unsigned int num_points)
184 {
185  unsigned int number = 0;
186  const unsigned int sixteenthPoints = num_points / 16;
187 
188  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
189  __m256i* outputVectorPtr = (__m256i*)outputVector;
190  __m128i inputVal;
191  __m256i ret;
192 
193  for (; number < sixteenthPoints; number++) {
194  inputVal = _mm_load_si128(inputVectorPtr);
195  ret = _mm256_cvtepi8_epi16(inputVal);
196  ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
197  _mm256_store_si256(outputVectorPtr, ret);
198 
199  outputVectorPtr++;
200  inputVectorPtr++;
201  }
202 
203  number = sixteenthPoints * 16;
204  for (; number < num_points; number++) {
205  outputVector[number] = (int16_t)(inputVector[number]) * 256;
206  }
207 }
208 #endif /* LV_HAVE_AVX2 */
209 
210 #ifdef LV_HAVE_AVX512BW
211 #include <immintrin.h>
212 
213 static inline void volk_8i_convert_16i_a_avx512bw(int16_t* outputVector,
214  const int8_t* inputVector,
215  unsigned int num_points)
216 {
217  unsigned int number = 0;
218  const unsigned int thirtysecondPoints = num_points / 32;
219 
220  const __m256i* inputVectorPtr = (const __m256i*)inputVector;
221  __m512i* outputVectorPtr = (__m512i*)outputVector;
222  __m256i inputVal;
223  __m512i ret;
224 
225  for (; number < thirtysecondPoints; number++) {
226  inputVal = _mm256_load_si256(inputVectorPtr);
227  ret = _mm512_cvtepi8_epi16(inputVal);
228  ret = _mm512_slli_epi16(ret, 8); // Multiply by 256
229  _mm512_store_si512(outputVectorPtr, ret);
230 
231  outputVectorPtr++;
232  inputVectorPtr++;
233  }
234 
235  number = thirtysecondPoints * 32;
236  for (; number < num_points; number++) {
237  outputVector[number] = (int16_t)(inputVector[number]) * 256;
238  }
239 }
240 #endif /* LV_HAVE_AVX512BW */
241 
242 
243 #ifdef LV_HAVE_SSE4_1
244 #include <smmintrin.h>
245 
246 static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector,
247  const int8_t* inputVector,
248  unsigned int num_points)
249 {
250  unsigned int number = 0;
251  const unsigned int sixteenthPoints = num_points / 16;
252 
253  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
254  __m128i* outputVectorPtr = (__m128i*)outputVector;
255  __m128i inputVal;
256  __m128i ret;
257 
258  for (; number < sixteenthPoints; number++) {
259  inputVal = _mm_load_si128(inputVectorPtr);
260  ret = _mm_cvtepi8_epi16(inputVal);
261  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
262  _mm_store_si128(outputVectorPtr, ret);
263 
264  outputVectorPtr++;
265 
266  inputVal = _mm_srli_si128(inputVal, 8);
267  ret = _mm_cvtepi8_epi16(inputVal);
268  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
269  _mm_store_si128(outputVectorPtr, ret);
270 
271  outputVectorPtr++;
272 
273  inputVectorPtr++;
274  }
275 
276  number = sixteenthPoints * 16;
277  for (; number < num_points; number++) {
278  outputVector[number] = (int16_t)(inputVector[number]) * 256;
279  }
280 }
281 #endif /* LV_HAVE_SSE4_1 */
282 
283 
284 #ifdef LV_HAVE_NEON
285 #include <arm_neon.h>
286 
287 static inline void volk_8i_convert_16i_neon(int16_t* outputVector,
288  const int8_t* inputVector,
289  unsigned int num_points)
290 {
291  int16_t* outputVectorPtr = outputVector;
292  const int8_t* inputVectorPtr = inputVector;
293  unsigned int number;
294  const unsigned int eighth_points = num_points / 8;
295 
296  int8x8_t input_vec;
297  int16x8_t converted_vec;
298 
299  // NEON doesn't have a concept of 8 bit registers, so we are really
300  // dealing with the low half of 16-bit registers. Since this requires
301  // a move instruction we likely do better with ASM here.
302  for (number = 0; number < eighth_points; ++number) {
303  input_vec = vld1_s8(inputVectorPtr);
304  converted_vec = vmovl_s8(input_vec);
305  // converted_vec = vmulq_s16(converted_vec, scale_factor);
306  converted_vec = vshlq_n_s16(converted_vec, 8);
307  vst1q_s16(outputVectorPtr, converted_vec);
308 
309  inputVectorPtr += 8;
310  outputVectorPtr += 8;
311  }
312 
313  for (number = eighth_points * 8; number < num_points; number++) {
314  *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
315  }
316 }
317 #endif /* LV_HAVE_NEON */
318 
319 #ifdef LV_HAVE_NEONV8
320 #include <arm_neon.h>
321 
322 static inline void volk_8i_convert_16i_neonv8(int16_t* outputVector,
323  const int8_t* inputVector,
324  unsigned int num_points)
325 {
326  int16_t* outputVectorPtr = outputVector;
327  const int8_t* inputVectorPtr = inputVector;
328  const unsigned int sixteenthPoints = num_points / 16;
329 
330  for (unsigned int number = 0; number < sixteenthPoints; number++) {
331  int8x16_t in = vld1q_s8(inputVectorPtr);
332  __VOLK_PREFETCH(inputVectorPtr + 32);
333 
334  int16x8_t out_lo = vshll_n_s8(vget_low_s8(in), 8);
335  int16x8_t out_hi = vshll_n_s8(vget_high_s8(in), 8);
336 
337  vst1q_s16(outputVectorPtr, out_lo);
338  vst1q_s16(outputVectorPtr + 8, out_hi);
339 
340  inputVectorPtr += 16;
341  outputVectorPtr += 16;
342  }
343 
344  for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
345  *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
346  }
347 }
348 #endif /* LV_HAVE_NEONV8 */
349 
350 
351 #ifdef LV_HAVE_ORC
352 extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector,
353  const int8_t* inputVector,
354  int num_points);
355 
356 static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector,
357  const int8_t* inputVector,
358  unsigned int num_points)
359 {
360  volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
361 }
362 #endif /* LV_HAVE_ORC */
363 
364 #ifdef LV_HAVE_RVV
365 #include <riscv_vector.h>
366 
367 static inline void volk_8i_convert_16i_rvv(int16_t* outputVector,
368  const int8_t* inputVector,
369  unsigned int num_points)
370 {
371  size_t n = num_points;
372  for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
373  vl = __riscv_vsetvl_e8m4(n);
374  vint16m8_t v = __riscv_vsext_vf2(__riscv_vle8_v_i8m4(inputVector, vl), vl);
375  __riscv_vse16(outputVector, __riscv_vsll(v, 8, vl), vl);
376  }
377 }
378 #endif /*LV_HAVE_RVV*/
379 
380 #endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
volk_8i_convert_16i_generic
static void volk_8i_convert_16i_generic(int16_t *outputVector, const int8_t *inputVector, unsigned int num_points)
Definition: volk_8i_convert_16i.h:154
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_8i_convert_16i_neon
static void volk_8i_convert_16i_neon(int16_t *outputVector, const int8_t *inputVector, unsigned int num_points)
Definition: volk_8i_convert_16i.h:287