Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_16u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
40 #ifndef INCLUDED_volk_16u_byteswap_u_H
41 #define INCLUDED_volk_16u_byteswap_u_H
42 
43 #include <inttypes.h>
44 
45 #ifdef LV_HAVE_GENERIC
46 
47 static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap,
48  unsigned int num_points)
49 {
50  uint16_t* inputPtr = intsToSwap;
51  for (unsigned int point = 0; point < num_points; point++) {
52  uint16_t output = *inputPtr;
53  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
54  *inputPtr = output;
55  inputPtr++;
56  }
57 }
58 #endif /* LV_HAVE_GENERIC */
59 
60 
61 #if LV_HAVE_AVX2
62 #include <immintrin.h>
63 static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points)
64 {
65  const unsigned int nPerSet = 16;
66  const uint64_t nSets = num_points / nPerSet;
67 
68  uint16_t* inputPtr = (uint16_t*)intsToSwap;
69 
70  const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
71  10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
72  23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
73 
74  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
75 
76  for (unsigned int number = 0; number < nSets; number++) {
77  // Load the 32t values, increment inputPtr later since we're doing it in-place.
78  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
79  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
80 
81  // Store the results
82  _mm256_store_si256((__m256i*)inputPtr, output);
83  inputPtr += nPerSet;
84  }
85 
86  // Byteswap any remaining points:
87  for (unsigned int number = nPerSet * nSets; number < num_points; number++) {
88  uint16_t outputVal = *inputPtr;
89  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
90  *inputPtr = outputVal;
91  inputPtr++;
92  }
93 }
94 #endif /* LV_HAVE_AVX2 */
95 
96 
97 #if LV_HAVE_AVX2
98 #include <immintrin.h>
99 static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points)
100 {
101  const unsigned int nPerSet = 16;
102  const uint64_t nSets = num_points / nPerSet;
103 
104  uint16_t* inputPtr = (uint16_t*)intsToSwap;
105 
106  const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
107  10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
108  23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
109 
110  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
111 
112  for (unsigned int number = 0; number < nSets; number++) {
113  // Load the 32t values, increment inputPtr later since we're doing it in-place.
114  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
115  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
116 
117  // Store the results
118  _mm256_storeu_si256((__m256i*)inputPtr, output);
119  inputPtr += nPerSet;
120  }
121 
122  // Byteswap any remaining points:
123  for (unsigned int number = nPerSet * nSets; number < num_points; number++) {
124  uint16_t outputVal = *inputPtr;
125  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
126  *inputPtr = outputVal;
127  inputPtr++;
128  }
129 }
130 #endif /* LV_HAVE_AVX2 */
131 
132 
133 #ifdef LV_HAVE_SSE2
134 #include <emmintrin.h>
135 
136 static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points)
137 {
138  uint16_t* inputPtr = intsToSwap;
139  __m128i input, left, right, output;
140 
141  const unsigned int eighthPoints = num_points / 8;
142  for (unsigned int number = 0; number < eighthPoints; number++) {
143  // Load the 16t values, increment inputPtr later since we're doing it in-place.
144  input = _mm_loadu_si128((__m128i*)inputPtr);
145  // Do the two shifts
146  left = _mm_slli_epi16(input, 8);
147  right = _mm_srli_epi16(input, 8);
148  // Or the left and right halves together
149  output = _mm_or_si128(left, right);
150  // Store the results
151  _mm_storeu_si128((__m128i*)inputPtr, output);
152  inputPtr += 8;
153  }
154 
155  // Byteswap any remaining points:
156  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
157  uint16_t outputVal = *inputPtr;
158  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
159  *inputPtr = outputVal;
160  inputPtr++;
161  }
162 }
163 #endif /* LV_HAVE_SSE2 */
164 
165 
166 #endif /* INCLUDED_volk_16u_byteswap_u_H */
167 #ifndef INCLUDED_volk_16u_byteswap_a_H
168 #define INCLUDED_volk_16u_byteswap_a_H
169 
170 #include <inttypes.h>
171 
172 #ifdef LV_HAVE_SSE2
173 #include <emmintrin.h>
174 
175 static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points)
176 {
177  uint16_t* inputPtr = intsToSwap;
178  __m128i input, left, right, output;
179 
180  const unsigned int eighthPoints = num_points / 8;
181  for (unsigned int number = 0; number < eighthPoints; number++) {
182  // Load the 16t values, increment inputPtr later since we're doing it in-place.
183  input = _mm_load_si128((__m128i*)inputPtr);
184  // Do the two shifts
185  left = _mm_slli_epi16(input, 8);
186  right = _mm_srli_epi16(input, 8);
187  // Or the left and right halves together
188  output = _mm_or_si128(left, right);
189  // Store the results
190  _mm_store_si128((__m128i*)inputPtr, output);
191  inputPtr += 8;
192  }
193 
194  // Byteswap any remaining points:
195  volk_16u_byteswap_generic(inputPtr, num_points - eighthPoints * 8);
196 }
197 #endif /* LV_HAVE_SSE2 */
198 
199 #ifdef LV_HAVE_NEON
200 #include <arm_neon.h>
201 
202 static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points)
203 {
204  unsigned int eighth_points = num_points / 8;
205  uint16x8_t input;
206  uint16x8_t output = { 0, 0, 0, 0, 0, 0, 0, 0 };
207  uint16_t* inputPtr = intsToSwap;
208 
209  for (unsigned int number = 0; number < eighth_points; number++) {
210  input = vld1q_u16(inputPtr);
211  output = vsriq_n_u16(output, input, 8);
212  output = vsliq_n_u16(output, input, 8);
213  vst1q_u16(inputPtr, output);
214  inputPtr += 8;
215  }
216 
217  volk_16u_byteswap_generic(inputPtr, num_points - eighth_points * 8);
218 }
219 #endif /* LV_HAVE_NEON */
220 
221 #ifdef LV_HAVE_NEON
222 #include <arm_neon.h>
223 
224 static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap,
225  unsigned int num_points)
226 {
227  uint16_t* inputPtr = intsToSwap;
228  unsigned int n16points = num_points / 16;
229 
230  uint8x8x4_t input_table;
231  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
232  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
233 
234  /* these magic numbers are used as byte-indices in the LUT.
235  they are pre-computed to save time. A simple C program
236  can calculate them; for example for lookup01:
237  uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
238  for(ii=0; ii < 8; ++ii) {
239  index += ((uint64_t)(*(chars+ii))) << (ii*8);
240  }
241  */
242  int_lookup01 = vcreate_u8(1232017111498883080);
243  int_lookup23 = vcreate_u8(1376697457175036426);
244  int_lookup45 = vcreate_u8(1521377802851189772);
245  int_lookup67 = vcreate_u8(1666058148527343118);
246 
247  for (unsigned int number = 0; number < n16points; ++number) {
248  input_table = vld4_u8((uint8_t*)inputPtr);
249  swapped_int01 = vtbl4_u8(input_table, int_lookup01);
250  swapped_int23 = vtbl4_u8(input_table, int_lookup23);
251  swapped_int45 = vtbl4_u8(input_table, int_lookup45);
252  swapped_int67 = vtbl4_u8(input_table, int_lookup67);
253  vst1_u8((uint8_t*)inputPtr, swapped_int01);
254  vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
255  vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
256  vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
257 
258  inputPtr += 16;
259  }
260 
261  volk_16u_byteswap_generic(inputPtr, num_points - n16points * 16);
262 }
263 #endif /* LV_HAVE_NEON */
264 
265 #ifdef LV_HAVE_NEONV8
266 #include <arm_neon.h>
267 
268 static inline void volk_16u_byteswap_neonv8(uint16_t* intsToSwap, unsigned int num_points)
269 {
270  const unsigned int sixteenthPoints = num_points / 16;
271  uint16_t* inputPtr = intsToSwap;
272 
273  for (unsigned int number = 0; number < sixteenthPoints; number++) {
274  uint8x16_t in0 = vld1q_u8((const uint8_t*)inputPtr);
275  uint8x16_t in1 = vld1q_u8((const uint8_t*)(inputPtr + 8));
276  __VOLK_PREFETCH(inputPtr + 32);
277 
278  /* ARMv8 has vrev16q_u8 which reverses bytes within 16-bit elements */
279  vst1q_u8((uint8_t*)inputPtr, vrev16q_u8(in0));
280  vst1q_u8((uint8_t*)(inputPtr + 8), vrev16q_u8(in1));
281 
282  inputPtr += 16;
283  }
284 
285  for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
286  uint16_t output = *inputPtr;
287  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
288  *inputPtr++ = output;
289  }
290 }
291 #endif /* LV_HAVE_NEONV8 */
292 
293 #ifdef LV_HAVE_ORC
294 
295 extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, int num_points);
296 static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points)
297 {
298  volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
299 }
300 #endif /* LV_HAVE_ORC */
301 
302 #ifdef LV_HAVE_RVV
303 #include <riscv_vector.h>
305 
306 static inline void volk_16u_byteswap_rvv(uint16_t* intsToSwap, unsigned int num_points)
307 {
308  size_t n = num_points;
309  size_t vlmax = __riscv_vsetvlmax_e8m1();
310  if (vlmax <= 256) {
311  vuint8m1_t vidx = __riscv_vreinterpret_u8m1(
312  __riscv_vsub(__riscv_vreinterpret_u16m1(__riscv_vid_v_u8m1(vlmax)),
313  0x100 - 0x1,
314  vlmax / 2));
315  for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
316  vl = __riscv_vsetvl_e16m8(n);
317  vuint8m8_t v =
318  __riscv_vreinterpret_u8m8(__riscv_vle16_v_u16m8(intsToSwap, vl));
319  v = RISCV_PERM8(__riscv_vrgather, v, vidx);
320  __riscv_vse16(intsToSwap, __riscv_vreinterpret_u16m8(v), vl);
321  }
322  } else {
323  vuint16m2_t vidx = __riscv_vreinterpret_u16m2(
324  __riscv_vsub(__riscv_vreinterpret_u32m2(__riscv_vid_v_u16m2(vlmax)),
325  0x10000 - 0x1,
326  vlmax / 2));
327  for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
328  vl = __riscv_vsetvl_e16m8(n);
329  vuint8m8_t v =
330  __riscv_vreinterpret_u8m8(__riscv_vle16_v_u16m8(intsToSwap, vl));
331  v = RISCV_PERM8(__riscv_vrgatherei16, v, vidx);
332  __riscv_vse16(intsToSwap, __riscv_vreinterpret_u16m8(v), vl);
333  }
334  }
335 }
336 #endif /* LV_HAVE_RVV */
337 
338 #ifdef LV_HAVE_RVA23
339 #include <riscv_vector.h>
340 
341 static inline void volk_16u_byteswap_rva23(uint16_t* intsToSwap, unsigned int num_points)
342 {
343  size_t n = num_points;
344  for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
345  vl = __riscv_vsetvl_e16m8(n);
346  vuint16m8_t v = __riscv_vle16_v_u16m8(intsToSwap, vl);
347  __riscv_vse16(intsToSwap, __riscv_vrev8(v, vl), vl);
348  }
349 }
350 #endif /* LV_HAVE_RVA23 */
351 
352 #endif /* INCLUDED_volk_16u_byteswap_a_H */
RISCV_PERM8
#define RISCV_PERM8(f, v, vidx)
Definition: volk_rvv_intrinsics.h:64
volk_16u_byteswap_neon_table
static void volk_16u_byteswap_neon_table(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:224
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_16u_byteswap_neon
static void volk_16u_byteswap_neon(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:202
volk_16u_byteswap_a_sse2
static void volk_16u_byteswap_a_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:175
volk_rvv_intrinsics.h
volk_16u_byteswap_u_sse2
static void volk_16u_byteswap_u_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:136
volk_16u_byteswap_generic
static void volk_16u_byteswap_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:47