Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_64u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
53 #ifndef INCLUDED_volk_64u_byteswap_u_H
54 #define INCLUDED_volk_64u_byteswap_u_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 #ifdef LV_HAVE_SSE2
60 #include <emmintrin.h>
61 
62 static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points)
63 {
64  uint32_t* inputPtr = (uint32_t*)intsToSwap;
65  __m128i input, byte1, byte2, byte3, byte4, output;
66  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
67  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
68  uint64_t number = 0;
69  const unsigned int halfPoints = num_points / 2;
70  for (; number < halfPoints; number++) {
71  // Load the 32t values, increment inputPtr later since we're doing it in-place.
72  input = _mm_loadu_si128((__m128i*)inputPtr);
73 
74  // Do the four shifts
75  byte1 = _mm_slli_epi32(input, 24);
76  byte2 = _mm_slli_epi32(input, 8);
77  byte3 = _mm_srli_epi32(input, 8);
78  byte4 = _mm_srli_epi32(input, 24);
79  // Or bytes together
80  output = _mm_or_si128(byte1, byte4);
81  byte2 = _mm_and_si128(byte2, byte2mask);
82  output = _mm_or_si128(output, byte2);
83  byte3 = _mm_and_si128(byte3, byte3mask);
84  output = _mm_or_si128(output, byte3);
85 
86  // Reorder the two words
87  output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
88 
89  // Store the results
90  _mm_storeu_si128((__m128i*)inputPtr, output);
91  inputPtr += 4;
92  }
93 
94  // Byteswap any remaining points:
95  number = halfPoints * 2;
96  for (; number < num_points; number++) {
97  uint32_t output1 = *inputPtr;
98  uint32_t output2 = inputPtr[1];
99 
100  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
101  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
102 
103  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
104  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
105 
106  *inputPtr++ = output2;
107  *inputPtr++ = output1;
108  }
109 }
110 #endif /* LV_HAVE_SSE2 */
111 
112 
113 #ifdef LV_HAVE_NEON
114 #include <arm_neon.h>
115 
116 static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points)
117 {
118  uint8_t* inputPtr = (uint8_t*)intsToSwap;
119  unsigned int number = 0;
120  const unsigned int eighth_points = num_points / 8;
121 
122  for (; number < eighth_points; number++) {
123  uint8x16_t input0 = vld1q_u8(inputPtr);
124  uint8x16_t input1 = vld1q_u8(inputPtr + 16);
125  uint8x16_t input2 = vld1q_u8(inputPtr + 32);
126  uint8x16_t input3 = vld1q_u8(inputPtr + 48);
127 
128  // Reverse bytes within each 64-bit element
129  uint8x16_t output0 = vrev64q_u8(input0);
130  uint8x16_t output1 = vrev64q_u8(input1);
131  uint8x16_t output2 = vrev64q_u8(input2);
132  uint8x16_t output3 = vrev64q_u8(input3);
133 
134  vst1q_u8(inputPtr, output0);
135  vst1q_u8(inputPtr + 16, output1);
136  vst1q_u8(inputPtr + 32, output2);
137  vst1q_u8(inputPtr + 48, output3);
138 
139  inputPtr += 64;
140  }
141 
142  // Handle remaining points
143  number = eighth_points * 8;
144  uint32_t* intPtr = (uint32_t*)(intsToSwap + number);
145  for (; number < num_points; number++) {
146  uint32_t output1 = *intPtr;
147  uint32_t output2 = intPtr[1];
148 
149  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
150  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
151 
152  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
153  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
154 
155  *intPtr++ = output2;
156  *intPtr++ = output1;
157  }
158 }
159 #endif /* LV_HAVE_NEON */
160 
161 
162 #ifdef LV_HAVE_GENERIC
163 
164 static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
165  unsigned int num_points)
166 {
167  uint32_t* inputPtr = (uint32_t*)intsToSwap;
168  unsigned int point;
169  for (point = 0; point < num_points; point++) {
170  uint32_t output1 = *inputPtr;
171  uint32_t output2 = inputPtr[1];
172 
173  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
174  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
175 
176  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
177  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
178 
179  *inputPtr++ = output2;
180  *inputPtr++ = output1;
181  }
182 }
183 #endif /* LV_HAVE_GENERIC */
184 
185 #if LV_HAVE_AVX2
186 #include <immintrin.h>
187 static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
188 {
189  unsigned int number = 0;
190 
191  const unsigned int nPerSet = 4;
192  const uint64_t nSets = num_points / nPerSet;
193 
194  uint32_t* inputPtr = (uint32_t*)intsToSwap;
195 
196  const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
197  12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
198  17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
199 
200  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
201 
202  for (; number < nSets; number++) {
203 
204  // Load the 32t values, increment inputPtr later since we're doing it in-place.
205  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
206  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
207 
208  // Store the results
209  _mm256_store_si256((__m256i*)inputPtr, output);
210 
211  /* inputPtr is 32bit so increment twice */
212  inputPtr += 2 * nPerSet;
213  }
214 
215  // Byteswap any remaining points:
216  for (number = nSets * nPerSet; number < num_points; ++number) {
217  uint32_t output1 = *inputPtr;
218  uint32_t output2 = inputPtr[1];
219  uint32_t out1 =
220  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
221  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
222 
223  uint32_t out2 =
224  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
225  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
226  *inputPtr++ = out2;
227  *inputPtr++ = out1;
228  }
229 }
230 
231 #endif /* LV_HAVE_AVX2 */
232 
233 
234 #if LV_HAVE_SSSE3
235 #include <tmmintrin.h>
236 static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
237  unsigned int num_points)
238 {
239  unsigned int number = 0;
240 
241  const unsigned int nPerSet = 2;
242  const uint64_t nSets = num_points / nPerSet;
243 
244  uint32_t* inputPtr = (uint32_t*)intsToSwap;
245 
246  uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
247 
248  const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
249 
250  for (; number < nSets; number++) {
251 
252  // Load the 32t values, increment inputPtr later since we're doing it in-place.
253  const __m128i input = _mm_load_si128((__m128i*)inputPtr);
254  const __m128i output = _mm_shuffle_epi8(input, myShuffle);
255 
256  // Store the results
257  _mm_store_si128((__m128i*)inputPtr, output);
258 
259  /* inputPtr is 32bit so increment twice */
260  inputPtr += 2 * nPerSet;
261  }
262 
263  // Byteswap any remaining points:
264  for (number = nSets * nPerSet; number < num_points; ++number) {
265  uint32_t output1 = *inputPtr;
266  uint32_t output2 = inputPtr[1];
267  uint32_t out1 =
268  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
269  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
270 
271  uint32_t out2 =
272  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
273  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
274  *inputPtr++ = out2;
275  *inputPtr++ = out1;
276  }
277 }
278 #endif /* LV_HAVE_SSSE3 */
279 #endif /* INCLUDED_volk_64u_byteswap_u_H */
280 
281 
282 #ifndef INCLUDED_volk_64u_byteswap_a_H
283 #define INCLUDED_volk_64u_byteswap_a_H
284 
285 #include <inttypes.h>
286 #include <stdio.h>
287 
288 #ifdef LV_HAVE_SSE2
289 #include <emmintrin.h>
290 
291 static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points)
292 {
293  uint32_t* inputPtr = (uint32_t*)intsToSwap;
294  __m128i input, byte1, byte2, byte3, byte4, output;
295  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
296  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
297  uint64_t number = 0;
298  const unsigned int halfPoints = num_points / 2;
299  for (; number < halfPoints; number++) {
300  // Load the 32t values, increment inputPtr later since we're doing it in-place.
301  input = _mm_load_si128((__m128i*)inputPtr);
302 
303  // Do the four shifts
304  byte1 = _mm_slli_epi32(input, 24);
305  byte2 = _mm_slli_epi32(input, 8);
306  byte3 = _mm_srli_epi32(input, 8);
307  byte4 = _mm_srli_epi32(input, 24);
308  // Or bytes together
309  output = _mm_or_si128(byte1, byte4);
310  byte2 = _mm_and_si128(byte2, byte2mask);
311  output = _mm_or_si128(output, byte2);
312  byte3 = _mm_and_si128(byte3, byte3mask);
313  output = _mm_or_si128(output, byte3);
314 
315  // Reorder the two words
316  output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
317 
318  // Store the results
319  _mm_store_si128((__m128i*)inputPtr, output);
320  inputPtr += 4;
321  }
322 
323  // Byteswap any remaining points:
324  number = halfPoints * 2;
325  for (; number < num_points; number++) {
326  uint32_t output1 = *inputPtr;
327  uint32_t output2 = inputPtr[1];
328 
329  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
330  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
331 
332  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
333  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
334 
335  *inputPtr++ = output2;
336  *inputPtr++ = output1;
337  }
338 }
339 #endif /* LV_HAVE_SSE2 */
340 
341 #if LV_HAVE_AVX2
342 #include <immintrin.h>
343 static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
344 {
345  unsigned int number = 0;
346 
347  const unsigned int nPerSet = 4;
348  const uint64_t nSets = num_points / nPerSet;
349 
350  uint32_t* inputPtr = (uint32_t*)intsToSwap;
351 
352  const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
353  12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
354  17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
355 
356  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
357 
358  for (; number < nSets; number++) {
359  // Load the 32t values, increment inputPtr later since we're doing it in-place.
360  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
361  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
362 
363  // Store the results
364  _mm256_storeu_si256((__m256i*)inputPtr, output);
365 
366  /* inputPtr is 32bit so increment twice */
367  inputPtr += 2 * nPerSet;
368  }
369 
370  // Byteswap any remaining points:
371  for (number = nSets * nPerSet; number < num_points; ++number) {
372  uint32_t output1 = *inputPtr;
373  uint32_t output2 = inputPtr[1];
374  uint32_t out1 =
375  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
376  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
377 
378  uint32_t out2 =
379  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
380  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
381  *inputPtr++ = out2;
382  *inputPtr++ = out1;
383  }
384 }
385 
386 #endif /* LV_HAVE_AVX2 */
387 
388 
389 #if LV_HAVE_SSSE3
390 #include <tmmintrin.h>
391 static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
392  unsigned int num_points)
393 {
394  unsigned int number = 0;
395 
396  const unsigned int nPerSet = 2;
397  const uint64_t nSets = num_points / nPerSet;
398 
399  uint32_t* inputPtr = (uint32_t*)intsToSwap;
400 
401  uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
402 
403  const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
404 
405  for (; number < nSets; number++) {
406  // Load the 32t values, increment inputPtr later since we're doing it in-place.
407  const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
408  const __m128i output = _mm_shuffle_epi8(input, myShuffle);
409 
410  // Store the results
411  _mm_storeu_si128((__m128i*)inputPtr, output);
412 
413  /* inputPtr is 32bit so increment twice */
414  inputPtr += 2 * nPerSet;
415  }
416 
417  // Byteswap any remaining points:
418  for (number = nSets * nPerSet; number < num_points; ++number) {
419  uint32_t output1 = *inputPtr;
420  uint32_t output2 = inputPtr[1];
421  uint32_t out1 =
422  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
423  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
424 
425  uint32_t out2 =
426  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
427  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
428  *inputPtr++ = out2;
429  *inputPtr++ = out1;
430  }
431 }
432 #endif /* LV_HAVE_SSSE3 */
433 
434 
435 #ifdef LV_HAVE_RVV
436 #include <riscv_vector.h>
437 
438 static inline void volk_64u_byteswap_rvv(uint64_t* intsToSwap, unsigned int num_points)
439 {
440  size_t n = num_points;
441  size_t vlmax = __riscv_vsetvlmax_e8m1();
442  if (vlmax <= 256) {
443  vuint8m1_t vidx = __riscv_vreinterpret_u8m1(
444  __riscv_vsub(__riscv_vreinterpret_u64m1(__riscv_vid_v_u8m1(vlmax)),
445  0x0706050403020100 - 0x1020304050607,
446  vlmax / 8));
447  for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
448  vl = __riscv_vsetvl_e64m8(n);
449  vuint8m8_t v =
450  __riscv_vreinterpret_u8m8(__riscv_vle64_v_u64m8(intsToSwap, vl));
451  v = RISCV_PERM8(__riscv_vrgather, v, vidx);
452  __riscv_vse64(intsToSwap, __riscv_vreinterpret_u64m8(v), vl);
453  }
454  } else {
455  vuint16m2_t vid = __riscv_vid_v_u16m2(vlmax);
456  vuint16m2_t voff1 = __riscv_vand(vid, 0x7, vlmax);
457  vuint16m2_t voff2 = __riscv_vrsub(voff1, 0x7, vlmax);
458  vuint16m2_t vidx = __riscv_vadd(__riscv_vsub(vid, voff1, vlmax), voff2, vlmax);
459  for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
460  vl = __riscv_vsetvl_e64m8(n);
461  vuint8m8_t v =
462  __riscv_vreinterpret_u8m8(__riscv_vle64_v_u64m8(intsToSwap, vl));
463  v = RISCV_PERM8(__riscv_vrgatherei16, v, vidx);
464  __riscv_vse64(intsToSwap, __riscv_vreinterpret_u64m8(v), vl);
465  }
466  }
467 }
468 #endif /* LV_HAVE_RVV */
469 
470 #ifdef LV_HAVE_RVA23
471 #include <riscv_vector.h>
472 
473 static inline void volk_64u_byteswap_rva23(uint64_t* intsToSwap, unsigned int num_points)
474 {
475  size_t n = num_points;
476  for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
477  vl = __riscv_vsetvl_e64m8(n);
478  vuint64m8_t v = __riscv_vle64_v_u64m8(intsToSwap, vl);
479  __riscv_vse64(intsToSwap, __riscv_vrev8(v, vl), vl);
480  }
481 }
482 #endif /* LV_HAVE_RVA23 */
483 
484 #endif /* INCLUDED_volk_64u_byteswap_a_H */
volk_64u_byteswap_generic
static void volk_64u_byteswap_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:164
RISCV_PERM8
#define RISCV_PERM8(f, v, vidx)
Definition: volk_rvv_intrinsics.h:64
volk_64u_byteswap_u_sse2
static void volk_64u_byteswap_u_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:62
volk_64u_byteswap_neon
static void volk_64u_byteswap_neon(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:116
volk_64u_byteswap_a_ssse3
static void volk_64u_byteswap_a_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:236
volk_64u_byteswap_u_ssse3
static void volk_64u_byteswap_u_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:391
volk_64u_byteswap_a_sse2
static void volk_64u_byteswap_a_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:291