Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_8u_x2_add_saturated_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
52 #ifndef INCLUDED_volk_8u_x2_add_saturated_8u_u_H
53 #define INCLUDED_volk_8u_x2_add_saturated_8u_u_H
54 
55 #include <inttypes.h>
56 
57 #ifdef LV_HAVE_GENERIC
58 
59 static inline void volk_8u_x2_add_saturated_8u_generic(uint8_t* outVector,
60  const uint8_t* inVectorA,
61  const uint8_t* inVectorB,
62  unsigned int num_points)
63 {
64  for (unsigned int i = 0; i < num_points; i++) {
65  uint8_t sum = inVectorA[i] + inVectorB[i];
66  outVector[i] = sum | -(uint8_t)(sum < inVectorA[i]);
67  }
68 }
69 
70 #endif /* LV_HAVE_GENERIC */
71 
72 
73 #ifdef LV_HAVE_SSE2
74 #include <emmintrin.h>
75 
76 static inline void volk_8u_x2_add_saturated_8u_u_sse2(uint8_t* outVector,
77  const uint8_t* inVectorA,
78  const uint8_t* inVectorB,
79  unsigned int num_points)
80 {
81  const unsigned int sixteenthPoints = num_points / 16;
82  unsigned int number = 0;
83 
84  for (; number < sixteenthPoints; number++) {
85  __m128i a = _mm_loadu_si128((const __m128i*)(inVectorA + 16 * number));
86  __m128i b = _mm_loadu_si128((const __m128i*)(inVectorB + 16 * number));
87  __m128i result = _mm_adds_epu8(a, b);
88  _mm_storeu_si128((__m128i*)(outVector + 16 * number), result);
89  }
90 
91  for (number = sixteenthPoints * 16; number < num_points; number++) {
92  uint16_t sum = (uint16_t)inVectorA[number] + (uint16_t)inVectorB[number];
93  if (sum > 255)
94  sum = 255;
95  outVector[number] = (uint8_t)sum;
96  }
97 }
98 
99 #endif /* LV_HAVE_SSE2 */
100 
101 
102 #ifdef LV_HAVE_AVX2
103 #include <immintrin.h>
104 
105 static inline void volk_8u_x2_add_saturated_8u_u_avx2(uint8_t* outVector,
106  const uint8_t* inVectorA,
107  const uint8_t* inVectorB,
108  unsigned int num_points)
109 {
110  const unsigned int thirtysecondPoints = num_points / 32;
111  unsigned int number = 0;
112 
113  for (; number < thirtysecondPoints; number++) {
114  __m256i a = _mm256_loadu_si256((const __m256i*)(inVectorA + 32 * number));
115  __m256i b = _mm256_loadu_si256((const __m256i*)(inVectorB + 32 * number));
116  __m256i result = _mm256_adds_epu8(a, b);
117  _mm256_storeu_si256((__m256i*)(outVector + 32 * number), result);
118  }
119 
120  for (number = thirtysecondPoints * 32; number < num_points; number++) {
121  uint16_t sum = (uint16_t)inVectorA[number] + (uint16_t)inVectorB[number];
122  if (sum > 255)
123  sum = 255;
124  outVector[number] = (uint8_t)sum;
125  }
126 }
127 
128 #endif /* LV_HAVE_AVX2 */
129 
130 
131 #ifdef LV_HAVE_AVX512BW
132 #include <immintrin.h>
133 
134 static inline void volk_8u_x2_add_saturated_8u_u_avx512bw(uint8_t* outVector,
135  const uint8_t* inVectorA,
136  const uint8_t* inVectorB,
137  unsigned int num_points)
138 {
139  const unsigned int sixtyfourthPoints = num_points / 64;
140  unsigned int number = 0;
141 
142  for (; number < sixtyfourthPoints; number++) {
143  __m512i a = _mm512_loadu_si512((const __m512i*)(inVectorA + 64 * number));
144  __m512i b = _mm512_loadu_si512((const __m512i*)(inVectorB + 64 * number));
145  __m512i result = _mm512_adds_epu8(a, b);
146  _mm512_storeu_si512((__m512i*)(outVector + 64 * number), result);
147  }
148 
149  for (number = sixtyfourthPoints * 64; number < num_points; number++) {
150  uint16_t sum = (uint16_t)inVectorA[number] + (uint16_t)inVectorB[number];
151  if (sum > 255)
152  sum = 255;
153  outVector[number] = (uint8_t)sum;
154  }
155 }
156 
157 #endif /* LV_HAVE_AVX512BW */
158 
159 
160 #endif /* INCLUDED_volk_8u_x2_add_saturated_8u_u_H */
161 
162 
163 #ifndef INCLUDED_volk_8u_x2_add_saturated_8u_a_H
164 #define INCLUDED_volk_8u_x2_add_saturated_8u_a_H
165 
166 #include <inttypes.h>
167 
168 #ifdef LV_HAVE_SSE2
169 #include <emmintrin.h>
170 
171 static inline void volk_8u_x2_add_saturated_8u_a_sse2(uint8_t* outVector,
172  const uint8_t* inVectorA,
173  const uint8_t* inVectorB,
174  unsigned int num_points)
175 {
176  const unsigned int sixteenthPoints = num_points / 16;
177  unsigned int number = 0;
178 
179  for (; number < sixteenthPoints; number++) {
180  __m128i a = _mm_load_si128((const __m128i*)(inVectorA + 16 * number));
181  __m128i b = _mm_load_si128((const __m128i*)(inVectorB + 16 * number));
182  __m128i result = _mm_adds_epu8(a, b);
183  _mm_store_si128((__m128i*)(outVector + 16 * number), result);
184  }
185 
186  for (number = sixteenthPoints * 16; number < num_points; number++) {
187  uint16_t sum = (uint16_t)inVectorA[number] + (uint16_t)inVectorB[number];
188  if (sum > 255)
189  sum = 255;
190  outVector[number] = (uint8_t)sum;
191  }
192 }
193 
194 #endif /* LV_HAVE_SSE2 */
195 
196 
197 #ifdef LV_HAVE_AVX2
198 #include <immintrin.h>
199 
200 static inline void volk_8u_x2_add_saturated_8u_a_avx2(uint8_t* outVector,
201  const uint8_t* inVectorA,
202  const uint8_t* inVectorB,
203  unsigned int num_points)
204 {
205  const unsigned int thirtysecondPoints = num_points / 32;
206  unsigned int number = 0;
207 
208  for (; number < thirtysecondPoints; number++) {
209  __m256i a = _mm256_load_si256((const __m256i*)(inVectorA + 32 * number));
210  __m256i b = _mm256_load_si256((const __m256i*)(inVectorB + 32 * number));
211  __m256i result = _mm256_adds_epu8(a, b);
212  _mm256_store_si256((__m256i*)(outVector + 32 * number), result);
213  }
214 
215  for (number = thirtysecondPoints * 32; number < num_points; number++) {
216  uint16_t sum = (uint16_t)inVectorA[number] + (uint16_t)inVectorB[number];
217  if (sum > 255)
218  sum = 255;
219  outVector[number] = (uint8_t)sum;
220  }
221 }
222 
223 #endif /* LV_HAVE_AVX2 */
224 
225 
226 #ifdef LV_HAVE_AVX512BW
227 #include <immintrin.h>
228 
229 static inline void volk_8u_x2_add_saturated_8u_a_avx512bw(uint8_t* outVector,
230  const uint8_t* inVectorA,
231  const uint8_t* inVectorB,
232  unsigned int num_points)
233 {
234  const unsigned int sixtyfourthPoints = num_points / 64;
235  unsigned int number = 0;
236 
237  for (; number < sixtyfourthPoints; number++) {
238  __m512i a = _mm512_load_si512((const __m512i*)(inVectorA + 64 * number));
239  __m512i b = _mm512_load_si512((const __m512i*)(inVectorB + 64 * number));
240  __m512i result = _mm512_adds_epu8(a, b);
241  _mm512_store_si512((__m512i*)(outVector + 64 * number), result);
242  }
243 
244  for (number = sixtyfourthPoints * 64; number < num_points; number++) {
245  uint16_t sum = (uint16_t)inVectorA[number] + (uint16_t)inVectorB[number];
246  if (sum > 255)
247  sum = 255;
248  outVector[number] = (uint8_t)sum;
249  }
250 }
251 
252 #endif /* LV_HAVE_AVX512BW */
253 
254 
255 #ifdef LV_HAVE_NEON
256 #include <arm_neon.h>
257 
258 static inline void volk_8u_x2_add_saturated_8u_neon(uint8_t* outVector,
259  const uint8_t* inVectorA,
260  const uint8_t* inVectorB,
261  unsigned int num_points)
262 {
263  const unsigned int sixteenthPoints = num_points / 16;
264  unsigned int number = 0;
265 
266  for (; number < sixteenthPoints; number++) {
267  uint8x16_t a = vld1q_u8(inVectorA + 16 * number);
268  uint8x16_t b = vld1q_u8(inVectorB + 16 * number);
269  vst1q_u8(outVector + 16 * number, vqaddq_u8(a, b));
270  }
271 
272  for (number = sixteenthPoints * 16; number < num_points; number++) {
273  uint16_t sum = (uint16_t)inVectorA[number] + (uint16_t)inVectorB[number];
274  if (sum > 255)
275  sum = 255;
276  outVector[number] = (uint8_t)sum;
277  }
278 }
279 
280 #endif /* LV_HAVE_NEON */
281 
282 
283 #ifdef LV_HAVE_NEONV8
284 #include <arm_neon.h>
285 #include <volk/volk_common.h>
286 
287 static inline void volk_8u_x2_add_saturated_8u_neonv8(uint8_t* outVector,
288  const uint8_t* inVectorA,
289  const uint8_t* inVectorB,
290  unsigned int num_points)
291 {
292  const unsigned int thirtysecondPoints = num_points / 32;
293  unsigned int number = 0;
294 
295  for (; number < thirtysecondPoints; number++) {
296  __VOLK_PREFETCH(inVectorA + 64);
297  __VOLK_PREFETCH(inVectorB + 64);
298  uint8x16_t a0 = vld1q_u8(inVectorA);
299  uint8x16_t b0 = vld1q_u8(inVectorB);
300  uint8x16_t a1 = vld1q_u8(inVectorA + 16);
301  uint8x16_t b1 = vld1q_u8(inVectorB + 16);
302  vst1q_u8(outVector, vqaddq_u8(a0, b0));
303  vst1q_u8(outVector + 16, vqaddq_u8(a1, b1));
304  inVectorA += 32;
305  inVectorB += 32;
306  outVector += 32;
307  }
308 
309  for (number = thirtysecondPoints * 32; number < num_points; number++) {
310  uint16_t sum = (uint16_t)(*inVectorA++) + (uint16_t)(*inVectorB++);
311  if (sum > 255)
312  sum = 255;
313  *outVector++ = (uint8_t)sum;
314  }
315 }
316 
317 #endif /* LV_HAVE_NEONV8 */
318 
319 
320 #ifdef LV_HAVE_RVV
321 #include <riscv_vector.h>
322 
323 static inline void volk_8u_x2_add_saturated_8u_rvv(uint8_t* outVector,
324  const uint8_t* inVectorA,
325  const uint8_t* inVectorB,
326  unsigned int num_points)
327 {
328  size_t n = num_points;
329  for (size_t vl; n > 0; n -= vl, inVectorA += vl, inVectorB += vl, outVector += vl) {
330  vl = __riscv_vsetvl_e8m8(n);
331  vuint8m8_t a = __riscv_vle8_v_u8m8(inVectorA, vl);
332  vuint8m8_t b = __riscv_vle8_v_u8m8(inVectorB, vl);
333  __riscv_vse8(outVector, __riscv_vsaddu(a, b, vl), vl);
334  }
335 }
336 
337 #endif /* LV_HAVE_RVV */
338 
339 
340 #endif /* INCLUDED_volk_8u_x2_add_saturated_8u_a_H */
volk_8u_x2_add_saturated_8u_generic
static void volk_8u_x2_add_saturated_8u_generic(uint8_t *outVector, const uint8_t *inVectorA, const uint8_t *inVectorB, unsigned int num_points)
Definition: volk_8u_x2_add_saturated_8u.h:59
volk_8u_x2_add_saturated_8u_a_sse2
static void volk_8u_x2_add_saturated_8u_a_sse2(uint8_t *outVector, const uint8_t *inVectorA, const uint8_t *inVectorB, unsigned int num_points)
Definition: volk_8u_x2_add_saturated_8u.h:171
volk_8u_x2_add_saturated_8u_u_sse2
static void volk_8u_x2_add_saturated_8u_u_sse2(uint8_t *outVector, const uint8_t *inVectorA, const uint8_t *inVectorB, unsigned int num_points)
Definition: volk_8u_x2_add_saturated_8u.h:76
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_common.h
volk_8u_x2_add_saturated_8u_neon
static void volk_8u_x2_add_saturated_8u_neon(uint8_t *outVector, const uint8_t *inVectorA, const uint8_t *inVectorB, unsigned int num_points)
Definition: volk_8u_x2_add_saturated_8u.h:258