Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_8i_x2_add_saturated_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
52 #ifndef INCLUDED_volk_8i_x2_add_saturated_8i_u_H
53 #define INCLUDED_volk_8i_x2_add_saturated_8i_u_H
54 
55 #include <inttypes.h>
56 
57 #ifdef LV_HAVE_GENERIC
58 
59 static inline void volk_8i_x2_add_saturated_8i_generic(int8_t* outVector,
60  const int8_t* inVectorA,
61  const int8_t* inVectorB,
62  unsigned int num_points)
63 {
64  for (unsigned int i = 0; i < num_points; i++) {
65  int8_t a = inVectorA[i];
66  int8_t b = inVectorB[i];
67  int8_t sum = a + b;
68  // Overflow if a and b have same sign but sum has different sign
69  int8_t overflow = ((a ^ sum) & (b ^ sum)) >> 7;
70  // Saturation value: 127 if a >= 0, -128 if a < 0
71  int8_t sat_val = (a >> 7) ^ 0x7F;
72  outVector[i] = (overflow & sat_val) | (~overflow & sum);
73  }
74 }
75 
76 #endif /* LV_HAVE_GENERIC */
77 
78 
79 #ifdef LV_HAVE_SSE2
80 #include <emmintrin.h>
81 
82 static inline void volk_8i_x2_add_saturated_8i_u_sse2(int8_t* outVector,
83  const int8_t* inVectorA,
84  const int8_t* inVectorB,
85  unsigned int num_points)
86 {
87  const unsigned int sixteenthPoints = num_points / 16;
88  unsigned int number = 0;
89 
90  for (; number < sixteenthPoints; number++) {
91  __m128i a = _mm_loadu_si128((const __m128i*)(inVectorA + 16 * number));
92  __m128i b = _mm_loadu_si128((const __m128i*)(inVectorB + 16 * number));
93  __m128i result = _mm_adds_epi8(a, b);
94  _mm_storeu_si128((__m128i*)(outVector + 16 * number), result);
95  }
96 
97  for (number = sixteenthPoints * 16; number < num_points; number++) {
98  int16_t sum = (int16_t)inVectorA[number] + (int16_t)inVectorB[number];
99  if (sum > 127)
100  sum = 127;
101  else if (sum < -128)
102  sum = -128;
103  outVector[number] = (int8_t)sum;
104  }
105 }
106 
107 #endif /* LV_HAVE_SSE2 */
108 
109 
110 #ifdef LV_HAVE_AVX2
111 #include <immintrin.h>
112 
113 static inline void volk_8i_x2_add_saturated_8i_u_avx2(int8_t* outVector,
114  const int8_t* inVectorA,
115  const int8_t* inVectorB,
116  unsigned int num_points)
117 {
118  const unsigned int thirtysecondPoints = num_points / 32;
119  unsigned int number = 0;
120 
121  for (; number < thirtysecondPoints; number++) {
122  __m256i a = _mm256_loadu_si256((const __m256i*)(inVectorA + 32 * number));
123  __m256i b = _mm256_loadu_si256((const __m256i*)(inVectorB + 32 * number));
124  __m256i result = _mm256_adds_epi8(a, b);
125  _mm256_storeu_si256((__m256i*)(outVector + 32 * number), result);
126  }
127 
128  for (number = thirtysecondPoints * 32; number < num_points; number++) {
129  int16_t sum = (int16_t)inVectorA[number] + (int16_t)inVectorB[number];
130  if (sum > 127)
131  sum = 127;
132  else if (sum < -128)
133  sum = -128;
134  outVector[number] = (int8_t)sum;
135  }
136 }
137 
138 #endif /* LV_HAVE_AVX2 */
139 
140 
141 #ifdef LV_HAVE_AVX512BW
142 #include <immintrin.h>
143 
144 static inline void volk_8i_x2_add_saturated_8i_u_avx512bw(int8_t* outVector,
145  const int8_t* inVectorA,
146  const int8_t* inVectorB,
147  unsigned int num_points)
148 {
149  const unsigned int sixtyfourthPoints = num_points / 64;
150  unsigned int number = 0;
151 
152  for (; number < sixtyfourthPoints; number++) {
153  __m512i a = _mm512_loadu_si512((const __m512i*)(inVectorA + 64 * number));
154  __m512i b = _mm512_loadu_si512((const __m512i*)(inVectorB + 64 * number));
155  __m512i result = _mm512_adds_epi8(a, b);
156  _mm512_storeu_si512((__m512i*)(outVector + 64 * number), result);
157  }
158 
159  for (number = sixtyfourthPoints * 64; number < num_points; number++) {
160  int16_t sum = (int16_t)inVectorA[number] + (int16_t)inVectorB[number];
161  if (sum > 127)
162  sum = 127;
163  else if (sum < -128)
164  sum = -128;
165  outVector[number] = (int8_t)sum;
166  }
167 }
168 
169 #endif /* LV_HAVE_AVX512BW */
170 
171 
172 #endif /* INCLUDED_volk_8i_x2_add_saturated_8i_u_H */
173 
174 
175 #ifndef INCLUDED_volk_8i_x2_add_saturated_8i_a_H
176 #define INCLUDED_volk_8i_x2_add_saturated_8i_a_H
177 
178 #include <inttypes.h>
179 
180 #ifdef LV_HAVE_SSE2
181 #include <emmintrin.h>
182 
183 static inline void volk_8i_x2_add_saturated_8i_a_sse2(int8_t* outVector,
184  const int8_t* inVectorA,
185  const int8_t* inVectorB,
186  unsigned int num_points)
187 {
188  const unsigned int sixteenthPoints = num_points / 16;
189  unsigned int number = 0;
190 
191  for (; number < sixteenthPoints; number++) {
192  __m128i a = _mm_load_si128((const __m128i*)(inVectorA + 16 * number));
193  __m128i b = _mm_load_si128((const __m128i*)(inVectorB + 16 * number));
194  __m128i result = _mm_adds_epi8(a, b);
195  _mm_store_si128((__m128i*)(outVector + 16 * number), result);
196  }
197 
198  for (number = sixteenthPoints * 16; number < num_points; number++) {
199  int16_t sum = (int16_t)inVectorA[number] + (int16_t)inVectorB[number];
200  if (sum > 127)
201  sum = 127;
202  else if (sum < -128)
203  sum = -128;
204  outVector[number] = (int8_t)sum;
205  }
206 }
207 
208 #endif /* LV_HAVE_SSE2 */
209 
210 
211 #ifdef LV_HAVE_AVX2
212 #include <immintrin.h>
213 
214 static inline void volk_8i_x2_add_saturated_8i_a_avx2(int8_t* outVector,
215  const int8_t* inVectorA,
216  const int8_t* inVectorB,
217  unsigned int num_points)
218 {
219  const unsigned int thirtysecondPoints = num_points / 32;
220  unsigned int number = 0;
221 
222  for (; number < thirtysecondPoints; number++) {
223  __m256i a = _mm256_load_si256((const __m256i*)(inVectorA + 32 * number));
224  __m256i b = _mm256_load_si256((const __m256i*)(inVectorB + 32 * number));
225  __m256i result = _mm256_adds_epi8(a, b);
226  _mm256_store_si256((__m256i*)(outVector + 32 * number), result);
227  }
228 
229  for (number = thirtysecondPoints * 32; number < num_points; number++) {
230  int16_t sum = (int16_t)inVectorA[number] + (int16_t)inVectorB[number];
231  if (sum > 127)
232  sum = 127;
233  else if (sum < -128)
234  sum = -128;
235  outVector[number] = (int8_t)sum;
236  }
237 }
238 
239 #endif /* LV_HAVE_AVX2 */
240 
241 
242 #ifdef LV_HAVE_AVX512BW
243 #include <immintrin.h>
244 
245 static inline void volk_8i_x2_add_saturated_8i_a_avx512bw(int8_t* outVector,
246  const int8_t* inVectorA,
247  const int8_t* inVectorB,
248  unsigned int num_points)
249 {
250  const unsigned int sixtyfourthPoints = num_points / 64;
251  unsigned int number = 0;
252 
253  for (; number < sixtyfourthPoints; number++) {
254  __m512i a = _mm512_load_si512((const __m512i*)(inVectorA + 64 * number));
255  __m512i b = _mm512_load_si512((const __m512i*)(inVectorB + 64 * number));
256  __m512i result = _mm512_adds_epi8(a, b);
257  _mm512_store_si512((__m512i*)(outVector + 64 * number), result);
258  }
259 
260  for (number = sixtyfourthPoints * 64; number < num_points; number++) {
261  int16_t sum = (int16_t)inVectorA[number] + (int16_t)inVectorB[number];
262  if (sum > 127)
263  sum = 127;
264  else if (sum < -128)
265  sum = -128;
266  outVector[number] = (int8_t)sum;
267  }
268 }
269 
270 #endif /* LV_HAVE_AVX512BW */
271 
272 
273 #ifdef LV_HAVE_NEON
274 #include <arm_neon.h>
275 
276 static inline void volk_8i_x2_add_saturated_8i_neon(int8_t* outVector,
277  const int8_t* inVectorA,
278  const int8_t* inVectorB,
279  unsigned int num_points)
280 {
281  const unsigned int sixteenthPoints = num_points / 16;
282  unsigned int number = 0;
283 
284  for (; number < sixteenthPoints; number++) {
285  int8x16_t a = vld1q_s8(inVectorA + 16 * number);
286  int8x16_t b = vld1q_s8(inVectorB + 16 * number);
287  vst1q_s8(outVector + 16 * number, vqaddq_s8(a, b));
288  }
289 
290  for (number = sixteenthPoints * 16; number < num_points; number++) {
291  int16_t sum = (int16_t)inVectorA[number] + (int16_t)inVectorB[number];
292  if (sum > 127)
293  sum = 127;
294  else if (sum < -128)
295  sum = -128;
296  outVector[number] = (int8_t)sum;
297  }
298 }
299 
300 #endif /* LV_HAVE_NEON */
301 
302 
303 #ifdef LV_HAVE_NEONV8
304 #include <arm_neon.h>
305 #include <volk/volk_common.h>
306 
307 static inline void volk_8i_x2_add_saturated_8i_neonv8(int8_t* outVector,
308  const int8_t* inVectorA,
309  const int8_t* inVectorB,
310  unsigned int num_points)
311 {
312  const unsigned int thirtysecondPoints = num_points / 32;
313  unsigned int number = 0;
314 
315  for (; number < thirtysecondPoints; number++) {
316  __VOLK_PREFETCH(inVectorA + 64);
317  __VOLK_PREFETCH(inVectorB + 64);
318  int8x16_t a0 = vld1q_s8(inVectorA);
319  int8x16_t b0 = vld1q_s8(inVectorB);
320  int8x16_t a1 = vld1q_s8(inVectorA + 16);
321  int8x16_t b1 = vld1q_s8(inVectorB + 16);
322  vst1q_s8(outVector, vqaddq_s8(a0, b0));
323  vst1q_s8(outVector + 16, vqaddq_s8(a1, b1));
324  inVectorA += 32;
325  inVectorB += 32;
326  outVector += 32;
327  }
328 
329  for (number = thirtysecondPoints * 32; number < num_points; number++) {
330  int16_t sum = (int16_t)(*inVectorA++) + (int16_t)(*inVectorB++);
331  if (sum > 127)
332  sum = 127;
333  else if (sum < -128)
334  sum = -128;
335  *outVector++ = (int8_t)sum;
336  }
337 }
338 
339 #endif /* LV_HAVE_NEONV8 */
340 
341 
342 #ifdef LV_HAVE_RVV
343 #include <riscv_vector.h>
344 
345 static inline void volk_8i_x2_add_saturated_8i_rvv(int8_t* outVector,
346  const int8_t* inVectorA,
347  const int8_t* inVectorB,
348  unsigned int num_points)
349 {
350  size_t n = num_points;
351  for (size_t vl; n > 0; n -= vl, inVectorA += vl, inVectorB += vl, outVector += vl) {
352  vl = __riscv_vsetvl_e8m8(n);
353  vint8m8_t a = __riscv_vle8_v_i8m8(inVectorA, vl);
354  vint8m8_t b = __riscv_vle8_v_i8m8(inVectorB, vl);
355  __riscv_vse8(outVector, __riscv_vsadd(a, b, vl), vl);
356  }
357 }
358 
359 #endif /* LV_HAVE_RVV */
360 
361 
362 #endif /* INCLUDED_volk_8i_x2_add_saturated_8i_a_H */
volk_8i_x2_add_saturated_8i_u_sse2
static void volk_8i_x2_add_saturated_8i_u_sse2(int8_t *outVector, const int8_t *inVectorA, const int8_t *inVectorB, unsigned int num_points)
Definition: volk_8i_x2_add_saturated_8i.h:82
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_common.h
volk_8i_x2_add_saturated_8i_a_sse2
static void volk_8i_x2_add_saturated_8i_a_sse2(int8_t *outVector, const int8_t *inVectorA, const int8_t *inVectorB, unsigned int num_points)
Definition: volk_8i_x2_add_saturated_8i.h:183
volk_8i_x2_add_saturated_8i_neon
static void volk_8i_x2_add_saturated_8i_neon(int8_t *outVector, const int8_t *inVectorA, const int8_t *inVectorB, unsigned int num_points)
Definition: volk_8i_x2_add_saturated_8i.h:276
volk_8i_x2_add_saturated_8i_generic
static void volk_8i_x2_add_saturated_8i_generic(int8_t *outVector, const int8_t *inVectorA, const int8_t *inVectorB, unsigned int num_points)
Definition: volk_8i_x2_add_saturated_8i.h:59