Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32i_x2_or_32i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
68 #ifndef INCLUDED_volk_32i_x2_or_32i_a_H
69 #define INCLUDED_volk_32i_x2_or_32i_a_H
70 
71 #include <inttypes.h>
72 #include <stdio.h>
73 
74 #ifdef LV_HAVE_AVX512F
75 #include <immintrin.h>
76 
77 static inline void volk_32i_x2_or_32i_a_avx512f(int32_t* cVector,
78  const int32_t* aVector,
79  const int32_t* bVector,
80  unsigned int num_points)
81 {
82  unsigned int number = 0;
83  const unsigned int sixteenthPoints = num_points / 16;
84 
85  int32_t* cPtr = (int32_t*)cVector;
86  const int32_t* aPtr = (int32_t*)aVector;
87  const int32_t* bPtr = (int32_t*)bVector;
88 
89  __m512i aVal, bVal, cVal;
90  for (; number < sixteenthPoints; number++) {
91 
92  aVal = _mm512_load_si512(aPtr);
93  bVal = _mm512_load_si512(bPtr);
94 
95  cVal = _mm512_or_si512(aVal, bVal);
96 
97  _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
98 
99  aPtr += 16;
100  bPtr += 16;
101  cPtr += 16;
102  }
103 
104  number = sixteenthPoints * 16;
105  for (; number < num_points; number++) {
106  cVector[number] = aVector[number] | bVector[number];
107  }
108 }
109 #endif /* LV_HAVE_AVX512F */
110 
111 #ifdef LV_HAVE_AVX2
112 #include <immintrin.h>
113 
114 static inline void volk_32i_x2_or_32i_a_avx2(int32_t* cVector,
115  const int32_t* aVector,
116  const int32_t* bVector,
117  unsigned int num_points)
118 {
119  unsigned int number = 0;
120  const unsigned int oneEightPoints = num_points / 8;
121 
122  int32_t* cPtr = cVector;
123  const int32_t* aPtr = aVector;
124  const int32_t* bPtr = bVector;
125 
126  __m256i aVal, bVal, cVal;
127  for (; number < oneEightPoints; number++) {
128 
129  aVal = _mm256_load_si256((__m256i*)aPtr);
130  bVal = _mm256_load_si256((__m256i*)bPtr);
131 
132  cVal = _mm256_or_si256(aVal, bVal);
133 
134  _mm256_store_si256((__m256i*)cPtr,
135  cVal); // Store the results back into the C container
136 
137  aPtr += 8;
138  bPtr += 8;
139  cPtr += 8;
140  }
141 
142  number = oneEightPoints * 8;
143  for (; number < num_points; number++) {
144  cVector[number] = aVector[number] | bVector[number];
145  }
146 }
147 #endif /* LV_HAVE_AVX2 */
148 
149 
150 #ifdef LV_HAVE_SSE
151 #include <xmmintrin.h>
152 
153 static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector,
154  const int32_t* aVector,
155  const int32_t* bVector,
156  unsigned int num_points)
157 {
158  unsigned int number = 0;
159  const unsigned int quarterPoints = num_points / 4;
160 
161  float* cPtr = (float*)cVector;
162  const float* aPtr = (float*)aVector;
163  const float* bPtr = (float*)bVector;
164 
165  __m128 aVal, bVal, cVal;
166  for (; number < quarterPoints; number++) {
167  aVal = _mm_load_ps(aPtr);
168  bVal = _mm_load_ps(bPtr);
169 
170  cVal = _mm_or_ps(aVal, bVal);
171 
172  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
173 
174  aPtr += 4;
175  bPtr += 4;
176  cPtr += 4;
177  }
178 
179  number = quarterPoints * 4;
180  for (; number < num_points; number++) {
181  cVector[number] = aVector[number] | bVector[number];
182  }
183 }
184 #endif /* LV_HAVE_SSE */
185 
186 
187 #ifdef LV_HAVE_NEON
188 #include <arm_neon.h>
189 
190 static inline void volk_32i_x2_or_32i_neon(int32_t* cVector,
191  const int32_t* aVector,
192  const int32_t* bVector,
193  unsigned int num_points)
194 {
195  int32_t* cPtr = cVector;
196  const int32_t* aPtr = aVector;
197  const int32_t* bPtr = bVector;
198  unsigned int number = 0;
199  unsigned int quarter_points = num_points / 4;
200 
201  int32x4_t a_val, b_val, c_val;
202 
203  for (number = 0; number < quarter_points; number++) {
204  a_val = vld1q_s32(aPtr);
205  b_val = vld1q_s32(bPtr);
206  c_val = vorrq_s32(a_val, b_val);
207  vst1q_s32(cPtr, c_val);
208  aPtr += 4;
209  bPtr += 4;
210  cPtr += 4;
211  }
212 
213  for (number = quarter_points * 4; number < num_points; number++) {
214  *cPtr++ = (*aPtr++) | (*bPtr++);
215  }
216 }
217 #endif /* LV_HAVE_NEON */
218 
219 #ifdef LV_HAVE_NEONV8
220 #include <arm_neon.h>
221 
222 static inline void volk_32i_x2_or_32i_neonv8(int32_t* cVector,
223  const int32_t* aVector,
224  const int32_t* bVector,
225  unsigned int num_points)
226 {
227  const unsigned int eighthPoints = num_points / 8;
228 
229  const int32_t* aPtr = aVector;
230  const int32_t* bPtr = bVector;
231  int32_t* cPtr = cVector;
232 
233  for (unsigned int number = 0; number < eighthPoints; number++) {
234  int32x4_t a0 = vld1q_s32(aPtr);
235  int32x4_t a1 = vld1q_s32(aPtr + 4);
236  int32x4_t b0 = vld1q_s32(bPtr);
237  int32x4_t b1 = vld1q_s32(bPtr + 4);
238  __VOLK_PREFETCH(aPtr + 16);
239  __VOLK_PREFETCH(bPtr + 16);
240 
241  vst1q_s32(cPtr, vorrq_s32(a0, b0));
242  vst1q_s32(cPtr + 4, vorrq_s32(a1, b1));
243 
244  aPtr += 8;
245  bPtr += 8;
246  cPtr += 8;
247  }
248 
249  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
250  *cPtr++ = (*aPtr++) | (*bPtr++);
251  }
252 }
253 #endif /* LV_HAVE_NEONV8 */
254 
255 
256 #ifdef LV_HAVE_GENERIC
257 
258 static inline void volk_32i_x2_or_32i_generic(int32_t* cVector,
259  const int32_t* aVector,
260  const int32_t* bVector,
261  unsigned int num_points)
262 {
263  int32_t* cPtr = cVector;
264  const int32_t* aPtr = aVector;
265  const int32_t* bPtr = bVector;
266  unsigned int number = 0;
267 
268  for (number = 0; number < num_points; number++) {
269  *cPtr++ = (*aPtr++) | (*bPtr++);
270  }
271 }
272 #endif /* LV_HAVE_GENERIC */
273 
274 
275 #ifdef LV_HAVE_ORC
276 extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector,
277  const int32_t* aVector,
278  const int32_t* bVector,
279  int num_points);
280 
281 static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector,
282  const int32_t* aVector,
283  const int32_t* bVector,
284  unsigned int num_points)
285 {
286  volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
287 }
288 #endif /* LV_HAVE_ORC */
289 
290 
291 #endif /* INCLUDED_volk_32i_x2_or_32i_a_H */
292 
293 
294 #ifndef INCLUDED_volk_32i_x2_or_32i_u_H
295 #define INCLUDED_volk_32i_x2_or_32i_u_H
296 
297 #include <inttypes.h>
298 #include <stdio.h>
299 
300 #ifdef LV_HAVE_AVX512F
301 #include <immintrin.h>
302 
303 static inline void volk_32i_x2_or_32i_u_avx512f(int32_t* cVector,
304  const int32_t* aVector,
305  const int32_t* bVector,
306  unsigned int num_points)
307 {
308  unsigned int number = 0;
309  const unsigned int sixteenthPoints = num_points / 16;
310 
311  int32_t* cPtr = (int32_t*)cVector;
312  const int32_t* aPtr = (int32_t*)aVector;
313  const int32_t* bPtr = (int32_t*)bVector;
314 
315  __m512i aVal, bVal, cVal;
316  for (; number < sixteenthPoints; number++) {
317 
318  aVal = _mm512_loadu_si512(aPtr);
319  bVal = _mm512_loadu_si512(bPtr);
320 
321  cVal = _mm512_or_si512(aVal, bVal);
322 
323  _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
324 
325  aPtr += 16;
326  bPtr += 16;
327  cPtr += 16;
328  }
329 
330  number = sixteenthPoints * 16;
331  for (; number < num_points; number++) {
332  cVector[number] = aVector[number] | bVector[number];
333  }
334 }
335 #endif /* LV_HAVE_AVX512F */
336 
337 #ifdef LV_HAVE_AVX2
338 #include <immintrin.h>
339 
340 static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector,
341  const int32_t* aVector,
342  const int32_t* bVector,
343  unsigned int num_points)
344 {
345  unsigned int number = 0;
346  const unsigned int oneEightPoints = num_points / 8;
347 
348  int32_t* cPtr = cVector;
349  const int32_t* aPtr = aVector;
350  const int32_t* bPtr = bVector;
351 
352  __m256i aVal, bVal, cVal;
353  for (; number < oneEightPoints; number++) {
354 
355  aVal = _mm256_loadu_si256((__m256i*)aPtr);
356  bVal = _mm256_loadu_si256((__m256i*)bPtr);
357 
358  cVal = _mm256_or_si256(aVal, bVal);
359 
360  _mm256_storeu_si256((__m256i*)cPtr,
361  cVal); // Store the results back into the C container
362 
363  aPtr += 8;
364  bPtr += 8;
365  cPtr += 8;
366  }
367 
368  number = oneEightPoints * 8;
369  for (; number < num_points; number++) {
370  cVector[number] = aVector[number] | bVector[number];
371  }
372 }
373 #endif /* LV_HAVE_AVX2 */
374 
375 #ifdef LV_HAVE_RVV
376 #include <riscv_vector.h>
377 
378 static inline void volk_32i_x2_or_32i_rvv(int32_t* cVector,
379  const int32_t* aVector,
380  const int32_t* bVector,
381  unsigned int num_points)
382 {
383  size_t n = num_points;
384  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
385  vl = __riscv_vsetvl_e32m8(n);
386  vint32m8_t va = __riscv_vle32_v_i32m8(aVector, vl);
387  vint32m8_t vb = __riscv_vle32_v_i32m8(bVector, vl);
388  __riscv_vse32(cVector, __riscv_vor(va, vb, vl), vl);
389  }
390 }
391 #endif /*LV_HAVE_RVV*/
392 
393 #endif /* INCLUDED_volk_32i_x2_or_32i_u_H */
volk_32i_x2_or_32i_generic
static void volk_32i_x2_or_32i_generic(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:258
volk_32i_x2_or_32i_neon
static void volk_32i_x2_or_32i_neon(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:190
volk_32i_x2_or_32i_a_sse
static void volk_32i_x2_or_32i_a_sse(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:153
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68