Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32i_x2_and_32i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
68 #ifndef INCLUDED_volk_32i_x2_and_32i_a_H
69 #define INCLUDED_volk_32i_x2_and_32i_a_H
70 
71 #include <inttypes.h>
72 #include <stdio.h>
73 
74 #ifdef LV_HAVE_AVX512F
75 #include <immintrin.h>
76 
77 static inline void volk_32i_x2_and_32i_a_avx512f(int32_t* cVector,
78  const int32_t* aVector,
79  const int32_t* bVector,
80  unsigned int num_points)
81 {
82  unsigned int number = 0;
83  const unsigned int sixteenthPoints = num_points / 16;
84 
85  int32_t* cPtr = (int32_t*)cVector;
86  const int32_t* aPtr = (int32_t*)aVector;
87  const int32_t* bPtr = (int32_t*)bVector;
88 
89  __m512i aVal, bVal, cVal;
90  for (; number < sixteenthPoints; number++) {
91 
92  aVal = _mm512_load_si512(aPtr);
93  bVal = _mm512_load_si512(bPtr);
94 
95  cVal = _mm512_and_si512(aVal, bVal);
96 
97  _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
98 
99  aPtr += 16;
100  bPtr += 16;
101  cPtr += 16;
102  }
103 
104  number = sixteenthPoints * 16;
105  for (; number < num_points; number++) {
106  cVector[number] = aVector[number] & bVector[number];
107  }
108 }
109 #endif /* LV_HAVE_AVX512F */
110 
111 #ifdef LV_HAVE_AVX2
112 #include <immintrin.h>
113 
114 static inline void volk_32i_x2_and_32i_a_avx2(int32_t* cVector,
115  const int32_t* aVector,
116  const int32_t* bVector,
117  unsigned int num_points)
118 {
119  unsigned int number = 0;
120  const unsigned int oneEightPoints = num_points / 8;
121 
122  int32_t* cPtr = cVector;
123  const int32_t* aPtr = aVector;
124  const int32_t* bPtr = bVector;
125 
126  __m256i aVal, bVal, cVal;
127  for (; number < oneEightPoints; number++) {
128 
129  aVal = _mm256_load_si256((__m256i*)aPtr);
130  bVal = _mm256_load_si256((__m256i*)bPtr);
131 
132  cVal = _mm256_and_si256(aVal, bVal);
133 
134  _mm256_store_si256((__m256i*)cPtr,
135  cVal); // Store the results back into the C container
136 
137  aPtr += 8;
138  bPtr += 8;
139  cPtr += 8;
140  }
141 
142  number = oneEightPoints * 8;
143  for (; number < num_points; number++) {
144  cVector[number] = aVector[number] & bVector[number];
145  }
146 }
147 #endif /* LV_HAVE_AVX2 */
148 
149 
150 #ifdef LV_HAVE_SSE
151 #include <xmmintrin.h>
152 
153 static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector,
154  const int32_t* aVector,
155  const int32_t* bVector,
156  unsigned int num_points)
157 {
158  unsigned int number = 0;
159  const unsigned int quarterPoints = num_points / 4;
160 
161  float* cPtr = (float*)cVector;
162  const float* aPtr = (float*)aVector;
163  const float* bPtr = (float*)bVector;
164 
165  __m128 aVal, bVal, cVal;
166  for (; number < quarterPoints; number++) {
167 
168  aVal = _mm_load_ps(aPtr);
169  bVal = _mm_load_ps(bPtr);
170 
171  cVal = _mm_and_ps(aVal, bVal);
172 
173  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
174 
175  aPtr += 4;
176  bPtr += 4;
177  cPtr += 4;
178  }
179 
180  number = quarterPoints * 4;
181  for (; number < num_points; number++) {
182  cVector[number] = aVector[number] & bVector[number];
183  }
184 }
185 #endif /* LV_HAVE_SSE */
186 
187 
188 #ifdef LV_HAVE_NEON
189 #include <arm_neon.h>
190 
191 static inline void volk_32i_x2_and_32i_neon(int32_t* cVector,
192  const int32_t* aVector,
193  const int32_t* bVector,
194  unsigned int num_points)
195 {
196  int32_t* cPtr = cVector;
197  const int32_t* aPtr = aVector;
198  const int32_t* bPtr = bVector;
199  unsigned int number = 0;
200  unsigned int quarter_points = num_points / 4;
201 
202  int32x4_t a_val, b_val, c_val;
203 
204  for (number = 0; number < quarter_points; number++) {
205  a_val = vld1q_s32(aPtr);
206  b_val = vld1q_s32(bPtr);
207  c_val = vandq_s32(a_val, b_val);
208  vst1q_s32(cPtr, c_val);
209  aPtr += 4;
210  bPtr += 4;
211  cPtr += 4;
212  }
213 
214  for (number = quarter_points * 4; number < num_points; number++) {
215  *cPtr++ = (*aPtr++) & (*bPtr++);
216  }
217 }
218 #endif /* LV_HAVE_NEON */
219 
220 #ifdef LV_HAVE_NEONV8
221 #include <arm_neon.h>
222 
223 static inline void volk_32i_x2_and_32i_neonv8(int32_t* cVector,
224  const int32_t* aVector,
225  const int32_t* bVector,
226  unsigned int num_points)
227 {
228  const unsigned int eighthPoints = num_points / 8;
229 
230  const int32_t* aPtr = aVector;
231  const int32_t* bPtr = bVector;
232  int32_t* cPtr = cVector;
233 
234  for (unsigned int number = 0; number < eighthPoints; number++) {
235  int32x4_t a0 = vld1q_s32(aPtr);
236  int32x4_t a1 = vld1q_s32(aPtr + 4);
237  int32x4_t b0 = vld1q_s32(bPtr);
238  int32x4_t b1 = vld1q_s32(bPtr + 4);
239  __VOLK_PREFETCH(aPtr + 16);
240  __VOLK_PREFETCH(bPtr + 16);
241 
242  vst1q_s32(cPtr, vandq_s32(a0, b0));
243  vst1q_s32(cPtr + 4, vandq_s32(a1, b1));
244 
245  aPtr += 8;
246  bPtr += 8;
247  cPtr += 8;
248  }
249 
250  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
251  *cPtr++ = (*aPtr++) & (*bPtr++);
252  }
253 }
254 #endif /* LV_HAVE_NEONV8 */
255 
256 
257 #ifdef LV_HAVE_GENERIC
258 
259 static inline void volk_32i_x2_and_32i_generic(int32_t* cVector,
260  const int32_t* aVector,
261  const int32_t* bVector,
262  unsigned int num_points)
263 {
264  int32_t* cPtr = cVector;
265  const int32_t* aPtr = aVector;
266  const int32_t* bPtr = bVector;
267  unsigned int number = 0;
268 
269  for (number = 0; number < num_points; number++) {
270  *cPtr++ = (*aPtr++) & (*bPtr++);
271  }
272 }
273 #endif /* LV_HAVE_GENERIC */
274 
275 
276 #ifdef LV_HAVE_ORC
277 extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector,
278  const int32_t* aVector,
279  const int32_t* bVector,
280  int num_points);
281 
282 static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector,
283  const int32_t* aVector,
284  const int32_t* bVector,
285  unsigned int num_points)
286 {
287  volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
288 }
289 #endif /* LV_HAVE_ORC */
290 
291 
292 #endif /* INCLUDED_volk_32i_x2_and_32i_a_H */
293 
294 
295 #ifndef INCLUDED_volk_32i_x2_and_32i_u_H
296 #define INCLUDED_volk_32i_x2_and_32i_u_H
297 
298 #include <inttypes.h>
299 #include <stdio.h>
300 
301 #ifdef LV_HAVE_AVX512F
302 #include <immintrin.h>
303 
304 static inline void volk_32i_x2_and_32i_u_avx512f(int32_t* cVector,
305  const int32_t* aVector,
306  const int32_t* bVector,
307  unsigned int num_points)
308 {
309  unsigned int number = 0;
310  const unsigned int sixteenthPoints = num_points / 16;
311 
312  int32_t* cPtr = (int32_t*)cVector;
313  const int32_t* aPtr = (int32_t*)aVector;
314  const int32_t* bPtr = (int32_t*)bVector;
315 
316  __m512i aVal, bVal, cVal;
317  for (; number < sixteenthPoints; number++) {
318 
319  aVal = _mm512_loadu_si512(aPtr);
320  bVal = _mm512_loadu_si512(bPtr);
321 
322  cVal = _mm512_and_si512(aVal, bVal);
323 
324  _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
325 
326  aPtr += 16;
327  bPtr += 16;
328  cPtr += 16;
329  }
330 
331  number = sixteenthPoints * 16;
332  for (; number < num_points; number++) {
333  cVector[number] = aVector[number] & bVector[number];
334  }
335 }
336 #endif /* LV_HAVE_AVX512F */
337 
338 #ifdef LV_HAVE_AVX2
339 #include <immintrin.h>
340 
341 static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector,
342  const int32_t* aVector,
343  const int32_t* bVector,
344  unsigned int num_points)
345 {
346  unsigned int number = 0;
347  const unsigned int oneEightPoints = num_points / 8;
348 
349  int32_t* cPtr = cVector;
350  const int32_t* aPtr = aVector;
351  const int32_t* bPtr = bVector;
352 
353  __m256i aVal, bVal, cVal;
354  for (; number < oneEightPoints; number++) {
355 
356  aVal = _mm256_loadu_si256((__m256i*)aPtr);
357  bVal = _mm256_loadu_si256((__m256i*)bPtr);
358 
359  cVal = _mm256_and_si256(aVal, bVal);
360 
361  _mm256_storeu_si256((__m256i*)cPtr,
362  cVal); // Store the results back into the C container
363 
364  aPtr += 8;
365  bPtr += 8;
366  cPtr += 8;
367  }
368 
369  number = oneEightPoints * 8;
370  for (; number < num_points; number++) {
371  cVector[number] = aVector[number] & bVector[number];
372  }
373 }
374 #endif /* LV_HAVE_AVX2 */
375 
376 #ifdef LV_HAVE_RVV
377 #include <riscv_vector.h>
378 
379 static inline void volk_32i_x2_and_32i_rvv(int32_t* cVector,
380  const int32_t* aVector,
381  const int32_t* bVector,
382  unsigned int num_points)
383 {
384  size_t n = num_points;
385  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
386  vl = __riscv_vsetvl_e32m8(n);
387  vint32m8_t va = __riscv_vle32_v_i32m8(aVector, vl);
388  vint32m8_t vb = __riscv_vle32_v_i32m8(bVector, vl);
389  __riscv_vse32(cVector, __riscv_vand(va, vb, vl), vl);
390  }
391 }
392 #endif /*LV_HAVE_RVV*/
393 
394 #endif /* INCLUDED_volk_32i_x2_and_32i_u_H */
volk_32i_x2_and_32i_generic
static void volk_32i_x2_and_32i_generic(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_and_32i.h:259
volk_32i_x2_and_32i_neon
static void volk_32i_x2_and_32i_neon(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_and_32i.h:191
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32i_x2_and_32i_a_sse
static void volk_32i_x2_and_32i_a_sse(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_and_32i.h:153