Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_x2_subtract_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32f_x2_subtract_32f_a_H
59 #define INCLUDED_volk_32f_x2_subtract_32f_a_H
60 
61 #include <inttypes.h>
62 #include <stdio.h>
63 
64 
65 #ifdef LV_HAVE_GENERIC
66 
67 static inline void volk_32f_x2_subtract_32f_generic(float* cVector,
68  const float* aVector,
69  const float* bVector,
70  unsigned int num_points)
71 {
72  for (unsigned int number = 0; number < num_points; number++) {
73  *cVector++ = (*aVector++) - (*bVector++);
74  }
75 }
76 #endif /* LV_HAVE_GENERIC */
77 
78 
79 #ifdef LV_HAVE_AVX512F
80 #include <immintrin.h>
81 
82 static inline void volk_32f_x2_subtract_32f_a_avx512f(float* cVector,
83  const float* aVector,
84  const float* bVector,
85  unsigned int num_points)
86 {
87  const unsigned int sixteenthPoints = num_points / 16;
88 
89  for (unsigned int number = 0; number < sixteenthPoints; number++) {
90  __m512 aVal = _mm512_load_ps(aVector);
91  __m512 bVal = _mm512_load_ps(bVector);
92 
93  __m512 cVal = _mm512_sub_ps(aVal, bVal);
94 
95  _mm512_store_ps(cVector, cVal); // Store the results back into the C container
96 
97  aVector += 16;
98  bVector += 16;
99  cVector += 16;
100  }
101 
103  cVector, aVector, bVector, num_points - sixteenthPoints * 16);
104 }
105 #endif /* LV_HAVE_AVX512F */
106 
107 #ifdef LV_HAVE_AVX
108 #include <immintrin.h>
109 
110 static inline void volk_32f_x2_subtract_32f_a_avx(float* cVector,
111  const float* aVector,
112  const float* bVector,
113  unsigned int num_points)
114 {
115  const unsigned int eighthPoints = num_points / 8;
116 
117  for (unsigned int number = 0; number < eighthPoints; number++) {
118  __m256 aVal = _mm256_load_ps(aVector);
119  __m256 bVal = _mm256_load_ps(bVector);
120 
121  __m256 cVal = _mm256_sub_ps(aVal, bVal);
122 
123  _mm256_store_ps(cVector, cVal); // Store the results back into the C container
124 
125  aVector += 8;
126  bVector += 8;
127  cVector += 8;
128  }
129 
131  cVector, aVector, bVector, num_points - eighthPoints * 8);
132 }
133 #endif /* LV_HAVE_AVX */
134 
135 #ifdef LV_HAVE_SSE
136 #include <xmmintrin.h>
137 
138 static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector,
139  const float* aVector,
140  const float* bVector,
141  unsigned int num_points)
142 {
143  const unsigned int quarterPoints = num_points / 4;
144 
145  for (unsigned int number = 0; number < quarterPoints; number++) {
146  __m128 aVal = _mm_load_ps(aVector);
147  __m128 bVal = _mm_load_ps(bVector);
148 
149  __m128 cVal = _mm_sub_ps(aVal, bVal);
150 
151  _mm_store_ps(cVector, cVal); // Store the results back into the C container
152 
153  aVector += 4;
154  bVector += 4;
155  cVector += 4;
156  }
157 
159  cVector, aVector, bVector, num_points - quarterPoints * 4);
160 }
161 #endif /* LV_HAVE_SSE */
162 
163 
164 #ifdef LV_HAVE_NEON
165 #include <arm_neon.h>
166 
167 static inline void volk_32f_x2_subtract_32f_neon(float* cVector,
168  const float* aVector,
169  const float* bVector,
170  unsigned int num_points)
171 {
172  const unsigned int quarterPoints = num_points / 4;
173 
174  for (unsigned int number = 0; number < quarterPoints; number++) {
175  float32x4_t a_vec = vld1q_f32(aVector);
176  float32x4_t b_vec = vld1q_f32(bVector);
177 
178  float32x4_t c_vec = vsubq_f32(a_vec, b_vec);
179 
180  vst1q_f32(cVector, c_vec);
181 
182  aVector += 4;
183  bVector += 4;
184  cVector += 4;
185  }
186 
188  cVector, aVector, bVector, num_points - quarterPoints * 4);
189 }
190 #endif /* LV_HAVE_NEON */
191 
192 
193 #ifdef LV_HAVE_NEONV8
194 #include <arm_neon.h>
195 
196 static inline void volk_32f_x2_subtract_32f_neonv8(float* cVector,
197  const float* aVector,
198  const float* bVector,
199  unsigned int num_points)
200 {
201  unsigned int n = num_points;
202  float* c = cVector;
203  const float* a = aVector;
204  const float* b = bVector;
205 
206  /* Process 8 floats per iteration (2x unroll) */
207  while (n >= 8) {
208  float32x4_t a0 = vld1q_f32(a);
209  float32x4_t a1 = vld1q_f32(a + 4);
210  float32x4_t b0 = vld1q_f32(b);
211  float32x4_t b1 = vld1q_f32(b + 4);
212  __VOLK_PREFETCH(a + 16);
213  __VOLK_PREFETCH(b + 16);
214 
215  vst1q_f32(c, vsubq_f32(a0, b0));
216  vst1q_f32(c + 4, vsubq_f32(a1, b1));
217 
218  a += 8;
219  b += 8;
220  c += 8;
221  n -= 8;
222  }
223 
224  /* Process remaining 4 floats */
225  if (n >= 4) {
226  vst1q_f32(c, vsubq_f32(vld1q_f32(a), vld1q_f32(b)));
227  a += 4;
228  b += 4;
229  c += 4;
230  n -= 4;
231  }
232 
233  /* Scalar tail */
234  while (n > 0) {
235  *c++ = *a++ - *b++;
236  n--;
237  }
238 }
239 
240 #endif /* LV_HAVE_NEONV8 */
241 
242 
243 #ifdef LV_HAVE_ORC
244 extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector,
245  const float* aVector,
246  const float* bVector,
247  int num_points);
248 
249 static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector,
250  const float* aVector,
251  const float* bVector,
252  unsigned int num_points)
253 {
254  volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
255 }
256 #endif /* LV_HAVE_ORC */
257 
258 
259 #endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */
260 
261 
262 #ifndef INCLUDED_volk_32f_x2_subtract_32f_u_H
263 #define INCLUDED_volk_32f_x2_subtract_32f_u_H
264 
265 #include <inttypes.h>
266 #include <stdio.h>
267 
268 #ifdef LV_HAVE_AVX512F
269 #include <immintrin.h>
270 
271 static inline void volk_32f_x2_subtract_32f_u_avx512f(float* cVector,
272  const float* aVector,
273  const float* bVector,
274  unsigned int num_points)
275 {
276  const unsigned int sixteenthPoints = num_points / 16;
277 
278  for (unsigned int number = 0; number < sixteenthPoints; number++) {
279  __m512 aVal = _mm512_loadu_ps(aVector);
280  __m512 bVal = _mm512_loadu_ps(bVector);
281 
282  __m512 cVal = _mm512_sub_ps(aVal, bVal);
283 
284  _mm512_storeu_ps(cVector, cVal); // Store the results back into the C container
285 
286  aVector += 16;
287  bVector += 16;
288  cVector += 16;
289  }
290 
292  cVector, aVector, bVector, num_points - sixteenthPoints * 16);
293 }
294 #endif /* LV_HAVE_AVX512F */
295 
296 
297 #ifdef LV_HAVE_AVX
298 #include <immintrin.h>
299 
300 static inline void volk_32f_x2_subtract_32f_u_avx(float* cVector,
301  const float* aVector,
302  const float* bVector,
303  unsigned int num_points)
304 {
305  const unsigned int eighthPoints = num_points / 8;
306 
307  for (unsigned int number = 0; number < eighthPoints; number++) {
308  __m256 aVal = _mm256_loadu_ps(aVector);
309  __m256 bVal = _mm256_loadu_ps(bVector);
310 
311  __m256 cVal = _mm256_sub_ps(aVal, bVal);
312 
313  _mm256_storeu_ps(cVector, cVal); // Store the results back into the C container
314 
315  aVector += 8;
316  bVector += 8;
317  cVector += 8;
318  }
319 
321  cVector, aVector, bVector, num_points - eighthPoints * 8);
322 }
323 #endif /* LV_HAVE_AVX */
324 
325 #ifdef LV_HAVE_RVV
326 #include <riscv_vector.h>
327 
328 static inline void volk_32f_x2_subtract_32f_rvv(float* cVector,
329  const float* aVector,
330  const float* bVector,
331  unsigned int num_points)
332 {
333  size_t n = num_points;
334  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
335  vl = __riscv_vsetvl_e32m8(n);
336  vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl);
337  vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl);
338  __riscv_vse32(cVector, __riscv_vfsub(va, vb, vl), vl);
339  }
340 }
341 #endif /*LV_HAVE_RVV*/
342 
343 #endif /* INCLUDED_volk_32f_x2_subtract_32f_u_H */
volk_32f_x2_subtract_32f_a_avx
static void volk_32f_x2_subtract_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:110
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32f_x2_subtract_32f_generic
static void volk_32f_x2_subtract_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:67
volk_32f_x2_subtract_32f_neon
static void volk_32f_x2_subtract_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:167
volk_32f_x2_subtract_32f_a_sse
static void volk_32f_x2_subtract_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:138
volk_32f_x2_subtract_32f_u_avx
static void volk_32f_x2_subtract_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:300