Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_64f_x2_max_64f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_64f_x2_max_64f_a_H
59 #define INCLUDED_volk_64f_x2_max_64f_a_H
60 
61 #include <inttypes.h>
62 #include <stdio.h>
63 
64 #ifdef LV_HAVE_AVX512F
65 #include <immintrin.h>
66 
67 static inline void volk_64f_x2_max_64f_a_avx512f(double* cVector,
68  const double* aVector,
69  const double* bVector,
70  unsigned int num_points)
71 {
72  unsigned int number = 0;
73  const unsigned int eigthPoints = num_points / 8;
74 
75  double* cPtr = cVector;
76  const double* aPtr = aVector;
77  const double* bPtr = bVector;
78 
79  __m512d aVal, bVal, cVal;
80  for (; number < eigthPoints; number++) {
81 
82  aVal = _mm512_load_pd(aPtr);
83  bVal = _mm512_load_pd(bPtr);
84 
85  cVal = _mm512_max_pd(aVal, bVal);
86 
87  _mm512_store_pd(cPtr, cVal); // Store the results back into the C container
88 
89  aPtr += 8;
90  bPtr += 8;
91  cPtr += 8;
92  }
93 
94  number = eigthPoints * 8;
95  for (; number < num_points; number++) {
96  const double a = *aPtr++;
97  const double b = *bPtr++;
98  *cPtr++ = (a > b ? a : b);
99  }
100 }
101 #endif /* LV_HAVE_AVX512F */
102 
103 
104 #ifdef LV_HAVE_AVX
105 #include <immintrin.h>
106 
107 static inline void volk_64f_x2_max_64f_a_avx(double* cVector,
108  const double* aVector,
109  const double* bVector,
110  unsigned int num_points)
111 {
112  unsigned int number = 0;
113  const unsigned int quarterPoints = num_points / 4;
114 
115  double* cPtr = cVector;
116  const double* aPtr = aVector;
117  const double* bPtr = bVector;
118 
119  __m256d aVal, bVal, cVal;
120  for (; number < quarterPoints; number++) {
121 
122  aVal = _mm256_load_pd(aPtr);
123  bVal = _mm256_load_pd(bPtr);
124 
125  cVal = _mm256_max_pd(aVal, bVal);
126 
127  _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
128 
129  aPtr += 4;
130  bPtr += 4;
131  cPtr += 4;
132  }
133 
134  number = quarterPoints * 4;
135  for (; number < num_points; number++) {
136  const double a = *aPtr++;
137  const double b = *bPtr++;
138  *cPtr++ = (a > b ? a : b);
139  }
140 }
141 #endif /* LV_HAVE_AVX */
142 
143 
144 #ifdef LV_HAVE_SSE2
145 #include <emmintrin.h>
146 
147 static inline void volk_64f_x2_max_64f_a_sse2(double* cVector,
148  const double* aVector,
149  const double* bVector,
150  unsigned int num_points)
151 {
152  unsigned int number = 0;
153  const unsigned int halfPoints = num_points / 2;
154 
155  double* cPtr = cVector;
156  const double* aPtr = aVector;
157  const double* bPtr = bVector;
158 
159  __m128d aVal, bVal, cVal;
160  for (; number < halfPoints; number++) {
161 
162  aVal = _mm_load_pd(aPtr);
163  bVal = _mm_load_pd(bPtr);
164 
165  cVal = _mm_max_pd(aVal, bVal);
166 
167  _mm_store_pd(cPtr, cVal); // Store the results back into the C container
168 
169  aPtr += 2;
170  bPtr += 2;
171  cPtr += 2;
172  }
173 
174  number = halfPoints * 2;
175  for (; number < num_points; number++) {
176  const double a = *aPtr++;
177  const double b = *bPtr++;
178  *cPtr++ = (a > b ? a : b);
179  }
180 }
181 #endif /* LV_HAVE_SSE2 */
182 
183 
184 #ifdef LV_HAVE_GENERIC
185 
186 static inline void volk_64f_x2_max_64f_generic(double* cVector,
187  const double* aVector,
188  const double* bVector,
189  unsigned int num_points)
190 {
191  double* cPtr = cVector;
192  const double* aPtr = aVector;
193  const double* bPtr = bVector;
194  unsigned int number = 0;
195 
196  for (number = 0; number < num_points; number++) {
197  const double a = *aPtr++;
198  const double b = *bPtr++;
199  *cPtr++ = (a > b ? a : b);
200  }
201 }
202 #endif /* LV_HAVE_GENERIC */
203 
204 
205 #endif /* INCLUDED_volk_64f_x2_max_64f_a_H */
206 
207 
208 #ifndef INCLUDED_volk_64f_x2_max_64f_u_H
209 #define INCLUDED_volk_64f_x2_max_64f_u_H
210 
211 #include <inttypes.h>
212 #include <stdio.h>
213 
214 #ifdef LV_HAVE_AVX512F
215 #include <immintrin.h>
216 
217 static inline void volk_64f_x2_max_64f_u_avx512f(double* cVector,
218  const double* aVector,
219  const double* bVector,
220  unsigned int num_points)
221 {
222  unsigned int number = 0;
223  const unsigned int eigthPoints = num_points / 8;
224 
225  double* cPtr = cVector;
226  const double* aPtr = aVector;
227  const double* bPtr = bVector;
228 
229  __m512d aVal, bVal, cVal;
230  for (; number < eigthPoints; number++) {
231 
232  aVal = _mm512_loadu_pd(aPtr);
233  bVal = _mm512_loadu_pd(bPtr);
234 
235  cVal = _mm512_max_pd(aVal, bVal);
236 
237  _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container
238 
239  aPtr += 8;
240  bPtr += 8;
241  cPtr += 8;
242  }
243 
244  number = eigthPoints * 8;
245  for (; number < num_points; number++) {
246  const double a = *aPtr++;
247  const double b = *bPtr++;
248  *cPtr++ = (a > b ? a : b);
249  }
250 }
251 #endif /* LV_HAVE_AVX512F */
252 
253 
254 #ifdef LV_HAVE_AVX
255 #include <immintrin.h>
256 
257 static inline void volk_64f_x2_max_64f_u_avx(double* cVector,
258  const double* aVector,
259  const double* bVector,
260  unsigned int num_points)
261 {
262  unsigned int number = 0;
263  const unsigned int quarterPoints = num_points / 4;
264 
265  double* cPtr = cVector;
266  const double* aPtr = aVector;
267  const double* bPtr = bVector;
268 
269  __m256d aVal, bVal, cVal;
270  for (; number < quarterPoints; number++) {
271 
272  aVal = _mm256_loadu_pd(aPtr);
273  bVal = _mm256_loadu_pd(bPtr);
274 
275  cVal = _mm256_max_pd(aVal, bVal);
276 
277  _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
278 
279  aPtr += 4;
280  bPtr += 4;
281  cPtr += 4;
282  }
283 
284  number = quarterPoints * 4;
285  for (; number < num_points; number++) {
286  const double a = *aPtr++;
287  const double b = *bPtr++;
288  *cPtr++ = (a > b ? a : b);
289  }
290 }
291 #endif /* LV_HAVE_AVX */
292 
293 #ifdef LV_HAVE_NEONV8
294 #include <arm_neon.h>
295 
296 static inline void volk_64f_x2_max_64f_neonv8(double* cVector,
297  const double* aVector,
298  const double* bVector,
299  unsigned int num_points)
300 {
301  unsigned int number = 0;
302  const unsigned int quarter_points = num_points / 4;
303 
304  double* cPtr = cVector;
305  const double* aPtr = aVector;
306  const double* bPtr = bVector;
307 
308  for (; number < quarter_points; number++) {
309  float64x2_t aVal0 = vld1q_f64(aPtr);
310  float64x2_t aVal1 = vld1q_f64(aPtr + 2);
311  float64x2_t bVal0 = vld1q_f64(bPtr);
312  float64x2_t bVal1 = vld1q_f64(bPtr + 2);
313  __VOLK_PREFETCH(aPtr + 4);
314  __VOLK_PREFETCH(bPtr + 4);
315 
316  float64x2_t cVal0 = vmaxq_f64(aVal0, bVal0);
317  float64x2_t cVal1 = vmaxq_f64(aVal1, bVal1);
318 
319  vst1q_f64(cPtr, cVal0);
320  vst1q_f64(cPtr + 2, cVal1);
321 
322  aPtr += 4;
323  bPtr += 4;
324  cPtr += 4;
325  }
326 
327  number = quarter_points * 4;
328  for (; number < num_points; number++) {
329  const double a = *aPtr++;
330  const double b = *bPtr++;
331  *cPtr++ = (a > b ? a : b);
332  }
333 }
334 #endif /* LV_HAVE_NEONV8 */
335 
336 #ifdef LV_HAVE_RVV
337 #include <riscv_vector.h>
338 
339 static inline void volk_64f_x2_max_64f_rvv(double* cVector,
340  const double* aVector,
341  const double* bVector,
342  unsigned int num_points)
343 {
344  size_t n = num_points;
345  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
346  vl = __riscv_vsetvl_e64m8(n);
347  vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl);
348  vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
349  __riscv_vse64(cVector, __riscv_vfmax(va, vb, vl), vl);
350  }
351 }
352 #endif /*LV_HAVE_RVV*/
353 
354 #endif /* INCLUDED_volk_64f_x2_max_64f_u_H */
volk_64f_x2_max_64f_a_sse2
static void volk_64f_x2_max_64f_a_sse2(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_max_64f.h:147
volk_64f_x2_max_64f_u_avx
static void volk_64f_x2_max_64f_u_avx(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_max_64f.h:257
volk_64f_x2_max_64f_a_avx
static void volk_64f_x2_max_64f_a_avx(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_max_64f.h:107
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_64f_x2_max_64f_generic
static void volk_64f_x2_max_64f_generic(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_max_64f.h:186