Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_x3_sum_of_poly_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
71 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
72 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
73 
74 #include <inttypes.h>
75 #include <stdio.h>
76 #include <volk/volk_complex.h>
77 
78 #ifndef MAX
79 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
80 #endif
81 
82 #ifdef LV_HAVE_SSE3
83 #include <pmmintrin.h>
84 #include <xmmintrin.h>
85 
86 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target,
87  float* src0,
88  float* center_point_array,
89  float* cutoff,
90  unsigned int num_points)
91 {
92  float result = 0.0f;
93  float fst = 0.0f;
94  float sq = 0.0f;
95  float thrd = 0.0f;
96  float frth = 0.0f;
97 
98  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
99 
100  xmm9 = _mm_setzero_ps();
101  xmm1 = _mm_setzero_ps();
102  xmm0 = _mm_load1_ps(&center_point_array[0]);
103  xmm6 = _mm_load1_ps(&center_point_array[1]);
104  xmm7 = _mm_load1_ps(&center_point_array[2]);
105  xmm8 = _mm_load1_ps(&center_point_array[3]);
106  xmm10 = _mm_load1_ps(cutoff);
107 
108  int bound = num_points / 8;
109  int leftovers = num_points - 8 * bound;
110  int i = 0;
111  for (; i < bound; ++i) {
112  // 1st
113  xmm2 = _mm_load_ps(src0);
114  xmm2 = _mm_max_ps(xmm10, xmm2);
115  xmm3 = _mm_mul_ps(xmm2, xmm2);
116  xmm4 = _mm_mul_ps(xmm2, xmm3);
117  xmm5 = _mm_mul_ps(xmm3, xmm3);
118 
119  xmm2 = _mm_mul_ps(xmm2, xmm0);
120  xmm3 = _mm_mul_ps(xmm3, xmm6);
121  xmm4 = _mm_mul_ps(xmm4, xmm7);
122  xmm5 = _mm_mul_ps(xmm5, xmm8);
123 
124  xmm2 = _mm_add_ps(xmm2, xmm3);
125  xmm3 = _mm_add_ps(xmm4, xmm5);
126 
127  src0 += 4;
128 
129  xmm9 = _mm_add_ps(xmm2, xmm9);
130  xmm9 = _mm_add_ps(xmm3, xmm9);
131 
132  // 2nd
133  xmm2 = _mm_load_ps(src0);
134  xmm2 = _mm_max_ps(xmm10, xmm2);
135  xmm3 = _mm_mul_ps(xmm2, xmm2);
136  xmm4 = _mm_mul_ps(xmm2, xmm3);
137  xmm5 = _mm_mul_ps(xmm3, xmm3);
138 
139  xmm2 = _mm_mul_ps(xmm2, xmm0);
140  xmm3 = _mm_mul_ps(xmm3, xmm6);
141  xmm4 = _mm_mul_ps(xmm4, xmm7);
142  xmm5 = _mm_mul_ps(xmm5, xmm8);
143 
144  xmm2 = _mm_add_ps(xmm2, xmm3);
145  xmm3 = _mm_add_ps(xmm4, xmm5);
146 
147  src0 += 4;
148 
149  xmm1 = _mm_add_ps(xmm2, xmm1);
150  xmm1 = _mm_add_ps(xmm3, xmm1);
151  }
152  xmm2 = _mm_hadd_ps(xmm9, xmm1);
153  xmm3 = _mm_hadd_ps(xmm2, xmm2);
154  xmm4 = _mm_hadd_ps(xmm3, xmm3);
155  _mm_store_ss(&result, xmm4);
156 
157  for (i = 0; i < leftovers; ++i) {
158  fst = *src0++;
159  fst = MAX(fst, *cutoff);
160  sq = fst * fst;
161  thrd = fst * sq;
162  frth = sq * sq;
163  result += (center_point_array[0] * fst + center_point_array[1] * sq +
164  center_point_array[2] * thrd + center_point_array[3] * frth);
165  }
166 
167  result += (float)(num_points)*center_point_array[4];
168  *target = result;
169 }
170 
171 
172 #endif /*LV_HAVE_SSE3*/
173 
174 #if LV_HAVE_AVX && LV_HAVE_FMA
175 #include <immintrin.h>
176 
177 static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target,
178  float* src0,
179  float* center_point_array,
180  float* cutoff,
181  unsigned int num_points)
182 {
183  const unsigned int eighth_points = num_points / 8;
184  float fst = 0.0;
185  float sq = 0.0;
186  float thrd = 0.0;
187  float frth = 0.0;
188 
189  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
190  __m256 target_vec;
191  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
192 
193  cpa0 = _mm256_set1_ps(center_point_array[0]);
194  cpa1 = _mm256_set1_ps(center_point_array[1]);
195  cpa2 = _mm256_set1_ps(center_point_array[2]);
196  cpa3 = _mm256_set1_ps(center_point_array[3]);
197  cutoff_vec = _mm256_set1_ps(*cutoff);
198  target_vec = _mm256_setzero_ps();
199 
200  unsigned int i;
201 
202  for (i = 0; i < eighth_points; ++i) {
203  x_to_1 = _mm256_load_ps(src0);
204  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
205  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
206  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
207  // x^1 * x^3 is slightly faster than x^2 * x^2
208  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
209 
210  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
211  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
212 
213  x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
214  x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
215  // this is slightly faster than result += (x_to_1 + x_to_3)
216  target_vec = _mm256_add_ps(x_to_1, target_vec);
217  target_vec = _mm256_add_ps(x_to_3, target_vec);
218 
219  src0 += 8;
220  }
221 
222  // the hadd for vector reduction has very very slight impact @ 50k iters
223  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
224  target_vec = _mm256_hadd_ps(
225  target_vec,
226  target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
227  _mm256_store_ps(temp_results, target_vec);
228  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
229 
230  for (i = eighth_points * 8; i < num_points; ++i) {
231  fst = *src0++;
232  fst = MAX(fst, *cutoff);
233  sq = fst * fst;
234  thrd = fst * sq;
235  frth = sq * sq;
236  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
237  center_point_array[2] * thrd + center_point_array[3] * frth);
238  }
239  *target += (float)(num_points)*center_point_array[4];
240 }
241 #endif // LV_HAVE_AVX && LV_HAVE_FMA
242 
243 #ifdef LV_HAVE_AVX
244 #include <immintrin.h>
245 
246 static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target,
247  float* src0,
248  float* center_point_array,
249  float* cutoff,
250  unsigned int num_points)
251 {
252  const unsigned int eighth_points = num_points / 8;
253  float fst = 0.0;
254  float sq = 0.0;
255  float thrd = 0.0;
256  float frth = 0.0;
257 
258  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
259  __m256 target_vec;
260  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
261 
262  cpa0 = _mm256_set1_ps(center_point_array[0]);
263  cpa1 = _mm256_set1_ps(center_point_array[1]);
264  cpa2 = _mm256_set1_ps(center_point_array[2]);
265  cpa3 = _mm256_set1_ps(center_point_array[3]);
266  cutoff_vec = _mm256_set1_ps(*cutoff);
267  target_vec = _mm256_setzero_ps();
268 
269  unsigned int i;
270 
271  for (i = 0; i < eighth_points; ++i) {
272  x_to_1 = _mm256_load_ps(src0);
273  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
274  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
275  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
276  // x^1 * x^3 is slightly faster than x^2 * x^2
277  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
278 
279  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
280  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
281  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
282  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
283 
284  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
285  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
286  // this is slightly faster than result += (x_to_1 + x_to_3)
287  target_vec = _mm256_add_ps(x_to_1, target_vec);
288  target_vec = _mm256_add_ps(x_to_3, target_vec);
289 
290  src0 += 8;
291  }
292 
293  // the hadd for vector reduction has very very slight impact @ 50k iters
294  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
295  target_vec = _mm256_hadd_ps(
296  target_vec,
297  target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
298  _mm256_store_ps(temp_results, target_vec);
299  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
300 
301  for (i = eighth_points * 8; i < num_points; ++i) {
302  fst = *src0++;
303  fst = MAX(fst, *cutoff);
304  sq = fst * fst;
305  thrd = fst * sq;
306  frth = sq * sq;
307  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
308  center_point_array[2] * thrd + center_point_array[3] * frth);
309  }
310  *target += (float)(num_points)*center_point_array[4];
311 }
312 #endif // LV_HAVE_AVX
313 
314 
315 #ifdef LV_HAVE_GENERIC
316 
317 static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target,
318  float* src0,
319  float* center_point_array,
320  float* cutoff,
321  unsigned int num_points)
322 {
323  const unsigned int eighth_points = num_points / 8;
324 
325  float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
326  float fst = 0.0f;
327  float sq = 0.0f;
328  float thrd = 0.0f;
329  float frth = 0.0f;
330 
331  unsigned int i = 0;
332  unsigned int k = 0;
333  for (i = 0; i < eighth_points; ++i) {
334  for (k = 0; k < 8; ++k) {
335  fst = *src0++;
336  fst = MAX(fst, *cutoff);
337  sq = fst * fst;
338  thrd = fst * sq;
339  frth = fst * thrd;
340  result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
341  result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
342  }
343  }
344  for (k = 0; k < 8; k += 2) {
345  result[k] = result[k] + result[k + 1];
346  }
347 
348  *target = result[0] + result[2] + result[4] + result[6];
349 
350  for (i = eighth_points * 8; i < num_points; ++i) {
351  fst = *src0++;
352  fst = MAX(fst, *cutoff);
353  sq = fst * fst;
354  thrd = fst * sq;
355  frth = fst * thrd;
356  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
357  center_point_array[2] * thrd + center_point_array[3] * frth);
358  }
359  *target += (float)(num_points)*center_point_array[4];
360 }
361 
362 #endif /*LV_HAVE_GENERIC*/
363 
364 #ifdef LV_HAVE_NEON
365 #include <arm_neon.h>
366 
367 static inline void volk_32f_x3_sum_of_poly_32f_neon(float* __restrict target,
368  float* __restrict src0,
369  float* __restrict center_point_array,
370  float* __restrict cutoff,
371  unsigned int num_points)
372 {
373  unsigned int i;
374  float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
375 
376  float accumulator;
377 
378  float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
379  accumulator1_vec = vld1q_f32(zero);
380  accumulator2_vec = vld1q_f32(zero);
381  accumulator3_vec = vld1q_f32(zero);
382  accumulator4_vec = vld1q_f32(zero);
383  float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
384  float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
385 
386  // load the cutoff in to a vector
387  cutoff_vector = vdupq_n_f32(*cutoff);
388  // ... center point array
389  cpa_0 = vdupq_n_f32(center_point_array[0]);
390  cpa_1 = vdupq_n_f32(center_point_array[1]);
391  cpa_2 = vdupq_n_f32(center_point_array[2]);
392  cpa_3 = vdupq_n_f32(center_point_array[3]);
393 
394  for (i = 0; i < num_points / 4; ++i) {
395  // load x
396  x_to_1 = vld1q_f32(src0);
397 
398  // Get a vector of max(src0, cutoff)
399  x_to_1 = vmaxq_f32(x_to_1, cutoff_vector); // x^1
400  x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
401  x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
402  x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
403  x_to_1 = vmulq_f32(x_to_1, cpa_0);
404  x_to_2 = vmulq_f32(x_to_2, cpa_1);
405  x_to_3 = vmulq_f32(x_to_3, cpa_2);
406  x_to_4 = vmulq_f32(x_to_4, cpa_3);
407  accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
408  accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
409  accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
410  accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
411 
412  src0 += 4;
413  }
414  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
415  accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
416  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
417 
418  __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
419  vst1q_f32(res_accumulators, accumulator1_vec);
420  accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
421  res_accumulators[3];
422 
423  float fst = 0.0;
424  float sq = 0.0;
425  float thrd = 0.0;
426  float frth = 0.0;
427 
428  for (i = 4 * (num_points / 4); i < num_points; ++i) {
429  fst = *src0++;
430  fst = MAX(fst, *cutoff);
431 
432  sq = fst * fst;
433  thrd = fst * sq;
434  frth = sq * sq;
435  // fith = sq * thrd;
436 
437  accumulator += (center_point_array[0] * fst + center_point_array[1] * sq +
438  center_point_array[2] * thrd + center_point_array[3] * frth); //+
439  }
440 
441  *target = accumulator + (float)num_points * center_point_array[4];
442 }
443 
444 #endif /* LV_HAVE_NEON */
445 
446 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
447 
448 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
449 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
450 
451 #include <inttypes.h>
452 #include <stdio.h>
453 #include <volk/volk_complex.h>
454 
455 #ifndef MAX
456 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
457 #endif
458 
459 #if LV_HAVE_AVX && LV_HAVE_FMA
460 #include <immintrin.h>
461 
462 static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target,
463  float* src0,
464  float* center_point_array,
465  float* cutoff,
466  unsigned int num_points)
467 {
468  const unsigned int eighth_points = num_points / 8;
469  float fst = 0.0;
470  float sq = 0.0;
471  float thrd = 0.0;
472  float frth = 0.0;
473 
474  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
475  __m256 target_vec;
476  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
477 
478  cpa0 = _mm256_set1_ps(center_point_array[0]);
479  cpa1 = _mm256_set1_ps(center_point_array[1]);
480  cpa2 = _mm256_set1_ps(center_point_array[2]);
481  cpa3 = _mm256_set1_ps(center_point_array[3]);
482  cutoff_vec = _mm256_set1_ps(*cutoff);
483  target_vec = _mm256_setzero_ps();
484 
485  unsigned int i;
486 
487  for (i = 0; i < eighth_points; ++i) {
488  x_to_1 = _mm256_loadu_ps(src0);
489  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
490  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
491  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
492  // x^1 * x^3 is slightly faster than x^2 * x^2
493  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
494 
495  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
496  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
497 
498  x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
499  x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
500  // this is slightly faster than result += (x_to_1 + x_to_3)
501  target_vec = _mm256_add_ps(x_to_1, target_vec);
502  target_vec = _mm256_add_ps(x_to_3, target_vec);
503 
504  src0 += 8;
505  }
506 
507  // the hadd for vector reduction has very very slight impact @ 50k iters
508  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
509  target_vec = _mm256_hadd_ps(
510  target_vec,
511  target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
512  _mm256_storeu_ps(temp_results, target_vec);
513  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
514 
515  for (i = eighth_points * 8; i < num_points; ++i) {
516  fst = *src0++;
517  fst = MAX(fst, *cutoff);
518  sq = fst * fst;
519  thrd = fst * sq;
520  frth = sq * sq;
521  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
522  center_point_array[2] * thrd + center_point_array[3] * frth);
523  }
524 
525  *target += (float)(num_points)*center_point_array[4];
526 }
527 #endif // LV_HAVE_AVX && LV_HAVE_FMA
528 
529 #ifdef LV_HAVE_AVX
530 #include <immintrin.h>
531 
532 static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target,
533  float* src0,
534  float* center_point_array,
535  float* cutoff,
536  unsigned int num_points)
537 {
538  const unsigned int eighth_points = num_points / 8;
539  float fst = 0.0;
540  float sq = 0.0;
541  float thrd = 0.0;
542  float frth = 0.0;
543 
544  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
545  __m256 target_vec;
546  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
547 
548  cpa0 = _mm256_set1_ps(center_point_array[0]);
549  cpa1 = _mm256_set1_ps(center_point_array[1]);
550  cpa2 = _mm256_set1_ps(center_point_array[2]);
551  cpa3 = _mm256_set1_ps(center_point_array[3]);
552  cutoff_vec = _mm256_set1_ps(*cutoff);
553  target_vec = _mm256_setzero_ps();
554 
555  unsigned int i;
556 
557  for (i = 0; i < eighth_points; ++i) {
558  x_to_1 = _mm256_loadu_ps(src0);
559  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
560  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
561  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
562  // x^1 * x^3 is slightly faster than x^2 * x^2
563  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
564 
565  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
566  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
567  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
568  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
569 
570  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
571  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
572  // this is slightly faster than result += (x_to_1 + x_to_3)
573  target_vec = _mm256_add_ps(x_to_1, target_vec);
574  target_vec = _mm256_add_ps(x_to_3, target_vec);
575 
576  src0 += 8;
577  }
578 
579  // the hadd for vector reduction has very very slight impact @ 50k iters
580  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
581  target_vec = _mm256_hadd_ps(
582  target_vec,
583  target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
584  _mm256_storeu_ps(temp_results, target_vec);
585  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
586 
587  for (i = eighth_points * 8; i < num_points; ++i) {
588  fst = *src0++;
589  fst = MAX(fst, *cutoff);
590  sq = fst * fst;
591  thrd = fst * sq;
592  frth = sq * sq;
593 
594  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
595  center_point_array[2] * thrd + center_point_array[3] * frth);
596  }
597 
598  *target += (float)(num_points)*center_point_array[4];
599 }
600 #endif // LV_HAVE_AVX
601 
602 #ifdef LV_HAVE_RVV
603 #include <riscv_vector.h>
605 
606 static inline void volk_32f_x3_sum_of_poly_32f_rvv(float* target,
607  float* src0,
608  float* center_point_array,
609  float* cutoff,
610  unsigned int num_points)
611 {
612  size_t vlmax = __riscv_vsetvlmax_e32m4();
613  vfloat32m4_t vsum = __riscv_vfmv_v_f_f32m4(0, vlmax);
614  float mul1 = center_point_array[0]; // scalar to avoid register spills
615  float mul2 = center_point_array[1];
616  vfloat32m4_t vmul3 = __riscv_vfmv_v_f_f32m4(center_point_array[2], vlmax);
617  vfloat32m4_t vmul4 = __riscv_vfmv_v_f_f32m4(center_point_array[3], vlmax);
618  vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(*cutoff, vlmax);
619 
620  size_t n = num_points;
621  for (size_t vl; n > 0; n -= vl, src0 += vl) {
622  vl = __riscv_vsetvl_e32m4(n);
623  vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl);
624  vfloat32m4_t v1 = __riscv_vfmax(v, vmax, vl);
625  vfloat32m4_t v2 = __riscv_vfmul(v1, v1, vl);
626  vfloat32m4_t v3 = __riscv_vfmul(v1, v2, vl);
627  vfloat32m4_t v4 = __riscv_vfmul(v2, v2, vl);
628  v2 = __riscv_vfmul(v2, mul2, vl);
629  v4 = __riscv_vfmul(v4, vmul4, vl);
630  v1 = __riscv_vfmadd(v1, mul1, v2, vl);
631  v3 = __riscv_vfmadd(v3, vmul3, v4, vl);
632  v1 = __riscv_vfadd(v1, v3, vl);
633  vsum = __riscv_vfadd_tu(vsum, vsum, v1, vl);
634  }
635  size_t vl = __riscv_vsetvlmax_e32m1();
636  vfloat32m1_t v = RISCV_SHRINK4(vfadd, f, 32, vsum);
637  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
638  float sum = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl));
639  *target = sum + num_points * center_point_array[4];
640 }
641 #endif /*LV_HAVE_RVV*/
642 
643 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H*/
volk_32f_x3_sum_of_poly_32f_neon
static void volk_32f_x3_sum_of_poly_32f_neon(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:367
volk_32f_x3_sum_of_poly_32f_u_avx
static void volk_32f_x3_sum_of_poly_32f_u_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:532
RISCV_SHRINK4
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
MAX
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:79
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_32f_x3_sum_of_poly_32f_generic
static void volk_32f_x3_sum_of_poly_32f_generic(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:317
volk_32f_x3_sum_of_poly_32f_a_avx
static void volk_32f_x3_sum_of_poly_32f_a_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:246
volk_complex.h
volk_rvv_intrinsics.h
volk_32f_x3_sum_of_poly_32f_a_sse3
static void volk_32f_x3_sum_of_poly_32f_a_sse3(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:86