Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_index_max_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
52 #ifndef INCLUDED_volk_32f_index_max_32u_a_H
53 #define INCLUDED_volk_32f_index_max_32u_a_H
54 
55 #include <inttypes.h>
56 #include <stdio.h>
57 #include <volk/volk_common.h>
58 
59 #ifdef LV_HAVE_SSE4_1
60 #include <smmintrin.h>
61 
62 static inline void
63 volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
64 {
65  if (num_points > 0) {
66  uint32_t number = 0;
67  const uint32_t quarterPoints = num_points / 4;
68 
69  float* inputPtr = (float*)src0;
70 
71  __m128 indexIncrementValues = _mm_set1_ps(4);
72  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
73 
74  float max = src0[0];
75  float index = 0;
76  __m128 maxValues = _mm_set1_ps(max);
77  __m128 maxValuesIndex = _mm_setzero_ps();
78  __m128 compareResults;
79  __m128 currentValues;
80 
81  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
82  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
83 
84  for (; number < quarterPoints; number++) {
85 
86  currentValues = _mm_load_ps(inputPtr);
87  inputPtr += 4;
88  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
89 
90  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
91 
92  maxValuesIndex =
93  _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
94  maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
95  }
96 
97  // Calculate the largest value from the remaining 4 points
98  _mm_store_ps(maxValuesBuffer, maxValues);
99  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
100 
101  for (number = 0; number < 4; number++) {
102  if (maxValuesBuffer[number] > max) {
103  index = maxIndexesBuffer[number];
104  max = maxValuesBuffer[number];
105  } else if (maxValuesBuffer[number] == max) {
106  if (index > maxIndexesBuffer[number])
107  index = maxIndexesBuffer[number];
108  }
109  }
110 
111  number = quarterPoints * 4;
112  for (; number < num_points; number++) {
113  if (src0[number] > max) {
114  index = number;
115  max = src0[number];
116  }
117  }
118  target[0] = (uint32_t)index;
119  }
120 }
121 
122 #endif /*LV_HAVE_SSE4_1*/
123 
124 
125 #ifdef LV_HAVE_SSE
126 
127 #include <xmmintrin.h>
128 
129 static inline void
130 volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
131 {
132  if (num_points > 0) {
133  uint32_t number = 0;
134  const uint32_t quarterPoints = num_points / 4;
135 
136  float* inputPtr = (float*)src0;
137 
138  __m128 indexIncrementValues = _mm_set1_ps(4);
139  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
140 
141  float max = src0[0];
142  float index = 0;
143  __m128 maxValues = _mm_set1_ps(max);
144  __m128 maxValuesIndex = _mm_setzero_ps();
145  __m128 compareResults;
146  __m128 currentValues;
147 
148  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
149  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
150 
151  for (; number < quarterPoints; number++) {
152 
153  currentValues = _mm_load_ps(inputPtr);
154  inputPtr += 4;
155  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
156 
157  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
158 
159  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
160  _mm_andnot_ps(compareResults, maxValuesIndex));
161 
162  maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
163  _mm_andnot_ps(compareResults, maxValues));
164  }
165 
166  // Calculate the largest value from the remaining 4 points
167  _mm_store_ps(maxValuesBuffer, maxValues);
168  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
169 
170  for (number = 0; number < 4; number++) {
171  if (maxValuesBuffer[number] > max) {
172  index = maxIndexesBuffer[number];
173  max = maxValuesBuffer[number];
174  } else if (maxValuesBuffer[number] == max) {
175  if (index > maxIndexesBuffer[number])
176  index = maxIndexesBuffer[number];
177  }
178  }
179 
180  number = quarterPoints * 4;
181  for (; number < num_points; number++) {
182  if (src0[number] > max) {
183  index = number;
184  max = src0[number];
185  }
186  }
187  target[0] = (uint32_t)index;
188  }
189 }
190 
191 #endif /*LV_HAVE_SSE*/
192 
193 
194 #ifdef LV_HAVE_AVX
195 #include <immintrin.h>
196 
197 static inline void
198 volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
199 {
200  if (num_points > 0) {
201  uint32_t number = 0;
202  const uint32_t quarterPoints = num_points / 8;
203 
204  float* inputPtr = (float*)src0;
205 
206  __m256 indexIncrementValues = _mm256_set1_ps(8);
207  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
208 
209  float max = src0[0];
210  float index = 0;
211  __m256 maxValues = _mm256_set1_ps(max);
212  __m256 maxValuesIndex = _mm256_setzero_ps();
213  __m256 compareResults;
214  __m256 currentValues;
215 
216  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
217  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
218 
219  for (; number < quarterPoints; number++) {
220  currentValues = _mm256_load_ps(inputPtr);
221  inputPtr += 8;
222  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
223  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
224  maxValuesIndex =
225  _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
226  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
227  }
228 
229  // Calculate the largest value from the remaining 8 points
230  _mm256_store_ps(maxValuesBuffer, maxValues);
231  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
232 
233  for (number = 0; number < 8; number++) {
234  if (maxValuesBuffer[number] > max) {
235  index = maxIndexesBuffer[number];
236  max = maxValuesBuffer[number];
237  } else if (maxValuesBuffer[number] == max) {
238  if (index > maxIndexesBuffer[number])
239  index = maxIndexesBuffer[number];
240  }
241  }
242 
243  number = quarterPoints * 8;
244  for (; number < num_points; number++) {
245  if (src0[number] > max) {
246  index = number;
247  max = src0[number];
248  }
249  }
250  target[0] = (uint32_t)index;
251  }
252 }
253 
254 #endif /*LV_HAVE_AVX*/
255 
256 
257 #ifdef LV_HAVE_NEON
258 #include <arm_neon.h>
259 
260 static inline void
261 volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
262 {
263  if (num_points > 0) {
264  uint32_t number = 0;
265  const uint32_t quarterPoints = num_points / 4;
266 
267  float* inputPtr = (float*)src0;
268  float32x4_t indexIncrementValues = vdupq_n_f32(4);
270  float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
271  float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
272 
273  float max = src0[0];
274  float index = 0;
275  float32x4_t maxValues = vdupq_n_f32(max);
276  uint32x4_t maxValuesIndex = vmovq_n_u32(0);
277  uint32x4_t compareResults;
278  uint32x4_t currentIndexes_u;
279  float32x4_t currentValues;
280 
281  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
282  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
283 
284  for (; number < quarterPoints; number++) {
285  currentValues = vld1q_f32(inputPtr);
286  inputPtr += 4;
287  currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
288  currentIndexes_u = vcvtq_u32_f32(currentIndexes);
289  compareResults = vcleq_f32(currentValues, maxValues);
290  maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex),
291  vbicq_u32(currentIndexes_u, compareResults));
292  maxValues = vmaxq_f32(currentValues, maxValues);
293  }
294 
295  // Calculate the largest value from the remaining 4 points
296  vst1q_f32(maxValuesBuffer, maxValues);
297  vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
298  for (number = 0; number < 4; number++) {
299  if (maxValuesBuffer[number] > max) {
300  index = maxIndexesBuffer[number];
301  max = maxValuesBuffer[number];
302  } else if (maxValuesBuffer[number] == max) {
303  if (index > maxIndexesBuffer[number])
304  index = maxIndexesBuffer[number];
305  }
306  }
307 
308  number = quarterPoints * 4;
309  for (; number < num_points; number++) {
310  if (src0[number] > max) {
311  index = number;
312  max = src0[number];
313  }
314  }
315  target[0] = (uint32_t)index;
316  }
317 }
318 
319 #endif /*LV_HAVE_NEON*/
320 
321 
322 #ifdef LV_HAVE_NEONV8
323 #include <arm_neon.h>
324 #include <float.h>
325 
326 static inline void
327 volk_32f_index_max_32u_neonv8(uint32_t* target, const float* src0, uint32_t num_points)
328 {
329  if (num_points == 0)
330  return;
331 
332  const uint32_t quarter_points = num_points / 4;
333  const float* inputPtr = src0;
334 
335  // Use integer indices directly (no float conversion overhead)
336  uint32x4_t vec_indices = { 0, 1, 2, 3 };
337  const uint32x4_t vec_incr = vdupq_n_u32(4);
338 
339  float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
340  uint32x4_t vec_max_idx = vdupq_n_u32(0);
341 
342  for (uint32_t i = 0; i < quarter_points; i++) {
343  float32x4_t vec_val = vld1q_f32(inputPtr);
344  inputPtr += 4;
345 
346  // Compare BEFORE max update to know which lanes change
347  uint32x4_t gt_mask = vcgtq_f32(vec_val, vec_max);
348  vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
349 
350  // vmaxq_f32 is single-cycle, no dependency on comparison result
351  vec_max = vmaxq_f32(vec_val, vec_max);
352 
353  vec_indices = vaddq_u32(vec_indices, vec_incr);
354  }
355 
356  // ARMv8 horizontal reduction - find max value across all lanes
357  float max_val = vmaxvq_f32(vec_max);
358 
359  // Find which lane(s) have the max value, get minimum index among them
360  uint32x4_t max_mask = vceqq_f32(vec_max, vdupq_n_f32(max_val));
361  uint32x4_t idx_masked = vbslq_u32(max_mask, vec_max_idx, vdupq_n_u32(UINT32_MAX));
362  uint32_t result_idx = vminvq_u32(idx_masked);
363 
364  // Handle tail elements
365  for (uint32_t i = quarter_points * 4; i < num_points; i++) {
366  if (src0[i] > max_val) {
367  max_val = src0[i];
368  result_idx = i;
369  }
370  }
371 
372  *target = result_idx;
373 }
374 
375 #endif /*LV_HAVE_NEONV8*/
376 
377 
378 #ifdef LV_HAVE_GENERIC
379 
380 static inline void
381 volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
382 {
383  if (num_points > 0) {
384  float max = src0[0];
385  uint32_t index = 0;
386 
387  uint32_t i = 1;
388 
389  for (; i < num_points; ++i) {
390  if (src0[i] > max) {
391  index = i;
392  max = src0[i];
393  }
394  }
395  target[0] = index;
396  }
397 }
398 
399 #endif /*LV_HAVE_GENERIC*/
400 
401 #ifdef LV_HAVE_AVX512F
402 #include <immintrin.h>
403 
404 static inline void
405 volk_32f_index_max_32u_a_avx512f(uint32_t* target, const float* src0, uint32_t num_points)
406 {
407  if (num_points > 0) {
408  uint32_t number = 0;
409  const uint32_t sixteenthPoints = num_points / 16;
410 
411  const float* inputPtr = src0;
412 
413  __m512 indexIncrementValues = _mm512_set1_ps(16);
414  __m512 currentIndexes = _mm512_set_ps(
415  -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16);
416 
417  float max = src0[0];
418  float index = 0;
419  __m512 maxValues = _mm512_set1_ps(max);
420  __m512 maxValuesIndex = _mm512_setzero_ps();
421  __mmask16 compareResults;
422  __m512 currentValues;
423 
424  __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
425  __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
426 
427  for (; number < sixteenthPoints; number++) {
428  currentValues = _mm512_load_ps(inputPtr);
429  inputPtr += 16;
430  currentIndexes = _mm512_add_ps(currentIndexes, indexIncrementValues);
431  compareResults = _mm512_cmp_ps_mask(currentValues, maxValues, _CMP_GT_OS);
432  maxValuesIndex =
433  _mm512_mask_blend_ps(compareResults, maxValuesIndex, currentIndexes);
434  maxValues = _mm512_mask_blend_ps(compareResults, maxValues, currentValues);
435  }
436 
437  // Calculate the largest value from the remaining 16 points
438  _mm512_store_ps(maxValuesBuffer, maxValues);
439  _mm512_store_ps(maxIndexesBuffer, maxValuesIndex);
440 
441  for (number = 0; number < 16; number++) {
442  if (maxValuesBuffer[number] > max) {
443  index = maxIndexesBuffer[number];
444  max = maxValuesBuffer[number];
445  } else if (maxValuesBuffer[number] == max) {
446  if (index > maxIndexesBuffer[number])
447  index = maxIndexesBuffer[number];
448  }
449  }
450 
451  number = sixteenthPoints * 16;
452  for (; number < num_points; number++) {
453  if (src0[number] > max) {
454  index = number;
455  max = src0[number];
456  }
457  }
458  target[0] = (uint32_t)index;
459  }
460 }
461 
462 #endif /*LV_HAVE_AVX512F*/
463 
464 #endif /*INCLUDED_volk_32f_index_max_32u_a_H*/
465 
466 
467 #ifndef INCLUDED_volk_32f_index_max_32u_u_H
468 #define INCLUDED_volk_32f_index_max_32u_u_H
469 
470 #include <inttypes.h>
471 #include <stdio.h>
472 #include <volk/volk_common.h>
473 
474 
475 #ifdef LV_HAVE_AVX
476 #include <immintrin.h>
477 
478 static inline void
479 volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
480 {
481  if (num_points > 0) {
482  uint32_t number = 0;
483  const uint32_t quarterPoints = num_points / 8;
484 
485  float* inputPtr = (float*)src0;
486 
487  __m256 indexIncrementValues = _mm256_set1_ps(8);
488  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
489 
490  float max = src0[0];
491  float index = 0;
492  __m256 maxValues = _mm256_set1_ps(max);
493  __m256 maxValuesIndex = _mm256_setzero_ps();
494  __m256 compareResults;
495  __m256 currentValues;
496 
497  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
498  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
499 
500  for (; number < quarterPoints; number++) {
501  currentValues = _mm256_loadu_ps(inputPtr);
502  inputPtr += 8;
503  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
504  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
505  maxValuesIndex =
506  _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
507  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
508  }
509 
510  // Calculate the largest value from the remaining 8 points
511  _mm256_store_ps(maxValuesBuffer, maxValues);
512  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
513 
514  for (number = 0; number < 8; number++) {
515  if (maxValuesBuffer[number] > max) {
516  index = maxIndexesBuffer[number];
517  max = maxValuesBuffer[number];
518  } else if (maxValuesBuffer[number] == max) {
519  if (index > maxIndexesBuffer[number])
520  index = maxIndexesBuffer[number];
521  }
522  }
523 
524  number = quarterPoints * 8;
525  for (; number < num_points; number++) {
526  if (src0[number] > max) {
527  index = number;
528  max = src0[number];
529  }
530  }
531  target[0] = (uint32_t)index;
532  }
533 }
534 
535 #endif /*LV_HAVE_AVX*/
536 
537 
538 #ifdef LV_HAVE_SSE4_1
539 #include <smmintrin.h>
540 
541 static inline void
542 volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
543 {
544  if (num_points > 0) {
545  uint32_t number = 0;
546  const uint32_t quarterPoints = num_points / 4;
547 
548  float* inputPtr = (float*)src0;
549 
550  __m128 indexIncrementValues = _mm_set1_ps(4);
551  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
552 
553  float max = src0[0];
554  float index = 0;
555  __m128 maxValues = _mm_set1_ps(max);
556  __m128 maxValuesIndex = _mm_setzero_ps();
557  __m128 compareResults;
558  __m128 currentValues;
559 
560  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
561  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
562 
563  for (; number < quarterPoints; number++) {
564  currentValues = _mm_loadu_ps(inputPtr);
565  inputPtr += 4;
566  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
567  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
568  maxValuesIndex =
569  _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
570  maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
571  }
572 
573  // Calculate the largest value from the remaining 4 points
574  _mm_store_ps(maxValuesBuffer, maxValues);
575  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
576 
577  for (number = 0; number < 4; number++) {
578  if (maxValuesBuffer[number] > max) {
579  index = maxIndexesBuffer[number];
580  max = maxValuesBuffer[number];
581  } else if (maxValuesBuffer[number] == max) {
582  if (index > maxIndexesBuffer[number])
583  index = maxIndexesBuffer[number];
584  }
585  }
586 
587  number = quarterPoints * 4;
588  for (; number < num_points; number++) {
589  if (src0[number] > max) {
590  index = number;
591  max = src0[number];
592  }
593  }
594  target[0] = (uint32_t)index;
595  }
596 }
597 
598 #endif /*LV_HAVE_SSE4_1*/
599 
600 #ifdef LV_HAVE_SSE
601 #include <xmmintrin.h>
602 
603 static inline void
604 volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
605 {
606  if (num_points > 0) {
607  uint32_t number = 0;
608  const uint32_t quarterPoints = num_points / 4;
609 
610  float* inputPtr = (float*)src0;
611 
612  __m128 indexIncrementValues = _mm_set1_ps(4);
613  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
614 
615  float max = src0[0];
616  float index = 0;
617  __m128 maxValues = _mm_set1_ps(max);
618  __m128 maxValuesIndex = _mm_setzero_ps();
619  __m128 compareResults;
620  __m128 currentValues;
621 
622  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
623  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
624 
625  for (; number < quarterPoints; number++) {
626  currentValues = _mm_loadu_ps(inputPtr);
627  inputPtr += 4;
628  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
629  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
630  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
631  _mm_andnot_ps(compareResults, maxValuesIndex));
632  maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
633  _mm_andnot_ps(compareResults, maxValues));
634  }
635 
636  // Calculate the largest value from the remaining 4 points
637  _mm_store_ps(maxValuesBuffer, maxValues);
638  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
639 
640  for (number = 0; number < 4; number++) {
641  if (maxValuesBuffer[number] > max) {
642  index = maxIndexesBuffer[number];
643  max = maxValuesBuffer[number];
644  } else if (maxValuesBuffer[number] == max) {
645  if (index > maxIndexesBuffer[number])
646  index = maxIndexesBuffer[number];
647  }
648  }
649 
650  number = quarterPoints * 4;
651  for (; number < num_points; number++) {
652  if (src0[number] > max) {
653  index = number;
654  max = src0[number];
655  }
656  }
657  target[0] = (uint32_t)index;
658  }
659 }
660 
661 #endif /*LV_HAVE_SSE*/
662 
663 #ifdef LV_HAVE_AVX512F
664 #include <immintrin.h>
665 
666 static inline void
667 volk_32f_index_max_32u_u_avx512f(uint32_t* target, const float* src0, uint32_t num_points)
668 {
669  if (num_points > 0) {
670  uint32_t number = 0;
671  const uint32_t sixteenthPoints = num_points / 16;
672 
673  const float* inputPtr = src0;
674 
675  __m512 indexIncrementValues = _mm512_set1_ps(16);
676  __m512 currentIndexes = _mm512_set_ps(
677  -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16);
678 
679  float max = src0[0];
680  float index = 0;
681  __m512 maxValues = _mm512_set1_ps(max);
682  __m512 maxValuesIndex = _mm512_setzero_ps();
683  __mmask16 compareResults;
684  __m512 currentValues;
685 
686  __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
687  __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
688 
689  for (; number < sixteenthPoints; number++) {
690  currentValues = _mm512_loadu_ps(inputPtr);
691  inputPtr += 16;
692  currentIndexes = _mm512_add_ps(currentIndexes, indexIncrementValues);
693  compareResults = _mm512_cmp_ps_mask(currentValues, maxValues, _CMP_GT_OS);
694  maxValuesIndex =
695  _mm512_mask_blend_ps(compareResults, maxValuesIndex, currentIndexes);
696  maxValues = _mm512_mask_blend_ps(compareResults, maxValues, currentValues);
697  }
698 
699  // Calculate the largest value from the remaining 16 points
700  _mm512_store_ps(maxValuesBuffer, maxValues);
701  _mm512_store_ps(maxIndexesBuffer, maxValuesIndex);
702 
703  for (number = 0; number < 16; number++) {
704  if (maxValuesBuffer[number] > max) {
705  index = maxIndexesBuffer[number];
706  max = maxValuesBuffer[number];
707  } else if (maxValuesBuffer[number] == max) {
708  if (index > maxIndexesBuffer[number])
709  index = maxIndexesBuffer[number];
710  }
711  }
712 
713  number = sixteenthPoints * 16;
714  for (; number < num_points; number++) {
715  if (src0[number] > max) {
716  index = number;
717  max = src0[number];
718  }
719  }
720  target[0] = (uint32_t)index;
721  }
722 }
723 
724 #endif /*LV_HAVE_AVX512F*/
725 
726 #ifdef LV_HAVE_RVV
727 #include <float.h>
728 #include <riscv_vector.h>
729 
730 static inline void
731 volk_32f_index_max_32u_rvv(uint32_t* target, const float* src0, uint32_t num_points)
732 {
733  vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(-FLT_MAX, __riscv_vsetvlmax_e32m4());
734  vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
735  vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
736  size_t n = num_points;
737  for (size_t vl; n > 0; n -= vl, src0 += vl) {
738  vl = __riscv_vsetvl_e32m4(n);
739  vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl);
740  vbool8_t m = __riscv_vmfgt(v, vmax, vl);
741  vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
742  vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
743  vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
744  }
745  size_t vl = __riscv_vsetvlmax_e32m4();
746  float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
747  __riscv_vfmv_v_f_f32m1(-FLT_MAX, 1),
748  __riscv_vsetvlmax_e32m1()));
749  // Find lanes with max value, set others to UINT32_MAX
750  vbool8_t m = __riscv_vmfeq(vmax, max, vl);
751  vuint32m4_t idx_masked =
752  __riscv_vmerge(__riscv_vmv_v_x_u32m4(UINT32_MAX, vl), vmaxi, m, vl);
753  // Find minimum index among lanes with max value
754  *target = __riscv_vmv_x(__riscv_vredminu(RISCV_SHRINK4(vminu, u, 32, idx_masked),
755  __riscv_vmv_v_x_u32m1(UINT32_MAX, 1),
756  __riscv_vsetvlmax_e32m1()));
757 }
758 #endif /*LV_HAVE_RVV*/
759 
760 #endif /*INCLUDED_volk_32f_index_max_32u_u_H*/
volk_32f_index_max_32u_u_avx
static void volk_32f_index_max_32u_u_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:479
volk_32f_index_max_32u_u_sse
static void volk_32f_index_max_32u_u_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:604
RISCV_SHRINK4
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24
volk_32f_index_max_32u_a_sse
static void volk_32f_index_max_32u_a_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:130
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_common.h
volk_32f_index_max_32u_a_avx
static void volk_32f_index_max_32u_a_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:198
volk_32f_index_max_32u_generic
static void volk_32f_index_max_32u_generic(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:381
volk_32f_index_max_32u_neon
static void volk_32f_index_max_32u_neon(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:261