Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_index_min_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2021 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
52 #ifndef INCLUDED_volk_32f_index_min_32u_a_H
53 #define INCLUDED_volk_32f_index_min_32u_a_H
54 
55 #include <inttypes.h>
56 #include <stdio.h>
57 #include <volk/volk_common.h>
58 
59 #ifdef LV_HAVE_SSE4_1
60 #include <smmintrin.h>
61 
62 static inline void volk_32f_index_min_32u_a_sse4_1(uint32_t* target,
63  const float* source,
64  uint32_t num_points)
65 {
66  const uint32_t quarterPoints = num_points / 4;
67 
68  float* inputPtr = (float*)source;
69 
70  __m128 indexIncrementValues = _mm_set1_ps(4);
71  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
72 
73  float min = source[0];
74  float index = 0;
75  __m128 minValues = _mm_set1_ps(min);
76  __m128 minValuesIndex = _mm_setzero_ps();
77  __m128 compareResults;
78  __m128 currentValues;
79 
80  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
81  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
82 
83  for (uint32_t number = 0; number < quarterPoints; number++) {
84 
85  currentValues = _mm_load_ps(inputPtr);
86  inputPtr += 4;
87  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
88 
89  compareResults = _mm_cmplt_ps(currentValues, minValues);
90 
91  minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
92  minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
93  }
94 
95  // Calculate the smallest value from the remaining 4 points
96  _mm_store_ps(minValuesBuffer, minValues);
97  _mm_store_ps(minIndexesBuffer, minValuesIndex);
98 
99  for (uint32_t number = 0; number < 4; number++) {
100  if (minValuesBuffer[number] < min) {
101  index = minIndexesBuffer[number];
102  min = minValuesBuffer[number];
103  } else if (minValuesBuffer[number] == min) {
104  if (index > minIndexesBuffer[number])
105  index = minIndexesBuffer[number];
106  }
107  }
108 
109  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
110  if (source[number] < min) {
111  index = number;
112  min = source[number];
113  }
114  }
115  target[0] = (uint32_t)index;
116 }
117 
118 #endif /*LV_HAVE_SSE4_1*/
119 
120 
121 #ifdef LV_HAVE_SSE
122 
123 #include <xmmintrin.h>
124 
125 static inline void
126 volk_32f_index_min_32u_a_sse(uint32_t* target, const float* source, uint32_t num_points)
127 {
128  const uint32_t quarterPoints = num_points / 4;
129 
130  float* inputPtr = (float*)source;
131 
132  __m128 indexIncrementValues = _mm_set1_ps(4);
133  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
134 
135  float min = source[0];
136  float index = 0;
137  __m128 minValues = _mm_set1_ps(min);
138  __m128 minValuesIndex = _mm_setzero_ps();
139  __m128 compareResults;
140  __m128 currentValues;
141 
142  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
143  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
144 
145  for (uint32_t number = 0; number < quarterPoints; number++) {
146 
147  currentValues = _mm_load_ps(inputPtr);
148  inputPtr += 4;
149  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
150 
151  compareResults = _mm_cmplt_ps(currentValues, minValues);
152 
153  minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
154  _mm_andnot_ps(compareResults, minValuesIndex));
155 
156  minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
157  _mm_andnot_ps(compareResults, minValues));
158  }
159 
160  // Calculate the smallest value from the remaining 4 points
161  _mm_store_ps(minValuesBuffer, minValues);
162  _mm_store_ps(minIndexesBuffer, minValuesIndex);
163 
164  for (uint32_t number = 0; number < 4; number++) {
165  if (minValuesBuffer[number] < min) {
166  index = minIndexesBuffer[number];
167  min = minValuesBuffer[number];
168  } else if (minValuesBuffer[number] == min) {
169  if (index > minIndexesBuffer[number])
170  index = minIndexesBuffer[number];
171  }
172  }
173 
174  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
175  if (source[number] < min) {
176  index = number;
177  min = source[number];
178  }
179  }
180  target[0] = (uint32_t)index;
181 }
182 
183 #endif /*LV_HAVE_SSE*/
184 
185 
186 #ifdef LV_HAVE_AVX
187 #include <immintrin.h>
188 
189 static inline void
190 volk_32f_index_min_32u_a_avx(uint32_t* target, const float* source, uint32_t num_points)
191 {
192  const uint32_t quarterPoints = num_points / 8;
193 
194  float* inputPtr = (float*)source;
195 
196  __m256 indexIncrementValues = _mm256_set1_ps(8);
197  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
198 
199  float min = source[0];
200  float index = 0;
201  __m256 minValues = _mm256_set1_ps(min);
202  __m256 minValuesIndex = _mm256_setzero_ps();
203  __m256 compareResults;
204  __m256 currentValues;
205 
206  __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
207  __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
208 
209  for (uint32_t number = 0; number < quarterPoints; number++) {
210  currentValues = _mm256_load_ps(inputPtr);
211  inputPtr += 8;
212  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
213  compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
214  minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
215  minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
216  }
217 
218  // Calculate the smallest value from the remaining 8 points
219  _mm256_store_ps(minValuesBuffer, minValues);
220  _mm256_store_ps(minIndexesBuffer, minValuesIndex);
221 
222  for (uint32_t number = 0; number < 8; number++) {
223  if (minValuesBuffer[number] < min) {
224  index = minIndexesBuffer[number];
225  min = minValuesBuffer[number];
226  } else if (minValuesBuffer[number] == min) {
227  if (index > minIndexesBuffer[number])
228  index = minIndexesBuffer[number];
229  }
230  }
231 
232  for (uint32_t number = quarterPoints * 8; number < num_points; number++) {
233  if (source[number] < min) {
234  index = number;
235  min = source[number];
236  }
237  }
238  target[0] = (uint32_t)index;
239 }
240 
241 #endif /*LV_HAVE_AVX*/
242 
243 
244 #ifdef LV_HAVE_NEON
245 #include <arm_neon.h>
246 
247 static inline void
248 volk_32f_index_min_32u_neon(uint32_t* target, const float* source, uint32_t num_points)
249 {
250  const uint32_t quarterPoints = num_points / 4;
251 
252  float* inputPtr = (float*)source;
253  float32x4_t indexIncrementValues = vdupq_n_f32(4);
255  float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
256  float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
257 
258  float min = source[0];
259  float index = 0;
260  float32x4_t minValues = vdupq_n_f32(min);
261  uint32x4_t minValuesIndex = vmovq_n_u32(0);
262  uint32x4_t compareResults;
263  uint32x4_t currentIndexes_u;
264  float32x4_t currentValues;
265 
266  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
267  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
268 
269  for (uint32_t number = 0; number < quarterPoints; number++) {
270  currentValues = vld1q_f32(inputPtr);
271  inputPtr += 4;
272  currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
273  currentIndexes_u = vcvtq_u32_f32(currentIndexes);
274  compareResults = vcgeq_f32(currentValues, minValues);
275  minValuesIndex = vorrq_u32(vandq_u32(compareResults, minValuesIndex),
276  vbicq_u32(currentIndexes_u, compareResults));
277  minValues = vminq_f32(currentValues, minValues);
278  }
279 
280  // Calculate the smallest value from the remaining 4 points
281  vst1q_f32(minValuesBuffer, minValues);
282  vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex));
283  for (uint32_t number = 0; number < 4; number++) {
284  if (minValuesBuffer[number] < min) {
285  index = minIndexesBuffer[number];
286  min = minValuesBuffer[number];
287  } else if (minValuesBuffer[number] == min) {
288  if (index > minIndexesBuffer[number])
289  index = minIndexesBuffer[number];
290  }
291  }
292 
293  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
294  if (source[number] < min) {
295  index = number;
296  min = source[number];
297  }
298  }
299  target[0] = (uint32_t)index;
300 }
301 
302 #endif /*LV_HAVE_NEON*/
303 
304 
305 #ifdef LV_HAVE_NEONV8
306 #include <arm_neon.h>
307 #include <float.h>
308 
309 static inline void
310 volk_32f_index_min_32u_neonv8(uint32_t* target, const float* source, uint32_t num_points)
311 {
312  if (num_points == 0)
313  return;
314 
315  const uint32_t quarter_points = num_points / 4;
316  const float* inputPtr = source;
317 
318  // Use integer indices directly (no float conversion overhead)
319  uint32x4_t vec_indices = { 0, 1, 2, 3 };
320  const uint32x4_t vec_incr = vdupq_n_u32(4);
321 
322  float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
323  uint32x4_t vec_min_idx = vdupq_n_u32(0);
324 
325  for (uint32_t i = 0; i < quarter_points; i++) {
326  float32x4_t vec_val = vld1q_f32(inputPtr);
327  inputPtr += 4;
328 
329  // Compare BEFORE min update to know which lanes change
330  uint32x4_t lt_mask = vcltq_f32(vec_val, vec_min);
331  vec_min_idx = vbslq_u32(lt_mask, vec_indices, vec_min_idx);
332 
333  // vminq_f32 is single-cycle, no dependency on comparison result
334  vec_min = vminq_f32(vec_val, vec_min);
335 
336  vec_indices = vaddq_u32(vec_indices, vec_incr);
337  }
338 
339  // ARMv8 horizontal reduction - find min value across all lanes
340  float min_val = vminvq_f32(vec_min);
341 
342  // Find which lane(s) have the min value, get minimum index among them
343  uint32x4_t min_mask = vceqq_f32(vec_min, vdupq_n_f32(min_val));
344  uint32x4_t idx_masked = vbslq_u32(min_mask, vec_min_idx, vdupq_n_u32(UINT32_MAX));
345  uint32_t result_idx = vminvq_u32(idx_masked);
346 
347  // Handle tail elements
348  for (uint32_t i = quarter_points * 4; i < num_points; i++) {
349  if (source[i] < min_val) {
350  min_val = source[i];
351  result_idx = i;
352  }
353  }
354 
355  *target = result_idx;
356 }
357 
358 #endif /*LV_HAVE_NEONV8*/
359 
360 
361 #ifdef LV_HAVE_GENERIC
362 
363 static inline void
364 volk_32f_index_min_32u_generic(uint32_t* target, const float* source, uint32_t num_points)
365 {
366  float min = source[0];
367  uint32_t index = 0;
368 
369  for (uint32_t i = 1; i < num_points; ++i) {
370  if (source[i] < min) {
371  index = i;
372  min = source[i];
373  }
374  }
375  target[0] = index;
376 }
377 
378 #endif /*LV_HAVE_GENERIC*/
379 
380 #ifdef LV_HAVE_AVX512F
381 #include <immintrin.h>
382 
383 static inline void volk_32f_index_min_32u_a_avx512f(uint32_t* target,
384  const float* source,
385  uint32_t num_points)
386 {
387  if (num_points > 0) {
388  uint32_t number = 0;
389  const uint32_t sixteenthPoints = num_points / 16;
390 
391  const float* inputPtr = source;
392 
393  __m512 indexIncrementValues = _mm512_set1_ps(16);
394  __m512 currentIndexes = _mm512_set_ps(
395  -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16);
396 
397  float min = source[0];
398  float index = 0;
399  __m512 minValues = _mm512_set1_ps(min);
400  __m512 minValuesIndex = _mm512_setzero_ps();
401  __mmask16 compareResults;
402  __m512 currentValues;
403 
404  __VOLK_ATTR_ALIGNED(64) float minValuesBuffer[16];
405  __VOLK_ATTR_ALIGNED(64) float minIndexesBuffer[16];
406 
407  for (; number < sixteenthPoints; number++) {
408  currentValues = _mm512_load_ps(inputPtr);
409  inputPtr += 16;
410  currentIndexes = _mm512_add_ps(currentIndexes, indexIncrementValues);
411  compareResults = _mm512_cmp_ps_mask(currentValues, minValues, _CMP_LT_OS);
412  minValuesIndex =
413  _mm512_mask_blend_ps(compareResults, minValuesIndex, currentIndexes);
414  minValues = _mm512_mask_blend_ps(compareResults, minValues, currentValues);
415  }
416 
417  // Calculate the smallest value from the remaining 16 points
418  _mm512_store_ps(minValuesBuffer, minValues);
419  _mm512_store_ps(minIndexesBuffer, minValuesIndex);
420 
421  for (number = 0; number < 16; number++) {
422  if (minValuesBuffer[number] < min) {
423  index = minIndexesBuffer[number];
424  min = minValuesBuffer[number];
425  } else if (minValuesBuffer[number] == min) {
426  if (index > minIndexesBuffer[number])
427  index = minIndexesBuffer[number];
428  }
429  }
430 
431  number = sixteenthPoints * 16;
432  for (; number < num_points; number++) {
433  if (source[number] < min) {
434  index = number;
435  min = source[number];
436  }
437  }
438  target[0] = (uint32_t)index;
439  }
440 }
441 
442 #endif /*LV_HAVE_AVX512F*/
443 
444 #endif /*INCLUDED_volk_32f_index_min_32u_a_H*/
445 
446 
447 #ifndef INCLUDED_volk_32f_index_min_32u_u_H
448 #define INCLUDED_volk_32f_index_min_32u_u_H
449 
450 #include <inttypes.h>
451 #include <stdio.h>
452 #include <volk/volk_common.h>
453 
454 
455 #ifdef LV_HAVE_AVX
456 #include <immintrin.h>
457 
458 static inline void
459 volk_32f_index_min_32u_u_avx(uint32_t* target, const float* source, uint32_t num_points)
460 {
461  const uint32_t quarterPoints = num_points / 8;
462 
463  float* inputPtr = (float*)source;
464 
465  __m256 indexIncrementValues = _mm256_set1_ps(8);
466  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
467 
468  float min = source[0];
469  float index = 0;
470  __m256 minValues = _mm256_set1_ps(min);
471  __m256 minValuesIndex = _mm256_setzero_ps();
472  __m256 compareResults;
473  __m256 currentValues;
474 
475  __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
476  __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
477 
478  for (uint32_t number = 0; number < quarterPoints; number++) {
479  currentValues = _mm256_loadu_ps(inputPtr);
480  inputPtr += 8;
481  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
482  compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
483  minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
484  minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
485  }
486 
487  // Calculate the smalles value from the remaining 8 points
488  _mm256_store_ps(minValuesBuffer, minValues);
489  _mm256_store_ps(minIndexesBuffer, minValuesIndex);
490 
491  for (uint32_t number = 0; number < 8; number++) {
492  if (minValuesBuffer[number] < min) {
493  index = minIndexesBuffer[number];
494  min = minValuesBuffer[number];
495  } else if (minValuesBuffer[number] == min) {
496  if (index > minIndexesBuffer[number])
497  index = minIndexesBuffer[number];
498  }
499  }
500 
501  for (uint32_t number = quarterPoints * 8; number < num_points; number++) {
502  if (source[number] < min) {
503  index = number;
504  min = source[number];
505  }
506  }
507  target[0] = (uint32_t)index;
508 }
509 
510 #endif /*LV_HAVE_AVX*/
511 
512 
513 #ifdef LV_HAVE_SSE4_1
514 #include <smmintrin.h>
515 
516 static inline void volk_32f_index_min_32u_u_sse4_1(uint32_t* target,
517  const float* source,
518  uint32_t num_points)
519 {
520  const uint32_t quarterPoints = num_points / 4;
521 
522  float* inputPtr = (float*)source;
523 
524  __m128 indexIncrementValues = _mm_set1_ps(4);
525  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
526 
527  float min = source[0];
528  float index = 0;
529  __m128 minValues = _mm_set1_ps(min);
530  __m128 minValuesIndex = _mm_setzero_ps();
531  __m128 compareResults;
532  __m128 currentValues;
533 
534  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
535  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
536 
537  for (uint32_t number = 0; number < quarterPoints; number++) {
538  currentValues = _mm_loadu_ps(inputPtr);
539  inputPtr += 4;
540  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
541  compareResults = _mm_cmplt_ps(currentValues, minValues);
542  minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
543  minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
544  }
545 
546  // Calculate the smallest value from the remaining 4 points
547  _mm_store_ps(minValuesBuffer, minValues);
548  _mm_store_ps(minIndexesBuffer, minValuesIndex);
549 
550  for (uint32_t number = 0; number < 4; number++) {
551  if (minValuesBuffer[number] < min) {
552  index = minIndexesBuffer[number];
553  min = minValuesBuffer[number];
554  } else if (minValuesBuffer[number] == min) {
555  if (index > minIndexesBuffer[number])
556  index = minIndexesBuffer[number];
557  }
558  }
559 
560  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
561  if (source[number] < min) {
562  index = number;
563  min = source[number];
564  }
565  }
566  target[0] = (uint32_t)index;
567 }
568 
569 #endif /*LV_HAVE_SSE4_1*/
570 
571 #ifdef LV_HAVE_SSE
572 #include <xmmintrin.h>
573 
574 static inline void
575 volk_32f_index_min_32u_u_sse(uint32_t* target, const float* source, uint32_t num_points)
576 {
577  const uint32_t quarterPoints = num_points / 4;
578 
579  float* inputPtr = (float*)source;
580 
581  __m128 indexIncrementValues = _mm_set1_ps(4);
582  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
583 
584  float min = source[0];
585  float index = 0;
586  __m128 minValues = _mm_set1_ps(min);
587  __m128 minValuesIndex = _mm_setzero_ps();
588  __m128 compareResults;
589  __m128 currentValues;
590 
591  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
592  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
593 
594  for (uint32_t number = 0; number < quarterPoints; number++) {
595  currentValues = _mm_loadu_ps(inputPtr);
596  inputPtr += 4;
597  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
598  compareResults = _mm_cmplt_ps(currentValues, minValues);
599  minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
600  _mm_andnot_ps(compareResults, minValuesIndex));
601  minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
602  _mm_andnot_ps(compareResults, minValues));
603  }
604 
605  // Calculate the smallest value from the remaining 4 points
606  _mm_store_ps(minValuesBuffer, minValues);
607  _mm_store_ps(minIndexesBuffer, minValuesIndex);
608 
609  for (uint32_t number = 0; number < 4; number++) {
610  if (minValuesBuffer[number] < min) {
611  index = minIndexesBuffer[number];
612  min = minValuesBuffer[number];
613  } else if (minValuesBuffer[number] == min) {
614  if (index > minIndexesBuffer[number])
615  index = minIndexesBuffer[number];
616  }
617  }
618 
619  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
620  if (source[number] < min) {
621  index = number;
622  min = source[number];
623  }
624  }
625  target[0] = (uint32_t)index;
626 }
627 
628 #endif /*LV_HAVE_SSE*/
629 
630 #ifdef LV_HAVE_AVX512F
631 #include <immintrin.h>
632 
633 static inline void volk_32f_index_min_32u_u_avx512f(uint32_t* target,
634  const float* source,
635  uint32_t num_points)
636 {
637  if (num_points > 0) {
638  uint32_t number = 0;
639  const uint32_t sixteenthPoints = num_points / 16;
640 
641  const float* inputPtr = source;
642 
643  __m512 indexIncrementValues = _mm512_set1_ps(16);
644  __m512 currentIndexes = _mm512_set_ps(
645  -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16);
646 
647  float min = source[0];
648  float index = 0;
649  __m512 minValues = _mm512_set1_ps(min);
650  __m512 minValuesIndex = _mm512_setzero_ps();
651  __mmask16 compareResults;
652  __m512 currentValues;
653 
654  __VOLK_ATTR_ALIGNED(64) float minValuesBuffer[16];
655  __VOLK_ATTR_ALIGNED(64) float minIndexesBuffer[16];
656 
657  for (; number < sixteenthPoints; number++) {
658  currentValues = _mm512_loadu_ps(inputPtr);
659  inputPtr += 16;
660  currentIndexes = _mm512_add_ps(currentIndexes, indexIncrementValues);
661  compareResults = _mm512_cmp_ps_mask(currentValues, minValues, _CMP_LT_OS);
662  minValuesIndex =
663  _mm512_mask_blend_ps(compareResults, minValuesIndex, currentIndexes);
664  minValues = _mm512_mask_blend_ps(compareResults, minValues, currentValues);
665  }
666 
667  // Calculate the smallest value from the remaining 16 points
668  _mm512_store_ps(minValuesBuffer, minValues);
669  _mm512_store_ps(minIndexesBuffer, minValuesIndex);
670 
671  for (number = 0; number < 16; number++) {
672  if (minValuesBuffer[number] < min) {
673  index = minIndexesBuffer[number];
674  min = minValuesBuffer[number];
675  } else if (minValuesBuffer[number] == min) {
676  if (index > minIndexesBuffer[number])
677  index = minIndexesBuffer[number];
678  }
679  }
680 
681  number = sixteenthPoints * 16;
682  for (; number < num_points; number++) {
683  if (source[number] < min) {
684  index = number;
685  min = source[number];
686  }
687  }
688  target[0] = (uint32_t)index;
689  }
690 }
691 
692 #endif /*LV_HAVE_AVX512F*/
693 
694 #ifdef LV_HAVE_RVV
695 #include <float.h>
696 #include <riscv_vector.h>
697 
698 static inline void
699 volk_32f_index_min_32u_rvv(uint32_t* target, const float* src0, uint32_t num_points)
700 {
701  vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
702  vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
703  vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
704  size_t n = num_points;
705  for (size_t vl; n > 0; n -= vl, src0 += vl) {
706  vl = __riscv_vsetvl_e32m4(n);
707  vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl);
708  vbool8_t m = __riscv_vmflt(v, vmin, vl);
709  vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
710  vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
711  vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
712  }
713  size_t vl = __riscv_vsetvlmax_e32m4();
714  float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
715  __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
716  __riscv_vsetvlmax_e32m1()));
717  // Find lanes with min value, set others to UINT32_MAX
718  vbool8_t m = __riscv_vmfeq(vmin, min, vl);
719  vuint32m4_t idx_masked =
720  __riscv_vmerge(__riscv_vmv_v_x_u32m4(UINT32_MAX, vl), vmini, m, vl);
721  // Find minimum index among lanes with min value
722  *target = __riscv_vmv_x(__riscv_vredminu(RISCV_SHRINK4(vminu, u, 32, idx_masked),
723  __riscv_vmv_v_x_u32m1(UINT32_MAX, 1),
724  __riscv_vsetvlmax_e32m1()));
725 }
726 #endif /*LV_HAVE_RVV*/
727 
728 #endif /*INCLUDED_volk_32f_index_min_32u_u_H*/
RISCV_SHRINK4
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
volk_32f_index_min_32u_neon
static void volk_32f_index_min_32u_neon(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:248
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_common.h
volk_32f_index_min_32u_a_sse
static void volk_32f_index_min_32u_a_sse(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:126
volk_32f_index_min_32u_u_avx
static void volk_32f_index_min_32u_u_avx(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:459
volk_32f_index_min_32u_u_sse
static void volk_32f_index_min_32u_u_sse(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:575
volk_32f_index_min_32u_generic
static void volk_32f_index_min_32u_generic(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:364
volk_32f_index_min_32u_a_avx
static void volk_32f_index_min_32u_a_avx(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:190