Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_index_max_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32f_index_max_16u_a_H
59 #define INCLUDED_volk_32f_index_max_16u_a_H
60 
61 #include <inttypes.h>
62 #include <limits.h>
63 #include <stdio.h>
64 #include <volk/volk_common.h>
65 
66 #ifdef LV_HAVE_AVX
67 #include <immintrin.h>
68 
69 static inline void
70 volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points)
71 {
72  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
73 
74  uint32_t number = 0;
75  const uint32_t eighthPoints = num_points / 8;
76 
77  float* inputPtr = (float*)src0;
78 
79  __m256 indexIncrementValues = _mm256_set1_ps(8);
80  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
81 
82  float max = src0[0];
83  float index = 0;
84  __m256 maxValues = _mm256_set1_ps(max);
85  __m256 maxValuesIndex = _mm256_setzero_ps();
86  __m256 compareResults;
87  __m256 currentValues;
88 
89  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
90  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
91 
92  for (; number < eighthPoints; number++) {
93 
94  currentValues = _mm256_load_ps(inputPtr);
95  inputPtr += 8;
96  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
97 
98  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
99 
100  maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
101  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
102  }
103 
104  // Calculate the largest value from the remaining 4 points
105  _mm256_store_ps(maxValuesBuffer, maxValues);
106  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
107 
108  for (number = 0; number < 8; number++) {
109  if (maxValuesBuffer[number] > max) {
110  index = maxIndexesBuffer[number];
111  max = maxValuesBuffer[number];
112  } else if (maxValuesBuffer[number] == max) {
113  if (index > maxIndexesBuffer[number])
114  index = maxIndexesBuffer[number];
115  }
116  }
117 
118  number = eighthPoints * 8;
119  for (; number < num_points; number++) {
120  if (src0[number] > max) {
121  index = number;
122  max = src0[number];
123  }
124  }
125  target[0] = (uint16_t)index;
126 }
127 
128 #endif /*LV_HAVE_AVX*/
129 
130 #ifdef LV_HAVE_SSE4_1
131 #include <smmintrin.h>
132 
133 static inline void
134 volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points)
135 {
136  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
137 
138  uint32_t number = 0;
139  const uint32_t quarterPoints = num_points / 4;
140 
141  float* inputPtr = (float*)src0;
142 
143  __m128 indexIncrementValues = _mm_set1_ps(4);
144  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
145 
146  float max = src0[0];
147  float index = 0;
148  __m128 maxValues = _mm_set1_ps(max);
149  __m128 maxValuesIndex = _mm_setzero_ps();
150  __m128 compareResults;
151  __m128 currentValues;
152 
153  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
154  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
155 
156  for (; number < quarterPoints; number++) {
157 
158  currentValues = _mm_load_ps(inputPtr);
159  inputPtr += 4;
160  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
161 
162  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
163 
164  maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
165  maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
166  }
167 
168  // Calculate the largest value from the remaining 4 points
169  _mm_store_ps(maxValuesBuffer, maxValues);
170  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
171 
172  for (number = 0; number < 4; number++) {
173  if (maxValuesBuffer[number] > max) {
174  index = maxIndexesBuffer[number];
175  max = maxValuesBuffer[number];
176  } else if (maxValuesBuffer[number] == max) {
177  if (index > maxIndexesBuffer[number])
178  index = maxIndexesBuffer[number];
179  }
180  }
181 
182  number = quarterPoints * 4;
183  for (; number < num_points; number++) {
184  if (src0[number] > max) {
185  index = number;
186  max = src0[number];
187  }
188  }
189  target[0] = (uint16_t)index;
190 }
191 
192 #endif /*LV_HAVE_SSE4_1*/
193 
194 
195 #ifdef LV_HAVE_SSE
196 
197 #include <xmmintrin.h>
198 
199 static inline void
200 volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points)
201 {
202  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
203 
204  uint32_t number = 0;
205  const uint32_t quarterPoints = num_points / 4;
206 
207  float* inputPtr = (float*)src0;
208 
209  __m128 indexIncrementValues = _mm_set1_ps(4);
210  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
211 
212  float max = src0[0];
213  float index = 0;
214  __m128 maxValues = _mm_set1_ps(max);
215  __m128 maxValuesIndex = _mm_setzero_ps();
216  __m128 compareResults;
217  __m128 currentValues;
218 
219  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
220  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
221 
222  for (; number < quarterPoints; number++) {
223 
224  currentValues = _mm_load_ps(inputPtr);
225  inputPtr += 4;
226  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
227 
228  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
229 
230  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
231  _mm_andnot_ps(compareResults, maxValuesIndex));
232  maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
233  _mm_andnot_ps(compareResults, maxValues));
234  }
235 
236  // Calculate the largest value from the remaining 4 points
237  _mm_store_ps(maxValuesBuffer, maxValues);
238  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
239 
240  for (number = 0; number < 4; number++) {
241  if (maxValuesBuffer[number] > max) {
242  index = maxIndexesBuffer[number];
243  max = maxValuesBuffer[number];
244  } else if (maxValuesBuffer[number] == max) {
245  if (index > maxIndexesBuffer[number])
246  index = maxIndexesBuffer[number];
247  }
248  }
249 
250  number = quarterPoints * 4;
251  for (; number < num_points; number++) {
252  if (src0[number] > max) {
253  index = number;
254  max = src0[number];
255  }
256  }
257  target[0] = (uint16_t)index;
258 }
259 
260 #endif /*LV_HAVE_SSE*/
261 
262 
263 #ifdef LV_HAVE_NEON
264 #include <arm_neon.h>
265 #include <float.h>
266 #include <limits.h>
267 
268 static inline void
269 volk_32f_index_max_16u_neon(uint16_t* target, const float* src0, uint32_t num_points)
270 {
271  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
272 
273  if (num_points == 0)
274  return;
275 
276  const uint32_t quarter_points = num_points / 4;
277  const float* inputPtr = src0;
278 
279  // Use integer indices directly
280  uint32x4_t vec_indices = { 0, 1, 2, 3 };
281  const uint32x4_t vec_incr = vdupq_n_u32(4);
282 
283  float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
284  uint32x4_t vec_max_idx = vdupq_n_u32(0);
285 
286  for (uint32_t i = 0; i < quarter_points; i++) {
287  float32x4_t vec_val = vld1q_f32(inputPtr);
288  inputPtr += 4;
289 
290  // Compare BEFORE max update to know which lanes change
291  uint32x4_t gt_mask = vcgtq_f32(vec_val, vec_max);
292  vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
293 
294  // vmaxq_f32 is single-cycle, no dependency on comparison result
295  vec_max = vmaxq_f32(vec_val, vec_max);
296 
297  vec_indices = vaddq_u32(vec_indices, vec_incr);
298  }
299 
300  // Scalar reduction
301  __VOLK_ATTR_ALIGNED(16) float max_buf[4];
302  __VOLK_ATTR_ALIGNED(16) uint32_t idx_buf[4];
303  vst1q_f32(max_buf, vec_max);
304  vst1q_u32(idx_buf, vec_max_idx);
305 
306  float max_val = max_buf[0];
307  uint32_t result_idx = idx_buf[0];
308  for (int i = 1; i < 4; i++) {
309  if (max_buf[i] > max_val) {
310  max_val = max_buf[i];
311  result_idx = idx_buf[i];
312  } else if (max_buf[i] == max_val && idx_buf[i] < result_idx) {
313  result_idx = idx_buf[i];
314  }
315  }
316 
317  // Handle tail
318  for (uint32_t i = quarter_points * 4; i < num_points; i++) {
319  if (src0[i] > max_val) {
320  max_val = src0[i];
321  result_idx = i;
322  }
323  }
324 
325  *target = (uint16_t)result_idx;
326 }
327 
328 #endif /*LV_HAVE_NEON*/
329 
330 
331 #ifdef LV_HAVE_NEONV8
332 #include <arm_neon.h>
333 #include <float.h>
334 #include <limits.h>
335 
336 static inline void
337 volk_32f_index_max_16u_neonv8(uint16_t* target, const float* src0, uint32_t num_points)
338 {
339  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
340 
341  if (num_points == 0)
342  return;
343 
344  const uint32_t quarter_points = num_points / 4;
345  const float* inputPtr = src0;
346 
347  // Use integer indices directly (no float conversion overhead)
348  uint32x4_t vec_indices = { 0, 1, 2, 3 };
349  const uint32x4_t vec_incr = vdupq_n_u32(4);
350 
351  float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
352  uint32x4_t vec_max_idx = vdupq_n_u32(0);
353 
354  for (uint32_t i = 0; i < quarter_points; i++) {
355  float32x4_t vec_val = vld1q_f32(inputPtr);
356  inputPtr += 4;
357 
358  // Compare BEFORE max update to know which lanes change
359  uint32x4_t gt_mask = vcgtq_f32(vec_val, vec_max);
360  vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
361 
362  // vmaxq_f32 is single-cycle, no dependency on comparison result
363  vec_max = vmaxq_f32(vec_val, vec_max);
364 
365  vec_indices = vaddq_u32(vec_indices, vec_incr);
366  }
367 
368  // ARMv8 horizontal reduction
369  float max_val = vmaxvq_f32(vec_max);
370  uint32x4_t max_mask = vceqq_f32(vec_max, vdupq_n_f32(max_val));
371  uint32x4_t idx_masked = vbslq_u32(max_mask, vec_max_idx, vdupq_n_u32(UINT32_MAX));
372  uint32_t result_idx = vminvq_u32(idx_masked);
373 
374  // Handle tail
375  for (uint32_t i = quarter_points * 4; i < num_points; i++) {
376  if (src0[i] > max_val) {
377  max_val = src0[i];
378  result_idx = i;
379  }
380  }
381 
382  *target = (uint16_t)result_idx;
383 }
384 
385 #endif /*LV_HAVE_NEONV8*/
386 
387 
388 #ifdef LV_HAVE_GENERIC
389 
390 static inline void
391 volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, uint32_t num_points)
392 {
393  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
394 
395  float max = src0[0];
396  uint16_t index = 0;
397 
398  uint32_t i = 1;
399 
400  for (; i < num_points; ++i) {
401  if (src0[i] > max) {
402  index = i;
403  max = src0[i];
404  }
405  }
406  target[0] = index;
407 }
408 
409 #endif /*LV_HAVE_GENERIC*/
410 
411 #ifdef LV_HAVE_AVX512F
412 #include <immintrin.h>
413 #include <limits.h>
414 
415 static inline void
416 volk_32f_index_max_16u_a_avx512f(uint16_t* target, const float* src0, uint32_t num_points)
417 {
418  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
419 
420  uint32_t number = 0;
421  const uint32_t sixteenthPoints = num_points / 16;
422 
423  const float* inputPtr = src0;
424 
425  __m512 indexIncrementValues = _mm512_set1_ps(16);
426  __m512 currentIndexes = _mm512_set_ps(
427  -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16);
428 
429  float max = src0[0];
430  float index = 0;
431  __m512 maxValues = _mm512_set1_ps(max);
432  __m512 maxValuesIndex = _mm512_setzero_ps();
433  __mmask16 compareResults;
434  __m512 currentValues;
435 
436  __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
437  __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
438 
439  for (; number < sixteenthPoints; number++) {
440  currentValues = _mm512_load_ps(inputPtr);
441  inputPtr += 16;
442  currentIndexes = _mm512_add_ps(currentIndexes, indexIncrementValues);
443  compareResults = _mm512_cmp_ps_mask(currentValues, maxValues, _CMP_GT_OS);
444  maxValuesIndex =
445  _mm512_mask_blend_ps(compareResults, maxValuesIndex, currentIndexes);
446  maxValues = _mm512_mask_blend_ps(compareResults, maxValues, currentValues);
447  }
448 
449  // Calculate the largest value from the remaining 16 points
450  _mm512_store_ps(maxValuesBuffer, maxValues);
451  _mm512_store_ps(maxIndexesBuffer, maxValuesIndex);
452 
453  for (number = 0; number < 16; number++) {
454  if (maxValuesBuffer[number] > max) {
455  index = maxIndexesBuffer[number];
456  max = maxValuesBuffer[number];
457  } else if (maxValuesBuffer[number] == max) {
458  if (index > maxIndexesBuffer[number])
459  index = maxIndexesBuffer[number];
460  }
461  }
462 
463  number = sixteenthPoints * 16;
464  for (; number < num_points; number++) {
465  if (src0[number] > max) {
466  index = number;
467  max = src0[number];
468  }
469  }
470  target[0] = (uint16_t)index;
471 }
472 
473 #endif /*LV_HAVE_AVX512F*/
474 
475 #endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
476 
477 
478 #ifndef INCLUDED_volk_32f_index_max_16u_u_H
479 #define INCLUDED_volk_32f_index_max_16u_u_H
480 
481 #include <inttypes.h>
482 #include <limits.h>
483 #include <stdio.h>
484 #include <volk/volk_common.h>
485 
486 #ifdef LV_HAVE_AVX
487 #include <immintrin.h>
488 
489 static inline void
490 volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points)
491 {
492  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
493 
494  uint32_t number = 0;
495  const uint32_t eighthPoints = num_points / 8;
496 
497  float* inputPtr = (float*)src0;
498 
499  __m256 indexIncrementValues = _mm256_set1_ps(8);
500  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
501 
502  float max = src0[0];
503  float index = 0;
504  __m256 maxValues = _mm256_set1_ps(max);
505  __m256 maxValuesIndex = _mm256_setzero_ps();
506  __m256 compareResults;
507  __m256 currentValues;
508 
509  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
510  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
511 
512  for (; number < eighthPoints; number++) {
513 
514  currentValues = _mm256_loadu_ps(inputPtr);
515  inputPtr += 8;
516  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
517 
518  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
519 
520  maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
521  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
522  }
523 
524  // Calculate the largest value from the remaining 4 points
525  _mm256_storeu_ps(maxValuesBuffer, maxValues);
526  _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
527 
528  for (number = 0; number < 8; number++) {
529  if (maxValuesBuffer[number] > max) {
530  index = maxIndexesBuffer[number];
531  max = maxValuesBuffer[number];
532  } else if (maxValuesBuffer[number] == max) {
533  if (index > maxIndexesBuffer[number])
534  index = maxIndexesBuffer[number];
535  }
536  }
537 
538  number = eighthPoints * 8;
539  for (; number < num_points; number++) {
540  if (src0[number] > max) {
541  index = number;
542  max = src0[number];
543  }
544  }
545  target[0] = (uint16_t)index;
546 }
547 
548 #endif /*LV_HAVE_AVX*/
549 
550 #ifdef LV_HAVE_AVX512F
551 #include <immintrin.h>
552 #include <limits.h>
553 
554 static inline void
555 volk_32f_index_max_16u_u_avx512f(uint16_t* target, const float* src0, uint32_t num_points)
556 {
557  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
558 
559  uint32_t number = 0;
560  const uint32_t sixteenthPoints = num_points / 16;
561 
562  const float* inputPtr = src0;
563 
564  __m512 indexIncrementValues = _mm512_set1_ps(16);
565  __m512 currentIndexes = _mm512_set_ps(
566  -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16);
567 
568  float max = src0[0];
569  float index = 0;
570  __m512 maxValues = _mm512_set1_ps(max);
571  __m512 maxValuesIndex = _mm512_setzero_ps();
572  __mmask16 compareResults;
573  __m512 currentValues;
574 
575  __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
576  __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
577 
578  for (; number < sixteenthPoints; number++) {
579  currentValues = _mm512_loadu_ps(inputPtr);
580  inputPtr += 16;
581  currentIndexes = _mm512_add_ps(currentIndexes, indexIncrementValues);
582  compareResults = _mm512_cmp_ps_mask(currentValues, maxValues, _CMP_GT_OS);
583  maxValuesIndex =
584  _mm512_mask_blend_ps(compareResults, maxValuesIndex, currentIndexes);
585  maxValues = _mm512_mask_blend_ps(compareResults, maxValues, currentValues);
586  }
587 
588  // Calculate the largest value from the remaining 16 points
589  _mm512_store_ps(maxValuesBuffer, maxValues);
590  _mm512_store_ps(maxIndexesBuffer, maxValuesIndex);
591 
592  for (number = 0; number < 16; number++) {
593  if (maxValuesBuffer[number] > max) {
594  index = maxIndexesBuffer[number];
595  max = maxValuesBuffer[number];
596  } else if (maxValuesBuffer[number] == max) {
597  if (index > maxIndexesBuffer[number])
598  index = maxIndexesBuffer[number];
599  }
600  }
601 
602  number = sixteenthPoints * 16;
603  for (; number < num_points; number++) {
604  if (src0[number] > max) {
605  index = number;
606  max = src0[number];
607  }
608  }
609  target[0] = (uint16_t)index;
610 }
611 
612 #endif /*LV_HAVE_AVX512F*/
613 
614 #ifdef LV_HAVE_RVV
615 #include <float.h>
616 #include <riscv_vector.h>
617 
618 static inline void
619 volk_32f_index_max_16u_rvv(uint16_t* target, const float* src0, uint32_t num_points)
620 {
621  vfloat32m8_t vmax = __riscv_vfmv_v_f_f32m8(-FLT_MAX, __riscv_vsetvlmax_e32m8());
622  vuint16m4_t vmaxi = __riscv_vmv_v_x_u16m4(0, __riscv_vsetvlmax_e16m4());
623  vuint16m4_t vidx = __riscv_vid_v_u16m4(__riscv_vsetvlmax_e16m4());
624  size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
625  for (size_t vl; n > 0; n -= vl, src0 += vl) {
626  vl = __riscv_vsetvl_e32m8(n);
627  vfloat32m8_t v = __riscv_vle32_v_f32m8(src0, vl);
628  vbool4_t m = __riscv_vmfgt(v, vmax, vl);
629  vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
630  vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
631  vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
632  }
633  size_t vl = __riscv_vsetvlmax_e32m8();
634  float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK8(vfmax, f, 32, vmax),
635  __riscv_vfmv_v_f_f32m1(-FLT_MAX, 1),
636  __riscv_vsetvlmax_e32m1()));
637  // Find lanes with max value, set others to UINT16_MAX
638  vbool4_t m = __riscv_vmfeq(vmax, max, vl);
639  vuint16m4_t idx_masked = __riscv_vmerge(
640  __riscv_vmv_v_x_u16m4(UINT16_MAX, __riscv_vsetvlmax_e16m4()), vmaxi, m, vl);
641  // Find minimum index among lanes with max value
642  *target = __riscv_vmv_x(__riscv_vredminu(RISCV_SHRINK4(vminu, u, 16, idx_masked),
643  __riscv_vmv_v_x_u16m1(UINT16_MAX, 1),
644  __riscv_vsetvlmax_e16m1()));
645 }
646 #endif /*LV_HAVE_RVV*/
647 
648 #endif /*INCLUDED_volk_32f_index_max_16u_u_H*/
volk_32f_index_max_16u_u_avx
static void volk_32f_index_max_16u_u_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:490
RISCV_SHRINK4
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24
volk_32f_index_max_16u_generic
static void volk_32f_index_max_16u_generic(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:391
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
volk_32f_index_max_16u_neon
static void volk_32f_index_max_16u_neon(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:269
volk_32f_index_max_16u_a_avx
static void volk_32f_index_max_16u_a_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:70
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_common.h
volk_32f_index_max_16u_a_sse
static void volk_32f_index_max_16u_a_sse(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:200
RISCV_SHRINK8
#define RISCV_SHRINK8(op, T, S, v)
Definition: volk_rvv_intrinsics.h:33