Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_index_max_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016, 2018-2020 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
57 #ifndef INCLUDED_volk_32fc_index_max_32u_a_H
58 #define INCLUDED_volk_32fc_index_max_32u_a_H
59 
60 #include <inttypes.h>
61 #include <volk/volk_common.h>
62 #include <volk/volk_complex.h>
63 
64 #ifdef LV_HAVE_AVX2
65 #include <immintrin.h>
67 
68 static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target,
69  const lv_32fc_t* src0,
70  uint32_t num_points)
71 {
72  const __m256i indices_increment = _mm256_set1_epi32(8);
73  /*
74  * At the start of each loop iteration current_indices holds the indices of
75  * the complex numbers loaded from memory. Explanation for odd order is given
76  * in implementation of vector_32fc_index_max_variant0().
77  */
78  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
79 
80  __m256 max_values = _mm256_setzero_ps();
81  __m256i max_indices = _mm256_setzero_si256();
82 
83  for (unsigned i = 0; i < num_points / 8u; ++i) {
84  __m256 in0 = _mm256_load_ps((float*)src0);
85  __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
87  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
88  src0 += 8;
89  }
90 
91  // determine maximum value and index in the result of the vectorized loop
92  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
93  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
94  _mm256_store_ps(max_values_buffer, max_values);
95  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
96 
97  float max = 0.f;
98  uint32_t index = 0;
99  for (unsigned i = 0; i < 8; i++) {
100  if (max_values_buffer[i] > max) {
101  max = max_values_buffer[i];
102  index = max_indices_buffer[i];
103  }
104  }
105 
106  // handle tail not processed by the vectorized loop
107  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
108  const float abs_squared =
109  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
110  if (abs_squared > max) {
111  max = abs_squared;
112  index = i;
113  }
114  ++src0;
115  }
116 
117  *target = index;
118 }
119 
120 #endif /*LV_HAVE_AVX2*/
121 
122 #ifdef LV_HAVE_AVX2
123 #include <immintrin.h>
125 
126 static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target,
127  const lv_32fc_t* src0,
128  uint32_t num_points)
129 {
130  const __m256i indices_increment = _mm256_set1_epi32(8);
131  /*
132  * At the start of each loop iteration current_indices holds the indices of
133  * the complex numbers loaded from memory. Explanation for odd order is given
134  * in implementation of vector_32fc_index_max_variant0().
135  */
136  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
137 
138  __m256 max_values = _mm256_setzero_ps();
139  __m256i max_indices = _mm256_setzero_si256();
140 
141  for (unsigned i = 0; i < num_points / 8u; ++i) {
142  __m256 in0 = _mm256_load_ps((float*)src0);
143  __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
145  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
146  src0 += 8;
147  }
148 
149  // determine maximum value and index in the result of the vectorized loop
150  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
151  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
152  _mm256_store_ps(max_values_buffer, max_values);
153  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
154 
155  float max = 0.f;
156  uint32_t index = 0;
157  for (unsigned i = 0; i < 8; i++) {
158  if (max_values_buffer[i] > max) {
159  max = max_values_buffer[i];
160  index = max_indices_buffer[i];
161  }
162  }
163 
164  // handle tail not processed by the vectorized loop
165  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
166  const float abs_squared =
167  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
168  if (abs_squared > max) {
169  max = abs_squared;
170  index = i;
171  }
172  ++src0;
173  }
174 
175  *target = index;
176 }
177 
178 #endif /*LV_HAVE_AVX2*/
179 
180 #ifdef LV_HAVE_SSE3
181 #include <pmmintrin.h>
182 #include <xmmintrin.h>
183 
184 static inline void volk_32fc_index_max_32u_a_sse3(uint32_t* target,
185  const lv_32fc_t* src0,
186  uint32_t num_points)
187 {
188  const uint32_t num_bytes = num_points * 8;
189 
190  union bit128 holderf;
191  union bit128 holderi;
192  float sq_dist = 0.0;
193 
194  union bit128 xmm5, xmm4;
195  __m128 xmm1, xmm2, xmm3;
196  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
197 
198  xmm5.int_vec = _mm_setzero_si128();
199  xmm4.int_vec = _mm_setzero_si128();
200  holderf.int_vec = _mm_setzero_si128();
201  holderi.int_vec = _mm_setzero_si128();
202 
203  int bound = num_bytes >> 5;
204  int i = 0;
205 
206  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
207  xmm9 = _mm_setzero_si128();
208  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
209  xmm3 = _mm_setzero_ps();
210 
211  for (; i < bound; ++i) {
212  xmm1 = _mm_load_ps((float*)src0);
213  xmm2 = _mm_load_ps((float*)&src0[2]);
214 
215  src0 += 4;
216 
217  xmm1 = _mm_mul_ps(xmm1, xmm1);
218  xmm2 = _mm_mul_ps(xmm2, xmm2);
219 
220  xmm1 = _mm_hadd_ps(xmm1, xmm2);
221 
222  xmm3 = _mm_max_ps(xmm1, xmm3);
223 
224  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
225  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
226 
227  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
228  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
229 
230  xmm9 = _mm_add_epi32(xmm11, xmm12);
231 
232  xmm8 = _mm_add_epi32(xmm8, xmm10);
233  }
234 
235  if (num_bytes >> 4 & 1) {
236  xmm2 = _mm_load_ps((float*)src0);
237 
238  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
239  xmm8 = bit128_p(&xmm1)->int_vec;
240 
241  xmm2 = _mm_mul_ps(xmm2, xmm2);
242 
243  src0 += 2;
244 
245  xmm1 = _mm_hadd_ps(xmm2, xmm2);
246 
247  xmm3 = _mm_max_ps(xmm1, xmm3);
248 
249  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
250 
251  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
252  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
253 
254  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
255  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
256 
257  xmm9 = _mm_add_epi32(xmm11, xmm12);
258 
259  xmm8 = _mm_add_epi32(xmm8, xmm10);
260  }
261 
262  if (num_bytes >> 3 & 1) {
263  sq_dist =
264  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
265 
266  xmm2 = _mm_load1_ps(&sq_dist);
267 
268  xmm1 = xmm3;
269 
270  xmm3 = _mm_max_ss(xmm3, xmm2);
271 
272  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
273  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
274 
275  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
276 
277  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
278  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
279 
280  xmm9 = _mm_add_epi32(xmm11, xmm12);
281  }
282 
283  _mm_store_ps((float*)&(holderf.f), xmm3);
284  _mm_store_si128(&(holderi.int_vec), xmm9);
285 
286  target[0] = holderi.i[0];
287  sq_dist = holderf.f[0];
288  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
289  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
290  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
291  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
292  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
293  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
294 }
295 
296 #endif /*LV_HAVE_SSE3*/
297 
298 #ifdef LV_HAVE_GENERIC
299 static inline void volk_32fc_index_max_32u_generic(uint32_t* target,
300  const lv_32fc_t* src0,
301  uint32_t num_points)
302 {
303  const uint32_t num_bytes = num_points * 8;
304 
305  float sq_dist = 0.0;
306  float max = 0.0;
307  uint32_t index = 0;
308 
309  uint32_t i = 0;
310 
311  for (; i < (num_bytes >> 3); ++i) {
312  sq_dist =
313  lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
314 
315  if (sq_dist > max) {
316  index = i;
317  max = sq_dist;
318  }
319  }
320  target[0] = index;
321 }
322 
323 #endif /*LV_HAVE_GENERIC*/
324 
325 #ifdef LV_HAVE_AVX512F
326 #include <immintrin.h>
327 
328 static inline void volk_32fc_index_max_32u_a_avx512f(uint32_t* target,
329  const lv_32fc_t* src0,
330  uint32_t num_points)
331 {
332  const lv_32fc_t* src0Ptr = src0;
333  const uint32_t sixteenthPoints = num_points / 16;
334 
335  // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
336  __m512 currentIndexes =
337  _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
338  const __m512 indexIncrement = _mm512_set1_ps(16);
339 
340  __m512 maxValues = _mm512_setzero_ps();
341  __m512 maxIndices = _mm512_setzero_ps();
342 
343  for (uint32_t number = 0; number < sixteenthPoints; number++) {
344  // Load 16 complex values (32 floats)
345  __m512 in0 = _mm512_load_ps((const float*)src0Ptr);
346  __m512 in1 = _mm512_load_ps((const float*)(src0Ptr + 8));
347  src0Ptr += 16;
348 
349  // Square all values
350  in0 = _mm512_mul_ps(in0, in0);
351  in1 = _mm512_mul_ps(in1, in1);
352 
353  // Add adjacent pairs (re² + im²) using within-lane shuffle
354  // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
355  __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
356  __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
357  __m512 sum0 = _mm512_add_ps(in0, sw0);
358  __m512 sum1 = _mm512_add_ps(in1, sw1);
359 
360  // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
361  // 0x88 = _MM_SHUFFLE(2,0,2,0)
362  __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
363 
364  // Compare and update maximums
365  __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, maxValues, _CMP_GT_OS);
366  maxIndices = _mm512_mask_blend_ps(cmpMask, maxIndices, currentIndexes);
367  maxValues = _mm512_max_ps(mag_sq, maxValues);
368 
369  currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
370  }
371 
372  // Reduce 16 values to find maximum
373  __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
374  __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
375  _mm512_store_ps(maxValuesBuffer, maxValues);
376  _mm512_store_ps(maxIndexesBuffer, maxIndices);
377 
378  float max = 0.0f;
379  uint32_t index = 0;
380  for (uint32_t i = 0; i < 16; i++) {
381  if (maxValuesBuffer[i] > max) {
382  max = maxValuesBuffer[i];
383  index = (uint32_t)maxIndexesBuffer[i];
384  } else if (maxValuesBuffer[i] == max) {
385  if ((uint32_t)maxIndexesBuffer[i] < index)
386  index = (uint32_t)maxIndexesBuffer[i];
387  }
388  }
389 
390  // Handle tail
391  for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
392  const float re = lv_creal(*src0Ptr);
393  const float im = lv_cimag(*src0Ptr);
394  const float sq_dist = re * re + im * im;
395  if (sq_dist > max) {
396  max = sq_dist;
397  index = number;
398  }
399  src0Ptr++;
400  }
401  *target = index;
402 }
403 
404 #endif /*LV_HAVE_AVX512F*/
405 
406 #endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
407 
408 #ifndef INCLUDED_volk_32fc_index_max_32u_u_H
409 #define INCLUDED_volk_32fc_index_max_32u_u_H
410 
411 #include <inttypes.h>
412 #include <volk/volk_common.h>
413 #include <volk/volk_complex.h>
414 
415 #ifdef LV_HAVE_AVX2
416 #include <immintrin.h>
418 
419 static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target,
420  const lv_32fc_t* src0,
421  uint32_t num_points)
422 {
423  const __m256i indices_increment = _mm256_set1_epi32(8);
424  /*
425  * At the start of each loop iteration current_indices holds the indices of
426  * the complex numbers loaded from memory. Explanation for odd order is given
427  * in implementation of vector_32fc_index_max_variant0().
428  */
429  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
430 
431  __m256 max_values = _mm256_setzero_ps();
432  __m256i max_indices = _mm256_setzero_si256();
433 
434  for (unsigned i = 0; i < num_points / 8u; ++i) {
435  __m256 in0 = _mm256_loadu_ps((float*)src0);
436  __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
438  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
439  src0 += 8;
440  }
441 
442  // determine maximum value and index in the result of the vectorized loop
443  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
444  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
445  _mm256_store_ps(max_values_buffer, max_values);
446  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
447 
448  float max = 0.f;
449  uint32_t index = 0;
450  for (unsigned i = 0; i < 8; i++) {
451  if (max_values_buffer[i] > max) {
452  max = max_values_buffer[i];
453  index = max_indices_buffer[i];
454  }
455  }
456 
457  // handle tail not processed by the vectorized loop
458  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
459  const float abs_squared =
460  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
461  if (abs_squared > max) {
462  max = abs_squared;
463  index = i;
464  }
465  ++src0;
466  }
467 
468  *target = index;
469 }
470 
471 #endif /*LV_HAVE_AVX2*/
472 
473 #ifdef LV_HAVE_AVX2
474 #include <immintrin.h>
476 
477 static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target,
478  const lv_32fc_t* src0,
479  uint32_t num_points)
480 {
481  const __m256i indices_increment = _mm256_set1_epi32(8);
482  /*
483  * At the start of each loop iteration current_indices holds the indices of
484  * the complex numbers loaded from memory. Explanation for odd order is given
485  * in implementation of vector_32fc_index_max_variant0().
486  */
487  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
488 
489  __m256 max_values = _mm256_setzero_ps();
490  __m256i max_indices = _mm256_setzero_si256();
491 
492  for (unsigned i = 0; i < num_points / 8u; ++i) {
493  __m256 in0 = _mm256_loadu_ps((float*)src0);
494  __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
496  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
497  src0 += 8;
498  }
499 
500  // determine maximum value and index in the result of the vectorized loop
501  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
502  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
503  _mm256_store_ps(max_values_buffer, max_values);
504  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
505 
506  float max = 0.f;
507  uint32_t index = 0;
508  for (unsigned i = 0; i < 8; i++) {
509  if (max_values_buffer[i] > max) {
510  max = max_values_buffer[i];
511  index = max_indices_buffer[i];
512  }
513  }
514 
515  // handle tail not processed by the vectorized loop
516  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
517  const float abs_squared =
518  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
519  if (abs_squared > max) {
520  max = abs_squared;
521  index = i;
522  }
523  ++src0;
524  }
525 
526  *target = index;
527 }
528 
529 #endif /*LV_HAVE_AVX2*/
530 
531 #ifdef LV_HAVE_NEON
532 #include <arm_neon.h>
534 
535 static inline void
536 volk_32fc_index_max_32u_neon(uint32_t* target, const lv_32fc_t* src0, uint32_t num_points)
537 {
538  unsigned int number = 0;
539  const uint32_t quarter_points = num_points / 4;
540  const lv_32fc_t* src0Ptr = src0;
541 
542  uint32_t indices[4] = { 0, 1, 2, 3 };
543  const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
544  uint32x4_t vec_indices = vld1q_u32(indices);
545  uint32x4_t vec_max_indices = vec_indices;
546 
547  if (num_points) {
548  float max = FLT_MIN;
549  uint32_t index = 0;
550 
551  float32x4_t vec_max = vdupq_n_f32(FLT_MIN);
552 
553  for (; number < quarter_points; number++) {
554  // Load complex and compute magnitude squared
555  const float32x4_t vec_mag2 =
556  _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
557  __VOLK_PREFETCH(src0Ptr += 4);
558  // a > b?
559  const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
560  vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
561  vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
562  vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
563  }
564  uint32_t tmp_max_indices[4];
565  float tmp_max[4];
566  vst1q_u32(tmp_max_indices, vec_max_indices);
567  vst1q_f32(tmp_max, vec_max);
568 
569  for (int i = 0; i < 4; i++) {
570  if (tmp_max[i] > max) {
571  max = tmp_max[i];
572  index = tmp_max_indices[i];
573  } else if (tmp_max[i] == max) {
574  if (tmp_max_indices[i] < index)
575  index = tmp_max_indices[i];
576  }
577  }
578 
579  // Deal with the rest
580  for (number = quarter_points * 4; number < num_points; number++) {
581  const float re = lv_creal(*src0Ptr);
582  const float im = lv_cimag(*src0Ptr);
583  const float sq_dist = re * re + im * im;
584  if (sq_dist > max) {
585  max = sq_dist;
586  index = number;
587  }
588  src0Ptr++;
589  }
590  *target = index;
591  }
592 }
593 
594 #endif /*LV_HAVE_NEON*/
595 
596 
597 #ifdef LV_HAVE_NEONV8
598 #include <arm_neon.h>
599 #include <float.h>
600 
601 static inline void volk_32fc_index_max_32u_neonv8(uint32_t* target,
602  const lv_32fc_t* src0,
603  uint32_t num_points)
604 {
605  if (num_points == 0)
606  return;
607 
608  const uint32_t quarter_points = num_points / 4;
609  const lv_32fc_t* inputPtr = src0;
610 
611  // Use integer indices directly (no float conversion overhead)
612  uint32x4_t vec_indices = { 0, 1, 2, 3 };
613  const uint32x4_t vec_incr = vdupq_n_u32(4);
614 
615  float32x4_t vec_max = vdupq_n_f32(0.0f);
616  uint32x4_t vec_max_idx = vdupq_n_u32(0);
617 
618  for (uint32_t i = 0; i < quarter_points; i++) {
619  // Load and deinterleave complex values
620  float32x4x2_t cplx = vld2q_f32((const float*)inputPtr);
621  inputPtr += 4;
622 
623  // Magnitude squared using FMA: re*re + im*im
624  float32x4_t mag2 =
625  vfmaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
626 
627  // Compare BEFORE max update to know which lanes change
628  uint32x4_t gt_mask = vcgtq_f32(mag2, vec_max);
629  vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
630 
631  // vmaxq_f32 is single-cycle, no dependency on comparison result
632  vec_max = vmaxq_f32(mag2, vec_max);
633 
634  vec_indices = vaddq_u32(vec_indices, vec_incr);
635  }
636 
637  // ARMv8 horizontal reduction - find max value across all lanes
638  float max_val = vmaxvq_f32(vec_max);
639 
640  // Find which lane(s) have the max value, get minimum index among them
641  uint32x4_t max_mask = vceqq_f32(vec_max, vdupq_n_f32(max_val));
642  uint32x4_t idx_masked = vbslq_u32(max_mask, vec_max_idx, vdupq_n_u32(UINT32_MAX));
643  uint32_t result_idx = vminvq_u32(idx_masked);
644 
645  // Handle tail elements
646  for (uint32_t i = quarter_points * 4; i < num_points; i++) {
647  float re = lv_creal(src0[i]);
648  float im = lv_cimag(src0[i]);
649  float mag2 = re * re + im * im;
650  if (mag2 > max_val) {
651  max_val = mag2;
652  result_idx = i;
653  }
654  }
655 
656  *target = result_idx;
657 }
658 
659 #endif /*LV_HAVE_NEONV8*/
660 
661 
662 #ifdef LV_HAVE_AVX512F
663 #include <immintrin.h>
664 
665 static inline void volk_32fc_index_max_32u_u_avx512f(uint32_t* target,
666  const lv_32fc_t* src0,
667  uint32_t num_points)
668 {
669  const lv_32fc_t* src0Ptr = src0;
670  const uint32_t sixteenthPoints = num_points / 16;
671 
672  // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
673  __m512 currentIndexes =
674  _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
675  const __m512 indexIncrement = _mm512_set1_ps(16);
676 
677  __m512 maxValues = _mm512_setzero_ps();
678  __m512 maxIndices = _mm512_setzero_ps();
679 
680  for (uint32_t number = 0; number < sixteenthPoints; number++) {
681  // Load 16 complex values (32 floats)
682  __m512 in0 = _mm512_loadu_ps((const float*)src0Ptr);
683  __m512 in1 = _mm512_loadu_ps((const float*)(src0Ptr + 8));
684  src0Ptr += 16;
685 
686  // Square all values
687  in0 = _mm512_mul_ps(in0, in0);
688  in1 = _mm512_mul_ps(in1, in1);
689 
690  // Add adjacent pairs (re² + im²) using within-lane shuffle
691  // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
692  __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
693  __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
694  __m512 sum0 = _mm512_add_ps(in0, sw0);
695  __m512 sum1 = _mm512_add_ps(in1, sw1);
696 
697  // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
698  // 0x88 = _MM_SHUFFLE(2,0,2,0)
699  __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
700 
701  // Compare and update maximums
702  __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, maxValues, _CMP_GT_OS);
703  maxIndices = _mm512_mask_blend_ps(cmpMask, maxIndices, currentIndexes);
704  maxValues = _mm512_max_ps(mag_sq, maxValues);
705 
706  currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
707  }
708 
709  // Reduce 16 values to find maximum
710  __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
711  __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
712  _mm512_store_ps(maxValuesBuffer, maxValues);
713  _mm512_store_ps(maxIndexesBuffer, maxIndices);
714 
715  float max = 0.0f;
716  uint32_t index = 0;
717  for (uint32_t i = 0; i < 16; i++) {
718  if (maxValuesBuffer[i] > max) {
719  max = maxValuesBuffer[i];
720  index = (uint32_t)maxIndexesBuffer[i];
721  } else if (maxValuesBuffer[i] == max) {
722  if ((uint32_t)maxIndexesBuffer[i] < index)
723  index = (uint32_t)maxIndexesBuffer[i];
724  }
725  }
726 
727  // Handle tail
728  for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
729  const float re = lv_creal(*src0Ptr);
730  const float im = lv_cimag(*src0Ptr);
731  const float sq_dist = re * re + im * im;
732  if (sq_dist > max) {
733  max = sq_dist;
734  index = number;
735  }
736  src0Ptr++;
737  }
738  *target = index;
739 }
740 
741 #endif /*LV_HAVE_AVX512F*/
742 
743 #ifdef LV_HAVE_RVV
744 #include <float.h>
745 #include <riscv_vector.h>
746 
747 static inline void
748 volk_32fc_index_max_32u_rvv(uint32_t* target, const lv_32fc_t* src0, uint32_t num_points)
749 {
750  vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
751  vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
752  vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
753  size_t n = num_points;
754  for (size_t vl; n > 0; n -= vl, src0 += vl) {
755  vl = __riscv_vsetvl_e32m4(n);
756  vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)src0, vl);
757  vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
758  vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
759  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
760  vbool8_t m = __riscv_vmflt(vmax, v, vl);
761  vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
762  vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
763  vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
764  }
765  size_t vl = __riscv_vsetvlmax_e32m4();
766  float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
767  __riscv_vfmv_v_f_f32m1(0, 1),
768  __riscv_vsetvlmax_e32m1()));
769  // Find lanes with max value, set others to UINT32_MAX
770  vbool8_t m = __riscv_vmfeq(vmax, max, vl);
771  vuint32m4_t idx_masked =
772  __riscv_vmerge(__riscv_vmv_v_x_u32m4(UINT32_MAX, vl), vmaxi, m, vl);
773  // Find minimum index among lanes with max value
774  *target = __riscv_vmv_x(__riscv_vredminu(RISCV_SHRINK4(vminu, u, 32, idx_masked),
775  __riscv_vmv_v_x_u32m1(UINT32_MAX, 1),
776  __riscv_vsetvlmax_e32m1()));
777 }
778 #endif /*LV_HAVE_RVV*/
779 
780 #ifdef LV_HAVE_RVVSEG
781 #include <float.h>
782 #include <riscv_vector.h>
783 
784 static inline void volk_32fc_index_max_32u_rvvseg(uint32_t* target,
785  const lv_32fc_t* src0,
786  uint32_t num_points)
787 {
788  vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
789  vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
790  vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
791  size_t n = num_points;
792  for (size_t vl; n > 0; n -= vl, src0 += vl) {
793  vl = __riscv_vsetvl_e32m4(n);
794  vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)src0, vl);
795  vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
796  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
797  vbool8_t m = __riscv_vmflt(vmax, v, vl);
798  vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
799  vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
800  vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
801  }
802  size_t vl = __riscv_vsetvlmax_e32m4();
803  float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
804  __riscv_vfmv_v_f_f32m1(0, 1),
805  __riscv_vsetvlmax_e32m1()));
806  // Find lanes with max value, set others to UINT32_MAX
807  vbool8_t m = __riscv_vmfeq(vmax, max, vl);
808  vuint32m4_t idx_masked =
809  __riscv_vmerge(__riscv_vmv_v_x_u32m4(UINT32_MAX, vl), vmaxi, m, vl);
810  // Find minimum index among lanes with max value
811  *target = __riscv_vmv_x(__riscv_vredminu(RISCV_SHRINK4(vminu, u, 32, idx_masked),
812  __riscv_vmv_v_x_u32m1(UINT32_MAX, 1),
813  __riscv_vsetvlmax_e32m1()));
814 }
815 #endif /*LV_HAVE_RVVSEG*/
816 
817 #endif /*INCLUDED_volk_32fc_index_max_32u_u_H*/
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:98
bit128_p
#define bit128_p(x)
Definition: volk_common.h:147
_vmagnitudesquaredq_f32
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:74
RISCV_SHRINK4
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24
vector_32fc_index_max_variant1
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:230
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
vector_32fc_index_max_variant0
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:168
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_common.h
volk_32fc_index_max_32u_a_sse3
static void volk_32fc_index_max_32u_a_sse3(uint32_t *target, const lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:184
bit128::int_vec
__m128i int_vec
Definition: volk_common.h:128
volk_32fc_index_max_32u_generic
static void volk_32fc_index_max_32u_generic(uint32_t *target, const lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:299
bit128::i
uint32_t i[4]
Definition: volk_common.h:119
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_32fc_index_max_32u_neon
static void volk_32fc_index_max_32u_neon(uint32_t *target, const lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:536
volk_complex.h
bit128::f
float f[4]
Definition: volk_common.h:120
volk_neon_intrinsics.h
bit128
Definition: volk_common.h:116
volk_avx2_intrinsics.h
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:96
bit128::float_vec
__m128 float_vec
Definition: volk_common.h:124