Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_index_max_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
63 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
64 #define INCLUDED_volk_32fc_index_max_16u_a_H
65 
66 #include <inttypes.h>
67 #include <limits.h>
68 #include <stdio.h>
69 #include <volk/volk_common.h>
70 #include <volk/volk_complex.h>
71 
72 #ifdef LV_HAVE_AVX2
73 #include <immintrin.h>
75 
76 static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
77  const lv_32fc_t* src0,
78  uint32_t num_points)
79 {
80  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
81 
82  const __m256i indices_increment = _mm256_set1_epi32(8);
83  /*
84  * At the start of each loop iteration current_indices holds the indices of
85  * the complex numbers loaded from memory. Explanation for odd order is given
86  * in implementation of vector_32fc_index_max_variant0().
87  */
88  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
89 
90  __m256 max_values = _mm256_setzero_ps();
91  __m256i max_indices = _mm256_setzero_si256();
92 
93  for (unsigned i = 0; i < num_points / 8u; ++i) {
94  __m256 in0 = _mm256_load_ps((float*)src0);
95  __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
97  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
98  src0 += 8;
99  }
100 
101  // determine maximum value and index in the result of the vectorized loop
102  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
103  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
104  _mm256_store_ps(max_values_buffer, max_values);
105  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
106 
107  float max = 0.f;
108  uint32_t index = 0;
109  for (unsigned i = 0; i < 8; i++) {
110  if (max_values_buffer[i] > max) {
111  max = max_values_buffer[i];
112  index = max_indices_buffer[i];
113  }
114  }
115 
116  // handle tail not processed by the vectorized loop
117  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
118  const float abs_squared =
119  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
120  if (abs_squared > max) {
121  max = abs_squared;
122  index = i;
123  }
124  ++src0;
125  }
126 
127  *target = index;
128 }
129 
130 #endif /*LV_HAVE_AVX2*/
131 
132 #ifdef LV_HAVE_AVX2
133 #include <immintrin.h>
135 
136 static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
137  const lv_32fc_t* src0,
138  uint32_t num_points)
139 {
140  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
141 
142  const __m256i indices_increment = _mm256_set1_epi32(8);
143  /*
144  * At the start of each loop iteration current_indices holds the indices of
145  * the complex numbers loaded from memory. Explanation for odd order is given
146  * in implementation of vector_32fc_index_max_variant0().
147  */
148  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
149 
150  __m256 max_values = _mm256_setzero_ps();
151  __m256i max_indices = _mm256_setzero_si256();
152 
153  for (unsigned i = 0; i < num_points / 8u; ++i) {
154  __m256 in0 = _mm256_load_ps((float*)src0);
155  __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
157  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
158  src0 += 8;
159  }
160 
161  // determine maximum value and index in the result of the vectorized loop
162  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
163  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
164  _mm256_store_ps(max_values_buffer, max_values);
165  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
166 
167  float max = 0.f;
168  uint32_t index = 0;
169  for (unsigned i = 0; i < 8; i++) {
170  if (max_values_buffer[i] > max) {
171  max = max_values_buffer[i];
172  index = max_indices_buffer[i];
173  }
174  }
175 
176  // handle tail not processed by the vectorized loop
177  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
178  const float abs_squared =
179  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
180  if (abs_squared > max) {
181  max = abs_squared;
182  index = i;
183  }
184  ++src0;
185  }
186 
187  *target = index;
188 }
189 
190 #endif /*LV_HAVE_AVX2*/
191 
192 #ifdef LV_HAVE_SSE3
193 #include <pmmintrin.h>
194 #include <xmmintrin.h>
195 
196 static inline void volk_32fc_index_max_16u_a_sse3(uint16_t* target,
197  const lv_32fc_t* src0,
198  uint32_t num_points)
199 {
200  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
201  const uint32_t num_bytes = num_points * 8;
202 
203  union bit128 holderf;
204  union bit128 holderi;
205  float sq_dist = 0.0;
206 
207  union bit128 xmm5, xmm4;
208  __m128 xmm1, xmm2, xmm3;
209  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
210 
211  xmm5.int_vec = _mm_setzero_si128();
212  xmm4.int_vec = _mm_setzero_si128();
213  holderf.int_vec = _mm_setzero_si128();
214  holderi.int_vec = _mm_setzero_si128();
215 
216  int bound = num_bytes >> 5;
217  int i = 0;
218 
219  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
220  xmm9 = _mm_setzero_si128();
221  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
222  xmm3 = _mm_setzero_ps();
223 
224  for (; i < bound; ++i) {
225  xmm1 = _mm_load_ps((float*)src0);
226  xmm2 = _mm_load_ps((float*)&src0[2]);
227 
228  src0 += 4;
229 
230  xmm1 = _mm_mul_ps(xmm1, xmm1);
231  xmm2 = _mm_mul_ps(xmm2, xmm2);
232 
233  xmm1 = _mm_hadd_ps(xmm1, xmm2);
234 
235  xmm3 = _mm_max_ps(xmm1, xmm3);
236 
237  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
238  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
239 
240  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
241  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
242 
243  xmm9 = _mm_add_epi32(xmm11, xmm12);
244 
245  xmm8 = _mm_add_epi32(xmm8, xmm10);
246  }
247 
248  if (num_bytes >> 4 & 1) {
249  xmm2 = _mm_load_ps((float*)src0);
250 
251  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
252  xmm8 = bit128_p(&xmm1)->int_vec;
253 
254  xmm2 = _mm_mul_ps(xmm2, xmm2);
255 
256  src0 += 2;
257 
258  xmm1 = _mm_hadd_ps(xmm2, xmm2);
259 
260  xmm3 = _mm_max_ps(xmm1, xmm3);
261 
262  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
263 
264  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
265  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
266 
267  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
268  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
269 
270  xmm9 = _mm_add_epi32(xmm11, xmm12);
271 
272  xmm8 = _mm_add_epi32(xmm8, xmm10);
273  }
274 
275  if (num_bytes >> 3 & 1) {
276  sq_dist =
277  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
278 
279  xmm2 = _mm_load1_ps(&sq_dist);
280 
281  xmm1 = xmm3;
282 
283  xmm3 = _mm_max_ss(xmm3, xmm2);
284 
285  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
286  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
287 
288  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
289 
290  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
291  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
292 
293  xmm9 = _mm_add_epi32(xmm11, xmm12);
294  }
295 
296  _mm_store_ps((float*)&(holderf.f), xmm3);
297  _mm_store_si128(&(holderi.int_vec), xmm9);
298 
299  target[0] = holderi.i[0];
300  sq_dist = holderf.f[0];
301  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
302  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
303  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
304  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
305  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
306  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
307 }
308 
309 #endif /*LV_HAVE_SSE3*/
310 
311 
312 #ifdef LV_HAVE_NEON
313 #include <arm_neon.h>
314 #include <float.h>
315 #include <limits.h>
317 
318 static inline void
319 volk_32fc_index_max_16u_neon(uint16_t* target, const lv_32fc_t* src0, uint32_t num_points)
320 {
321  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
322 
323  if (num_points == 0)
324  return;
325 
326  const uint32_t quarter_points = num_points / 4;
327  const lv_32fc_t* inputPtr = src0;
328 
329  // Use integer indices directly
330  uint32x4_t vec_indices = { 0, 1, 2, 3 };
331  const uint32x4_t vec_incr = vdupq_n_u32(4);
332 
333  float32x4_t vec_max = vdupq_n_f32(0.0f);
334  uint32x4_t vec_max_idx = vdupq_n_u32(0);
335 
336  for (uint32_t i = 0; i < quarter_points; i++) {
337  // Load and deinterleave complex values
338  float32x4x2_t cplx = vld2q_f32((const float*)inputPtr);
339  inputPtr += 4;
340 
341  // Magnitude squared: re*re + im*im
342  float32x4_t mag2 =
343  vmlaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
344 
345  // Compare BEFORE max update to know which lanes change
346  uint32x4_t gt_mask = vcgtq_f32(mag2, vec_max);
347  vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
348 
349  // vmaxq_f32 is single-cycle, no dependency on comparison result
350  vec_max = vmaxq_f32(mag2, vec_max);
351 
352  vec_indices = vaddq_u32(vec_indices, vec_incr);
353  }
354 
355  // Scalar reduction
356  __VOLK_ATTR_ALIGNED(16) float max_buf[4];
357  __VOLK_ATTR_ALIGNED(16) uint32_t idx_buf[4];
358  vst1q_f32(max_buf, vec_max);
359  vst1q_u32(idx_buf, vec_max_idx);
360 
361  float max_val = max_buf[0];
362  uint32_t result_idx = idx_buf[0];
363  for (int i = 1; i < 4; i++) {
364  if (max_buf[i] > max_val) {
365  max_val = max_buf[i];
366  result_idx = idx_buf[i];
367  } else if (max_buf[i] == max_val && idx_buf[i] < result_idx) {
368  result_idx = idx_buf[i];
369  }
370  }
371 
372  // Handle tail
373  for (uint32_t i = quarter_points * 4; i < num_points; i++) {
374  float re = lv_creal(src0[i]);
375  float im = lv_cimag(src0[i]);
376  float mag2 = re * re + im * im;
377  if (mag2 > max_val) {
378  max_val = mag2;
379  result_idx = i;
380  }
381  }
382 
383  *target = (uint16_t)result_idx;
384 }
385 
386 #endif /*LV_HAVE_NEON*/
387 
388 
389 #ifdef LV_HAVE_NEONV8
390 #include <arm_neon.h>
391 #include <float.h>
392 #include <limits.h>
393 
394 static inline void volk_32fc_index_max_16u_neonv8(uint16_t* target,
395  const lv_32fc_t* src0,
396  uint32_t num_points)
397 {
398  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
399 
400  if (num_points == 0)
401  return;
402 
403  const uint32_t quarter_points = num_points / 4;
404  const lv_32fc_t* inputPtr = src0;
405 
406  // Use integer indices directly (no float conversion overhead)
407  uint32x4_t vec_indices = { 0, 1, 2, 3 };
408  const uint32x4_t vec_incr = vdupq_n_u32(4);
409 
410  float32x4_t vec_max = vdupq_n_f32(0.0f);
411  uint32x4_t vec_max_idx = vdupq_n_u32(0);
412 
413  for (uint32_t i = 0; i < quarter_points; i++) {
414  // Load and deinterleave complex values
415  float32x4x2_t cplx = vld2q_f32((const float*)inputPtr);
416  inputPtr += 4;
417 
418  // Magnitude squared using FMA: re*re + im*im
419  float32x4_t mag2 =
420  vfmaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
421 
422  // Compare BEFORE max update to know which lanes change
423  uint32x4_t gt_mask = vcgtq_f32(mag2, vec_max);
424  vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
425 
426  // vmaxq_f32 is single-cycle, no dependency on comparison result
427  vec_max = vmaxq_f32(mag2, vec_max);
428 
429  vec_indices = vaddq_u32(vec_indices, vec_incr);
430  }
431 
432  // ARMv8 horizontal reduction
433  float max_val = vmaxvq_f32(vec_max);
434  uint32x4_t max_mask = vceqq_f32(vec_max, vdupq_n_f32(max_val));
435  uint32x4_t idx_masked = vbslq_u32(max_mask, vec_max_idx, vdupq_n_u32(UINT32_MAX));
436  uint32_t result_idx = vminvq_u32(idx_masked);
437 
438  // Handle tail
439  for (uint32_t i = quarter_points * 4; i < num_points; i++) {
440  float re = lv_creal(src0[i]);
441  float im = lv_cimag(src0[i]);
442  float mag2 = re * re + im * im;
443  if (mag2 > max_val) {
444  max_val = mag2;
445  result_idx = i;
446  }
447  }
448 
449  *target = (uint16_t)result_idx;
450 }
451 
452 #endif /*LV_HAVE_NEONV8*/
453 
454 
455 #ifdef LV_HAVE_GENERIC
456 static inline void volk_32fc_index_max_16u_generic(uint16_t* target,
457  const lv_32fc_t* src0,
458  uint32_t num_points)
459 {
460  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
461 
462  const uint32_t num_bytes = num_points * 8;
463 
464  float sq_dist = 0.0;
465  float max = 0.0;
466  uint16_t index = 0;
467 
468  uint32_t i = 0;
469 
470  for (; i < (num_bytes >> 3); ++i) {
471  sq_dist =
472  lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
473 
474  if (sq_dist > max) {
475  index = i;
476  max = sq_dist;
477  }
478  }
479  target[0] = index;
480 }
481 
482 #endif /*LV_HAVE_GENERIC*/
483 
484 #ifdef LV_HAVE_AVX512F
485 #include <immintrin.h>
486 #include <limits.h>
487 
488 static inline void volk_32fc_index_max_16u_a_avx512f(uint16_t* target,
489  const lv_32fc_t* src0,
490  uint32_t num_points)
491 {
492  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
493 
494  const lv_32fc_t* src0Ptr = src0;
495  const uint32_t sixteenthPoints = num_points / 16;
496 
497  // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
498  __m512 currentIndexes =
499  _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
500  const __m512 indexIncrement = _mm512_set1_ps(16);
501 
502  __m512 maxValues = _mm512_setzero_ps();
503  __m512 maxIndices = _mm512_setzero_ps();
504 
505  for (uint32_t number = 0; number < sixteenthPoints; number++) {
506  // Load 16 complex values (32 floats)
507  __m512 in0 = _mm512_load_ps((const float*)src0Ptr);
508  __m512 in1 = _mm512_load_ps((const float*)(src0Ptr + 8));
509  src0Ptr += 16;
510 
511  // Square all values
512  in0 = _mm512_mul_ps(in0, in0);
513  in1 = _mm512_mul_ps(in1, in1);
514 
515  // Add adjacent pairs (re² + im²) using within-lane shuffle
516  // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
517  __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
518  __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
519  __m512 sum0 = _mm512_add_ps(in0, sw0);
520  __m512 sum1 = _mm512_add_ps(in1, sw1);
521 
522  // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
523  // 0x88 = _MM_SHUFFLE(2,0,2,0)
524  __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
525 
526  // Compare and update maximums
527  __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, maxValues, _CMP_GT_OS);
528  maxIndices = _mm512_mask_blend_ps(cmpMask, maxIndices, currentIndexes);
529  maxValues = _mm512_max_ps(mag_sq, maxValues);
530 
531  currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
532  }
533 
534  // Reduce 16 values to find maximum
535  __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
536  __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
537  _mm512_store_ps(maxValuesBuffer, maxValues);
538  _mm512_store_ps(maxIndexesBuffer, maxIndices);
539 
540  float max = 0.0f;
541  uint32_t index = 0;
542  for (uint32_t i = 0; i < 16; i++) {
543  if (maxValuesBuffer[i] > max) {
544  max = maxValuesBuffer[i];
545  index = (uint32_t)maxIndexesBuffer[i];
546  } else if (maxValuesBuffer[i] == max) {
547  if ((uint32_t)maxIndexesBuffer[i] < index)
548  index = (uint32_t)maxIndexesBuffer[i];
549  }
550  }
551 
552  // Handle tail
553  for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
554  const float re = lv_creal(*src0Ptr);
555  const float im = lv_cimag(*src0Ptr);
556  const float sq_dist = re * re + im * im;
557  if (sq_dist > max) {
558  max = sq_dist;
559  index = number;
560  }
561  src0Ptr++;
562  }
563  *target = (uint16_t)index;
564 }
565 
566 #endif /*LV_HAVE_AVX512F*/
567 
568 #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
569 
570 #ifndef INCLUDED_volk_32fc_index_max_16u_u_H
571 #define INCLUDED_volk_32fc_index_max_16u_u_H
572 
573 #include <inttypes.h>
574 #include <limits.h>
575 #include <stdio.h>
576 #include <volk/volk_common.h>
577 #include <volk/volk_complex.h>
578 
579 #ifdef LV_HAVE_AVX2
580 #include <immintrin.h>
582 
583 static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
584  const lv_32fc_t* src0,
585  uint32_t num_points)
586 {
587  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
588 
589  const __m256i indices_increment = _mm256_set1_epi32(8);
590  /*
591  * At the start of each loop iteration current_indices holds the indices of
592  * the complex numbers loaded from memory. Explanation for odd order is given
593  * in implementation of vector_32fc_index_max_variant0().
594  */
595  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
596 
597  __m256 max_values = _mm256_setzero_ps();
598  __m256i max_indices = _mm256_setzero_si256();
599 
600  for (unsigned i = 0; i < num_points / 8u; ++i) {
601  __m256 in0 = _mm256_loadu_ps((float*)src0);
602  __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
604  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
605  src0 += 8;
606  }
607 
608  // determine maximum value and index in the result of the vectorized loop
609  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
610  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
611  _mm256_store_ps(max_values_buffer, max_values);
612  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
613 
614  float max = 0.f;
615  uint32_t index = 0;
616  for (unsigned i = 0; i < 8; i++) {
617  if (max_values_buffer[i] > max) {
618  max = max_values_buffer[i];
619  index = max_indices_buffer[i];
620  }
621  }
622 
623  // handle tail not processed by the vectorized loop
624  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
625  const float abs_squared =
626  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
627  if (abs_squared > max) {
628  max = abs_squared;
629  index = i;
630  }
631  ++src0;
632  }
633 
634  *target = index;
635 }
636 
637 #endif /*LV_HAVE_AVX2*/
638 
639 #ifdef LV_HAVE_AVX2
640 #include <immintrin.h>
642 
643 static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
644  const lv_32fc_t* src0,
645  uint32_t num_points)
646 {
647  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
648 
649  const __m256i indices_increment = _mm256_set1_epi32(8);
650  /*
651  * At the start of each loop iteration current_indices holds the indices of
652  * the complex numbers loaded from memory. Explanation for odd order is given
653  * in implementation of vector_32fc_index_max_variant0().
654  */
655  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
656 
657  __m256 max_values = _mm256_setzero_ps();
658  __m256i max_indices = _mm256_setzero_si256();
659 
660  for (unsigned i = 0; i < num_points / 8u; ++i) {
661  __m256 in0 = _mm256_loadu_ps((float*)src0);
662  __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
664  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
665  src0 += 8;
666  }
667 
668  // determine maximum value and index in the result of the vectorized loop
669  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
670  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
671  _mm256_store_ps(max_values_buffer, max_values);
672  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
673 
674  float max = 0.f;
675  uint32_t index = 0;
676  for (unsigned i = 0; i < 8; i++) {
677  if (max_values_buffer[i] > max) {
678  max = max_values_buffer[i];
679  index = max_indices_buffer[i];
680  }
681  }
682 
683  // handle tail not processed by the vectorized loop
684  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
685  const float abs_squared =
686  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
687  if (abs_squared > max) {
688  max = abs_squared;
689  index = i;
690  }
691  ++src0;
692  }
693 
694  *target = index;
695 }
696 
697 #endif /*LV_HAVE_AVX2*/
698 
699 #ifdef LV_HAVE_AVX512F
700 #include <immintrin.h>
701 #include <limits.h>
702 
703 static inline void volk_32fc_index_max_16u_u_avx512f(uint16_t* target,
704  const lv_32fc_t* src0,
705  uint32_t num_points)
706 {
707  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
708 
709  const lv_32fc_t* src0Ptr = src0;
710  const uint32_t sixteenthPoints = num_points / 16;
711 
712  // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
713  __m512 currentIndexes =
714  _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
715  const __m512 indexIncrement = _mm512_set1_ps(16);
716 
717  __m512 maxValues = _mm512_setzero_ps();
718  __m512 maxIndices = _mm512_setzero_ps();
719 
720  for (uint32_t number = 0; number < sixteenthPoints; number++) {
721  // Load 16 complex values (32 floats)
722  __m512 in0 = _mm512_loadu_ps((const float*)src0Ptr);
723  __m512 in1 = _mm512_loadu_ps((const float*)(src0Ptr + 8));
724  src0Ptr += 16;
725 
726  // Square all values
727  in0 = _mm512_mul_ps(in0, in0);
728  in1 = _mm512_mul_ps(in1, in1);
729 
730  // Add adjacent pairs (re² + im²) using within-lane shuffle
731  // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
732  __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
733  __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
734  __m512 sum0 = _mm512_add_ps(in0, sw0);
735  __m512 sum1 = _mm512_add_ps(in1, sw1);
736 
737  // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
738  // 0x88 = _MM_SHUFFLE(2,0,2,0)
739  __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
740 
741  // Compare and update maximums
742  __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, maxValues, _CMP_GT_OS);
743  maxIndices = _mm512_mask_blend_ps(cmpMask, maxIndices, currentIndexes);
744  maxValues = _mm512_max_ps(mag_sq, maxValues);
745 
746  currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
747  }
748 
749  // Reduce 16 values to find maximum
750  __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
751  __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
752  _mm512_store_ps(maxValuesBuffer, maxValues);
753  _mm512_store_ps(maxIndexesBuffer, maxIndices);
754 
755  float max = 0.0f;
756  uint32_t index = 0;
757  for (uint32_t i = 0; i < 16; i++) {
758  if (maxValuesBuffer[i] > max) {
759  max = maxValuesBuffer[i];
760  index = (uint32_t)maxIndexesBuffer[i];
761  } else if (maxValuesBuffer[i] == max) {
762  if ((uint32_t)maxIndexesBuffer[i] < index)
763  index = (uint32_t)maxIndexesBuffer[i];
764  }
765  }
766 
767  // Handle tail
768  for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
769  const float re = lv_creal(*src0Ptr);
770  const float im = lv_cimag(*src0Ptr);
771  const float sq_dist = re * re + im * im;
772  if (sq_dist > max) {
773  max = sq_dist;
774  index = number;
775  }
776  src0Ptr++;
777  }
778  *target = (uint16_t)index;
779 }
780 
781 #endif /*LV_HAVE_AVX512F*/
782 
783 #ifdef LV_HAVE_RVV
784 #include <float.h>
785 #include <riscv_vector.h>
786 
787 static inline void
788 volk_32fc_index_max_16u_rvv(uint16_t* target, const lv_32fc_t* src0, uint32_t num_points)
789 {
790  vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
791  vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
792  vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
793  size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
794  for (size_t vl; n > 0; n -= vl, src0 += vl) {
795  vl = __riscv_vsetvl_e32m4(n);
796  vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)src0, vl);
797  vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
798  vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
799  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
800  vbool8_t m = __riscv_vmflt(vmax, v, vl);
801  vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
802  vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
803  vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
804  }
805  size_t vl = __riscv_vsetvlmax_e32m4();
806  float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
807  __riscv_vfmv_v_f_f32m1(0, 1),
808  __riscv_vsetvlmax_e32m1()));
809  // Find minimum index among lanes with max value
810  // Note: mask type mismatch (vbool8_t vs vbool4_t) prevents using vmerge,
811  // so we use scalar comparison
812  __attribute__((aligned(32))) float values[128];
813  __attribute__((aligned(32))) uint16_t indices[128];
814  __riscv_vse32(values, vmax, vl);
815  __riscv_vse16(indices, vmaxi, vl);
816  uint16_t min_idx = UINT16_MAX;
817  for (size_t i = 0; i < vl; i++) {
818  if (values[i] == max && indices[i] < min_idx) {
819  min_idx = indices[i];
820  }
821  }
822  *target = min_idx;
823 }
824 #endif /*LV_HAVE_RVV*/
825 
826 #ifdef LV_HAVE_RVVSEG
827 #include <float.h>
828 #include <riscv_vector.h>
829 
830 static inline void volk_32fc_index_max_16u_rvvseg(uint16_t* target,
831  const lv_32fc_t* src0,
832  uint32_t num_points)
833 {
834  vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
835  vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
836  vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
837  size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
838  for (size_t vl; n > 0; n -= vl, src0 += vl) {
839  vl = __riscv_vsetvl_e32m4(n);
840  vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)src0, vl);
841  vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
842  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
843  vbool8_t m = __riscv_vmflt(vmax, v, vl);
844  vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
845  vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
846  vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
847  }
848  size_t vl = __riscv_vsetvlmax_e32m4();
849  float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
850  __riscv_vfmv_v_f_f32m1(0, 1),
851  __riscv_vsetvlmax_e32m1()));
852  // Find minimum index among lanes with max value
853  __attribute__((aligned(32))) float values[128];
854  __attribute__((aligned(32))) uint16_t indices[128];
855  __riscv_vse32(values, vmax, vl);
856  __riscv_vse16(indices, vmaxi, vl);
857  uint16_t min_idx = UINT16_MAX;
858  for (size_t i = 0; i < vl; i++) {
859  if (values[i] == max && indices[i] < min_idx) {
860  min_idx = indices[i];
861  }
862  }
863  *target = min_idx;
864 }
865 #endif /*LV_HAVE_RVVSEG*/
866 
867 #endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:98
bit128_p
#define bit128_p(x)
Definition: volk_common.h:147
RISCV_SHRINK4
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24
vector_32fc_index_max_variant1
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:230
volk_32fc_index_max_16u_neon
static void volk_32fc_index_max_16u_neon(uint16_t *target, const lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:319
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
volk_32fc_index_max_16u_generic
static void volk_32fc_index_max_16u_generic(uint16_t *target, const lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:456
vector_32fc_index_max_variant0
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:168
volk_32fc_index_max_16u_a_sse3
static void volk_32fc_index_max_16u_a_sse3(uint16_t *target, const lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:196
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_common.h
bit128::int_vec
__m128i int_vec
Definition: volk_common.h:128
bit128::i
uint32_t i[4]
Definition: volk_common.h:119
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_complex.h
bit128::f
float f[4]
Definition: volk_common.h:120
volk_neon_intrinsics.h
bit128
Definition: volk_common.h:116
volk_avx2_intrinsics.h
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:96
bit128::float_vec
__m128 float_vec
Definition: volk_common.h:124