Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_index_min_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2021 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
57 #ifndef INCLUDED_volk_32fc_index_min_32u_a_H
58 #define INCLUDED_volk_32fc_index_min_32u_a_H
59 
60 #include <inttypes.h>
61 #include <stdio.h>
62 #include <volk/volk_common.h>
63 #include <volk/volk_complex.h>
64 
65 #ifdef LV_HAVE_AVX2
66 #include <immintrin.h>
68 
69 static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target,
70  const lv_32fc_t* source,
71  uint32_t num_points)
72 {
73  const __m256i indices_increment = _mm256_set1_epi32(8);
74  /*
75  * At the start of each loop iteration current_indices holds the indices of
76  * the complex numbers loaded from memory. Explanation for odd order is given
77  * in implementation of vector_32fc_index_min_variant0().
78  */
79  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
80 
81  __m256 min_values = _mm256_set1_ps(FLT_MAX);
82  __m256i min_indices = _mm256_setzero_si256();
83 
84  for (unsigned i = 0; i < num_points / 8u; ++i) {
85  __m256 in0 = _mm256_load_ps((float*)source);
86  __m256 in1 = _mm256_load_ps((float*)(source + 4));
88  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
89  source += 8;
90  }
91 
92  // determine minimum value and index in the result of the vectorized loop
93  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
94  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
95  _mm256_store_ps(min_values_buffer, min_values);
96  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
97 
98  float min = FLT_MAX;
99  uint32_t index = 0;
100  for (unsigned i = 0; i < 8; i++) {
101  if (min_values_buffer[i] < min) {
102  min = min_values_buffer[i];
103  index = min_indices_buffer[i];
104  }
105  }
106 
107  // handle tail not processed by the vectorized loop
108  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
109  const float abs_squared =
110  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
111  if (abs_squared < min) {
112  min = abs_squared;
113  index = i;
114  }
115  ++source;
116  }
117 
118  *target = index;
119 }
120 
121 #endif /*LV_HAVE_AVX2*/
122 
123 #ifdef LV_HAVE_AVX2
124 #include <immintrin.h>
126 
127 static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target,
128  const lv_32fc_t* source,
129  uint32_t num_points)
130 {
131  const __m256i indices_increment = _mm256_set1_epi32(8);
132  /*
133  * At the start of each loop iteration current_indices holds the indices of
134  * the complex numbers loaded from memory. Explanation for odd order is given
135  * in implementation of vector_32fc_index_min_variant0().
136  */
137  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
138 
139  __m256 min_values = _mm256_set1_ps(FLT_MAX);
140  __m256i min_indices = _mm256_setzero_si256();
141 
142  for (unsigned i = 0; i < num_points / 8u; ++i) {
143  __m256 in0 = _mm256_load_ps((float*)source);
144  __m256 in1 = _mm256_load_ps((float*)(source + 4));
146  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
147  source += 8;
148  }
149 
150  // determine minimum value and index in the result of the vectorized loop
151  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
152  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
153  _mm256_store_ps(min_values_buffer, min_values);
154  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
155 
156  float min = FLT_MAX;
157  uint32_t index = 0;
158  for (unsigned i = 0; i < 8; i++) {
159  if (min_values_buffer[i] < min) {
160  min = min_values_buffer[i];
161  index = min_indices_buffer[i];
162  }
163  }
164 
165  // handle tail not processed by the vectorized loop
166  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
167  const float abs_squared =
168  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
169  if (abs_squared < min) {
170  min = abs_squared;
171  index = i;
172  }
173  ++source;
174  }
175 
176  *target = index;
177 }
178 
179 #endif /*LV_HAVE_AVX2*/
180 
181 #ifdef LV_HAVE_SSE3
182 #include <pmmintrin.h>
183 #include <xmmintrin.h>
184 
185 static inline void volk_32fc_index_min_32u_a_sse3(uint32_t* target,
186  const lv_32fc_t* source,
187  uint32_t num_points)
188 {
189  union bit128 holderf;
190  union bit128 holderi;
191  float sq_dist = 0.0;
192 
193  union bit128 xmm5, xmm4;
194  __m128 xmm1, xmm2, xmm3;
195  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
196 
197  xmm5.int_vec = _mm_setzero_si128();
198  xmm4.int_vec = _mm_setzero_si128();
199  holderf.int_vec = _mm_setzero_si128();
200  holderi.int_vec = _mm_setzero_si128();
201 
202  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
203  xmm9 = _mm_setzero_si128();
204  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
205  xmm3 = _mm_set_ps1(FLT_MAX);
206 
207  int bound = num_points >> 2;
208 
209  for (int i = 0; i < bound; ++i) {
210  xmm1 = _mm_load_ps((float*)source);
211  xmm2 = _mm_load_ps((float*)&source[2]);
212 
213  source += 4;
214 
215  xmm1 = _mm_mul_ps(xmm1, xmm1);
216  xmm2 = _mm_mul_ps(xmm2, xmm2);
217 
218  xmm1 = _mm_hadd_ps(xmm1, xmm2);
219 
220  xmm3 = _mm_min_ps(xmm1, xmm3);
221 
222  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
223  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
224 
225  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
226  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
227 
228  xmm9 = _mm_add_epi32(xmm11, xmm12);
229 
230  xmm8 = _mm_add_epi32(xmm8, xmm10);
231  }
232 
233  if (num_points >> 1 & 1) {
234  xmm2 = _mm_load_ps((float*)source);
235 
236  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
237  xmm8 = bit128_p(&xmm1)->int_vec;
238 
239  xmm2 = _mm_mul_ps(xmm2, xmm2);
240 
241  source += 2;
242 
243  xmm1 = _mm_hadd_ps(xmm2, xmm2);
244 
245  xmm3 = _mm_min_ps(xmm1, xmm3);
246 
247  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
248 
249  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
250  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
251 
252  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
253  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
254 
255  xmm9 = _mm_add_epi32(xmm11, xmm12);
256 
257  xmm8 = _mm_add_epi32(xmm8, xmm10);
258  }
259 
260  if (num_points & 1) {
261  sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
262  lv_cimag(source[0]) * lv_cimag(source[0]);
263 
264  xmm2 = _mm_load1_ps(&sq_dist);
265 
266  xmm1 = xmm3;
267 
268  xmm3 = _mm_min_ss(xmm3, xmm2);
269 
270  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
271  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
272 
273  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
274 
275  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
276  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
277 
278  xmm9 = _mm_add_epi32(xmm11, xmm12);
279  }
280 
281  _mm_store_ps((float*)&(holderf.f), xmm3);
282  _mm_store_si128(&(holderi.int_vec), xmm9);
283 
284  target[0] = holderi.i[0];
285  sq_dist = holderf.f[0];
286  target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
287  sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
288  target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
289  sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
290  target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
291  sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
292 }
293 
294 #endif /*LV_HAVE_SSE3*/
295 
296 #ifdef LV_HAVE_GENERIC
297 static inline void volk_32fc_index_min_32u_generic(uint32_t* target,
298  const lv_32fc_t* source,
299  uint32_t num_points)
300 {
301  float sq_dist = 0.0;
302  float min = FLT_MAX;
303  uint32_t index = 0;
304 
305  for (uint32_t i = 0; i < num_points; ++i) {
306  sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
307  lv_cimag(source[i]) * lv_cimag(source[i]);
308 
309  if (sq_dist < min) {
310  index = i;
311  min = sq_dist;
312  }
313  }
314  target[0] = index;
315 }
316 
317 #endif /*LV_HAVE_GENERIC*/
318 
319 #ifdef LV_HAVE_AVX512F
320 #include <float.h>
321 #include <immintrin.h>
322 
323 static inline void volk_32fc_index_min_32u_a_avx512f(uint32_t* target,
324  const lv_32fc_t* src0,
325  uint32_t num_points)
326 {
327  const lv_32fc_t* src0Ptr = src0;
328  const uint32_t sixteenthPoints = num_points / 16;
329 
330  // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
331  __m512 currentIndexes =
332  _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
333  const __m512 indexIncrement = _mm512_set1_ps(16);
334 
335  __m512 minValues = _mm512_set1_ps(FLT_MAX);
336  __m512 minIndices = _mm512_setzero_ps();
337 
338  for (uint32_t number = 0; number < sixteenthPoints; number++) {
339  // Load 16 complex values (32 floats)
340  __m512 in0 = _mm512_load_ps((const float*)src0Ptr);
341  __m512 in1 = _mm512_load_ps((const float*)(src0Ptr + 8));
342  src0Ptr += 16;
343 
344  // Square all values
345  in0 = _mm512_mul_ps(in0, in0);
346  in1 = _mm512_mul_ps(in1, in1);
347 
348  // Add adjacent pairs (re² + im²) using within-lane shuffle
349  // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
350  __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
351  __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
352  __m512 sum0 = _mm512_add_ps(in0, sw0);
353  __m512 sum1 = _mm512_add_ps(in1, sw1);
354 
355  // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
356  // 0x88 = _MM_SHUFFLE(2,0,2,0)
357  __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
358 
359  // Compare and update minimums
360  __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, minValues, _CMP_LT_OS);
361  minIndices = _mm512_mask_blend_ps(cmpMask, minIndices, currentIndexes);
362  minValues = _mm512_min_ps(mag_sq, minValues);
363 
364  currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
365  }
366 
367  // Reduce 16 values to find minimum
368  __VOLK_ATTR_ALIGNED(64) float minValuesBuffer[16];
369  __VOLK_ATTR_ALIGNED(64) float minIndexesBuffer[16];
370  _mm512_store_ps(minValuesBuffer, minValues);
371  _mm512_store_ps(minIndexesBuffer, minIndices);
372 
373  float min = FLT_MAX;
374  uint32_t index = 0;
375  for (uint32_t i = 0; i < 16; i++) {
376  if (minValuesBuffer[i] < min) {
377  min = minValuesBuffer[i];
378  index = (uint32_t)minIndexesBuffer[i];
379  } else if (minValuesBuffer[i] == min) {
380  if ((uint32_t)minIndexesBuffer[i] < index)
381  index = (uint32_t)minIndexesBuffer[i];
382  }
383  }
384 
385  // Handle tail
386  for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
387  const float re = lv_creal(*src0Ptr);
388  const float im = lv_cimag(*src0Ptr);
389  const float sq_dist = re * re + im * im;
390  if (sq_dist < min) {
391  min = sq_dist;
392  index = number;
393  }
394  src0Ptr++;
395  }
396  *target = index;
397 }
398 
399 #endif /*LV_HAVE_AVX512F*/
400 
401 #endif /*INCLUDED_volk_32fc_index_min_32u_a_H*/
402 
403 #ifndef INCLUDED_volk_32fc_index_min_32u_u_H
404 #define INCLUDED_volk_32fc_index_min_32u_u_H
405 
406 #include <inttypes.h>
407 #include <stdio.h>
408 #include <volk/volk_common.h>
409 #include <volk/volk_complex.h>
410 
411 #ifdef LV_HAVE_AVX2
412 #include <immintrin.h>
414 
415 static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target,
416  const lv_32fc_t* source,
417  uint32_t num_points)
418 {
419  const __m256i indices_increment = _mm256_set1_epi32(8);
420  /*
421  * At the start of each loop iteration current_indices holds the indices of
422  * the complex numbers loaded from memory. Explanation for odd order is given
423  * in implementation of vector_32fc_index_min_variant0().
424  */
425  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
426 
427  __m256 min_values = _mm256_set1_ps(FLT_MAX);
428  __m256i min_indices = _mm256_setzero_si256();
429 
430  for (unsigned i = 0; i < num_points / 8u; ++i) {
431  __m256 in0 = _mm256_loadu_ps((float*)source);
432  __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
434  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
435  source += 8;
436  }
437 
438  // determine minimum value and index in the result of the vectorized loop
439  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
440  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
441  _mm256_store_ps(min_values_buffer, min_values);
442  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
443 
444  float min = FLT_MAX;
445  uint32_t index = 0;
446  for (unsigned i = 0; i < 8; i++) {
447  if (min_values_buffer[i] < min) {
448  min = min_values_buffer[i];
449  index = min_indices_buffer[i];
450  }
451  }
452 
453  // handle tail not processed by the vectorized loop
454  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
455  const float abs_squared =
456  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
457  if (abs_squared < min) {
458  min = abs_squared;
459  index = i;
460  }
461  ++source;
462  }
463 
464  *target = index;
465 }
466 
467 #endif /*LV_HAVE_AVX2*/
468 
469 #ifdef LV_HAVE_AVX2
470 #include <immintrin.h>
472 
473 static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target,
474  const lv_32fc_t* source,
475  uint32_t num_points)
476 {
477  const __m256i indices_increment = _mm256_set1_epi32(8);
478  /*
479  * At the start of each loop iteration current_indices holds the indices of
480  * the complex numbers loaded from memory. Explanation for odd order is given
481  * in implementation of vector_32fc_index_min_variant0().
482  */
483  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
484 
485  __m256 min_values = _mm256_set1_ps(FLT_MAX);
486  __m256i min_indices = _mm256_setzero_si256();
487 
488  for (unsigned i = 0; i < num_points / 8u; ++i) {
489  __m256 in0 = _mm256_loadu_ps((float*)source);
490  __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
492  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
493  source += 8;
494  }
495 
496  // determine minimum value and index in the result of the vectorized loop
497  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
498  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
499  _mm256_store_ps(min_values_buffer, min_values);
500  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
501 
502  float min = FLT_MAX;
503  uint32_t index = 0;
504  for (unsigned i = 0; i < 8; i++) {
505  if (min_values_buffer[i] < min) {
506  min = min_values_buffer[i];
507  index = min_indices_buffer[i];
508  }
509  }
510 
511  // handle tail not processed by the vectorized loop
512  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
513  const float abs_squared =
514  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
515  if (abs_squared < min) {
516  min = abs_squared;
517  index = i;
518  }
519  ++source;
520  }
521 
522  *target = index;
523 }
524 
525 #endif /*LV_HAVE_AVX2*/
526 
527 #ifdef LV_HAVE_NEON
528 #include <arm_neon.h>
530 
531 static inline void volk_32fc_index_min_32u_neon(uint32_t* target,
532  const lv_32fc_t* source,
533  uint32_t num_points)
534 {
535  const uint32_t quarter_points = num_points / 4;
536  const lv_32fc_t* sourcePtr = source;
537 
538  uint32_t indices[4] = { 0, 1, 2, 3 };
539  const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
540  uint32x4_t vec_indices = vld1q_u32(indices);
541  uint32x4_t vec_min_indices = vec_indices;
542 
543  if (num_points) {
544  float min = FLT_MAX;
545  uint32_t index = 0;
546 
547  float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
548 
549  for (uint32_t number = 0; number < quarter_points; number++) {
550  // Load complex and compute magnitude squared
551  const float32x4_t vec_mag2 =
552  _vmagnitudesquaredq_f32(vld2q_f32((float*)sourcePtr));
553  __VOLK_PREFETCH(sourcePtr += 4);
554  // a < b?
555  const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min);
556  vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min);
557  vec_min_indices = vbslq_u32(lt_mask, vec_indices, vec_min_indices);
558  vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
559  }
560  uint32_t tmp_min_indices[4];
561  float tmp_min[4];
562  vst1q_u32(tmp_min_indices, vec_min_indices);
563  vst1q_f32(tmp_min, vec_min);
564 
565  for (int i = 0; i < 4; i++) {
566  if (tmp_min[i] < min) {
567  min = tmp_min[i];
568  index = tmp_min_indices[i];
569  } else if (tmp_min[i] == min) {
570  if (tmp_min_indices[i] < index)
571  index = tmp_min_indices[i];
572  }
573  }
574 
575  // Deal with the rest
576  for (uint32_t number = quarter_points * 4; number < num_points; number++) {
577  const float re = lv_creal(*sourcePtr);
578  const float im = lv_cimag(*sourcePtr);
579  const float sq_dist = re * re + im * im;
580  if (sq_dist < min) {
581  min = sq_dist;
582  index = number;
583  }
584  sourcePtr++;
585  }
586  *target = index;
587  }
588 }
589 
590 #endif /*LV_HAVE_NEON*/
591 
592 
593 #ifdef LV_HAVE_NEONV8
594 #include <arm_neon.h>
595 #include <float.h>
596 
597 static inline void volk_32fc_index_min_32u_neonv8(uint32_t* target,
598  const lv_32fc_t* source,
599  uint32_t num_points)
600 {
601  if (num_points == 0)
602  return;
603 
604  const uint32_t quarter_points = num_points / 4;
605  const lv_32fc_t* inputPtr = source;
606 
607  // Use integer indices directly (no float conversion overhead)
608  uint32x4_t vec_indices = { 0, 1, 2, 3 };
609  const uint32x4_t vec_incr = vdupq_n_u32(4);
610 
611  float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
612  uint32x4_t vec_min_idx = vdupq_n_u32(0);
613 
614  for (uint32_t i = 0; i < quarter_points; i++) {
615  // Load and deinterleave complex values
616  float32x4x2_t cplx = vld2q_f32((const float*)inputPtr);
617  inputPtr += 4;
618 
619  // Magnitude squared using FMA: re*re + im*im
620  float32x4_t mag2 =
621  vfmaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
622 
623  // Compare BEFORE min update to know which lanes change
624  uint32x4_t lt_mask = vcltq_f32(mag2, vec_min);
625  vec_min_idx = vbslq_u32(lt_mask, vec_indices, vec_min_idx);
626 
627  // vminq_f32 is single-cycle, no dependency on comparison result
628  vec_min = vminq_f32(mag2, vec_min);
629 
630  vec_indices = vaddq_u32(vec_indices, vec_incr);
631  }
632 
633  // ARMv8 horizontal reduction - find min value across all lanes
634  float min_val = vminvq_f32(vec_min);
635 
636  // Find which lane(s) have the min value, get minimum index among them
637  uint32x4_t min_mask = vceqq_f32(vec_min, vdupq_n_f32(min_val));
638  uint32x4_t idx_masked = vbslq_u32(min_mask, vec_min_idx, vdupq_n_u32(UINT32_MAX));
639  uint32_t result_idx = vminvq_u32(idx_masked);
640 
641  // Handle tail elements
642  for (uint32_t i = quarter_points * 4; i < num_points; i++) {
643  float re = lv_creal(source[i]);
644  float im = lv_cimag(source[i]);
645  float mag2 = re * re + im * im;
646  if (mag2 < min_val) {
647  min_val = mag2;
648  result_idx = i;
649  }
650  }
651 
652  *target = result_idx;
653 }
654 
655 #endif /*LV_HAVE_NEONV8*/
656 
657 
658 #ifdef LV_HAVE_AVX512F
659 #include <float.h>
660 #include <immintrin.h>
661 
662 static inline void volk_32fc_index_min_32u_u_avx512f(uint32_t* target,
663  const lv_32fc_t* src0,
664  uint32_t num_points)
665 {
666  const lv_32fc_t* src0Ptr = src0;
667  const uint32_t sixteenthPoints = num_points / 16;
668 
669  // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
670  __m512 currentIndexes =
671  _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
672  const __m512 indexIncrement = _mm512_set1_ps(16);
673 
674  __m512 minValues = _mm512_set1_ps(FLT_MAX);
675  __m512 minIndices = _mm512_setzero_ps();
676 
677  for (uint32_t number = 0; number < sixteenthPoints; number++) {
678  // Load 16 complex values (32 floats)
679  __m512 in0 = _mm512_loadu_ps((const float*)src0Ptr);
680  __m512 in1 = _mm512_loadu_ps((const float*)(src0Ptr + 8));
681  src0Ptr += 16;
682 
683  // Square all values
684  in0 = _mm512_mul_ps(in0, in0);
685  in1 = _mm512_mul_ps(in1, in1);
686 
687  // Add adjacent pairs (re² + im²) using within-lane shuffle
688  // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
689  __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
690  __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
691  __m512 sum0 = _mm512_add_ps(in0, sw0);
692  __m512 sum1 = _mm512_add_ps(in1, sw1);
693 
694  // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
695  // 0x88 = _MM_SHUFFLE(2,0,2,0)
696  __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
697 
698  // Compare and update minimums
699  __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, minValues, _CMP_LT_OS);
700  minIndices = _mm512_mask_blend_ps(cmpMask, minIndices, currentIndexes);
701  minValues = _mm512_min_ps(mag_sq, minValues);
702 
703  currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
704  }
705 
706  // Reduce 16 values to find minimum
707  __VOLK_ATTR_ALIGNED(64) float minValuesBuffer[16];
708  __VOLK_ATTR_ALIGNED(64) float minIndexesBuffer[16];
709  _mm512_store_ps(minValuesBuffer, minValues);
710  _mm512_store_ps(minIndexesBuffer, minIndices);
711 
712  float min = FLT_MAX;
713  uint32_t index = 0;
714  for (uint32_t i = 0; i < 16; i++) {
715  if (minValuesBuffer[i] < min) {
716  min = minValuesBuffer[i];
717  index = (uint32_t)minIndexesBuffer[i];
718  } else if (minValuesBuffer[i] == min) {
719  if ((uint32_t)minIndexesBuffer[i] < index)
720  index = (uint32_t)minIndexesBuffer[i];
721  }
722  }
723 
724  // Handle tail
725  for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
726  const float re = lv_creal(*src0Ptr);
727  const float im = lv_cimag(*src0Ptr);
728  const float sq_dist = re * re + im * im;
729  if (sq_dist < min) {
730  min = sq_dist;
731  index = number;
732  }
733  src0Ptr++;
734  }
735  *target = index;
736 }
737 
738 #endif /*LV_HAVE_AVX512F*/
739 
740 #ifdef LV_HAVE_RVV
741 #include <float.h>
742 #include <riscv_vector.h>
743 
744 static inline void volk_32fc_index_min_32u_rvv(uint32_t* target,
745  const lv_32fc_t* source,
746  uint32_t num_points)
747 {
748  vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
749  vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
750  vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
751  size_t n = num_points;
752  for (size_t vl; n > 0; n -= vl, source += vl) {
753  vl = __riscv_vsetvl_e32m4(n);
754  vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)source, vl);
755  vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
756  vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
757  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
758  vbool8_t m = __riscv_vmfgt(vmin, v, vl);
759  vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
760  vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
761  vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
762  }
763  size_t vl = __riscv_vsetvlmax_e32m4();
764  float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
765  __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
766  __riscv_vsetvlmax_e32m1()));
767  // Find lanes with min value, set others to UINT32_MAX
768  vbool8_t m = __riscv_vmfeq(vmin, min, vl);
769  vuint32m4_t idx_masked =
770  __riscv_vmerge(__riscv_vmv_v_x_u32m4(UINT32_MAX, vl), vmini, m, vl);
771  // Find minimum index among lanes with min value
772  *target = __riscv_vmv_x(__riscv_vredminu(RISCV_SHRINK4(vminu, u, 32, idx_masked),
773  __riscv_vmv_v_x_u32m1(UINT32_MAX, 1),
774  __riscv_vsetvlmax_e32m1()));
775 }
776 #endif /*LV_HAVE_RVV*/
777 
778 #ifdef LV_HAVE_RVVSEG
779 #include <float.h>
780 #include <riscv_vector.h>
781 
782 static inline void volk_32fc_index_min_32u_rvvseg(uint32_t* target,
783  const lv_32fc_t* source,
784  uint32_t num_points)
785 {
786  vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
787  vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
788  vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
789  size_t n = num_points;
790  for (size_t vl; n > 0; n -= vl, source += vl) {
791  vl = __riscv_vsetvl_e32m4(n);
792  vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)source, vl);
793  vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
794  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
795  vbool8_t m = __riscv_vmfgt(vmin, v, vl);
796  vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
797  vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
798  vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
799  }
800  size_t vl = __riscv_vsetvlmax_e32m4();
801  float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
802  __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
803  __riscv_vsetvlmax_e32m1()));
804  // Find lanes with min value, set others to UINT32_MAX
805  vbool8_t m = __riscv_vmfeq(vmin, min, vl);
806  vuint32m4_t idx_masked =
807  __riscv_vmerge(__riscv_vmv_v_x_u32m4(UINT32_MAX, vl), vmini, m, vl);
808  // Find minimum index among lanes with min value
809  *target = __riscv_vmv_x(__riscv_vredminu(RISCV_SHRINK4(vminu, u, 32, idx_masked),
810  __riscv_vmv_v_x_u32m1(UINT32_MAX, 1),
811  __riscv_vsetvlmax_e32m1()));
812 }
813 #endif /*LV_HAVE_RVVSEG*/
814 
815 #endif /*INCLUDED_volk_32fc_index_min_32u_u_H*/
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:98
bit128_p
#define bit128_p(x)
Definition: volk_common.h:147
_vmagnitudesquaredq_f32
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:74
RISCV_SHRINK4
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24
volk_32fc_index_min_32u_generic
static void volk_32fc_index_min_32u_generic(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:297
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32fc_index_min_32u_a_sse3
static void volk_32fc_index_min_32u_a_sse3(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:185
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_common.h
bit128::int_vec
__m128i int_vec
Definition: volk_common.h:128
vector_32fc_index_min_variant0
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:280
vector_32fc_index_min_variant1
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:342
bit128::i
uint32_t i[4]
Definition: volk_common.h:119
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_complex.h
bit128::f
float f[4]
Definition: volk_common.h:120
volk_neon_intrinsics.h
volk_32fc_index_min_32u_neon
static void volk_32fc_index_min_32u_neon(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:531
bit128
Definition: volk_common.h:116
volk_avx2_intrinsics.h
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:96
bit128::float_vec
__m128 float_vec
Definition: volk_common.h:124