Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_index_min_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2021 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
63 #ifndef INCLUDED_volk_32fc_index_min_16u_a_H
64 #define INCLUDED_volk_32fc_index_min_16u_a_H
65 
66 #include <inttypes.h>
67 #include <limits.h>
68 #include <stdio.h>
69 #include <volk/volk_common.h>
70 #include <volk/volk_complex.h>
71 
72 #ifdef LV_HAVE_AVX2
73 #include <immintrin.h>
75 
76 static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
77  const lv_32fc_t* source,
78  uint32_t num_points)
79 {
80  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
81 
82  const __m256i indices_increment = _mm256_set1_epi32(8);
83  /*
84  * At the start of each loop iteration current_indices holds the indices of
85  * the complex numbers loaded from memory. Explanation for odd order is given
86  * in implementation of vector_32fc_index_min_variant0().
87  */
88  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
89 
90  __m256 min_values = _mm256_set1_ps(FLT_MAX);
91  __m256i min_indices = _mm256_setzero_si256();
92 
93  for (unsigned i = 0; i < num_points / 8u; ++i) {
94  __m256 in0 = _mm256_load_ps((float*)source);
95  __m256 in1 = _mm256_load_ps((float*)(source + 4));
97  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
98  source += 8;
99  }
100 
101  // determine minimum value and index in the result of the vectorized loop
102  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
103  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
104  _mm256_store_ps(min_values_buffer, min_values);
105  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
106 
107  float min = FLT_MAX;
108  uint32_t index = 0;
109  for (unsigned i = 0; i < 8; i++) {
110  if (min_values_buffer[i] < min) {
111  min = min_values_buffer[i];
112  index = min_indices_buffer[i];
113  }
114  }
115 
116  // handle tail not processed by the vectorized loop
117  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
118  const float abs_squared =
119  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
120  if (abs_squared < min) {
121  min = abs_squared;
122  index = i;
123  }
124  ++source;
125  }
126 
127  *target = index;
128 }
129 
130 #endif /*LV_HAVE_AVX2*/
131 
132 #ifdef LV_HAVE_AVX2
133 #include <immintrin.h>
135 
136 static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
137  const lv_32fc_t* source,
138  uint32_t num_points)
139 {
140  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
141 
142  const __m256i indices_increment = _mm256_set1_epi32(8);
143  /*
144  * At the start of each loop iteration current_indices holds the indices of
145  * the complex numbers loaded from memory. Explanation for odd order is given
146  * in implementation of vector_32fc_index_min_variant0().
147  */
148  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
149 
150  __m256 min_values = _mm256_set1_ps(FLT_MAX);
151  __m256i min_indices = _mm256_setzero_si256();
152 
153  for (unsigned i = 0; i < num_points / 8u; ++i) {
154  __m256 in0 = _mm256_load_ps((float*)source);
155  __m256 in1 = _mm256_load_ps((float*)(source + 4));
157  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
158  source += 8;
159  }
160 
161  // determine minimum value and index in the result of the vectorized loop
162  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
163  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
164  _mm256_store_ps(min_values_buffer, min_values);
165  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
166 
167  float min = FLT_MAX;
168  uint32_t index = 0;
169  for (unsigned i = 0; i < 8; i++) {
170  if (min_values_buffer[i] < min) {
171  min = min_values_buffer[i];
172  index = min_indices_buffer[i];
173  }
174  }
175 
176  // handle tail not processed by the vectorized loop
177  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
178  const float abs_squared =
179  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
180  if (abs_squared < min) {
181  min = abs_squared;
182  index = i;
183  }
184  ++source;
185  }
186 
187  *target = index;
188 }
189 
190 #endif /*LV_HAVE_AVX2*/
191 
192 #ifdef LV_HAVE_SSE3
193 #include <pmmintrin.h>
194 #include <xmmintrin.h>
195 
196 static inline void volk_32fc_index_min_16u_a_sse3(uint16_t* target,
197  const lv_32fc_t* source,
198  uint32_t num_points)
199 {
200  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
201 
202  union bit128 holderf;
203  union bit128 holderi;
204  float sq_dist = 0.0;
205 
206  union bit128 xmm5, xmm4;
207  __m128 xmm1, xmm2, xmm3;
208  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
209 
210  xmm5.int_vec = _mm_setzero_si128();
211  xmm4.int_vec = _mm_setzero_si128();
212  holderf.int_vec = _mm_setzero_si128();
213  holderi.int_vec = _mm_setzero_si128();
214 
215  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
216  xmm9 = _mm_setzero_si128();
217  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
218  xmm3 = _mm_set_ps1(FLT_MAX);
219 
220  int bound = num_points >> 2;
221 
222  for (int i = 0; i < bound; ++i) {
223  xmm1 = _mm_load_ps((float*)source);
224  xmm2 = _mm_load_ps((float*)&source[2]);
225 
226  source += 4;
227 
228  xmm1 = _mm_mul_ps(xmm1, xmm1);
229  xmm2 = _mm_mul_ps(xmm2, xmm2);
230 
231  xmm1 = _mm_hadd_ps(xmm1, xmm2);
232 
233  xmm3 = _mm_min_ps(xmm1, xmm3);
234 
235  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
236  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
237 
238  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
239  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
240 
241  xmm9 = _mm_add_epi32(xmm11, xmm12);
242 
243  xmm8 = _mm_add_epi32(xmm8, xmm10);
244  }
245 
246  if (num_points >> 1 & 1) {
247  xmm2 = _mm_load_ps((float*)source);
248 
249  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
250  xmm8 = bit128_p(&xmm1)->int_vec;
251 
252  xmm2 = _mm_mul_ps(xmm2, xmm2);
253 
254  source += 2;
255 
256  xmm1 = _mm_hadd_ps(xmm2, xmm2);
257 
258  xmm3 = _mm_min_ps(xmm1, xmm3);
259 
260  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
261 
262  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
263  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
264 
265  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
266  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
267 
268  xmm9 = _mm_add_epi32(xmm11, xmm12);
269 
270  xmm8 = _mm_add_epi32(xmm8, xmm10);
271  }
272 
273  if (num_points & 1) {
274  sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
275  lv_cimag(source[0]) * lv_cimag(source[0]);
276 
277  xmm2 = _mm_load1_ps(&sq_dist);
278 
279  xmm1 = xmm3;
280 
281  xmm3 = _mm_min_ss(xmm3, xmm2);
282 
283  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
284  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
285 
286  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
287 
288  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
289  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
290 
291  xmm9 = _mm_add_epi32(xmm11, xmm12);
292  }
293 
294  _mm_store_ps((float*)&(holderf.f), xmm3);
295  _mm_store_si128(&(holderi.int_vec), xmm9);
296 
297  target[0] = holderi.i[0];
298  sq_dist = holderf.f[0];
299  target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
300  sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
301  target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
302  sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
303  target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
304  sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
305 }
306 
307 #endif /*LV_HAVE_SSE3*/
308 
309 
310 #ifdef LV_HAVE_NEON
311 #include <arm_neon.h>
312 #include <float.h>
313 #include <limits.h>
315 
316 static inline void volk_32fc_index_min_16u_neon(uint16_t* target,
317  const lv_32fc_t* source,
318  uint32_t num_points)
319 {
320  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
321 
322  if (num_points == 0)
323  return;
324 
325  const uint32_t quarter_points = num_points / 4;
326  const lv_32fc_t* inputPtr = source;
327 
328  // Use integer indices directly
329  uint32x4_t vec_indices = { 0, 1, 2, 3 };
330  const uint32x4_t vec_incr = vdupq_n_u32(4);
331 
332  float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
333  uint32x4_t vec_min_idx = vdupq_n_u32(0);
334 
335  for (uint32_t i = 0; i < quarter_points; i++) {
336  // Load and deinterleave complex values
337  float32x4x2_t cplx = vld2q_f32((const float*)inputPtr);
338  inputPtr += 4;
339 
340  // Magnitude squared: re*re + im*im
341  float32x4_t mag2 =
342  vmlaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
343 
344  // Compare BEFORE min update to know which lanes change
345  uint32x4_t lt_mask = vcltq_f32(mag2, vec_min);
346  vec_min_idx = vbslq_u32(lt_mask, vec_indices, vec_min_idx);
347 
348  // vminq_f32 is single-cycle, no dependency on comparison result
349  vec_min = vminq_f32(mag2, vec_min);
350 
351  vec_indices = vaddq_u32(vec_indices, vec_incr);
352  }
353 
354  // Scalar reduction
355  __VOLK_ATTR_ALIGNED(16) float min_buf[4];
356  __VOLK_ATTR_ALIGNED(16) uint32_t idx_buf[4];
357  vst1q_f32(min_buf, vec_min);
358  vst1q_u32(idx_buf, vec_min_idx);
359 
360  float min_val = min_buf[0];
361  uint32_t result_idx = idx_buf[0];
362  for (int i = 1; i < 4; i++) {
363  if (min_buf[i] < min_val) {
364  min_val = min_buf[i];
365  result_idx = idx_buf[i];
366  } else if (min_buf[i] == min_val && idx_buf[i] < result_idx) {
367  result_idx = idx_buf[i];
368  }
369  }
370 
371  // Handle tail
372  for (uint32_t i = quarter_points * 4; i < num_points; i++) {
373  float re = lv_creal(source[i]);
374  float im = lv_cimag(source[i]);
375  float mag2 = re * re + im * im;
376  if (mag2 < min_val) {
377  min_val = mag2;
378  result_idx = i;
379  }
380  }
381 
382  *target = (uint16_t)result_idx;
383 }
384 
385 #endif /*LV_HAVE_NEON*/
386 
387 
388 #ifdef LV_HAVE_NEONV8
389 #include <arm_neon.h>
390 #include <float.h>
391 #include <limits.h>
392 
393 static inline void volk_32fc_index_min_16u_neonv8(uint16_t* target,
394  const lv_32fc_t* source,
395  uint32_t num_points)
396 {
397  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
398 
399  if (num_points == 0)
400  return;
401 
402  const uint32_t quarter_points = num_points / 4;
403  const lv_32fc_t* inputPtr = source;
404 
405  // Use integer indices directly (no float conversion overhead)
406  uint32x4_t vec_indices = { 0, 1, 2, 3 };
407  const uint32x4_t vec_incr = vdupq_n_u32(4);
408 
409  float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
410  uint32x4_t vec_min_idx = vdupq_n_u32(0);
411 
412  for (uint32_t i = 0; i < quarter_points; i++) {
413  // Load and deinterleave complex values
414  float32x4x2_t cplx = vld2q_f32((const float*)inputPtr);
415  inputPtr += 4;
416 
417  // Magnitude squared using FMA: re*re + im*im
418  float32x4_t mag2 =
419  vfmaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
420 
421  // Compare BEFORE min update to know which lanes change
422  uint32x4_t lt_mask = vcltq_f32(mag2, vec_min);
423  vec_min_idx = vbslq_u32(lt_mask, vec_indices, vec_min_idx);
424 
425  // vminq_f32 is single-cycle, no dependency on comparison result
426  vec_min = vminq_f32(mag2, vec_min);
427 
428  vec_indices = vaddq_u32(vec_indices, vec_incr);
429  }
430 
431  // ARMv8 horizontal reduction
432  float min_val = vminvq_f32(vec_min);
433  uint32x4_t min_mask = vceqq_f32(vec_min, vdupq_n_f32(min_val));
434  uint32x4_t idx_masked = vbslq_u32(min_mask, vec_min_idx, vdupq_n_u32(UINT32_MAX));
435  uint32_t result_idx = vminvq_u32(idx_masked);
436 
437  // Handle tail
438  for (uint32_t i = quarter_points * 4; i < num_points; i++) {
439  float re = lv_creal(source[i]);
440  float im = lv_cimag(source[i]);
441  float mag2 = re * re + im * im;
442  if (mag2 < min_val) {
443  min_val = mag2;
444  result_idx = i;
445  }
446  }
447 
448  *target = (uint16_t)result_idx;
449 }
450 
451 #endif /*LV_HAVE_NEONV8*/
452 
453 
454 #ifdef LV_HAVE_GENERIC
455 static inline void volk_32fc_index_min_16u_generic(uint16_t* target,
456  const lv_32fc_t* source,
457  uint32_t num_points)
458 {
459  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
460 
461  float sq_dist = 0.0;
462  float min = FLT_MAX;
463  uint16_t index = 0;
464 
465  for (uint32_t i = 0; i < num_points; ++i) {
466  sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
467  lv_cimag(source[i]) * lv_cimag(source[i]);
468 
469  if (sq_dist < min) {
470  index = i;
471  min = sq_dist;
472  }
473  }
474  target[0] = index;
475 }
476 
477 #endif /*LV_HAVE_GENERIC*/
478 
479 #ifdef LV_HAVE_AVX512F
480 #include <float.h>
481 #include <immintrin.h>
482 #include <limits.h>
483 
484 static inline void volk_32fc_index_min_16u_a_avx512f(uint16_t* target,
485  const lv_32fc_t* src0,
486  uint32_t num_points)
487 {
488  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
489 
490  const lv_32fc_t* src0Ptr = src0;
491  const uint32_t sixteenthPoints = num_points / 16;
492 
493  // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
494  __m512 currentIndexes =
495  _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
496  const __m512 indexIncrement = _mm512_set1_ps(16);
497 
498  __m512 minValues = _mm512_set1_ps(FLT_MAX);
499  __m512 minIndices = _mm512_setzero_ps();
500 
501  for (uint32_t number = 0; number < sixteenthPoints; number++) {
502  // Load 16 complex values (32 floats)
503  __m512 in0 = _mm512_load_ps((const float*)src0Ptr);
504  __m512 in1 = _mm512_load_ps((const float*)(src0Ptr + 8));
505  src0Ptr += 16;
506 
507  // Square all values
508  in0 = _mm512_mul_ps(in0, in0);
509  in1 = _mm512_mul_ps(in1, in1);
510 
511  // Add adjacent pairs (re² + im²) using within-lane shuffle
512  // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
513  __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
514  __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
515  __m512 sum0 = _mm512_add_ps(in0, sw0);
516  __m512 sum1 = _mm512_add_ps(in1, sw1);
517 
518  // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
519  // 0x88 = _MM_SHUFFLE(2,0,2,0)
520  __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
521 
522  // Compare and update minimums
523  __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, minValues, _CMP_LT_OS);
524  minIndices = _mm512_mask_blend_ps(cmpMask, minIndices, currentIndexes);
525  minValues = _mm512_min_ps(mag_sq, minValues);
526 
527  currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
528  }
529 
530  // Reduce 16 values to find minimum
531  __VOLK_ATTR_ALIGNED(64) float minValuesBuffer[16];
532  __VOLK_ATTR_ALIGNED(64) float minIndexesBuffer[16];
533  _mm512_store_ps(minValuesBuffer, minValues);
534  _mm512_store_ps(minIndexesBuffer, minIndices);
535 
536  float min = FLT_MAX;
537  uint32_t index = 0;
538  for (uint32_t i = 0; i < 16; i++) {
539  if (minValuesBuffer[i] < min) {
540  min = minValuesBuffer[i];
541  index = (uint32_t)minIndexesBuffer[i];
542  } else if (minValuesBuffer[i] == min) {
543  if ((uint32_t)minIndexesBuffer[i] < index)
544  index = (uint32_t)minIndexesBuffer[i];
545  }
546  }
547 
548  // Handle tail
549  for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
550  const float re = lv_creal(*src0Ptr);
551  const float im = lv_cimag(*src0Ptr);
552  const float sq_dist = re * re + im * im;
553  if (sq_dist < min) {
554  min = sq_dist;
555  index = number;
556  }
557  src0Ptr++;
558  }
559  *target = (uint16_t)index;
560 }
561 
562 #endif /*LV_HAVE_AVX512F*/
563 
564 #endif /*INCLUDED_volk_32fc_index_min_16u_a_H*/
565 
566 #ifndef INCLUDED_volk_32fc_index_min_16u_u_H
567 #define INCLUDED_volk_32fc_index_min_16u_u_H
568 
569 #include <inttypes.h>
570 #include <limits.h>
571 #include <stdio.h>
572 #include <volk/volk_common.h>
573 #include <volk/volk_complex.h>
574 
575 #ifdef LV_HAVE_AVX2
576 #include <immintrin.h>
578 
579 static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
580  const lv_32fc_t* source,
581  uint32_t num_points)
582 {
583  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
584 
585  const __m256i indices_increment = _mm256_set1_epi32(8);
586  /*
587  * At the start of each loop iteration current_indices holds the indices of
588  * the complex numbers loaded from memory. Explanation for odd order is given
589  * in implementation of vector_32fc_index_min_variant0().
590  */
591  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
592 
593  __m256 min_values = _mm256_set1_ps(FLT_MAX);
594  __m256i min_indices = _mm256_setzero_si256();
595 
596  for (unsigned i = 0; i < num_points / 8u; ++i) {
597  __m256 in0 = _mm256_loadu_ps((float*)source);
598  __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
600  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
601  source += 8;
602  }
603 
604  // determine minimum value and index in the result of the vectorized loop
605  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
606  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
607  _mm256_store_ps(min_values_buffer, min_values);
608  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
609 
610  float min = FLT_MAX;
611  uint32_t index = 0;
612  for (unsigned i = 0; i < 8; i++) {
613  if (min_values_buffer[i] < min) {
614  min = min_values_buffer[i];
615  index = min_indices_buffer[i];
616  }
617  }
618 
619  // handle tail not processed by the vectorized loop
620  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
621  const float abs_squared =
622  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
623  if (abs_squared < min) {
624  min = abs_squared;
625  index = i;
626  }
627  ++source;
628  }
629 
630  *target = index;
631 }
632 
633 #endif /*LV_HAVE_AVX2*/
634 
635 #ifdef LV_HAVE_AVX2
636 #include <immintrin.h>
638 
639 static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
640  const lv_32fc_t* source,
641  uint32_t num_points)
642 {
643  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
644 
645  const __m256i indices_increment = _mm256_set1_epi32(8);
646  /*
647  * At the start of each loop iteration current_indices holds the indices of
648  * the complex numbers loaded from memory. Explanation for odd order is given
649  * in implementation of vector_32fc_index_min_variant0().
650  */
651  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
652 
653  __m256 min_values = _mm256_set1_ps(FLT_MAX);
654  __m256i min_indices = _mm256_setzero_si256();
655 
656  for (unsigned i = 0; i < num_points / 8u; ++i) {
657  __m256 in0 = _mm256_loadu_ps((float*)source);
658  __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
660  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
661  source += 8;
662  }
663 
664  // determine minimum value and index in the result of the vectorized loop
665  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
666  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
667  _mm256_store_ps(min_values_buffer, min_values);
668  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
669 
670  float min = FLT_MAX;
671  uint32_t index = 0;
672  for (unsigned i = 0; i < 8; i++) {
673  if (min_values_buffer[i] < min) {
674  min = min_values_buffer[i];
675  index = min_indices_buffer[i];
676  }
677  }
678 
679  // handle tail not processed by the vectorized loop
680  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
681  const float abs_squared =
682  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
683  if (abs_squared < min) {
684  min = abs_squared;
685  index = i;
686  }
687  ++source;
688  }
689 
690  *target = index;
691 }
692 
693 #endif /*LV_HAVE_AVX2*/
694 
695 #ifdef LV_HAVE_AVX512F
696 #include <float.h>
697 #include <immintrin.h>
698 #include <limits.h>
699 
700 static inline void volk_32fc_index_min_16u_u_avx512f(uint16_t* target,
701  const lv_32fc_t* src0,
702  uint32_t num_points)
703 {
704  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
705 
706  const lv_32fc_t* src0Ptr = src0;
707  const uint32_t sixteenthPoints = num_points / 16;
708 
709  // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
710  __m512 currentIndexes =
711  _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
712  const __m512 indexIncrement = _mm512_set1_ps(16);
713 
714  __m512 minValues = _mm512_set1_ps(FLT_MAX);
715  __m512 minIndices = _mm512_setzero_ps();
716 
717  for (uint32_t number = 0; number < sixteenthPoints; number++) {
718  // Load 16 complex values (32 floats)
719  __m512 in0 = _mm512_loadu_ps((const float*)src0Ptr);
720  __m512 in1 = _mm512_loadu_ps((const float*)(src0Ptr + 8));
721  src0Ptr += 16;
722 
723  // Square all values
724  in0 = _mm512_mul_ps(in0, in0);
725  in1 = _mm512_mul_ps(in1, in1);
726 
727  // Add adjacent pairs (re² + im²) using within-lane shuffle
728  // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
729  __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
730  __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
731  __m512 sum0 = _mm512_add_ps(in0, sw0);
732  __m512 sum1 = _mm512_add_ps(in1, sw1);
733 
734  // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
735  // 0x88 = _MM_SHUFFLE(2,0,2,0)
736  __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
737 
738  // Compare and update minimums
739  __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, minValues, _CMP_LT_OS);
740  minIndices = _mm512_mask_blend_ps(cmpMask, minIndices, currentIndexes);
741  minValues = _mm512_min_ps(mag_sq, minValues);
742 
743  currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
744  }
745 
746  // Reduce 16 values to find minimum
747  __VOLK_ATTR_ALIGNED(64) float minValuesBuffer[16];
748  __VOLK_ATTR_ALIGNED(64) float minIndexesBuffer[16];
749  _mm512_store_ps(minValuesBuffer, minValues);
750  _mm512_store_ps(minIndexesBuffer, minIndices);
751 
752  float min = FLT_MAX;
753  uint32_t index = 0;
754  for (uint32_t i = 0; i < 16; i++) {
755  if (minValuesBuffer[i] < min) {
756  min = minValuesBuffer[i];
757  index = (uint32_t)minIndexesBuffer[i];
758  } else if (minValuesBuffer[i] == min) {
759  if ((uint32_t)minIndexesBuffer[i] < index)
760  index = (uint32_t)minIndexesBuffer[i];
761  }
762  }
763 
764  // Handle tail
765  for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
766  const float re = lv_creal(*src0Ptr);
767  const float im = lv_cimag(*src0Ptr);
768  const float sq_dist = re * re + im * im;
769  if (sq_dist < min) {
770  min = sq_dist;
771  index = number;
772  }
773  src0Ptr++;
774  }
775  *target = (uint16_t)index;
776 }
777 
778 #endif /*LV_HAVE_AVX512F*/
779 
780 #ifdef LV_HAVE_RVV
781 #include <float.h>
782 #include <riscv_vector.h>
783 
784 static inline void volk_32fc_index_min_16u_rvv(uint16_t* target,
785  const lv_32fc_t* source,
786  uint32_t num_points)
787 {
788  vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
789  vuint16m2_t vmini = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
790  vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
791  size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
792  for (size_t vl; n > 0; n -= vl, source += vl) {
793  vl = __riscv_vsetvl_e32m4(n);
794  vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)source, vl);
795  vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
796  vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
797  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
798  vbool8_t m = __riscv_vmfgt(vmin, v, vl);
799  vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
800  vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
801  vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
802  }
803  size_t vl = __riscv_vsetvlmax_e32m4();
804  float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
805  __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
806  __riscv_vsetvlmax_e32m1()));
807  // Find minimum index among lanes with min value
808  __attribute__((aligned(32))) float values[128];
809  __attribute__((aligned(32))) uint16_t indices[128];
810  __riscv_vse32(values, vmin, vl);
811  __riscv_vse16(indices, vmini, vl);
812  uint16_t min_idx = UINT16_MAX;
813  for (size_t i = 0; i < vl; i++) {
814  if (values[i] == min && indices[i] < min_idx) {
815  min_idx = indices[i];
816  }
817  }
818  *target = min_idx;
819 }
820 #endif /*LV_HAVE_RVV*/
821 
822 #ifdef LV_HAVE_RVVSEG
823 #include <float.h>
824 #include <riscv_vector.h>
825 
826 static inline void volk_32fc_index_min_16u_rvvseg(uint16_t* target,
827  const lv_32fc_t* source,
828  uint32_t num_points)
829 {
830  vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
831  vuint16m2_t vmini = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
832  vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
833  size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
834  for (size_t vl; n > 0; n -= vl, source += vl) {
835  vl = __riscv_vsetvl_e32m4(n);
836  vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)source, vl);
837  vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
838  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
839  vbool8_t m = __riscv_vmfgt(vmin, v, vl);
840  vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
841  vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
842  vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
843  }
844  size_t vl = __riscv_vsetvlmax_e32m4();
845  float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
846  __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
847  __riscv_vsetvlmax_e32m1()));
848  // Find minimum index among lanes with min value
849  __attribute__((aligned(32))) float values[128];
850  __attribute__((aligned(32))) uint16_t indices[128];
851  __riscv_vse32(values, vmin, vl);
852  __riscv_vse16(indices, vmini, vl);
853  uint16_t min_idx = UINT16_MAX;
854  for (size_t i = 0; i < vl; i++) {
855  if (values[i] == min && indices[i] < min_idx) {
856  min_idx = indices[i];
857  }
858  }
859  *target = min_idx;
860 }
861 #endif /*LV_HAVE_RVVSEG*/
862 
863 #endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:98
bit128_p
#define bit128_p(x)
Definition: volk_common.h:147
RISCV_SHRINK4
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24
volk_32fc_index_min_16u_generic
static void volk_32fc_index_min_16u_generic(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_16u.h:455
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_common.h
bit128::int_vec
__m128i int_vec
Definition: volk_common.h:128
vector_32fc_index_min_variant0
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:280
vector_32fc_index_min_variant1
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:342
volk_32fc_index_min_16u_a_sse3
static void volk_32fc_index_min_16u_a_sse3(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_16u.h:196
bit128::i
uint32_t i[4]
Definition: volk_common.h:119
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_complex.h
volk_32fc_index_min_16u_neon
static void volk_32fc_index_min_16u_neon(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_16u.h:316
bit128::f
float f[4]
Definition: volk_common.h:120
volk_neon_intrinsics.h
bit128
Definition: volk_common.h:116
volk_avx2_intrinsics.h
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:96
bit128::float_vec
__m128 float_vec
Definition: volk_common.h:124