Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_log2_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  * Copyright 2025-2026 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
80 #ifndef INCLUDED_volk_32f_log2_32f_a_H
81 #define INCLUDED_volk_32f_log2_32f_a_H
82 
83 #include <inttypes.h>
84 #include <math.h>
85 #include <stdio.h>
86 #include <stdlib.h>
87 
88 #ifdef LV_HAVE_GENERIC
89 
90 static inline void
91 volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
92 {
93  float* bPtr = bVector;
94  const float* aPtr = aVector;
95  unsigned int number = 0;
96 
97  for (number = 0; number < num_points; number++) {
98  *bPtr++ = log2f_non_ieee(*aPtr++);
99  }
100 }
101 #endif /* LV_HAVE_GENERIC */
102 
103 #ifdef LV_HAVE_SSE4_1
104 #include <smmintrin.h>
106 
107 static inline void
108 volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
109 {
110  float* bPtr = bVector;
111  const float* aPtr = aVector;
112 
113  unsigned int number = 0;
114  const unsigned int quarterPoints = num_points / 4;
115 
116  const __m128i exp_mask = _mm_set1_epi32(0x7f800000);
117  const __m128i mant_mask = _mm_set1_epi32(0x007fffff);
118  const __m128i one_bits = _mm_set1_epi32(0x3f800000);
119  const __m128i exp_bias = _mm_set1_epi32(127);
120  const __m128 one = _mm_set1_ps(1.0f);
121 
122  for (; number < quarterPoints; number++) {
123  __m128 aVal = _mm_loadu_ps(aPtr);
124 
125  // Check for special values
126  __m128 zero_mask = _mm_cmpeq_ps(aVal, _mm_setzero_ps());
127  __m128 neg_mask = _mm_cmplt_ps(aVal, _mm_setzero_ps());
128  __m128 inf_mask = _mm_cmpeq_ps(aVal, _mm_set1_ps(INFINITY));
129  __m128 nan_mask = _mm_cmpunord_ps(aVal, aVal);
130  __m128 invalid_mask = _mm_or_ps(neg_mask, nan_mask);
131 
132  __m128i aVal_i = _mm_castps_si128(aVal);
133 
134  // Extract exponent: (aVal_i & exp_mask) >> 23 - bias
135  __m128i exp_i = _mm_srli_epi32(_mm_and_si128(aVal_i, exp_mask), 23);
136  exp_i = _mm_sub_epi32(exp_i, exp_bias);
137  __m128 exp_f = _mm_cvtepi32_ps(exp_i);
138 
139  // Extract mantissa as float in [1, 2)
140  __m128 frac =
141  _mm_castsi128_ps(_mm_or_si128(_mm_and_si128(aVal_i, mant_mask), one_bits));
142 
143  // Evaluate degree-6 polynomial
144  __m128 poly = _mm_log2_poly_sse(frac);
145 
146  // result = exp + poly * (frac - 1)
147  __m128 bVal = _mm_add_ps(exp_f, _mm_mul_ps(poly, _mm_sub_ps(frac, one)));
148 
149  // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
150  bVal = _mm_blendv_ps(bVal, _mm_set1_ps(-127.0f), zero_mask);
151  bVal = _mm_blendv_ps(bVal, _mm_set1_ps(127.0f), inf_mask);
152  bVal = _mm_blendv_ps(bVal, _mm_set1_ps(NAN), invalid_mask);
153 
154  _mm_storeu_ps(bPtr, bVal);
155 
156  aPtr += 4;
157  bPtr += 4;
158  }
159 
160  number = quarterPoints * 4;
161  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
162 }
163 
164 #endif /* LV_HAVE_SSE4_1 for unaligned */
165 
166 #ifdef LV_HAVE_SSE4_1
167 
168 static inline void
169 volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
170 {
171  float* bPtr = bVector;
172  const float* aPtr = aVector;
173 
174  unsigned int number = 0;
175  const unsigned int quarterPoints = num_points / 4;
176 
177  const __m128i exp_mask = _mm_set1_epi32(0x7f800000);
178  const __m128i mant_mask = _mm_set1_epi32(0x007fffff);
179  const __m128i one_bits = _mm_set1_epi32(0x3f800000);
180  const __m128i exp_bias = _mm_set1_epi32(127);
181  const __m128 one = _mm_set1_ps(1.0f);
182 
183  for (; number < quarterPoints; number++) {
184  __m128 aVal = _mm_load_ps(aPtr);
185 
186  // Check for special values
187  __m128 zero_mask = _mm_cmpeq_ps(aVal, _mm_setzero_ps());
188  __m128 neg_mask = _mm_cmplt_ps(aVal, _mm_setzero_ps());
189  __m128 inf_mask = _mm_cmpeq_ps(aVal, _mm_set1_ps(INFINITY));
190  __m128 nan_mask = _mm_cmpunord_ps(aVal, aVal);
191  __m128 invalid_mask = _mm_or_ps(neg_mask, nan_mask);
192 
193  __m128i aVal_i = _mm_castps_si128(aVal);
194 
195  // Extract exponent: (aVal_i & exp_mask) >> 23 - bias
196  __m128i exp_i = _mm_srli_epi32(_mm_and_si128(aVal_i, exp_mask), 23);
197  exp_i = _mm_sub_epi32(exp_i, exp_bias);
198  __m128 exp_f = _mm_cvtepi32_ps(exp_i);
199 
200  // Extract mantissa as float in [1, 2)
201  __m128 frac =
202  _mm_castsi128_ps(_mm_or_si128(_mm_and_si128(aVal_i, mant_mask), one_bits));
203 
204  // Evaluate degree-6 polynomial
205  __m128 poly = _mm_log2_poly_sse(frac);
206 
207  // result = exp + poly * (frac - 1)
208  __m128 bVal = _mm_add_ps(exp_f, _mm_mul_ps(poly, _mm_sub_ps(frac, one)));
209 
210  // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
211  bVal = _mm_blendv_ps(bVal, _mm_set1_ps(-127.0f), zero_mask);
212  bVal = _mm_blendv_ps(bVal, _mm_set1_ps(127.0f), inf_mask);
213  bVal = _mm_blendv_ps(bVal, _mm_set1_ps(NAN), invalid_mask);
214 
215  _mm_store_ps(bPtr, bVal);
216 
217  aPtr += 4;
218  bPtr += 4;
219  }
220 
221  number = quarterPoints * 4;
222  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
223 }
224 
225 #endif /* LV_HAVE_SSE4_1 */
226 
227 #ifdef LV_HAVE_AVX2
228 #include <immintrin.h>
230 
231 static inline void
232 volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
233 {
234  float* bPtr = bVector;
235  const float* aPtr = aVector;
236 
237  unsigned int number = 0;
238  const unsigned int eighthPoints = num_points / 8;
239 
240  const __m256i exp_mask = _mm256_set1_epi32(0x7f800000);
241  const __m256i mant_mask = _mm256_set1_epi32(0x007fffff);
242  const __m256i one_bits = _mm256_set1_epi32(0x3f800000);
243  const __m256i exp_bias = _mm256_set1_epi32(127);
244  const __m256 one = _mm256_set1_ps(1.0f);
245 
246  for (; number < eighthPoints; number++) {
247  __m256 aVal = _mm256_loadu_ps(aPtr);
248 
249  // Check for special values
250  __m256 zero_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_EQ_OQ);
251  __m256 neg_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LT_OQ);
252  __m256 inf_mask = _mm256_cmp_ps(aVal, _mm256_set1_ps(INFINITY), _CMP_EQ_OQ);
253  __m256 nan_mask = _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q);
254  __m256 invalid_mask = _mm256_or_ps(neg_mask, nan_mask);
255 
256  __m256i aVal_i = _mm256_castps_si256(aVal);
257 
258  // Extract exponent
259  __m256i exp_i = _mm256_srli_epi32(_mm256_and_si256(aVal_i, exp_mask), 23);
260  exp_i = _mm256_sub_epi32(exp_i, exp_bias);
261  __m256 exp_f = _mm256_cvtepi32_ps(exp_i);
262 
263  // Extract mantissa as float in [1, 2)
264  __m256 frac = _mm256_castsi256_ps(
265  _mm256_or_si256(_mm256_and_si256(aVal_i, mant_mask), one_bits));
266 
267  // Evaluate degree-6 polynomial
268  __m256 poly = _mm256_log2_poly_avx2(frac);
269 
270  // result = exp + poly * (frac - 1)
271  __m256 bVal = _mm256_add_ps(exp_f, _mm256_mul_ps(poly, _mm256_sub_ps(frac, one)));
272 
273  // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
274  bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(-127.0f), zero_mask);
275  bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(127.0f), inf_mask);
276  bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(NAN), invalid_mask);
277 
278  _mm256_storeu_ps(bPtr, bVal);
279 
280  aPtr += 8;
281  bPtr += 8;
282  }
283 
284  number = eighthPoints * 8;
285  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
286 }
287 
288 #endif /* LV_HAVE_AVX2 for unaligned */
289 
290 #ifdef LV_HAVE_AVX2
291 
292 static inline void
293 volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
294 {
295  float* bPtr = bVector;
296  const float* aPtr = aVector;
297 
298  unsigned int number = 0;
299  const unsigned int eighthPoints = num_points / 8;
300 
301  const __m256i exp_mask = _mm256_set1_epi32(0x7f800000);
302  const __m256i mant_mask = _mm256_set1_epi32(0x007fffff);
303  const __m256i one_bits = _mm256_set1_epi32(0x3f800000);
304  const __m256i exp_bias = _mm256_set1_epi32(127);
305  const __m256 one = _mm256_set1_ps(1.0f);
306 
307  for (; number < eighthPoints; number++) {
308  __m256 aVal = _mm256_load_ps(aPtr);
309 
310  // Check for special values
311  __m256 zero_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_EQ_OQ);
312  __m256 neg_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LT_OQ);
313  __m256 inf_mask = _mm256_cmp_ps(aVal, _mm256_set1_ps(INFINITY), _CMP_EQ_OQ);
314  __m256 nan_mask = _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q);
315  __m256 invalid_mask = _mm256_or_ps(neg_mask, nan_mask);
316 
317  __m256i aVal_i = _mm256_castps_si256(aVal);
318 
319  // Extract exponent
320  __m256i exp_i = _mm256_srli_epi32(_mm256_and_si256(aVal_i, exp_mask), 23);
321  exp_i = _mm256_sub_epi32(exp_i, exp_bias);
322  __m256 exp_f = _mm256_cvtepi32_ps(exp_i);
323 
324  // Extract mantissa as float in [1, 2)
325  __m256 frac = _mm256_castsi256_ps(
326  _mm256_or_si256(_mm256_and_si256(aVal_i, mant_mask), one_bits));
327 
328  // Evaluate degree-6 polynomial
329  __m256 poly = _mm256_log2_poly_avx2(frac);
330 
331  // result = exp + poly * (frac - 1)
332  __m256 bVal = _mm256_add_ps(exp_f, _mm256_mul_ps(poly, _mm256_sub_ps(frac, one)));
333 
334  // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
335  bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(-127.0f), zero_mask);
336  bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(127.0f), inf_mask);
337  bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(NAN), invalid_mask);
338 
339  _mm256_store_ps(bPtr, bVal);
340 
341  aPtr += 8;
342  bPtr += 8;
343  }
344 
345  number = eighthPoints * 8;
346  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
347 }
348 
349 #endif /* LV_HAVE_AVX2 */
350 
351 #if LV_HAVE_AVX2 && LV_HAVE_FMA
352 #include <immintrin.h>
354 
355 static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector,
356  const float* aVector,
357  unsigned int num_points)
358 {
359  float* bPtr = bVector;
360  const float* aPtr = aVector;
361 
362  unsigned int number = 0;
363  const unsigned int eighthPoints = num_points / 8;
364 
365  const __m256i exp_mask = _mm256_set1_epi32(0x7f800000);
366  const __m256i mant_mask = _mm256_set1_epi32(0x007fffff);
367  const __m256i one_bits = _mm256_set1_epi32(0x3f800000);
368  const __m256i exp_bias = _mm256_set1_epi32(127);
369  const __m256 one = _mm256_set1_ps(1.0f);
370 
371  for (; number < eighthPoints; number++) {
372  __m256 aVal = _mm256_loadu_ps(aPtr);
373 
374  // Check for special values
375  __m256 zero_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_EQ_OQ);
376  __m256 neg_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LT_OQ);
377  __m256 inf_mask = _mm256_cmp_ps(aVal, _mm256_set1_ps(INFINITY), _CMP_EQ_OQ);
378  __m256 nan_mask = _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q);
379  __m256 invalid_mask = _mm256_or_ps(neg_mask, nan_mask);
380 
381  __m256i aVal_i = _mm256_castps_si256(aVal);
382 
383  // Extract exponent
384  __m256i exp_i = _mm256_srli_epi32(_mm256_and_si256(aVal_i, exp_mask), 23);
385  exp_i = _mm256_sub_epi32(exp_i, exp_bias);
386  __m256 exp_f = _mm256_cvtepi32_ps(exp_i);
387 
388  // Extract mantissa as float in [1, 2)
389  __m256 frac = _mm256_castsi256_ps(
390  _mm256_or_si256(_mm256_and_si256(aVal_i, mant_mask), one_bits));
391 
392  // Evaluate degree-6 polynomial with FMA
393  __m256 poly = _mm256_log2_poly_avx2_fma(frac);
394 
395  // result = exp + poly * (frac - 1)
396  __m256 bVal = _mm256_fmadd_ps(poly, _mm256_sub_ps(frac, one), exp_f);
397 
398  // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
399  bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(-127.0f), zero_mask);
400  bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(127.0f), inf_mask);
401  bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(NAN), invalid_mask);
402 
403  _mm256_storeu_ps(bPtr, bVal);
404 
405  aPtr += 8;
406  bPtr += 8;
407  }
408 
409  number = eighthPoints * 8;
410  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
411 }
412 
413 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
414 
415 #if LV_HAVE_AVX2 && LV_HAVE_FMA
416 
417 static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector,
418  const float* aVector,
419  unsigned int num_points)
420 {
421  float* bPtr = bVector;
422  const float* aPtr = aVector;
423 
424  unsigned int number = 0;
425  const unsigned int eighthPoints = num_points / 8;
426 
427  const __m256i exp_mask = _mm256_set1_epi32(0x7f800000);
428  const __m256i mant_mask = _mm256_set1_epi32(0x007fffff);
429  const __m256i one_bits = _mm256_set1_epi32(0x3f800000);
430  const __m256i exp_bias = _mm256_set1_epi32(127);
431  const __m256 one = _mm256_set1_ps(1.0f);
432 
433  for (; number < eighthPoints; number++) {
434  __m256 aVal = _mm256_load_ps(aPtr);
435 
436  // Check for special values
437  __m256 zero_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_EQ_OQ);
438  __m256 neg_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LT_OQ);
439  __m256 inf_mask = _mm256_cmp_ps(aVal, _mm256_set1_ps(INFINITY), _CMP_EQ_OQ);
440  __m256 nan_mask = _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q);
441  __m256 invalid_mask = _mm256_or_ps(neg_mask, nan_mask);
442 
443  __m256i aVal_i = _mm256_castps_si256(aVal);
444 
445  // Extract exponent
446  __m256i exp_i = _mm256_srli_epi32(_mm256_and_si256(aVal_i, exp_mask), 23);
447  exp_i = _mm256_sub_epi32(exp_i, exp_bias);
448  __m256 exp_f = _mm256_cvtepi32_ps(exp_i);
449 
450  // Extract mantissa as float in [1, 2)
451  __m256 frac = _mm256_castsi256_ps(
452  _mm256_or_si256(_mm256_and_si256(aVal_i, mant_mask), one_bits));
453 
454  // Evaluate degree-6 polynomial with FMA
455  __m256 poly = _mm256_log2_poly_avx2_fma(frac);
456 
457  // result = exp + poly * (frac - 1)
458  __m256 bVal = _mm256_fmadd_ps(poly, _mm256_sub_ps(frac, one), exp_f);
459 
460  // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
461  bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(-127.0f), zero_mask);
462  bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(127.0f), inf_mask);
463  bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(NAN), invalid_mask);
464 
465  _mm256_store_ps(bPtr, bVal);
466 
467  aPtr += 8;
468  bPtr += 8;
469  }
470 
471  number = eighthPoints * 8;
472  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
473 }
474 
475 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
476 
477 #ifdef LV_HAVE_AVX512F
478 #include <immintrin.h>
480 
481 static inline void
482 volk_32f_log2_32f_u_avx512(float* bVector, const float* aVector, unsigned int num_points)
483 {
484  float* bPtr = bVector;
485  const float* aPtr = aVector;
486 
487  unsigned int number = 0;
488  const unsigned int sixteenthPoints = num_points / 16;
489 
490  const __m512i exp_mask = _mm512_set1_epi32(0x7f800000);
491  const __m512i mant_mask = _mm512_set1_epi32(0x007fffff);
492  const __m512i one_bits = _mm512_set1_epi32(0x3f800000);
493  const __m512i exp_bias = _mm512_set1_epi32(127);
494  const __m512 one = _mm512_set1_ps(1.0f);
495 
496  for (; number < sixteenthPoints; number++) {
497  __m512 aVal = _mm512_loadu_ps(aPtr);
498 
499  // Check for special values
500  __mmask16 zero_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_EQ_OQ);
501  __mmask16 neg_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OQ);
502  __mmask16 inf_mask =
503  _mm512_cmp_ps_mask(aVal, _mm512_set1_ps(INFINITY), _CMP_EQ_OQ);
504  __mmask16 nan_mask = _mm512_cmp_ps_mask(aVal, aVal, _CMP_UNORD_Q);
505  __mmask16 invalid_mask = _kor_mask16(neg_mask, nan_mask);
506 
507  __m512i aVal_i = _mm512_castps_si512(aVal);
508 
509  // Extract exponent
510  __m512i exp_i = _mm512_srli_epi32(_mm512_and_si512(aVal_i, exp_mask), 23);
511  exp_i = _mm512_sub_epi32(exp_i, exp_bias);
512  __m512 exp_f = _mm512_cvtepi32_ps(exp_i);
513 
514  // Extract mantissa as float in [1, 2)
515  __m512 frac = _mm512_castsi512_ps(
516  _mm512_or_si512(_mm512_and_si512(aVal_i, mant_mask), one_bits));
517 
518  // Evaluate degree-6 polynomial with FMA
519  __m512 poly = _mm512_log2_poly_avx512(frac);
520 
521  // result = exp + poly * (frac - 1)
522  __m512 bVal = _mm512_fmadd_ps(poly, _mm512_sub_ps(frac, one), exp_f);
523 
524  // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
525  bVal = _mm512_mask_blend_ps(zero_mask, bVal, _mm512_set1_ps(-127.0f));
526  bVal = _mm512_mask_blend_ps(inf_mask, bVal, _mm512_set1_ps(127.0f));
527  bVal = _mm512_mask_blend_ps(invalid_mask, bVal, _mm512_set1_ps(NAN));
528 
529  _mm512_storeu_ps(bPtr, bVal);
530 
531  aPtr += 16;
532  bPtr += 16;
533  }
534 
535  number = sixteenthPoints * 16;
536  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
537 }
538 
539 #endif /* LV_HAVE_AVX512F for unaligned */
540 
541 #ifdef LV_HAVE_AVX512F
542 
543 static inline void
544 volk_32f_log2_32f_a_avx512(float* bVector, const float* aVector, unsigned int num_points)
545 {
546  float* bPtr = bVector;
547  const float* aPtr = aVector;
548 
549  unsigned int number = 0;
550  const unsigned int sixteenthPoints = num_points / 16;
551 
552  const __m512i exp_mask = _mm512_set1_epi32(0x7f800000);
553  const __m512i mant_mask = _mm512_set1_epi32(0x007fffff);
554  const __m512i one_bits = _mm512_set1_epi32(0x3f800000);
555  const __m512i exp_bias = _mm512_set1_epi32(127);
556  const __m512 one = _mm512_set1_ps(1.0f);
557 
558  for (; number < sixteenthPoints; number++) {
559  __m512 aVal = _mm512_load_ps(aPtr);
560 
561  // Check for special values
562  __mmask16 zero_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_EQ_OQ);
563  __mmask16 neg_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OQ);
564  __mmask16 inf_mask =
565  _mm512_cmp_ps_mask(aVal, _mm512_set1_ps(INFINITY), _CMP_EQ_OQ);
566  __mmask16 nan_mask = _mm512_cmp_ps_mask(aVal, aVal, _CMP_UNORD_Q);
567  __mmask16 invalid_mask = _kor_mask16(neg_mask, nan_mask);
568 
569  __m512i aVal_i = _mm512_castps_si512(aVal);
570 
571  // Extract exponent
572  __m512i exp_i = _mm512_srli_epi32(_mm512_and_si512(aVal_i, exp_mask), 23);
573  exp_i = _mm512_sub_epi32(exp_i, exp_bias);
574  __m512 exp_f = _mm512_cvtepi32_ps(exp_i);
575 
576  // Extract mantissa as float in [1, 2)
577  __m512 frac = _mm512_castsi512_ps(
578  _mm512_or_si512(_mm512_and_si512(aVal_i, mant_mask), one_bits));
579 
580  // Evaluate degree-6 polynomial with FMA
581  __m512 poly = _mm512_log2_poly_avx512(frac);
582 
583  // result = exp + poly * (frac - 1)
584  __m512 bVal = _mm512_fmadd_ps(poly, _mm512_sub_ps(frac, one), exp_f);
585 
586  // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
587  bVal = _mm512_mask_blend_ps(zero_mask, bVal, _mm512_set1_ps(-127.0f));
588  bVal = _mm512_mask_blend_ps(inf_mask, bVal, _mm512_set1_ps(127.0f));
589  bVal = _mm512_mask_blend_ps(invalid_mask, bVal, _mm512_set1_ps(NAN));
590 
591  _mm512_store_ps(bPtr, bVal);
592 
593  aPtr += 16;
594  bPtr += 16;
595  }
596 
597  number = sixteenthPoints * 16;
598  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
599 }
600 
601 #endif /* LV_HAVE_AVX512F */
602 
603 #if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
604 
605 static inline void volk_32f_log2_32f_u_avx512dq(float* bVector,
606  const float* aVector,
607  unsigned int num_points)
608 {
609  float* bPtr = bVector;
610  const float* aPtr = aVector;
611 
612  unsigned int number = 0;
613  const unsigned int sixteenthPoints = num_points / 16;
614 
615  const __m512i exp_mask = _mm512_set1_epi32(0x7f800000);
616  const __m512i mant_mask = _mm512_set1_epi32(0x007fffff);
617  const __m512i one_bits = _mm512_set1_epi32(0x3f800000);
618  const __m512i exp_bias = _mm512_set1_epi32(127);
619  const __m512 one = _mm512_set1_ps(1.0f);
620 
621  for (; number < sixteenthPoints; number++) {
622  __m512 aVal = _mm512_loadu_ps(aPtr);
623 
624  // Use fpclass for special value detection (AVX512DQ feature)
625  // 0x01 = QNaN, 0x02 = +0, 0x04 = -0, 0x08 = +Inf, 0x10 = -Inf, 0x80 = SNaN
626  __mmask16 nan_mask = _mm512_fpclass_ps_mask(aVal, 0x81); // NaN (QNaN | SNaN)
627  __mmask16 zero_mask = _mm512_fpclass_ps_mask(aVal, 0x06); // Zero (+0 | -0)
628  __mmask16 inf_mask = _mm512_fpclass_ps_mask(aVal, 0x08); // +Inf only
629  __mmask16 neg_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OQ);
630  __mmask16 invalid_mask = _kor_mask16(nan_mask, neg_mask); // neg or NaN -> NaN
631 
632  __m512i aVal_i = _mm512_castps_si512(aVal);
633 
634  // Extract exponent
635  __m512i exp_i = _mm512_srli_epi32(_mm512_and_si512(aVal_i, exp_mask), 23);
636  exp_i = _mm512_sub_epi32(exp_i, exp_bias);
637  __m512 exp_f = _mm512_cvtepi32_ps(exp_i);
638 
639  // Extract mantissa as float in [1, 2)
640  __m512 frac = _mm512_castsi512_ps(
641  _mm512_or_si512(_mm512_and_si512(aVal_i, mant_mask), one_bits));
642 
643  // Evaluate degree-6 polynomial with FMA
644  __m512 poly = _mm512_log2_poly_avx512(frac);
645 
646  // result = exp + poly * (frac - 1)
647  __m512 bVal = _mm512_fmadd_ps(poly, _mm512_sub_ps(frac, one), exp_f);
648 
649  // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
650  bVal = _mm512_mask_blend_ps(zero_mask, bVal, _mm512_set1_ps(-127.0f));
651  bVal = _mm512_mask_blend_ps(inf_mask, bVal, _mm512_set1_ps(127.0f));
652  bVal = _mm512_mask_blend_ps(invalid_mask, bVal, _mm512_set1_ps(NAN));
653 
654  _mm512_storeu_ps(bPtr, bVal);
655 
656  aPtr += 16;
657  bPtr += 16;
658  }
659 
660  number = sixteenthPoints * 16;
661  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
662 }
663 
664 #endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */
665 
666 #if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
667 
668 static inline void volk_32f_log2_32f_a_avx512dq(float* bVector,
669  const float* aVector,
670  unsigned int num_points)
671 {
672  float* bPtr = bVector;
673  const float* aPtr = aVector;
674 
675  unsigned int number = 0;
676  const unsigned int sixteenthPoints = num_points / 16;
677 
678  const __m512i exp_mask = _mm512_set1_epi32(0x7f800000);
679  const __m512i mant_mask = _mm512_set1_epi32(0x007fffff);
680  const __m512i one_bits = _mm512_set1_epi32(0x3f800000);
681  const __m512i exp_bias = _mm512_set1_epi32(127);
682  const __m512 one = _mm512_set1_ps(1.0f);
683 
684  for (; number < sixteenthPoints; number++) {
685  __m512 aVal = _mm512_load_ps(aPtr);
686 
687  // Use fpclass for special value detection (AVX512DQ feature)
688  // 0x01 = QNaN, 0x02 = +0, 0x04 = -0, 0x08 = +Inf, 0x10 = -Inf, 0x80 = SNaN
689  __mmask16 nan_mask = _mm512_fpclass_ps_mask(aVal, 0x81); // NaN (QNaN | SNaN)
690  __mmask16 zero_mask = _mm512_fpclass_ps_mask(aVal, 0x06); // Zero (+0 | -0)
691  __mmask16 inf_mask = _mm512_fpclass_ps_mask(aVal, 0x08); // +Inf only
692  __mmask16 neg_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OQ);
693  __mmask16 invalid_mask = _kor_mask16(nan_mask, neg_mask); // neg or NaN -> NaN
694 
695  __m512i aVal_i = _mm512_castps_si512(aVal);
696 
697  // Extract exponent
698  __m512i exp_i = _mm512_srli_epi32(_mm512_and_si512(aVal_i, exp_mask), 23);
699  exp_i = _mm512_sub_epi32(exp_i, exp_bias);
700  __m512 exp_f = _mm512_cvtepi32_ps(exp_i);
701 
702  // Extract mantissa as float in [1, 2)
703  __m512 frac = _mm512_castsi512_ps(
704  _mm512_or_si512(_mm512_and_si512(aVal_i, mant_mask), one_bits));
705 
706  // Evaluate degree-6 polynomial with FMA
707  __m512 poly = _mm512_log2_poly_avx512(frac);
708 
709  // result = exp + poly * (frac - 1)
710  __m512 bVal = _mm512_fmadd_ps(poly, _mm512_sub_ps(frac, one), exp_f);
711 
712  // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
713  bVal = _mm512_mask_blend_ps(zero_mask, bVal, _mm512_set1_ps(-127.0f));
714  bVal = _mm512_mask_blend_ps(inf_mask, bVal, _mm512_set1_ps(127.0f));
715  bVal = _mm512_mask_blend_ps(invalid_mask, bVal, _mm512_set1_ps(NAN));
716 
717  _mm512_store_ps(bPtr, bVal);
718 
719  aPtr += 16;
720  bPtr += 16;
721  }
722 
723  number = sixteenthPoints * 16;
724  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
725 }
726 
727 #endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ */
728 
729 #ifdef LV_HAVE_NEON
730 #include <arm_neon.h>
732 
733 static inline void
734 volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
735 {
736  float* bPtr = bVector;
737  const float* aPtr = aVector;
738  unsigned int number;
739  const unsigned int quarterPoints = num_points / 4;
740 
741  const int32x4_t exp_mask = vdupq_n_s32(0x7f800000);
742  const int32x4_t mant_mask = vdupq_n_s32(0x007fffff);
743  const int32x4_t one_bits = vdupq_n_s32(0x3f800000);
744  const int32x4_t exp_bias = vdupq_n_s32(127);
745  const float32x4_t one = vdupq_n_f32(1.0f);
746  const float32x4_t zero = vdupq_n_f32(0.0f);
747  const float32x4_t inf_val = vdupq_n_f32(INFINITY);
748  const float32x4_t nan_val = vdupq_n_f32(NAN);
749  const float32x4_t neg_inf_out = vdupq_n_f32(-127.0f);
750  const float32x4_t pos_inf_out = vdupq_n_f32(127.0f);
751 
752  for (number = 0; number < quarterPoints; ++number) {
753  float32x4_t aVal = vld1q_f32(aPtr);
754 
755  // Check for special values
756  uint32x4_t neg_mask = vcltq_f32(aVal, zero);
757  uint32x4_t zero_mask = vceqq_f32(aVal, zero);
758  uint32x4_t inf_mask = vceqq_f32(aVal, inf_val);
759  uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aVal, aVal));
760  uint32x4_t invalid_mask = vorrq_u32(neg_mask, nan_mask);
761 
762  int32x4_t aVal_i = vreinterpretq_s32_f32(aVal);
763 
764  // Extract exponent
765  int32x4_t exp_i = vshrq_n_s32(vandq_s32(aVal_i, exp_mask), 23);
766  exp_i = vsubq_s32(exp_i, exp_bias);
767  float32x4_t exp_f = vcvtq_f32_s32(exp_i);
768 
769  // Extract mantissa as float in [1, 2)
770  int32x4_t frac_i = vorrq_s32(vandq_s32(aVal_i, mant_mask), one_bits);
771  float32x4_t frac = vreinterpretq_f32_s32(frac_i);
772 
773  // Evaluate degree-6 polynomial
774  float32x4_t poly = _vlog2_poly_f32(frac);
775 
776  // result = exp + poly * (frac - 1)
777  float32x4_t bVal = vaddq_f32(exp_f, vmulq_f32(poly, vsubq_f32(frac, one)));
778 
779  // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
780  bVal = vbslq_f32(zero_mask, neg_inf_out, bVal);
781  bVal = vbslq_f32(inf_mask, pos_inf_out, bVal);
782  bVal = vbslq_f32(invalid_mask, nan_val, bVal);
783 
784  vst1q_f32(bPtr, bVal);
785 
786  aPtr += 4;
787  bPtr += 4;
788  }
789 
790  number = quarterPoints * 4;
791  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
792 }
793 
794 #endif /* LV_HAVE_NEON */
795 
796 #ifdef LV_HAVE_NEONV8
797 
798 static inline void
799 volk_32f_log2_32f_neonv8(float* bVector, const float* aVector, unsigned int num_points)
800 {
801  float* bPtr = bVector;
802  const float* aPtr = aVector;
803  unsigned int number;
804  const unsigned int quarterPoints = num_points / 4;
805 
806  const int32x4_t exp_mask = vdupq_n_s32(0x7f800000);
807  const int32x4_t mant_mask = vdupq_n_s32(0x007fffff);
808  const int32x4_t one_bits = vdupq_n_s32(0x3f800000);
809  const int32x4_t exp_bias = vdupq_n_s32(127);
810  const float32x4_t one = vdupq_n_f32(1.0f);
811  const float32x4_t zero = vdupq_n_f32(0.0f);
812  const float32x4_t inf_val = vdupq_n_f32(INFINITY);
813  const float32x4_t nan_val = vdupq_n_f32(NAN);
814  const float32x4_t neg_inf_out = vdupq_n_f32(-127.0f);
815  const float32x4_t pos_inf_out = vdupq_n_f32(127.0f);
816 
817  for (number = 0; number < quarterPoints; ++number) {
818  float32x4_t aVal = vld1q_f32(aPtr);
819 
820  // Check for special values
821  uint32x4_t neg_mask = vcltq_f32(aVal, zero);
822  uint32x4_t zero_mask = vceqq_f32(aVal, zero);
823  uint32x4_t inf_mask = vceqq_f32(aVal, inf_val);
824  uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aVal, aVal));
825  uint32x4_t invalid_mask = vorrq_u32(neg_mask, nan_mask);
826 
827  int32x4_t aVal_i = vreinterpretq_s32_f32(aVal);
828 
829  // Extract exponent
830  int32x4_t exp_i = vshrq_n_s32(vandq_s32(aVal_i, exp_mask), 23);
831  exp_i = vsubq_s32(exp_i, exp_bias);
832  float32x4_t exp_f = vcvtq_f32_s32(exp_i);
833 
834  // Extract mantissa as float in [1, 2)
835  int32x4_t frac_i = vorrq_s32(vandq_s32(aVal_i, mant_mask), one_bits);
836  float32x4_t frac = vreinterpretq_f32_s32(frac_i);
837 
838  // Evaluate degree-6 polynomial with FMA
839  float32x4_t poly = _vlog2_poly_neonv8(frac);
840 
841  // result = exp + poly * (frac - 1)
842  float32x4_t bVal = vfmaq_f32(exp_f, poly, vsubq_f32(frac, one));
843 
844  // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
845  bVal = vbslq_f32(zero_mask, neg_inf_out, bVal);
846  bVal = vbslq_f32(inf_mask, pos_inf_out, bVal);
847  bVal = vbslq_f32(invalid_mask, nan_val, bVal);
848 
849  vst1q_f32(bPtr, bVal);
850 
851  aPtr += 4;
852  bPtr += 4;
853  }
854 
855  number = quarterPoints * 4;
856  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
857 }
858 
859 #endif /* LV_HAVE_NEONV8 */
860 
861 
862 #endif /* INCLUDED_volk_32f_log2_32f_a_H */
863 
864 #ifndef INCLUDED_volk_32f_log2_32f_u_H
865 #define INCLUDED_volk_32f_log2_32f_u_H
866 
867 #ifdef LV_HAVE_RVV
868 #include <riscv_vector.h>
870 
871 static inline void
872 volk_32f_log2_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
873 {
874  size_t vlmax = __riscv_vsetvlmax_e32m2();
875 
876  const vfloat32m2_t one = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
877  const vint32m2_t one_bits = __riscv_vreinterpret_i32m2(one);
878  const vint32m2_t mant_mask = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax);
879  const vint32m2_t exp_bias = __riscv_vmv_v_x_i32m2(127, vlmax);
880 
881  const vfloat32m2_t zero = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
882  const vfloat32m2_t inf_val = __riscv_vfmv_v_f_f32m2(INFINITY, vlmax);
883  const vfloat32m2_t nan_val = __riscv_vfmv_v_f_f32m2(NAN, vlmax);
884  const vfloat32m2_t neg_inf_out = __riscv_vfmv_v_f_f32m2(-127.0f, vlmax);
885  const vfloat32m2_t pos_inf_out = __riscv_vfmv_v_f_f32m2(127.0f, vlmax);
886 
887  size_t n = num_points;
888  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
889  vl = __riscv_vsetvl_e32m2(n);
890  vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
891 
892  // Check for special values
893  vbool16_t zero_mask = __riscv_vmfeq(v, zero, vl);
894  vbool16_t neg_mask = __riscv_vmflt(v, zero, vl);
895  vbool16_t inf_mask = __riscv_vmfeq(v, inf_val, vl);
896  vbool16_t nan_mask = __riscv_vmfne(v, v, vl);
897  vbool16_t invalid_mask = __riscv_vmor(neg_mask, nan_mask, vl);
898 
899  vfloat32m2_t a = __riscv_vfabs(v, vl);
900  vfloat32m2_t exp_f = __riscv_vfcvt_f(
901  __riscv_vsub(
902  __riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), exp_bias, vl),
903  vl);
904  vfloat32m2_t frac = __riscv_vreinterpret_f32m2(__riscv_vor(
905  __riscv_vand(__riscv_vreinterpret_i32m2(v), mant_mask, vl), one_bits, vl));
906 
907  // Evaluate degree-6 polynomial with FMA
908  vfloat32m2_t poly = __riscv_vlog2_poly_f32m2(frac, vl, vlmax);
909 
910  // result = exp + poly * (frac - 1)
911  vfloat32m2_t result =
912  __riscv_vfmacc(exp_f, poly, __riscv_vfsub(frac, one, vl), vl);
913 
914  // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
915  result = __riscv_vmerge(result, neg_inf_out, zero_mask, vl);
916  result = __riscv_vmerge(result, pos_inf_out, inf_mask, vl);
917  result = __riscv_vmerge(result, nan_val, invalid_mask, vl);
918 
919  __riscv_vse32(bVector, result, vl);
920  }
921 }
922 #endif /*LV_HAVE_RVV*/
923 
924 #endif /* INCLUDED_volk_32f_log2_32f_u_H */
_mm256_log2_poly_avx2_fma
static __m256 _mm256_log2_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:121
log2f_non_ieee
static float log2f_non_ieee(float f)
Definition: volk_common.h:156
volk_avx512_intrinsics.h
volk_32f_log2_32f_neon
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:734
_mm256_log2_poly_avx2
static __m256 _mm256_log2_poly_avx2(const __m256 x)
Definition: volk_avx2_intrinsics.h:425
volk_sse_intrinsics.h
volk_32f_log2_32f_generic
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:91
_vlog2_poly_f32
static float32x4_t _vlog2_poly_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:418
volk_avx2_fma_intrinsics.h
INFINITY
#define INFINITY
Definition: config.h:56
volk_rvv_intrinsics.h
__riscv_vlog2_poly_f32m2
static vfloat32m2_t __riscv_vlog2_poly_f32m2(vfloat32m2_t x, size_t vl, size_t vlmax)
Definition: volk_rvv_intrinsics.h:91
volk_neon_intrinsics.h
_mm_log2_poly_sse
static __m128 _mm_log2_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:199
_mm512_log2_poly_avx512
static __m512 _mm512_log2_poly_avx512(const __m512 x)
Definition: volk_avx512_intrinsics.h:250
volk_avx2_intrinsics.h