Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_asin_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
58 #include <inttypes.h>
59 #include <math.h>
60 #include <stdio.h>
61 #include <volk/volk_common.h>
62 
63 #ifndef INCLUDED_volk_32f_asin_32f_a_H
64 #define INCLUDED_volk_32f_asin_32f_a_H
65 
66 #ifdef LV_HAVE_GENERIC
67 
68 static inline void
69 volk_32f_asin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
70 {
71  for (unsigned int i = 0; i < num_points; i++) {
72  bVector[i] = volk_arcsin(aVector[i]);
73  }
74 }
75 
76 #endif /* LV_HAVE_GENERIC */
77 
78 #ifdef LV_HAVE_SSE4_1
79 #include <smmintrin.h>
81 
82 static inline void
83 volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
84 {
85  const __m128 pi_2 = _mm_set1_ps(0x1.921fb6p0f);
86  const __m128 half = _mm_set1_ps(0.5f);
87  const __m128 one = _mm_set1_ps(1.0f);
88  const __m128 two = _mm_set1_ps(2.0f);
89  const __m128 sign_mask = _mm_set1_ps(-0.0f);
90 
91  unsigned int number = 0;
92  const unsigned int quarterPoints = num_points / 4;
93 
94  for (; number < quarterPoints; number++) {
95  __m128 aVal = _mm_load_ps(aVector);
96 
97  // Get absolute value and sign
98  __m128 sign = _mm_and_ps(aVal, sign_mask);
99  __m128 ax = _mm_andnot_ps(sign_mask, aVal);
100 
101  // Two-range computation
102  // Small: result = arcsin_poly(x)
103  // Large: result = pi/2 - 2*arcsin_poly(sqrt((1-|x|)/2))
104 
105  __m128 t = _mm_mul_ps(_mm_sub_ps(one, ax), half);
106  __m128 s = _mm_sqrt_ps(t);
107 
108  // Compute polynomial for both ranges
109  __m128 poly_small = _mm_arcsin_poly_sse(ax);
110  __m128 poly_large = _mm_arcsin_poly_sse(s);
111 
112  // Large range: pi/2 - 2*poly_large
113  __m128 result_large = _mm_sub_ps(pi_2, _mm_mul_ps(two, poly_large));
114 
115  // Blend based on |x| > 0.5
116  __m128 mask = _mm_cmpgt_ps(ax, half);
117  __m128 result = _mm_blendv_ps(poly_small, result_large, mask);
118 
119  // Apply sign
120  result = _mm_or_ps(result, sign);
121 
122  _mm_store_ps(bVector, result);
123 
124  aVector += 4;
125  bVector += 4;
126  }
127 
128  number = quarterPoints * 4;
129  for (; number < num_points; number++) {
130  *bVector++ = volk_arcsin(*aVector++);
131  }
132 }
133 
134 #endif /* LV_HAVE_SSE4_1 */
135 
136 #ifdef LV_HAVE_AVX
137 #include <immintrin.h>
139 
140 static inline void
141 volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
142 {
143  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
144  const __m256 half = _mm256_set1_ps(0.5f);
145  const __m256 one = _mm256_set1_ps(1.0f);
146  const __m256 two = _mm256_set1_ps(2.0f);
147  const __m256 sign_mask = _mm256_set1_ps(-0.0f);
148 
149  unsigned int number = 0;
150  const unsigned int eighthPoints = num_points / 8;
151 
152  for (; number < eighthPoints; number++) {
153  __m256 aVal = _mm256_load_ps(aVector);
154 
155  // Get absolute value and sign
156  __m256 sign = _mm256_and_ps(aVal, sign_mask);
157  __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
158 
159  // Two-range computation
160  __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
161  __m256 s = _mm256_sqrt_ps(t);
162 
163  // Compute polynomial for both ranges
164  __m256 poly_small = _mm256_arcsin_poly_avx(ax);
165  __m256 poly_large = _mm256_arcsin_poly_avx(s);
166 
167  // Large range: pi/2 - 2*poly_large
168  __m256 result_large = _mm256_sub_ps(pi_2, _mm256_mul_ps(two, poly_large));
169 
170  // Blend based on |x| > 0.5
171  __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
172  __m256 result = _mm256_blendv_ps(poly_small, result_large, mask);
173 
174  // Apply sign
175  result = _mm256_or_ps(result, sign);
176 
177  _mm256_store_ps(bVector, result);
178 
179  aVector += 8;
180  bVector += 8;
181  }
182 
183  number = eighthPoints * 8;
184  for (; number < num_points; number++) {
185  *bVector++ = volk_arcsin(*aVector++);
186  }
187 }
188 
189 #endif /* LV_HAVE_AVX */
190 
191 #ifdef LV_HAVE_AVX2
192 #include <immintrin.h>
194 
195 static inline void volk_32f_asin_32f_a_avx2_fma(float* bVector,
196  const float* aVector,
197  unsigned int num_points)
198 {
199  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
200  const __m256 half = _mm256_set1_ps(0.5f);
201  const __m256 one = _mm256_set1_ps(1.0f);
202  const __m256 two = _mm256_set1_ps(2.0f);
203  const __m256 sign_mask = _mm256_set1_ps(-0.0f);
204 
205  unsigned int number = 0;
206  const unsigned int eighthPoints = num_points / 8;
207 
208  for (; number < eighthPoints; number++) {
209  __m256 aVal = _mm256_load_ps(aVector);
210 
211  // Get absolute value and sign
212  __m256 sign = _mm256_and_ps(aVal, sign_mask);
213  __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
214 
215  // Two-range computation
216  __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
217  __m256 s = _mm256_sqrt_ps(t);
218 
219  // Compute polynomial for both ranges
220  __m256 poly_small = _mm256_arcsin_poly_avx2_fma(ax);
221  __m256 poly_large = _mm256_arcsin_poly_avx2_fma(s);
222 
223  // Large range: pi/2 - 2*poly_large
224  __m256 result_large = _mm256_fnmadd_ps(two, poly_large, pi_2);
225 
226  // Blend based on |x| > 0.5
227  __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
228  __m256 result = _mm256_blendv_ps(poly_small, result_large, mask);
229 
230  // Apply sign
231  result = _mm256_or_ps(result, sign);
232 
233  _mm256_store_ps(bVector, result);
234 
235  aVector += 8;
236  bVector += 8;
237  }
238 
239  number = eighthPoints * 8;
240  for (; number < num_points; number++) {
241  *bVector++ = volk_arcsin(*aVector++);
242  }
243 }
244 
245 #endif /* LV_HAVE_AVX2 */
246 
247 #ifdef LV_HAVE_AVX512F
248 #include <immintrin.h>
250 
251 static inline void
252 volk_32f_asin_32f_a_avx512(float* bVector, const float* aVector, unsigned int num_points)
253 {
254  const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
255  const __m512 half = _mm512_set1_ps(0.5f);
256  const __m512 one = _mm512_set1_ps(1.0f);
257  const __m512 two = _mm512_set1_ps(2.0f);
258  const __m512i sign_mask = _mm512_set1_epi32(0x80000000);
259 
260  unsigned int number = 0;
261  const unsigned int sixteenthPoints = num_points / 16;
262 
263  for (; number < sixteenthPoints; number++) {
264  __m512 aVal = _mm512_load_ps(aVector);
265 
266  // Get absolute value and sign using integer ops (AVX512F compatible)
267  __m512i aVal_i = _mm512_castps_si512(aVal);
268  __m512i sign = _mm512_and_epi32(aVal_i, sign_mask);
269  __m512 ax = _mm512_castsi512_ps(_mm512_andnot_epi32(sign_mask, aVal_i));
270 
271  // Two-range computation
272  __m512 t = _mm512_mul_ps(_mm512_sub_ps(one, ax), half);
273  __m512 s = _mm512_sqrt_ps(t);
274 
275  // Compute polynomial for both ranges
276  __m512 poly_small = _mm512_arcsin_poly_avx512(ax);
277  __m512 poly_large = _mm512_arcsin_poly_avx512(s);
278 
279  // Large range: pi/2 - 2*poly_large
280  __m512 result_large = _mm512_fnmadd_ps(two, poly_large, pi_2);
281 
282  // Blend based on |x| > 0.5
283  __mmask16 mask = _mm512_cmp_ps_mask(ax, half, _CMP_GT_OS);
284  __m512 result = _mm512_mask_blend_ps(mask, poly_small, result_large);
285 
286  // Apply sign
287  result = _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(result), sign));
288 
289  _mm512_store_ps(bVector, result);
290 
291  aVector += 16;
292  bVector += 16;
293  }
294 
295  number = sixteenthPoints * 16;
296  for (; number < num_points; number++) {
297  *bVector++ = volk_arcsin(*aVector++);
298  }
299 }
300 
301 #endif /* LV_HAVE_AVX512F */
302 
303 #endif /* INCLUDED_volk_32f_asin_32f_a_H */
304 
305 #ifndef INCLUDED_volk_32f_asin_32f_u_H
306 #define INCLUDED_volk_32f_asin_32f_u_H
307 
308 #ifdef LV_HAVE_SSE4_1
309 #include <smmintrin.h>
311 
312 static inline void
313 volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
314 {
315  const __m128 pi_2 = _mm_set1_ps(0x1.921fb6p0f);
316  const __m128 half = _mm_set1_ps(0.5f);
317  const __m128 one = _mm_set1_ps(1.0f);
318  const __m128 two = _mm_set1_ps(2.0f);
319  const __m128 sign_mask = _mm_set1_ps(-0.0f);
320 
321  unsigned int number = 0;
322  const unsigned int quarterPoints = num_points / 4;
323 
324  for (; number < quarterPoints; number++) {
325  __m128 aVal = _mm_loadu_ps(aVector);
326 
327  __m128 sign = _mm_and_ps(aVal, sign_mask);
328  __m128 ax = _mm_andnot_ps(sign_mask, aVal);
329 
330  __m128 t = _mm_mul_ps(_mm_sub_ps(one, ax), half);
331  __m128 s = _mm_sqrt_ps(t);
332 
333  __m128 poly_small = _mm_arcsin_poly_sse(ax);
334  __m128 poly_large = _mm_arcsin_poly_sse(s);
335 
336  __m128 result_large = _mm_sub_ps(pi_2, _mm_mul_ps(two, poly_large));
337 
338  __m128 mask = _mm_cmpgt_ps(ax, half);
339  __m128 result = _mm_blendv_ps(poly_small, result_large, mask);
340 
341  result = _mm_or_ps(result, sign);
342 
343  _mm_storeu_ps(bVector, result);
344 
345  aVector += 4;
346  bVector += 4;
347  }
348 
349  number = quarterPoints * 4;
350  for (; number < num_points; number++) {
351  *bVector++ = volk_arcsin(*aVector++);
352  }
353 }
354 
355 #endif /* LV_HAVE_SSE4_1 */
356 
357 #ifdef LV_HAVE_AVX
358 #include <immintrin.h>
360 
361 static inline void
362 volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
363 {
364  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
365  const __m256 half = _mm256_set1_ps(0.5f);
366  const __m256 one = _mm256_set1_ps(1.0f);
367  const __m256 two = _mm256_set1_ps(2.0f);
368  const __m256 sign_mask = _mm256_set1_ps(-0.0f);
369 
370  unsigned int number = 0;
371  const unsigned int eighthPoints = num_points / 8;
372 
373  for (; number < eighthPoints; number++) {
374  __m256 aVal = _mm256_loadu_ps(aVector);
375 
376  __m256 sign = _mm256_and_ps(aVal, sign_mask);
377  __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
378 
379  __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
380  __m256 s = _mm256_sqrt_ps(t);
381 
382  __m256 poly_small = _mm256_arcsin_poly_avx(ax);
383  __m256 poly_large = _mm256_arcsin_poly_avx(s);
384 
385  __m256 result_large = _mm256_sub_ps(pi_2, _mm256_mul_ps(two, poly_large));
386 
387  __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
388  __m256 result = _mm256_blendv_ps(poly_small, result_large, mask);
389 
390  result = _mm256_or_ps(result, sign);
391 
392  _mm256_storeu_ps(bVector, result);
393 
394  aVector += 8;
395  bVector += 8;
396  }
397 
398  number = eighthPoints * 8;
399  for (; number < num_points; number++) {
400  *bVector++ = volk_arcsin(*aVector++);
401  }
402 }
403 
404 #endif /* LV_HAVE_AVX */
405 
406 #ifdef LV_HAVE_AVX2
407 #include <immintrin.h>
409 
410 static inline void volk_32f_asin_32f_u_avx2_fma(float* bVector,
411  const float* aVector,
412  unsigned int num_points)
413 {
414  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
415  const __m256 half = _mm256_set1_ps(0.5f);
416  const __m256 one = _mm256_set1_ps(1.0f);
417  const __m256 two = _mm256_set1_ps(2.0f);
418  const __m256 sign_mask = _mm256_set1_ps(-0.0f);
419 
420  unsigned int number = 0;
421  const unsigned int eighthPoints = num_points / 8;
422 
423  for (; number < eighthPoints; number++) {
424  __m256 aVal = _mm256_loadu_ps(aVector);
425 
426  __m256 sign = _mm256_and_ps(aVal, sign_mask);
427  __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
428 
429  __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
430  __m256 s = _mm256_sqrt_ps(t);
431 
432  __m256 poly_small = _mm256_arcsin_poly_avx2_fma(ax);
433  __m256 poly_large = _mm256_arcsin_poly_avx2_fma(s);
434 
435  __m256 result_large = _mm256_fnmadd_ps(two, poly_large, pi_2);
436 
437  __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
438  __m256 result = _mm256_blendv_ps(poly_small, result_large, mask);
439 
440  result = _mm256_or_ps(result, sign);
441 
442  _mm256_storeu_ps(bVector, result);
443 
444  aVector += 8;
445  bVector += 8;
446  }
447 
448  number = eighthPoints * 8;
449  for (; number < num_points; number++) {
450  *bVector++ = volk_arcsin(*aVector++);
451  }
452 }
453 
454 #endif /* LV_HAVE_AVX2 */
455 
456 #ifdef LV_HAVE_AVX512F
457 #include <immintrin.h>
459 
460 static inline void
461 volk_32f_asin_32f_u_avx512(float* bVector, const float* aVector, unsigned int num_points)
462 {
463  const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
464  const __m512 half = _mm512_set1_ps(0.5f);
465  const __m512 one = _mm512_set1_ps(1.0f);
466  const __m512 two = _mm512_set1_ps(2.0f);
467  const __m512i sign_mask = _mm512_set1_epi32(0x80000000);
468 
469  unsigned int number = 0;
470  const unsigned int sixteenthPoints = num_points / 16;
471 
472  for (; number < sixteenthPoints; number++) {
473  __m512 aVal = _mm512_loadu_ps(aVector);
474 
475  __m512i aVal_i = _mm512_castps_si512(aVal);
476  __m512i sign = _mm512_and_epi32(aVal_i, sign_mask);
477  __m512 ax = _mm512_castsi512_ps(_mm512_andnot_epi32(sign_mask, aVal_i));
478 
479  __m512 t = _mm512_mul_ps(_mm512_sub_ps(one, ax), half);
480  __m512 s = _mm512_sqrt_ps(t);
481 
482  __m512 poly_small = _mm512_arcsin_poly_avx512(ax);
483  __m512 poly_large = _mm512_arcsin_poly_avx512(s);
484 
485  __m512 result_large = _mm512_fnmadd_ps(two, poly_large, pi_2);
486 
487  __mmask16 mask = _mm512_cmp_ps_mask(ax, half, _CMP_GT_OS);
488  __m512 result = _mm512_mask_blend_ps(mask, poly_small, result_large);
489 
490  result = _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(result), sign));
491 
492  _mm512_storeu_ps(bVector, result);
493 
494  aVector += 16;
495  bVector += 16;
496  }
497 
498  number = sixteenthPoints * 16;
499  for (; number < num_points; number++) {
500  *bVector++ = volk_arcsin(*aVector++);
501  }
502 }
503 
504 #endif /* LV_HAVE_AVX512F */
505 
506 #ifdef LV_HAVE_NEON
507 #include <arm_neon.h>
509 
510 static inline void
511 volk_32f_asin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
512 {
513  const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
514  const float32x4_t half = vdupq_n_f32(0.5f);
515  const float32x4_t one = vdupq_n_f32(1.0f);
516  const float32x4_t two = vdupq_n_f32(2.0f);
517 
518  unsigned int number = 0;
519  const unsigned int quarterPoints = num_points / 4;
520 
521  for (; number < quarterPoints; number++) {
522  float32x4_t aVal = vld1q_f32(aVector);
523 
524  // Get absolute value and sign
525  float32x4_t ax = vabsq_f32(aVal);
526  uint32x4_t sign_bits =
527  vandq_u32(vreinterpretq_u32_f32(aVal), vdupq_n_u32(0x80000000));
528 
529  // Two-range computation
530  float32x4_t t = vmulq_f32(vsubq_f32(one, ax), half);
531  float32x4_t s = _vsqrtq_f32(t);
532 
533  // Compute polynomial for both ranges
534  float32x4_t poly_small = _varcsinq_f32(ax);
535  float32x4_t poly_large = _varcsinq_f32(s);
536 
537  // Large range: pi/2 - 2*poly_large
538  float32x4_t result_large = vmlsq_f32(pi_2, two, poly_large);
539 
540  // Blend based on |x| > 0.5
541  uint32x4_t mask = vcgtq_f32(ax, half);
542  float32x4_t result = vbslq_f32(mask, result_large, poly_small);
543 
544  // Apply sign
545  result =
546  vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(result), sign_bits));
547 
548  vst1q_f32(bVector, result);
549 
550  aVector += 4;
551  bVector += 4;
552  }
553 
554  number = quarterPoints * 4;
555  for (; number < num_points; number++) {
556  *bVector++ = volk_arcsin(*aVector++);
557  }
558 }
559 
560 #endif /* LV_HAVE_NEON */
561 
562 #ifdef LV_HAVE_NEONV8
563 #include <arm_neon.h>
565 
566 static inline void
567 volk_32f_asin_32f_neonv8(float* bVector, const float* aVector, unsigned int num_points)
568 {
569  const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
570  const float32x4_t half = vdupq_n_f32(0.5f);
571  const float32x4_t one = vdupq_n_f32(1.0f);
572  const float32x4_t two = vdupq_n_f32(2.0f);
573 
574  unsigned int number = 0;
575  const unsigned int quarterPoints = num_points / 4;
576 
577  for (; number < quarterPoints; number++) {
578  float32x4_t aVal = vld1q_f32(aVector);
579 
580  float32x4_t ax = vabsq_f32(aVal);
581  uint32x4_t sign_bits =
582  vandq_u32(vreinterpretq_u32_f32(aVal), vdupq_n_u32(0x80000000));
583 
584  float32x4_t t = vmulq_f32(vsubq_f32(one, ax), half);
585  float32x4_t s = vsqrtq_f32(t);
586 
587  float32x4_t poly_small = _varcsinq_f32_neonv8(ax);
588  float32x4_t poly_large = _varcsinq_f32_neonv8(s);
589 
590  float32x4_t result_large = vfmsq_f32(pi_2, two, poly_large);
591 
592  uint32x4_t mask = vcgtq_f32(ax, half);
593  float32x4_t result = vbslq_f32(mask, result_large, poly_small);
594 
595  result =
596  vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(result), sign_bits));
597 
598  vst1q_f32(bVector, result);
599 
600  aVector += 4;
601  bVector += 4;
602  }
603 
604  number = quarterPoints * 4;
605  for (; number < num_points; number++) {
606  *bVector++ = volk_arcsin(*aVector++);
607  }
608 }
609 
610 #endif /* LV_HAVE_NEONV8 */
611 
612 #endif /* INCLUDED_volk_32f_asin_32f_u_H */
volk_32f_asin_32f_generic
static void volk_32f_asin_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:69
_varcsinq_f32
static float32x4_t _varcsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:130
volk_32f_asin_32f_neon
static void volk_32f_asin_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:511
_mm_arcsin_poly_sse
static __m128 _mm_arcsin_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:92
volk_avx512_intrinsics.h
volk_arcsin
static float volk_arcsin(const float x)
Definition: volk_common.h:404
_mm512_arcsin_poly_avx512
static __m512 _mm512_arcsin_poly_avx512(const __m512 x)
Definition: volk_avx512_intrinsics.h:104
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_common.h
volk_sse_intrinsics.h
volk_avx2_fma_intrinsics.h
volk_32f_asin_32f_u_avx
static void volk_32f_asin_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:362
_vsqrtq_f32
static float32x4_t _vsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:105
volk_neon_intrinsics.h
volk_avx_intrinsics.h
_mm256_arcsin_poly_avx2_fma
static __m256 _mm256_arcsin_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:57
volk_32f_asin_32f_a_avx
static void volk_32f_asin_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:141
_mm256_arcsin_poly_avx
static __m256 _mm256_arcsin_poly_avx(const __m256 x)
Definition: volk_avx_intrinsics.h:101