Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_acos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
58 #include <inttypes.h>
59 #include <math.h>
60 #include <stdio.h>
61 #include <volk/volk_common.h>
62 
63 #ifndef INCLUDED_volk_32f_acos_32f_a_H
64 #define INCLUDED_volk_32f_acos_32f_a_H
65 
66 #ifdef LV_HAVE_GENERIC
67 
68 static inline void
69 volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
70 {
71  for (unsigned int i = 0; i < num_points; i++) {
72  bVector[i] = volk_arccos(aVector[i]);
73  }
74 }
75 
76 #endif /* LV_HAVE_GENERIC */
77 
78 #ifdef LV_HAVE_SSE4_1
79 #include <smmintrin.h>
81 
82 static inline void
83 volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
84 {
85  const __m128 pi_2 = _mm_set1_ps(0x1.921fb6p0f);
86  const __m128 half = _mm_set1_ps(0.5f);
87  const __m128 one = _mm_set1_ps(1.0f);
88  const __m128 two = _mm_set1_ps(2.0f);
89  const __m128 sign_mask = _mm_set1_ps(-0.0f);
90 
91  unsigned int number = 0;
92  const unsigned int quarterPoints = num_points / 4;
93 
94  for (; number < quarterPoints; number++) {
95  __m128 aVal = _mm_load_ps(aVector);
96 
97  // acos(x) = pi/2 - asin(x)
98  // Get absolute value and sign
99  __m128 sign = _mm_and_ps(aVal, sign_mask);
100  __m128 ax = _mm_andnot_ps(sign_mask, aVal);
101 
102  // Two-range asin computation
103  __m128 t = _mm_mul_ps(_mm_sub_ps(one, ax), half);
104  __m128 s = _mm_sqrt_ps(t);
105 
106  __m128 poly_small = _mm_arcsin_poly_sse(ax);
107  __m128 poly_large = _mm_arcsin_poly_sse(s);
108 
109  __m128 asin_large = _mm_sub_ps(pi_2, _mm_mul_ps(two, poly_large));
110 
111  __m128 mask = _mm_cmpgt_ps(ax, half);
112  __m128 asin_result = _mm_blendv_ps(poly_small, asin_large, mask);
113 
114  // Apply sign to get asin(x)
115  asin_result = _mm_or_ps(asin_result, sign);
116 
117  // acos(x) = pi/2 - asin(x)
118  __m128 result = _mm_sub_ps(pi_2, asin_result);
119 
120  _mm_store_ps(bVector, result);
121 
122  aVector += 4;
123  bVector += 4;
124  }
125 
126  number = quarterPoints * 4;
127  for (; number < num_points; number++) {
128  *bVector++ = volk_arccos(*aVector++);
129  }
130 }
131 
132 #endif /* LV_HAVE_SSE4_1 */
133 
134 #ifdef LV_HAVE_AVX
135 #include <immintrin.h>
137 
138 static inline void
139 volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
140 {
141  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
142  const __m256 half = _mm256_set1_ps(0.5f);
143  const __m256 one = _mm256_set1_ps(1.0f);
144  const __m256 two = _mm256_set1_ps(2.0f);
145  const __m256 sign_mask = _mm256_set1_ps(-0.0f);
146 
147  unsigned int number = 0;
148  const unsigned int eighthPoints = num_points / 8;
149 
150  for (; number < eighthPoints; number++) {
151  __m256 aVal = _mm256_load_ps(aVector);
152 
153  __m256 sign = _mm256_and_ps(aVal, sign_mask);
154  __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
155 
156  __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
157  __m256 s = _mm256_sqrt_ps(t);
158 
159  __m256 poly_small = _mm256_arcsin_poly_avx(ax);
160  __m256 poly_large = _mm256_arcsin_poly_avx(s);
161 
162  __m256 asin_large = _mm256_sub_ps(pi_2, _mm256_mul_ps(two, poly_large));
163 
164  __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
165  __m256 asin_result = _mm256_blendv_ps(poly_small, asin_large, mask);
166 
167  asin_result = _mm256_or_ps(asin_result, sign);
168 
169  __m256 result = _mm256_sub_ps(pi_2, asin_result);
170 
171  _mm256_store_ps(bVector, result);
172 
173  aVector += 8;
174  bVector += 8;
175  }
176 
177  number = eighthPoints * 8;
178  for (; number < num_points; number++) {
179  *bVector++ = volk_arccos(*aVector++);
180  }
181 }
182 
183 #endif /* LV_HAVE_AVX */
184 
185 #ifdef LV_HAVE_AVX2
186 #include <immintrin.h>
188 
189 static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
190  const float* aVector,
191  unsigned int num_points)
192 {
193  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
194  const __m256 half = _mm256_set1_ps(0.5f);
195  const __m256 one = _mm256_set1_ps(1.0f);
196  const __m256 two = _mm256_set1_ps(2.0f);
197  const __m256 sign_mask = _mm256_set1_ps(-0.0f);
198 
199  unsigned int number = 0;
200  const unsigned int eighthPoints = num_points / 8;
201 
202  for (; number < eighthPoints; number++) {
203  __m256 aVal = _mm256_load_ps(aVector);
204 
205  __m256 sign = _mm256_and_ps(aVal, sign_mask);
206  __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
207 
208  __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
209  __m256 s = _mm256_sqrt_ps(t);
210 
211  __m256 poly_small = _mm256_arcsin_poly_avx2_fma(ax);
212  __m256 poly_large = _mm256_arcsin_poly_avx2_fma(s);
213 
214  __m256 asin_large = _mm256_fnmadd_ps(two, poly_large, pi_2);
215 
216  __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
217  __m256 asin_result = _mm256_blendv_ps(poly_small, asin_large, mask);
218 
219  asin_result = _mm256_or_ps(asin_result, sign);
220 
221  __m256 result = _mm256_sub_ps(pi_2, asin_result);
222 
223  _mm256_store_ps(bVector, result);
224 
225  aVector += 8;
226  bVector += 8;
227  }
228 
229  number = eighthPoints * 8;
230  for (; number < num_points; number++) {
231  *bVector++ = volk_arccos(*aVector++);
232  }
233 }
234 
235 #endif /* LV_HAVE_AVX2 */
236 
237 #ifdef LV_HAVE_AVX512F
238 #include <immintrin.h>
240 
241 static inline void
242 volk_32f_acos_32f_a_avx512(float* bVector, const float* aVector, unsigned int num_points)
243 {
244  const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
245  const __m512 half = _mm512_set1_ps(0.5f);
246  const __m512 one = _mm512_set1_ps(1.0f);
247  const __m512 two = _mm512_set1_ps(2.0f);
248  const __m512i sign_mask = _mm512_set1_epi32(0x80000000);
249 
250  unsigned int number = 0;
251  const unsigned int sixteenthPoints = num_points / 16;
252 
253  for (; number < sixteenthPoints; number++) {
254  __m512 aVal = _mm512_load_ps(aVector);
255 
256  __m512i aVal_i = _mm512_castps_si512(aVal);
257  __m512i sign = _mm512_and_epi32(aVal_i, sign_mask);
258  __m512 ax = _mm512_castsi512_ps(_mm512_andnot_epi32(sign_mask, aVal_i));
259 
260  __m512 t = _mm512_mul_ps(_mm512_sub_ps(one, ax), half);
261  __m512 s = _mm512_sqrt_ps(t);
262 
263  __m512 poly_small = _mm512_arcsin_poly_avx512(ax);
264  __m512 poly_large = _mm512_arcsin_poly_avx512(s);
265 
266  __m512 asin_large = _mm512_fnmadd_ps(two, poly_large, pi_2);
267 
268  __mmask16 mask = _mm512_cmp_ps_mask(ax, half, _CMP_GT_OS);
269  __m512 asin_result = _mm512_mask_blend_ps(mask, poly_small, asin_large);
270 
271  asin_result =
272  _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(asin_result), sign));
273 
274  __m512 result = _mm512_sub_ps(pi_2, asin_result);
275 
276  _mm512_store_ps(bVector, result);
277 
278  aVector += 16;
279  bVector += 16;
280  }
281 
282  number = sixteenthPoints * 16;
283  for (; number < num_points; number++) {
284  *bVector++ = volk_arccos(*aVector++);
285  }
286 }
287 
288 #endif /* LV_HAVE_AVX512F */
289 
290 #endif /* INCLUDED_volk_32f_acos_32f_a_H */
291 
292 #ifndef INCLUDED_volk_32f_acos_32f_u_H
293 #define INCLUDED_volk_32f_acos_32f_u_H
294 
295 #ifdef LV_HAVE_SSE4_1
296 #include <smmintrin.h>
298 
299 static inline void
300 volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
301 {
302  const __m128 pi_2 = _mm_set1_ps(0x1.921fb6p0f);
303  const __m128 half = _mm_set1_ps(0.5f);
304  const __m128 one = _mm_set1_ps(1.0f);
305  const __m128 two = _mm_set1_ps(2.0f);
306  const __m128 sign_mask = _mm_set1_ps(-0.0f);
307 
308  unsigned int number = 0;
309  const unsigned int quarterPoints = num_points / 4;
310 
311  for (; number < quarterPoints; number++) {
312  __m128 aVal = _mm_loadu_ps(aVector);
313 
314  __m128 sign = _mm_and_ps(aVal, sign_mask);
315  __m128 ax = _mm_andnot_ps(sign_mask, aVal);
316 
317  __m128 t = _mm_mul_ps(_mm_sub_ps(one, ax), half);
318  __m128 s = _mm_sqrt_ps(t);
319 
320  __m128 poly_small = _mm_arcsin_poly_sse(ax);
321  __m128 poly_large = _mm_arcsin_poly_sse(s);
322 
323  __m128 asin_large = _mm_sub_ps(pi_2, _mm_mul_ps(two, poly_large));
324 
325  __m128 mask = _mm_cmpgt_ps(ax, half);
326  __m128 asin_result = _mm_blendv_ps(poly_small, asin_large, mask);
327 
328  asin_result = _mm_or_ps(asin_result, sign);
329 
330  __m128 result = _mm_sub_ps(pi_2, asin_result);
331 
332  _mm_storeu_ps(bVector, result);
333 
334  aVector += 4;
335  bVector += 4;
336  }
337 
338  number = quarterPoints * 4;
339  for (; number < num_points; number++) {
340  *bVector++ = volk_arccos(*aVector++);
341  }
342 }
343 
344 #endif /* LV_HAVE_SSE4_1 */
345 
346 #ifdef LV_HAVE_AVX
347 #include <immintrin.h>
349 
350 static inline void
351 volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
352 {
353  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
354  const __m256 half = _mm256_set1_ps(0.5f);
355  const __m256 one = _mm256_set1_ps(1.0f);
356  const __m256 two = _mm256_set1_ps(2.0f);
357  const __m256 sign_mask = _mm256_set1_ps(-0.0f);
358 
359  unsigned int number = 0;
360  const unsigned int eighthPoints = num_points / 8;
361 
362  for (; number < eighthPoints; number++) {
363  __m256 aVal = _mm256_loadu_ps(aVector);
364 
365  __m256 sign = _mm256_and_ps(aVal, sign_mask);
366  __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
367 
368  __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
369  __m256 s = _mm256_sqrt_ps(t);
370 
371  __m256 poly_small = _mm256_arcsin_poly_avx(ax);
372  __m256 poly_large = _mm256_arcsin_poly_avx(s);
373 
374  __m256 asin_large = _mm256_sub_ps(pi_2, _mm256_mul_ps(two, poly_large));
375 
376  __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
377  __m256 asin_result = _mm256_blendv_ps(poly_small, asin_large, mask);
378 
379  asin_result = _mm256_or_ps(asin_result, sign);
380 
381  __m256 result = _mm256_sub_ps(pi_2, asin_result);
382 
383  _mm256_storeu_ps(bVector, result);
384 
385  aVector += 8;
386  bVector += 8;
387  }
388 
389  number = eighthPoints * 8;
390  for (; number < num_points; number++) {
391  *bVector++ = volk_arccos(*aVector++);
392  }
393 }
394 
395 #endif /* LV_HAVE_AVX */
396 
397 #ifdef LV_HAVE_AVX2
398 #include <immintrin.h>
400 
401 static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
402  const float* aVector,
403  unsigned int num_points)
404 {
405  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
406  const __m256 half = _mm256_set1_ps(0.5f);
407  const __m256 one = _mm256_set1_ps(1.0f);
408  const __m256 two = _mm256_set1_ps(2.0f);
409  const __m256 sign_mask = _mm256_set1_ps(-0.0f);
410 
411  unsigned int number = 0;
412  const unsigned int eighthPoints = num_points / 8;
413 
414  for (; number < eighthPoints; number++) {
415  __m256 aVal = _mm256_loadu_ps(aVector);
416 
417  __m256 sign = _mm256_and_ps(aVal, sign_mask);
418  __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
419 
420  __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
421  __m256 s = _mm256_sqrt_ps(t);
422 
423  __m256 poly_small = _mm256_arcsin_poly_avx2_fma(ax);
424  __m256 poly_large = _mm256_arcsin_poly_avx2_fma(s);
425 
426  __m256 asin_large = _mm256_fnmadd_ps(two, poly_large, pi_2);
427 
428  __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
429  __m256 asin_result = _mm256_blendv_ps(poly_small, asin_large, mask);
430 
431  asin_result = _mm256_or_ps(asin_result, sign);
432 
433  __m256 result = _mm256_sub_ps(pi_2, asin_result);
434 
435  _mm256_storeu_ps(bVector, result);
436 
437  aVector += 8;
438  bVector += 8;
439  }
440 
441  number = eighthPoints * 8;
442  for (; number < num_points; number++) {
443  *bVector++ = volk_arccos(*aVector++);
444  }
445 }
446 
447 #endif /* LV_HAVE_AVX2 */
448 
449 #ifdef LV_HAVE_AVX512F
450 #include <immintrin.h>
452 
453 static inline void
454 volk_32f_acos_32f_u_avx512(float* bVector, const float* aVector, unsigned int num_points)
455 {
456  const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
457  const __m512 half = _mm512_set1_ps(0.5f);
458  const __m512 one = _mm512_set1_ps(1.0f);
459  const __m512 two = _mm512_set1_ps(2.0f);
460  const __m512i sign_mask = _mm512_set1_epi32(0x80000000);
461 
462  unsigned int number = 0;
463  const unsigned int sixteenthPoints = num_points / 16;
464 
465  for (; number < sixteenthPoints; number++) {
466  __m512 aVal = _mm512_loadu_ps(aVector);
467 
468  __m512i aVal_i = _mm512_castps_si512(aVal);
469  __m512i sign = _mm512_and_epi32(aVal_i, sign_mask);
470  __m512 ax = _mm512_castsi512_ps(_mm512_andnot_epi32(sign_mask, aVal_i));
471 
472  __m512 t = _mm512_mul_ps(_mm512_sub_ps(one, ax), half);
473  __m512 s = _mm512_sqrt_ps(t);
474 
475  __m512 poly_small = _mm512_arcsin_poly_avx512(ax);
476  __m512 poly_large = _mm512_arcsin_poly_avx512(s);
477 
478  __m512 asin_large = _mm512_fnmadd_ps(two, poly_large, pi_2);
479 
480  __mmask16 mask = _mm512_cmp_ps_mask(ax, half, _CMP_GT_OS);
481  __m512 asin_result = _mm512_mask_blend_ps(mask, poly_small, asin_large);
482 
483  asin_result =
484  _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(asin_result), sign));
485 
486  __m512 result = _mm512_sub_ps(pi_2, asin_result);
487 
488  _mm512_storeu_ps(bVector, result);
489 
490  aVector += 16;
491  bVector += 16;
492  }
493 
494  number = sixteenthPoints * 16;
495  for (; number < num_points; number++) {
496  *bVector++ = volk_arccos(*aVector++);
497  }
498 }
499 
500 #endif /* LV_HAVE_AVX512F */
501 
502 #ifdef LV_HAVE_NEON
503 #include <arm_neon.h>
505 
506 static inline void
507 volk_32f_acos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
508 {
509  const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
510  const float32x4_t half = vdupq_n_f32(0.5f);
511  const float32x4_t one = vdupq_n_f32(1.0f);
512  const float32x4_t two = vdupq_n_f32(2.0f);
513 
514  unsigned int number = 0;
515  const unsigned int quarterPoints = num_points / 4;
516 
517  for (; number < quarterPoints; number++) {
518  float32x4_t aVal = vld1q_f32(aVector);
519 
520  float32x4_t ax = vabsq_f32(aVal);
521  uint32x4_t sign_bits =
522  vandq_u32(vreinterpretq_u32_f32(aVal), vdupq_n_u32(0x80000000));
523 
524  float32x4_t t = vmulq_f32(vsubq_f32(one, ax), half);
525  float32x4_t s = _vsqrtq_f32(t);
526 
527  float32x4_t poly_small = _varcsinq_f32(ax);
528  float32x4_t poly_large = _varcsinq_f32(s);
529 
530  float32x4_t asin_large = vmlsq_f32(pi_2, two, poly_large);
531 
532  uint32x4_t mask = vcgtq_f32(ax, half);
533  float32x4_t asin_result = vbslq_f32(mask, asin_large, poly_small);
534 
535  asin_result = vreinterpretq_f32_u32(
536  vorrq_u32(vreinterpretq_u32_f32(asin_result), sign_bits));
537 
538  float32x4_t result = vsubq_f32(pi_2, asin_result);
539 
540  vst1q_f32(bVector, result);
541 
542  aVector += 4;
543  bVector += 4;
544  }
545 
546  number = quarterPoints * 4;
547  for (; number < num_points; number++) {
548  *bVector++ = volk_arccos(*aVector++);
549  }
550 }
551 
552 #endif /* LV_HAVE_NEON */
553 
554 #ifdef LV_HAVE_NEONV8
555 #include <arm_neon.h>
557 
558 static inline void
559 volk_32f_acos_32f_neonv8(float* bVector, const float* aVector, unsigned int num_points)
560 {
561  const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
562  const float32x4_t half = vdupq_n_f32(0.5f);
563  const float32x4_t one = vdupq_n_f32(1.0f);
564  const float32x4_t two = vdupq_n_f32(2.0f);
565 
566  unsigned int number = 0;
567  const unsigned int quarterPoints = num_points / 4;
568 
569  for (; number < quarterPoints; number++) {
570  float32x4_t aVal = vld1q_f32(aVector);
571 
572  float32x4_t ax = vabsq_f32(aVal);
573  uint32x4_t sign_bits =
574  vandq_u32(vreinterpretq_u32_f32(aVal), vdupq_n_u32(0x80000000));
575 
576  float32x4_t t = vmulq_f32(vsubq_f32(one, ax), half);
577  float32x4_t s = vsqrtq_f32(t);
578 
579  float32x4_t poly_small = _varcsinq_f32_neonv8(ax);
580  float32x4_t poly_large = _varcsinq_f32_neonv8(s);
581 
582  float32x4_t asin_large = vfmsq_f32(pi_2, two, poly_large);
583 
584  uint32x4_t mask = vcgtq_f32(ax, half);
585  float32x4_t asin_result = vbslq_f32(mask, asin_large, poly_small);
586 
587  asin_result = vreinterpretq_f32_u32(
588  vorrq_u32(vreinterpretq_u32_f32(asin_result), sign_bits));
589 
590  float32x4_t result = vsubq_f32(pi_2, asin_result);
591 
592  vst1q_f32(bVector, result);
593 
594  aVector += 4;
595  bVector += 4;
596  }
597 
598  number = quarterPoints * 4;
599  for (; number < num_points; number++) {
600  *bVector++ = volk_arccos(*aVector++);
601  }
602 }
603 
604 #endif /* LV_HAVE_NEONV8 */
605 
606 #endif /* INCLUDED_volk_32f_acos_32f_u_H */
_varcsinq_f32
static float32x4_t _varcsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:130
_mm_arcsin_poly_sse
static __m128 _mm_arcsin_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:92
volk_avx512_intrinsics.h
volk_arccos
static float volk_arccos(const float x)
Definition: volk_common.h:424
_mm512_arcsin_poly_avx512
static __m512 _mm512_arcsin_poly_avx512(const __m512 x)
Definition: volk_avx512_intrinsics.h:104
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_common.h
volk_sse_intrinsics.h
volk_32f_acos_32f_a_avx
static void volk_32f_acos_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:139
volk_avx2_fma_intrinsics.h
_vsqrtq_f32
static float32x4_t _vsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:105
volk_32f_acos_32f_u_avx
static void volk_32f_acos_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:351
volk_32f_acos_32f_generic
static void volk_32f_acos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:69
volk_neon_intrinsics.h
volk_avx_intrinsics.h
_mm256_arcsin_poly_avx2_fma
static __m256 _mm256_arcsin_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:57
_mm256_arcsin_poly_avx
static __m256 _mm256_arcsin_poly_avx(const __m256 x)
Definition: volk_avx_intrinsics.h:101
volk_32f_acos_32f_neon
static void volk_32f_acos_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:507