Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_invsqrt_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2013, 2014 Free Software Foundation, Inc.
4  * Copyright 2026 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
53 #ifndef INCLUDED_volk_32f_invsqrt_32f_a_H
54 #define INCLUDED_volk_32f_invsqrt_32f_a_H
55 
56 #include <math.h>
57 #include <volk/volk_common.h>
58 
59 #ifdef LV_HAVE_GENERIC
60 
61 static inline void volk_32f_invsqrt_32f_generic(float* cVector,
62  const float* aVector,
63  unsigned int num_points)
64 {
65  for (unsigned int number = 0; number < num_points; number++) {
66  cVector[number] = sqrtf(1.0f / aVector[number]);
67  }
68 }
69 #endif /* LV_HAVE_GENERIC */
70 
71 #ifdef LV_HAVE_GENERIC
72 
73 static inline void volk_32f_invsqrt_32f_recip_sqrt(float* cVector,
74  const float* aVector,
75  unsigned int num_points)
76 {
77  for (unsigned int number = 0; number < num_points; number++) {
78  cVector[number] = 1.0f / sqrtf(aVector[number]);
79  }
80 }
81 #endif /* LV_HAVE_GENERIC */
82 
83 #ifdef LV_HAVE_GENERIC
84 
85 static inline void volk_32f_invsqrt_32f_Q_rsqrt(float* cVector,
86  const float* aVector,
87  unsigned int num_points)
88 {
89  // The famous fast inverse square root from Quake III Arena
90  union {
91  float f;
92  uint32_t u;
93  } conv, result;
94 
95  for (unsigned int number = 0; number < num_points; number++) {
96  float a = aVector[number];
97  float xhalf = 0.5f * a;
98  conv.f = a;
99  uint32_t input_bits = conv.u; // Save original bits for edge case detection
100  conv.u = 0x5f3759df - (conv.u >> 1); // The magic (note: use unsigned for shift)
101  float x = conv.f;
102  x = x * (1.5f - xhalf * x * x); // Newton-Raphson iteration 1
103  x = x * (1.5f - xhalf * x * x);
104  x = x * (1.5f - xhalf * x * x);
105 
106  // Branchless special case handling
107  result.f = x;
108  uint32_t is_positive = (uint32_t)(-(int32_t)(a > 0.0f));
109  uint32_t is_zero = (uint32_t)(-(int32_t)(input_bits == 0x00000000));
110  uint32_t is_inf = (uint32_t)(-(int32_t)(input_bits == 0x7F800000));
111  uint32_t is_normal_pos = is_positive & ~is_inf;
112  result.u = (result.u & is_normal_pos) | // Normal positive: keep result
113  (0x7F800000u & is_zero) | // +0 → +Inf
114  (0x7FC00000u & ~is_positive & ~is_zero); // Negative/NaN → NaN
115  // Note: +Inf → 0 is handled implicitly (all terms are 0 when is_inf)
116  cVector[number] = result.f;
117  }
118 }
119 #endif /* LV_HAVE_GENERIC */
120 
121 #ifdef LV_HAVE_SSE
123 #include <xmmintrin.h>
124 
125 static inline void
126 volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
127 {
128  unsigned int number = 0;
129  const unsigned int quarterPoints = num_points / 4;
130 
131  float* cPtr = cVector;
132  const float* aPtr = aVector;
133  for (; number < quarterPoints; number++) {
134  __m128 aVal = _mm_load_ps(aPtr);
135  __m128 cVal = _mm_rsqrt_nr_ps(aVal);
136  _mm_store_ps(cPtr, cVal);
137  aPtr += 4;
138  cPtr += 4;
139  }
140 
141  number = quarterPoints * 4;
142  for (; number < num_points; number++) {
143  *cPtr++ = 1.0f / sqrtf(*aPtr++);
144  }
145 }
146 #endif /* LV_HAVE_SSE */
147 
148 #ifdef LV_HAVE_AVX
149 #include <immintrin.h>
151 
152 static inline void
153 volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
154 {
155  unsigned int number = 0;
156  const unsigned int eighthPoints = num_points / 8;
157 
158  float* cPtr = cVector;
159  const float* aPtr = aVector;
160  for (; number < eighthPoints; number++) {
161  __m256 aVal = _mm256_load_ps(aPtr);
162  __m256 cVal = _mm256_rsqrt_nr_ps(aVal);
163  _mm256_store_ps(cPtr, cVal);
164  aPtr += 8;
165  cPtr += 8;
166  }
167 
168  number = eighthPoints * 8;
169  for (; number < num_points; number++) {
170  *cPtr++ = 1.0f / sqrtf(*aPtr++);
171  }
172 }
173 #endif /* LV_HAVE_AVX */
174 
175 
176 #ifdef LV_HAVE_AVX2
177 #include <immintrin.h>
179 
180 static inline void
181 volk_32f_invsqrt_32f_a_avx2(float* cVector, const float* aVector, unsigned int num_points)
182 {
183  unsigned int number = 0;
184  const unsigned int eighthPoints = num_points / 8;
185 
186  float* cPtr = cVector;
187  const float* aPtr = aVector;
188  for (; number < eighthPoints; number++) {
189  __m256 aVal = _mm256_load_ps(aPtr);
190  __m256 cVal = _mm256_rsqrt_nr_avx2(aVal);
191  _mm256_store_ps(cPtr, cVal);
192  aPtr += 8;
193  cPtr += 8;
194  }
195 
196  number = eighthPoints * 8;
197  for (; number < num_points; number++) {
198  *cPtr++ = 1.0f / sqrtf(*aPtr++);
199  }
200 }
201 #endif /* LV_HAVE_AVX2 */
202 
203 
204 #ifdef LV_HAVE_AVX512F
205 #include <immintrin.h>
207 
208 static inline void volk_32f_invsqrt_32f_a_avx512f(float* cVector,
209  const float* aVector,
210  unsigned int num_points)
211 {
212  unsigned int number = 0;
213  const unsigned int sixteenthPoints = num_points / 16;
214 
215  float* cPtr = cVector;
216  const float* aPtr = aVector;
217  for (; number < sixteenthPoints; number++) {
218  __m512 aVal = _mm512_load_ps(aPtr);
219  __m512 cVal = _mm512_rsqrt_nr_ps(aVal);
220  _mm512_store_ps(cPtr, cVal);
221  aPtr += 16;
222  cPtr += 16;
223  }
224 
225  number = sixteenthPoints * 16;
226  for (; number < num_points; number++) {
227  *cPtr++ = 1.0f / sqrtf(*aPtr++);
228  }
229 }
230 #endif /* LV_HAVE_AVX512F */
231 
232 #ifdef LV_HAVE_SSE
234 #include <xmmintrin.h>
235 
236 static inline void
237 volk_32f_invsqrt_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
238 {
239  unsigned int number = 0;
240  const unsigned int quarterPoints = num_points / 4;
241 
242  float* cPtr = cVector;
243  const float* aPtr = aVector;
244  for (; number < quarterPoints; number++) {
245  __m128 aVal = _mm_loadu_ps(aPtr);
246  __m128 cVal = _mm_rsqrt_nr_ps(aVal);
247  _mm_storeu_ps(cPtr, cVal);
248  aPtr += 4;
249  cPtr += 4;
250  }
251 
252  number = quarterPoints * 4;
253  for (; number < num_points; number++) {
254  *cPtr++ = 1.0f / sqrtf(*aPtr++);
255  }
256 }
257 #endif /* LV_HAVE_SSE */
258 
259 
260 #ifdef LV_HAVE_AVX
261 #include <immintrin.h>
263 
264 static inline void
265 volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
266 {
267  unsigned int number = 0;
268  const unsigned int eighthPoints = num_points / 8;
269 
270  float* cPtr = cVector;
271  const float* aPtr = aVector;
272  for (; number < eighthPoints; number++) {
273  __m256 aVal = _mm256_loadu_ps(aPtr);
274  __m256 cVal = _mm256_rsqrt_nr_ps(aVal);
275  _mm256_storeu_ps(cPtr, cVal);
276  aPtr += 8;
277  cPtr += 8;
278  }
279 
280  number = eighthPoints * 8;
281  for (; number < num_points; number++) {
282  *cPtr++ = 1.0f / sqrtf(*aPtr++);
283  }
284 }
285 #endif /* LV_HAVE_AVX */
286 
287 #ifdef LV_HAVE_AVX2
288 #include <immintrin.h>
290 
291 static inline void
292 volk_32f_invsqrt_32f_u_avx2(float* cVector, const float* aVector, unsigned int num_points)
293 {
294  unsigned int number = 0;
295  const unsigned int eighthPoints = num_points / 8;
296 
297  float* cPtr = cVector;
298  const float* aPtr = aVector;
299  for (; number < eighthPoints; number++) {
300  __m256 aVal = _mm256_loadu_ps(aPtr);
301  __m256 cVal = _mm256_rsqrt_nr_avx2(aVal);
302  _mm256_storeu_ps(cPtr, cVal);
303  aPtr += 8;
304  cPtr += 8;
305  }
306 
307  number = eighthPoints * 8;
308  for (; number < num_points; number++) {
309  *cPtr++ = 1.0f / sqrtf(*aPtr++);
310  }
311 }
312 #endif /* LV_HAVE_AVX2 */
313 
314 #ifdef LV_HAVE_AVX512F
315 #include <immintrin.h>
317 
318 static inline void volk_32f_invsqrt_32f_u_avx512f(float* cVector,
319  const float* aVector,
320  unsigned int num_points)
321 {
322  unsigned int number = 0;
323  const unsigned int sixteenthPoints = num_points / 16;
324 
325  float* cPtr = cVector;
326  const float* aPtr = aVector;
327  for (; number < sixteenthPoints; number++) {
328  __m512 aVal = _mm512_loadu_ps(aPtr);
329  __m512 cVal = _mm512_rsqrt_nr_ps(aVal);
330  _mm512_storeu_ps(cPtr, cVal);
331  aPtr += 16;
332  cPtr += 16;
333  }
334 
335  number = sixteenthPoints * 16;
336  for (; number < num_points; number++) {
337  *cPtr++ = 1.0f / sqrtf(*aPtr++);
338  }
339 }
340 #endif /* LV_HAVE_AVX512F */
341 
342 #ifdef LV_HAVE_NEON
343 #include <arm_neon.h>
345 
346 static inline void
347 volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
348 {
349  unsigned int number;
350  const unsigned int quarter_points = num_points / 4;
351 
352  float* cPtr = cVector;
353  const float* aPtr = aVector;
354  for (number = 0; number < quarter_points; ++number) {
355  float32x4_t a_val = vld1q_f32(aPtr);
356  float32x4_t c_val = _vinvsqrtq_f32(a_val);
357  vst1q_f32(cPtr, c_val);
358  aPtr += 4;
359  cPtr += 4;
360  }
361 
362  for (number = quarter_points * 4; number < num_points; number++) {
363  *cPtr++ = 1.0f / sqrtf(*aPtr++);
364  }
365 }
366 #endif /* LV_HAVE_NEON */
367 
368 #ifdef LV_HAVE_NEONV8
369 #include <arm_neon.h>
370 
371 static inline void
372 volk_32f_invsqrt_32f_neonv8(float* cVector, const float* aVector, unsigned int num_points)
373 {
374  unsigned int number;
375  const unsigned int quarter_points = num_points / 4;
376 
377  float* cPtr = cVector;
378  const float* aPtr = aVector;
379 
380  // Process 4 elements at a time (1 vector)
381  for (number = 0; number < quarter_points; ++number) {
382  float32x4_t a = vld1q_f32(aPtr);
383  float32x4_t x0 = vrsqrteq_f32(a); // +Inf for +0, 0 for +Inf
384 
385  // Two Newton-Raphson iterations for float32 accuracy
386  float32x4_t x1 = vmulq_f32(x0, vrsqrtsq_f32(vmulq_f32(a, x0), x0));
387  x1 = vmulq_f32(x1, vrsqrtsq_f32(vmulq_f32(a, x1), x1));
388 
389  // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
390  // Blend: use x0 where a == +0 or a == +Inf, else use x1
391  uint32x4_t a_bits = vreinterpretq_u32_f32(a);
392  uint32x4_t zero_mask = vceqq_u32(a_bits, vdupq_n_u32(0x00000000));
393  uint32x4_t inf_mask = vceqq_u32(a_bits, vdupq_n_u32(0x7F800000));
394  uint32x4_t special_mask = vorrq_u32(zero_mask, inf_mask);
395 
396  vst1q_f32(cPtr, vbslq_f32(special_mask, x0, x1));
397  aPtr += 4;
398  cPtr += 4;
399  }
400 
401  for (number = quarter_points * 4; number < num_points; number++) {
402  *cPtr++ = 1.0f / sqrtf(*aPtr++);
403  }
404 }
405 #endif /* LV_HAVE_NEONV8 */
406 
407 #ifdef LV_HAVE_RVV
408 #include <riscv_vector.h>
409 
410 static inline void
411 volk_32f_invsqrt_32f_rvv(float* cVector, const float* aVector, unsigned int num_points)
412 {
413  size_t n = num_points;
414  for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
415  vl = __riscv_vsetvl_e32m8(n);
416  vfloat32m8_t a = __riscv_vle32_v_f32m8(aVector, vl);
417  vfloat32m8_t half = __riscv_vfmv_v_f_f32m8(0.5f, vl);
418  vfloat32m8_t three_halfs = __riscv_vfmv_v_f_f32m8(1.5f, vl);
419 
420  // Initial estimate (~7-bit precision): +Inf for +0, 0 for +Inf
421  vfloat32m8_t x0 = __riscv_vfrsqrt7(a, vl);
422 
423  // Two Newton-Raphson iterations: x = x * (1.5 - 0.5 * a * x * x)
424  vfloat32m8_t half_a = __riscv_vfmul(half, a, vl);
425  vfloat32m8_t x1 = __riscv_vfmul(
426  x0, __riscv_vfnmsac(three_halfs, half_a, __riscv_vfmul(x0, x0, vl), vl), vl);
427  x1 = __riscv_vfmul(
428  x1, __riscv_vfnmsac(three_halfs, half_a, __riscv_vfmul(x1, x1, vl), vl), vl);
429 
430  // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
431  // Blend: use x0 where a == +0 or a == +Inf, else use x1
432  vuint32m8_t a_bits = __riscv_vreinterpret_v_f32m8_u32m8(a);
433  vbool4_t zero_mask = __riscv_vmseq_vx_u32m8_b4(a_bits, 0x00000000, vl);
434  vbool4_t inf_mask = __riscv_vmseq_vx_u32m8_b4(a_bits, 0x7F800000, vl);
435  vbool4_t special_mask = __riscv_vmor_mm_b4(zero_mask, inf_mask, vl);
436  vfloat32m8_t result = __riscv_vmerge_vvm_f32m8(x1, x0, special_mask, vl);
437 
438  __riscv_vse32(cVector, result, vl);
439  }
440 }
441 #endif /*LV_HAVE_RVV*/
442 
443 #endif /* INCLUDED_volk_32f_invsqrt_32f_a_H */
volk_avx512_intrinsics.h
volk_32f_invsqrt_32f_u_avx
static void volk_32f_invsqrt_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:265
volk_32f_invsqrt_32f_neon
static void volk_32f_invsqrt_32f_neon(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:347
volk_32f_invsqrt_32f_generic
static void volk_32f_invsqrt_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:61
volk_common.h
volk_32f_invsqrt_32f_recip_sqrt
static void volk_32f_invsqrt_32f_recip_sqrt(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:73
volk_sse_intrinsics.h
_mm256_rsqrt_nr_avx2
static __m256 _mm256_rsqrt_nr_avx2(const __m256 a)
Definition: volk_avx2_intrinsics.h:26
volk_32f_invsqrt_32f_u_sse
static void volk_32f_invsqrt_32f_u_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:237
_vinvsqrtq_f32
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:85
volk_32f_invsqrt_32f_a_avx
static void volk_32f_invsqrt_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:153
_mm_rsqrt_nr_ps
static __m128 _mm_rsqrt_nr_ps(const __m128 a)
Definition: volk_sse_intrinsics.h:27
_mm256_rsqrt_nr_ps
static __m256 _mm256_rsqrt_nr_ps(const __m256 a)
Definition: volk_avx_intrinsics.h:26
volk_neon_intrinsics.h
_mm512_rsqrt_nr_ps
static __m512 _mm512_rsqrt_nr_ps(const __m512 a)
Definition: volk_avx512_intrinsics.h:26
volk_avx_intrinsics.h
volk_32f_invsqrt_32f_a_sse
static void volk_32f_invsqrt_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:126
volk_avx2_intrinsics.h
volk_32f_invsqrt_32f_Q_rsqrt
static void volk_32f_invsqrt_32f_Q_rsqrt(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:85