Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_32f_fm_detect_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
44 #ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
45 #define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
46 
47 #include <inttypes.h>
48 #include <stdio.h>
49 
50 #ifdef LV_HAVE_AVX
51 #include <immintrin.h>
52 
53 static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector,
54  const float* inputVector,
55  const float bound,
56  float* saveValue,
57  unsigned int num_points)
58 {
59  if (num_points < 1) {
60  return;
61  }
62  unsigned int number = 1;
63  unsigned int j = 0;
64  // num_points-1 keeps Fedora 7's gcc from crashing...
65  // num_points won't work. :(
66  const unsigned int eighthPoints = (num_points - 1) / 8;
67 
68  float* outPtr = outputVector;
69  const float* inPtr = inputVector;
70  __m256 upperBound = _mm256_set1_ps(bound);
71  __m256 lowerBound = _mm256_set1_ps(-bound);
72  __m256 next3old1;
73  __m256 next4;
74  __m256 boundAdjust;
75  __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
76  __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below.
77  // Do the first 8 by hand since we're going in from the saveValue:
78  *outPtr = *inPtr - *saveValue;
79  if (*outPtr > bound)
80  *outPtr -= 2 * bound;
81  if (*outPtr < -bound)
82  *outPtr += 2 * bound;
83  inPtr++;
84  outPtr++;
85  for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
86  *outPtr = *(inPtr) - *(inPtr - 1);
87  if (*outPtr > bound)
88  *outPtr -= 2 * bound;
89  if (*outPtr < -bound)
90  *outPtr += 2 * bound;
91  inPtr++;
92  outPtr++;
93  }
94 
95  for (; number < eighthPoints; number++) {
96  // Load data
97  next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
98  next4 = _mm256_load_ps(inPtr);
99  inPtr += 8;
100  // Subtract and store:
101  next3old1 = _mm256_sub_ps(next4, next3old1);
102  // Bound:
103  boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
104  boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
105  next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
106  next4 = _mm256_and_ps(next4, negBoundAdjust);
107  boundAdjust = _mm256_or_ps(next4, boundAdjust);
108  // Make sure we're in the bounding interval:
109  next3old1 = _mm256_add_ps(next3old1, boundAdjust);
110  _mm256_store_ps(outPtr, next3old1); // Store the results back into the output
111  outPtr += 8;
112  }
113 
114  for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
115  number++) {
116  *outPtr = *(inPtr) - *(inPtr - 1);
117  if (*outPtr > bound)
118  *outPtr -= 2 * bound;
119  if (*outPtr < -bound)
120  *outPtr += 2 * bound;
121  inPtr++;
122  outPtr++;
123  }
124 
125  *saveValue = inputVector[num_points - 1];
126 }
127 #endif /* LV_HAVE_AVX */
128 
129 
130 #ifdef LV_HAVE_SSE
131 #include <xmmintrin.h>
132 
133 static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector,
134  const float* inputVector,
135  const float bound,
136  float* saveValue,
137  unsigned int num_points)
138 {
139  if (num_points < 1) {
140  return;
141  }
142  unsigned int number = 1;
143  unsigned int j = 0;
144  // num_points-1 keeps Fedora 7's gcc from crashing...
145  // num_points won't work. :(
146  const unsigned int quarterPoints = (num_points - 1) / 4;
147 
148  float* outPtr = outputVector;
149  const float* inPtr = inputVector;
150  __m128 upperBound = _mm_set_ps1(bound);
151  __m128 lowerBound = _mm_set_ps1(-bound);
152  __m128 next3old1;
153  __m128 next4;
154  __m128 boundAdjust;
155  __m128 posBoundAdjust = _mm_set_ps1(-2 * bound); // Subtract when we're above.
156  __m128 negBoundAdjust = _mm_set_ps1(2 * bound); // Add when we're below.
157  // Do the first 4 by hand since we're going in from the saveValue:
158  *outPtr = *inPtr - *saveValue;
159  if (*outPtr > bound)
160  *outPtr -= 2 * bound;
161  if (*outPtr < -bound)
162  *outPtr += 2 * bound;
163  inPtr++;
164  outPtr++;
165  for (j = 1; j < ((4 < num_points) ? 4 : num_points); j++) {
166  *outPtr = *(inPtr) - *(inPtr - 1);
167  if (*outPtr > bound)
168  *outPtr -= 2 * bound;
169  if (*outPtr < -bound)
170  *outPtr += 2 * bound;
171  inPtr++;
172  outPtr++;
173  }
174 
175  for (; number < quarterPoints; number++) {
176  // Load data
177  next3old1 = _mm_loadu_ps((float*)(inPtr - 1));
178  next4 = _mm_load_ps(inPtr);
179  inPtr += 4;
180  // Subtract and store:
181  next3old1 = _mm_sub_ps(next4, next3old1);
182  // Bound:
183  boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
184  boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
185  next4 = _mm_cmplt_ps(next3old1, lowerBound);
186  next4 = _mm_and_ps(next4, negBoundAdjust);
187  boundAdjust = _mm_or_ps(next4, boundAdjust);
188  // Make sure we're in the bounding interval:
189  next3old1 = _mm_add_ps(next3old1, boundAdjust);
190  _mm_store_ps(outPtr, next3old1); // Store the results back into the output
191  outPtr += 4;
192  }
193 
194  for (number = (4 > (quarterPoints * 4) ? 4 : (4 * quarterPoints));
195  number < num_points;
196  number++) {
197  *outPtr = *(inPtr) - *(inPtr - 1);
198  if (*outPtr > bound)
199  *outPtr -= 2 * bound;
200  if (*outPtr < -bound)
201  *outPtr += 2 * bound;
202  inPtr++;
203  outPtr++;
204  }
205 
206  *saveValue = inputVector[num_points - 1];
207 }
208 #endif /* LV_HAVE_SSE */
209 
210 #ifdef LV_HAVE_GENERIC
211 
212 static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector,
213  const float* inputVector,
214  const float bound,
215  float* saveValue,
216  unsigned int num_points)
217 {
218  if (num_points < 1) {
219  return;
220  }
221  unsigned int number = 0;
222  float* outPtr = outputVector;
223  const float* inPtr = inputVector;
224 
225  // Do the first 1 by hand since we're going in from the saveValue:
226  *outPtr = *inPtr - *saveValue;
227  if (*outPtr > bound)
228  *outPtr -= 2 * bound;
229  if (*outPtr < -bound)
230  *outPtr += 2 * bound;
231  inPtr++;
232  outPtr++;
233 
234  for (number = 1; number < num_points; number++) {
235  *outPtr = *(inPtr) - *(inPtr - 1);
236  if (*outPtr > bound)
237  *outPtr -= 2 * bound;
238  if (*outPtr < -bound)
239  *outPtr += 2 * bound;
240  inPtr++;
241  outPtr++;
242  }
243 
244  *saveValue = inputVector[num_points - 1];
245 }
246 #endif /* LV_HAVE_GENERIC */
247 
248 
249 #ifdef LV_HAVE_NEON
250 #include <arm_neon.h>
251 
252 static inline void volk_32f_s32f_32f_fm_detect_32f_neon(float* outputVector,
253  const float* inputVector,
254  const float bound,
255  float* saveValue,
256  unsigned int num_points)
257 {
258  if (num_points < 1) {
259  return;
260  }
261 
262  float* outPtr = outputVector;
263  const float* inPtr = inputVector;
264 
265  const float32x4_t upperBound = vdupq_n_f32(bound);
266  const float32x4_t lowerBound = vdupq_n_f32(-bound);
267  const float32x4_t posBoundAdjust = vdupq_n_f32(-2.f * bound);
268  const float32x4_t negBoundAdjust = vdupq_n_f32(2.f * bound);
269 
270  // Do the first element from saveValue
271  *outPtr = *inPtr - *saveValue;
272  if (*outPtr > bound)
273  *outPtr -= 2 * bound;
274  if (*outPtr < -bound)
275  *outPtr += 2 * bound;
276  inPtr++;
277  outPtr++;
278 
279  // Do the next 3 elements to align to 4
280  for (unsigned int j = 1; j < ((4 < num_points) ? 4 : num_points); j++) {
281  *outPtr = *inPtr - *(inPtr - 1);
282  if (*outPtr > bound)
283  *outPtr -= 2 * bound;
284  if (*outPtr < -bound)
285  *outPtr += 2 * bound;
286  inPtr++;
287  outPtr++;
288  }
289 
290  const unsigned int quarterPoints = (num_points - 1) / 4;
291  for (unsigned int number = 1; number < quarterPoints; number++) {
292  // Load current and previous (offset by 1)
293  float32x4_t curr = vld1q_f32(inPtr);
294  float32x4_t prev = vld1q_f32(inPtr - 1);
295  inPtr += 4;
296 
297  // Compute difference
298  float32x4_t diff = vsubq_f32(curr, prev);
299 
300  // Apply bound wrapping
301  uint32x4_t aboveMask = vcgtq_f32(diff, upperBound);
302  uint32x4_t belowMask = vcltq_f32(diff, lowerBound);
303 
304  float32x4_t adjust = vbslq_f32(aboveMask, posBoundAdjust, vdupq_n_f32(0));
305  adjust = vbslq_f32(belowMask, negBoundAdjust, adjust);
306 
307  diff = vaddq_f32(diff, adjust);
308 
309  vst1q_f32(outPtr, diff);
310  outPtr += 4;
311  }
312 
313  // Handle remainder
314  for (unsigned int number = (4 > (quarterPoints * 4) ? 4 : (4 * quarterPoints));
315  number < num_points;
316  number++) {
317  *outPtr = *inPtr - *(inPtr - 1);
318  if (*outPtr > bound)
319  *outPtr -= 2 * bound;
320  if (*outPtr < -bound)
321  *outPtr += 2 * bound;
322  inPtr++;
323  outPtr++;
324  }
325 
326  *saveValue = inputVector[num_points - 1];
327 }
328 #endif /* LV_HAVE_NEON */
329 
330 
331 #ifdef LV_HAVE_NEONV8
332 #include <arm_neon.h>
333 
334 static inline void volk_32f_s32f_32f_fm_detect_32f_neonv8(float* outputVector,
335  const float* inputVector,
336  const float bound,
337  float* saveValue,
338  unsigned int num_points)
339 {
340  if (num_points < 1) {
341  return;
342  }
343 
344  float* outPtr = outputVector;
345  const float* inPtr = inputVector;
346 
347  const float32x4_t upperBound = vdupq_n_f32(bound);
348  const float32x4_t lowerBound = vdupq_n_f32(-bound);
349  const float32x4_t posBoundAdjust = vdupq_n_f32(-2.f * bound);
350  const float32x4_t negBoundAdjust = vdupq_n_f32(2.f * bound);
351  const float32x4_t zeros = vdupq_n_f32(0);
352 
353  /* Do the first element from saveValue */
354  *outPtr = *inPtr - *saveValue;
355  if (*outPtr > bound)
356  *outPtr -= 2 * bound;
357  if (*outPtr < -bound)
358  *outPtr += 2 * bound;
359  inPtr++;
360  outPtr++;
361 
362  /* Do the next 7 elements to align to 8 */
363  for (unsigned int j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
364  *outPtr = *inPtr - *(inPtr - 1);
365  if (*outPtr > bound)
366  *outPtr -= 2 * bound;
367  if (*outPtr < -bound)
368  *outPtr += 2 * bound;
369  inPtr++;
370  outPtr++;
371  }
372 
373  /* Process 8 floats per iteration (2x unroll) */
374  const unsigned int eighthPoints = (num_points - 1) / 8;
375  for (unsigned int number = 1; number < eighthPoints; number++) {
376  /* Load current and previous (offset by 1) */
377  float32x4_t curr0 = vld1q_f32(inPtr);
378  float32x4_t prev0 = vld1q_f32(inPtr - 1);
379  float32x4_t curr1 = vld1q_f32(inPtr + 4);
380  float32x4_t prev1 = vld1q_f32(inPtr + 3);
381  __VOLK_PREFETCH(inPtr + 16);
382  inPtr += 8;
383 
384  /* Compute differences */
385  float32x4_t diff0 = vsubq_f32(curr0, prev0);
386  float32x4_t diff1 = vsubq_f32(curr1, prev1);
387 
388  /* Apply bound wrapping for first 4 */
389  uint32x4_t above0 = vcgtq_f32(diff0, upperBound);
390  uint32x4_t below0 = vcltq_f32(diff0, lowerBound);
391  float32x4_t adj0 = vbslq_f32(above0, posBoundAdjust, zeros);
392  adj0 = vbslq_f32(below0, negBoundAdjust, adj0);
393  diff0 = vaddq_f32(diff0, adj0);
394 
395  /* Apply bound wrapping for second 4 */
396  uint32x4_t above1 = vcgtq_f32(diff1, upperBound);
397  uint32x4_t below1 = vcltq_f32(diff1, lowerBound);
398  float32x4_t adj1 = vbslq_f32(above1, posBoundAdjust, zeros);
399  adj1 = vbslq_f32(below1, negBoundAdjust, adj1);
400  diff1 = vaddq_f32(diff1, adj1);
401 
402  vst1q_f32(outPtr, diff0);
403  vst1q_f32(outPtr + 4, diff1);
404  outPtr += 8;
405  }
406 
407  /* Handle remainder */
408  for (unsigned int number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints));
409  number < num_points;
410  number++) {
411  *outPtr = *inPtr - *(inPtr - 1);
412  if (*outPtr > bound)
413  *outPtr -= 2 * bound;
414  if (*outPtr < -bound)
415  *outPtr += 2 * bound;
416  inPtr++;
417  outPtr++;
418  }
419 
420  *saveValue = inputVector[num_points - 1];
421 }
422 #endif /* LV_HAVE_NEONV8 */
423 
424 #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */
425 
426 
427 #ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
428 #define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
429 
430 #include <inttypes.h>
431 #include <stdio.h>
432 
433 #ifdef LV_HAVE_AVX
434 #include <immintrin.h>
435 
436 static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector,
437  const float* inputVector,
438  const float bound,
439  float* saveValue,
440  unsigned int num_points)
441 {
442  if (num_points < 1) {
443  return;
444  }
445  unsigned int number = 1;
446  unsigned int j = 0;
447  // num_points-1 keeps Fedora 7's gcc from crashing...
448  // num_points won't work. :(
449  const unsigned int eighthPoints = (num_points - 1) / 8;
450 
451  float* outPtr = outputVector;
452  const float* inPtr = inputVector;
453  __m256 upperBound = _mm256_set1_ps(bound);
454  __m256 lowerBound = _mm256_set1_ps(-bound);
455  __m256 next3old1;
456  __m256 next4;
457  __m256 boundAdjust;
458  __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
459  __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below.
460  // Do the first 8 by hand since we're going in from the saveValue:
461  *outPtr = *inPtr - *saveValue;
462  if (*outPtr > bound)
463  *outPtr -= 2 * bound;
464  if (*outPtr < -bound)
465  *outPtr += 2 * bound;
466  inPtr++;
467  outPtr++;
468  for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
469  *outPtr = *(inPtr) - *(inPtr - 1);
470  if (*outPtr > bound)
471  *outPtr -= 2 * bound;
472  if (*outPtr < -bound)
473  *outPtr += 2 * bound;
474  inPtr++;
475  outPtr++;
476  }
477 
478  for (; number < eighthPoints; number++) {
479  // Load data
480  next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
481  next4 = _mm256_loadu_ps(inPtr);
482  inPtr += 8;
483  // Subtract and store:
484  next3old1 = _mm256_sub_ps(next4, next3old1);
485  // Bound:
486  boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
487  boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
488  next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
489  next4 = _mm256_and_ps(next4, negBoundAdjust);
490  boundAdjust = _mm256_or_ps(next4, boundAdjust);
491  // Make sure we're in the bounding interval:
492  next3old1 = _mm256_add_ps(next3old1, boundAdjust);
493  _mm256_storeu_ps(outPtr, next3old1); // Store the results back into the output
494  outPtr += 8;
495  }
496 
497  for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
498  number++) {
499  *outPtr = *(inPtr) - *(inPtr - 1);
500  if (*outPtr > bound)
501  *outPtr -= 2 * bound;
502  if (*outPtr < -bound)
503  *outPtr += 2 * bound;
504  inPtr++;
505  outPtr++;
506  }
507 
508  *saveValue = inputVector[num_points - 1];
509 }
510 #endif /* LV_HAVE_AVX */
511 
512 
513 #ifdef LV_HAVE_RVV
514 #include <riscv_vector.h>
515 
516 static inline void volk_32f_s32f_32f_fm_detect_32f_rvv(float* outputVector,
517  const float* inputVector,
518  const float bound,
519  float* saveValue,
520  unsigned int num_points)
521 {
522  if (num_points < 1)
523  return;
524 
525  *outputVector = *inputVector - *saveValue;
526  if (*outputVector > bound)
527  *outputVector -= 2 * bound;
528  if (*outputVector < -bound)
529  *outputVector += 2 * bound;
530  ++inputVector;
531  ++outputVector;
532 
533  vfloat32m8_t v2bound = __riscv_vfmv_v_f_f32m8(bound * 2, __riscv_vsetvlmax_e32m8());
534 
535  size_t n = num_points - 1;
536  for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
537  vl = __riscv_vsetvl_e32m8(n);
538  vfloat32m8_t va = __riscv_vle32_v_f32m8(inputVector, vl);
539  vfloat32m8_t vb = __riscv_vle32_v_f32m8(inputVector - 1, vl);
540  vfloat32m8_t v = __riscv_vfsub(va, vb, vl);
541  v = __riscv_vfsub_mu(__riscv_vmfgt(v, bound, vl), v, v, v2bound, vl);
542  v = __riscv_vfadd_mu(__riscv_vmflt(v, -bound, vl), v, v, v2bound, vl);
543  __riscv_vse32(outputVector, v, vl);
544  }
545 
546  *saveValue = inputVector[-1];
547 }
548 #endif /*LV_HAVE_RVV*/
549 
550 #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H */
volk_32f_s32f_32f_fm_detect_32f_u_avx
static void volk_32f_s32f_32f_fm_detect_32f_u_avx(float *outputVector, const float *inputVector, const float bound, float *saveValue, unsigned int num_points)
Definition: volk_32f_s32f_32f_fm_detect_32f.h:436
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32f_s32f_32f_fm_detect_32f_a_sse
static void volk_32f_s32f_32f_fm_detect_32f_a_sse(float *outputVector, const float *inputVector, const float bound, float *saveValue, unsigned int num_points)
Definition: volk_32f_s32f_32f_fm_detect_32f.h:133
volk_32f_s32f_32f_fm_detect_32f_a_avx
static void volk_32f_s32f_32f_fm_detect_32f_a_avx(float *outputVector, const float *inputVector, const float bound, float *saveValue, unsigned int num_points)
Definition: volk_32f_s32f_32f_fm_detect_32f.h:53
volk_32f_s32f_32f_fm_detect_32f_generic
static void volk_32f_s32f_32f_fm_detect_32f_generic(float *outputVector, const float *inputVector, const float bound, float *saveValue, unsigned int num_points)
Definition: volk_32f_s32f_32f_fm_detect_32f.h:212
volk_32f_s32f_32f_fm_detect_32f_neon
static void volk_32f_s32f_32f_fm_detect_32f_neon(float *outputVector, const float *inputVector, const float bound, float *saveValue, unsigned int num_points)
Definition: volk_32f_s32f_32f_fm_detect_32f.h:252