Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_x2_clamp_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
44 #ifndef INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H
45 #define INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H
46 
47 #ifdef LV_HAVE_GENERIC
48 static inline void volk_32f_s32f_x2_clamp_32f_generic(float* out,
49  const float* in,
50  const float min,
51  const float max,
52  unsigned int num_points)
53 {
54  unsigned int number = 0;
55  for (; number < num_points; number++) {
56  if (*in > max) {
57  *out = max;
58  } else if (*in < min) {
59  *out = min;
60  } else {
61  *out = *in;
62  }
63  in++;
64  out++;
65  }
66 }
67 #endif /* LV_HAVE_GENERIC */
68 
69 #if LV_HAVE_AVX2
70 #include <immintrin.h>
71 static inline void volk_32f_s32f_x2_clamp_32f_a_avx2(float* out,
72  const float* in,
73  const float min,
74  const float max,
75  unsigned int num_points)
76 {
77  const __m256 vmin = _mm256_set1_ps(min);
78  const __m256 vmax = _mm256_set1_ps(max);
79 
80  unsigned int number = 0;
81  unsigned int eighth_points = num_points / 8;
82  for (; number < eighth_points; number++) {
83  __m256 res = _mm256_load_ps(in);
84  __m256 max_mask = _mm256_cmp_ps(vmax, res, _CMP_LT_OS);
85  __m256 min_mask = _mm256_cmp_ps(res, vmin, _CMP_LT_OS);
86  res = _mm256_blendv_ps(res, vmax, max_mask);
87  res = _mm256_blendv_ps(res, vmin, min_mask);
88  _mm256_store_ps(out, res);
89  in += 8;
90  out += 8;
91  }
92 
93  number = eighth_points * 8;
94  volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
95 }
96 #endif /* LV_HAVE_AVX2 */
97 
98 #if LV_HAVE_SSE4_1
99 #include <immintrin.h>
100 static inline void volk_32f_s32f_x2_clamp_32f_a_sse4_1(float* out,
101  const float* in,
102  const float min,
103  const float max,
104  unsigned int num_points)
105 {
106  const __m128 vmin = _mm_set1_ps(min);
107  const __m128 vmax = _mm_set1_ps(max);
108 
109  unsigned int number = 0;
110  unsigned int quarter_points = num_points / 4;
111  for (; number < quarter_points; number++) {
112  __m128 res = _mm_load_ps(in);
113  __m128 max_mask = _mm_cmplt_ps(vmax, res);
114  __m128 min_mask = _mm_cmplt_ps(res, vmin);
115  res = _mm_blendv_ps(res, vmax, max_mask);
116  res = _mm_blendv_ps(res, vmin, min_mask);
117  _mm_store_ps(out, res);
118  in += 4;
119  out += 4;
120  }
121 
122  number = quarter_points * 4;
123  volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
124 }
125 #endif /* LV_HAVE_SSE4_1 */
126 
127 #endif /* INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H */
128 
129 #ifndef INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H
130 #define INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H
131 
132 #if LV_HAVE_AVX2
133 #include <immintrin.h>
134 static inline void volk_32f_s32f_x2_clamp_32f_u_avx2(float* out,
135  const float* in,
136  const float min,
137  const float max,
138  unsigned int num_points)
139 {
140  const __m256 vmin = _mm256_set1_ps(min);
141  const __m256 vmax = _mm256_set1_ps(max);
142 
143  unsigned int number = 0;
144  unsigned int eighth_points = num_points / 8;
145  for (; number < eighth_points; number++) {
146  __m256 res = _mm256_loadu_ps(in);
147  __m256 max_mask = _mm256_cmp_ps(vmax, res, _CMP_LT_OS);
148  __m256 min_mask = _mm256_cmp_ps(res, vmin, _CMP_LT_OS);
149  res = _mm256_blendv_ps(res, vmax, max_mask);
150  res = _mm256_blendv_ps(res, vmin, min_mask);
151  _mm256_storeu_ps(out, res);
152  in += 8;
153  out += 8;
154  }
155 
156  number = eighth_points * 8;
157  volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
158 }
159 #endif /* LV_HAVE_AVX2 */
160 
161 #if LV_HAVE_SSE4_1
162 #include <immintrin.h>
163 static inline void volk_32f_s32f_x2_clamp_32f_u_sse4_1(float* out,
164  const float* in,
165  const float min,
166  const float max,
167  unsigned int num_points)
168 {
169  const __m128 vmin = _mm_set1_ps(min);
170  const __m128 vmax = _mm_set1_ps(max);
171 
172  unsigned int number = 0;
173  unsigned int quarter_points = num_points / 4;
174  for (; number < quarter_points; number++) {
175  __m128 res = _mm_loadu_ps(in);
176  __m128 max_mask = _mm_cmplt_ps(vmax, res);
177  __m128 min_mask = _mm_cmplt_ps(res, vmin);
178  res = _mm_blendv_ps(res, vmax, max_mask);
179  res = _mm_blendv_ps(res, vmin, min_mask);
180  _mm_storeu_ps(out, res);
181  in += 4;
182  out += 4;
183  }
184 
185  number = quarter_points * 4;
186  volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
187 }
188 #endif /* LV_HAVE_SSE4_1 */
189 
190 #ifdef LV_HAVE_NEON
191 #include <arm_neon.h>
192 
193 static inline void volk_32f_s32f_x2_clamp_32f_neon(float* out,
194  const float* in,
195  const float min,
196  const float max,
197  unsigned int num_points)
198 {
199  const float32x4_t vmin = vdupq_n_f32(min);
200  const float32x4_t vmax = vdupq_n_f32(max);
201 
202  unsigned int number = 0;
203  const unsigned int quarter_points = num_points / 4;
204 
205  for (; number < quarter_points; number++) {
206  float32x4_t val = vld1q_f32(in);
207  val = vmaxq_f32(val, vmin);
208  val = vminq_f32(val, vmax);
209  vst1q_f32(out, val);
210  in += 4;
211  out += 4;
212  }
213 
214  number = quarter_points * 4;
215  for (; number < num_points; number++) {
216  float val = *in++;
217  if (val < min)
218  val = min;
219  else if (val > max)
220  val = max;
221  *out++ = val;
222  }
223 }
224 #endif /* LV_HAVE_NEON */
225 
226 #ifdef LV_HAVE_NEONV8
227 #include <arm_neon.h>
228 
229 static inline void volk_32f_s32f_x2_clamp_32f_neonv8(float* out,
230  const float* in,
231  const float min,
232  const float max,
233  unsigned int num_points)
234 {
235  const float32x4_t vmin = vdupq_n_f32(min);
236  const float32x4_t vmax = vdupq_n_f32(max);
237 
238  unsigned int number = 0;
239  const unsigned int eighth_points = num_points / 8;
240 
241  for (; number < eighth_points; number++) {
242  float32x4_t val0 = vld1q_f32(in);
243  float32x4_t val1 = vld1q_f32(in + 4);
244  __VOLK_PREFETCH(in + 8);
245 
246  val0 = vmaxq_f32(val0, vmin);
247  val1 = vmaxq_f32(val1, vmin);
248  val0 = vminq_f32(val0, vmax);
249  val1 = vminq_f32(val1, vmax);
250 
251  vst1q_f32(out, val0);
252  vst1q_f32(out + 4, val1);
253  in += 8;
254  out += 8;
255  }
256 
257  number = eighth_points * 8;
258  for (; number < num_points; number++) {
259  float val = *in++;
260  if (val < min)
261  val = min;
262  else if (val > max)
263  val = max;
264  *out++ = val;
265  }
266 }
267 #endif /* LV_HAVE_NEONV8 */
268 
269 #ifdef LV_HAVE_RVV
270 #include <riscv_vector.h>
271 
272 static inline void volk_32f_s32f_x2_clamp_32f_rvv(float* out,
273  const float* in,
274  const float min,
275  const float max,
276  unsigned int num_points)
277 {
278  vfloat32m8_t vmin = __riscv_vfmv_v_f_f32m8(min, __riscv_vsetvlmax_e32m8());
279  vfloat32m8_t vmax = __riscv_vfmv_v_f_f32m8(max, __riscv_vsetvlmax_e32m8());
280  size_t n = num_points;
281  for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
282  vl = __riscv_vsetvl_e32m8(n);
283  vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
284  v = __riscv_vfmin(__riscv_vfmax(v, vmin, vl), vmax, vl);
285  __riscv_vse32(out, v, vl);
286  }
287 }
288 #endif /*LV_HAVE_RVV*/
289 
290 #endif /* INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H */
volk_32f_s32f_x2_clamp_32f_neon
static void volk_32f_s32f_x2_clamp_32f_neon(float *out, const float *in, const float min, const float max, unsigned int num_points)
Definition: volk_32f_s32f_x2_clamp_32f.h:193
volk_arch_defs.val
val
Definition: volk_arch_defs.py:57
volk_32f_s32f_x2_clamp_32f_generic
static void volk_32f_s32f_x2_clamp_32f_generic(float *out, const float *in, const float min, const float max, unsigned int num_points)
Definition: volk_32f_s32f_x2_clamp_32f.h:48
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68