Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_multiply_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
56 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
57 #define INCLUDED_volk_32f_s32f_multiply_32f_u_H
58 
59 #include <inttypes.h>
60 #include <stdio.h>
61 
62 #ifdef LV_HAVE_GENERIC
63 static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
64  const float* aVector,
65  const float scalar,
66  unsigned int num_points)
67 {
68  for (unsigned int number = 0; number < num_points; number++) {
69  *cVector++ = (*aVector++) * scalar;
70  }
71 }
72 #endif /* LV_HAVE_GENERIC */
73 
74 #ifdef LV_HAVE_SSE
75 #include <xmmintrin.h>
76 
77 static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
78  const float* aVector,
79  const float scalar,
80  unsigned int num_points)
81 {
82  const unsigned int quarterPoints = num_points / 4;
83 
84  float* cPtr = cVector;
85  const float* aPtr = aVector;
86 
87  const __m128 bVal = _mm_set_ps1(scalar);
88  for (unsigned int number = 0; number < quarterPoints; number++) {
89  __m128 aVal = _mm_loadu_ps(aPtr);
90 
91  __m128 cVal = _mm_mul_ps(aVal, bVal);
92 
93  _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
94 
95  aPtr += 4;
96  cPtr += 4;
97  }
98 
99  for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
100  *cPtr++ = (*aPtr++) * scalar;
101  }
102 }
103 #endif /* LV_HAVE_SSE */
104 
105 #ifdef LV_HAVE_AVX
106 #include <immintrin.h>
107 
108 static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
109  const float* aVector,
110  const float scalar,
111  unsigned int num_points)
112 {
113  const unsigned int eighthPoints = num_points / 8;
114 
115  float* cPtr = cVector;
116  const float* aPtr = aVector;
117 
118  const __m256 bVal = _mm256_set1_ps(scalar);
119  for (unsigned int number = 0; number < eighthPoints; number++) {
120  __m256 aVal = _mm256_loadu_ps(aPtr);
121 
122  __m256 cVal = _mm256_mul_ps(aVal, bVal);
123 
124  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
125 
126  aPtr += 8;
127  cPtr += 8;
128  }
129 
130  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
131  *cPtr++ = (*aPtr++) * scalar;
132  }
133 }
134 #endif /* LV_HAVE_AVX */
135 
136 #ifdef LV_HAVE_RISCV64
137 extern void volk_32f_s32f_multiply_32f_sifive_u74(float* cVector,
138  const float* aVector,
139  const float scalar,
140  unsigned int num_points);
141 #endif /* LV_HAVE_RISCV64 */
142 
143 
144 #endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
145 
146 
147 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
148 #define INCLUDED_volk_32f_s32f_multiply_32f_a_H
149 
150 #include <inttypes.h>
151 #include <stdio.h>
152 
153 #ifdef LV_HAVE_SSE
154 #include <xmmintrin.h>
155 
156 static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
157  const float* aVector,
158  const float scalar,
159  unsigned int num_points)
160 {
161  const unsigned int quarterPoints = num_points / 4;
162 
163  float* cPtr = cVector;
164  const float* aPtr = aVector;
165 
166  const __m128 bVal = _mm_set_ps1(scalar);
167  for (unsigned int number = 0; number < quarterPoints; number++) {
168  __m128 aVal = _mm_load_ps(aPtr);
169 
170  __m128 cVal = _mm_mul_ps(aVal, bVal);
171 
172  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
173 
174  aPtr += 4;
175  cPtr += 4;
176  }
177 
178  for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
179  *cPtr++ = (*aPtr++) * scalar;
180  }
181 }
182 #endif /* LV_HAVE_SSE */
183 
184 #ifdef LV_HAVE_AVX
185 #include <immintrin.h>
186 
187 static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
188  const float* aVector,
189  const float scalar,
190  unsigned int num_points)
191 {
192  const unsigned int eighthPoints = num_points / 8;
193 
194  float* cPtr = cVector;
195  const float* aPtr = aVector;
196 
197  const __m256 bVal = _mm256_set1_ps(scalar);
198  for (unsigned int number = 0; number < eighthPoints; number++) {
199  __m256 aVal = _mm256_load_ps(aPtr);
200 
201  __m256 cVal = _mm256_mul_ps(aVal, bVal);
202 
203  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
204 
205  aPtr += 8;
206  cPtr += 8;
207  }
208 
209  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
210  *cPtr++ = (*aPtr++) * scalar;
211  }
212 }
213 #endif /* LV_HAVE_AVX */
214 
215 #ifdef LV_HAVE_NEON
216 #include <arm_neon.h>
217 
218 static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector,
219  const float* aVector,
220  const float scalar,
221  unsigned int num_points)
222 {
223  const unsigned int quarterPoints = num_points / 4;
224 
225  const float* inputPtr = aVector;
226  float* outputPtr = cVector;
227 
228  for (unsigned int number = 0; number < quarterPoints; number++) {
229  float32x4_t aVal = vld1q_f32(inputPtr); // Load into NEON regs
230  float32x4_t cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
231  vst1q_f32(outputPtr, cVal); // Store results back to output
232  inputPtr += 4;
233  outputPtr += 4;
234  }
235 
236  for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
237  *outputPtr++ = (*inputPtr++) * scalar;
238  }
239 }
240 #endif /* LV_HAVE_NEON */
241 
242 #ifdef LV_HAVE_NEONV8
243 #include <arm_neon.h>
244 
245 static inline void volk_32f_s32f_multiply_32f_neonv8(float* cVector,
246  const float* aVector,
247  const float scalar,
248  unsigned int num_points)
249 {
250  const unsigned int eighthPoints = num_points / 8;
251 
252  const float* aPtr = aVector;
253  float* cPtr = cVector;
254  const float32x4_t scalarVec = vdupq_n_f32(scalar);
255 
256  for (unsigned int number = 0; number < eighthPoints; number++) {
257  float32x4_t a0 = vld1q_f32(aPtr);
258  float32x4_t a1 = vld1q_f32(aPtr + 4);
259  __VOLK_PREFETCH(aPtr + 16);
260 
261  vst1q_f32(cPtr, vmulq_f32(a0, scalarVec));
262  vst1q_f32(cPtr + 4, vmulq_f32(a1, scalarVec));
263 
264  aPtr += 8;
265  cPtr += 8;
266  }
267 
268  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
269  *cPtr++ = (*aPtr++) * scalar;
270  }
271 }
272 #endif /* LV_HAVE_NEONV8 */
273 
274 
275 #ifdef LV_HAVE_ORC
276 
277 extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst,
278  const float* src,
279  const float scalar,
280  int num_points);
281 
282 static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
283  const float* aVector,
284  const float scalar,
285  unsigned int num_points)
286 {
287  volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
288 }
289 
290 #endif /* LV_HAVE_ORC */
291 
292 #ifdef LV_HAVE_RVV
293 #include <riscv_vector.h>
294 
295 static inline void volk_32f_s32f_multiply_32f_rvv(float* cVector,
296  const float* aVector,
297  const float scalar,
298  unsigned int num_points)
299 {
300  size_t n = num_points;
301  for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
302  vl = __riscv_vsetvl_e32m8(n);
303  vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
304  __riscv_vse32(cVector, __riscv_vfmul(v, scalar, vl), vl);
305  }
306 }
307 #endif /*LV_HAVE_RVV*/
308 
309 #endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */
volk_32f_s32f_multiply_32f_u_sse
static void volk_32f_s32f_multiply_32f_u_sse(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:77
volk_32f_s32f_multiply_32f_generic
static void volk_32f_s32f_multiply_32f_generic(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:63
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
volk_32f_s32f_multiply_32f_u_neon
static void volk_32f_s32f_multiply_32f_u_neon(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:218
volk_32f_s32f_multiply_32f_a_sse
static void volk_32f_s32f_multiply_32f_a_sse(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:156
volk_32f_s32f_multiply_32f_a_avx
static void volk_32f_s32f_multiply_32f_a_avx(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:187
volk_32f_s32f_multiply_32f_u_avx
static void volk_32f_s32f_multiply_32f_u_avx(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:108