Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_square_dist_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
65 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
66 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
67 
68 #include <inttypes.h>
69 #include <stdio.h>
70 #include <volk/volk_complex.h>
71 
72 #ifdef LV_HAVE_AVX2
73 #include <immintrin.h>
74 
75 static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target,
76  const lv_32fc_t* src0,
77  const lv_32fc_t* points,
78  unsigned int num_points)
79 {
80  const unsigned int num_bytes = num_points * 8;
81  __m128 xmm0, xmm9, xmm10;
82  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
83 
84  lv_32fc_t diff;
85  float sq_dist;
86  int bound = num_bytes >> 6;
87  int leftovers0 = (num_bytes >> 5) & 1;
88  int leftovers1 = (num_bytes >> 4) & 1;
89  int leftovers2 = (num_bytes >> 3) & 1;
90  int i = 0;
91 
92  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
93  xmm1 = _mm256_setzero_ps();
94  xmm0 = _mm_load_ps((float*)src0);
95  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
96  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
97  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
98 
99  for (; i < bound; ++i) {
100  xmm2 = _mm256_load_ps((float*)&points[0]);
101  xmm3 = _mm256_load_ps((float*)&points[4]);
102  points += 8;
103 
104  xmm4 = _mm256_sub_ps(xmm1, xmm2);
105  xmm5 = _mm256_sub_ps(xmm1, xmm3);
106  xmm6 = _mm256_mul_ps(xmm4, xmm4);
107  xmm7 = _mm256_mul_ps(xmm5, xmm5);
108 
109  xmm4 = _mm256_hadd_ps(xmm6, xmm7);
110  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
111 
112  _mm256_store_ps(target, xmm4);
113 
114  target += 8;
115  }
116 
117  for (i = 0; i < leftovers0; ++i) {
118 
119  xmm2 = _mm256_load_ps((float*)&points[0]);
120 
121  xmm4 = _mm256_sub_ps(xmm1, xmm2);
122 
123  points += 4;
124 
125  xmm6 = _mm256_mul_ps(xmm4, xmm4);
126 
127  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
128  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
129 
130  xmm9 = _mm256_extractf128_ps(xmm4, 1);
131  _mm_store_ps(target, xmm9);
132 
133  target += 4;
134  }
135 
136  for (i = 0; i < leftovers1; ++i) {
137  xmm9 = _mm_load_ps((float*)&points[0]);
138 
139  xmm10 = _mm_sub_ps(xmm0, xmm9);
140 
141  points += 2;
142 
143  xmm9 = _mm_mul_ps(xmm10, xmm10);
144 
145  xmm10 = _mm_hadd_ps(xmm9, xmm9);
146 
147  _mm_storeh_pi((__m64*)target, xmm10);
148 
149  target += 2;
150  }
151 
152  for (i = 0; i < leftovers2; ++i) {
153 
154  diff = src0[0] - points[0];
155 
156  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
157 
158  target[0] = sq_dist;
159  }
160 }
161 
162 #endif /*LV_HAVE_AVX2*/
163 
164 #ifdef LV_HAVE_SSE3
165 #include <pmmintrin.h>
166 #include <xmmintrin.h>
167 
168 static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target,
169  const lv_32fc_t* src0,
170  const lv_32fc_t* points,
171  unsigned int num_points)
172 {
173  const unsigned int num_bytes = num_points * 8;
174 
175  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
176 
177  lv_32fc_t diff;
178  float sq_dist;
179  int bound = num_bytes >> 5;
180  int i = 0;
181 
182  xmm1 = _mm_setzero_ps();
183  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
184  xmm1 = _mm_movelh_ps(xmm1, xmm1);
185 
186  for (; i < bound; ++i) {
187  xmm2 = _mm_load_ps((float*)&points[0]);
188  xmm4 = _mm_sub_ps(xmm1, xmm2);
189  xmm3 = _mm_load_ps((float*)&points[2]);
190  xmm5 = _mm_sub_ps(xmm1, xmm3);
191 
192  xmm6 = _mm_mul_ps(xmm4, xmm4);
193  xmm7 = _mm_mul_ps(xmm5, xmm5);
194 
195  xmm4 = _mm_hadd_ps(xmm6, xmm7);
196 
197  _mm_store_ps(target, xmm4);
198 
199  points += 4;
200  target += 4;
201  }
202 
203  if (num_bytes >> 4 & 1) {
204 
205  xmm2 = _mm_load_ps((float*)&points[0]);
206 
207  xmm4 = _mm_sub_ps(xmm1, xmm2);
208 
209  points += 2;
210 
211  xmm6 = _mm_mul_ps(xmm4, xmm4);
212 
213  xmm4 = _mm_hadd_ps(xmm6, xmm6);
214 
215  _mm_storeh_pi((__m64*)target, xmm4);
216 
217  target += 2;
218  }
219 
220  if (num_bytes >> 3 & 1) {
221 
222  diff = src0[0] - points[0];
223 
224  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
225 
226  target[0] = sq_dist;
227  }
228 }
229 
230 #endif /*LV_HAVE_SSE3*/
231 
232 
233 #ifdef LV_HAVE_NEON
234 #include <arm_neon.h>
235 static inline void volk_32fc_x2_square_dist_32f_neon(float* target,
236  const lv_32fc_t* src0,
237  const lv_32fc_t* points,
238  unsigned int num_points)
239 {
240  const unsigned int quarter_points = num_points / 4;
241  unsigned int number;
242 
243  float32x4x2_t a_vec, b_vec;
244  float32x4x2_t diff_vec;
245  float32x4_t tmp, tmp1, dist_sq;
246  a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0]));
247  a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0]));
248  for (number = 0; number < quarter_points; ++number) {
249  b_vec = vld2q_f32((float*)points);
250  diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
251  diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
252  tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
253  tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
254 
255  dist_sq = vaddq_f32(tmp, tmp1);
256  vst1q_f32(target, dist_sq);
257  points += 4;
258  target += 4;
259  }
260  for (number = quarter_points * 4; number < num_points; ++number) {
261  lv_32fc_t diff = src0[0] - *points++;
262  *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
263  }
264 }
265 #endif /* LV_HAVE_NEON */
266 
267 #ifdef LV_HAVE_NEONV8
268 #include <arm_neon.h>
269 
270 static inline void volk_32fc_x2_square_dist_32f_neonv8(float* target,
271  const lv_32fc_t* src0,
272  const lv_32fc_t* points,
273  unsigned int num_points)
274 {
275  const unsigned int quarter_points = num_points / 4;
276  unsigned int number;
277 
278  float32x4x2_t b_vec;
279  float32x4_t diff_real, diff_imag, dist_sq;
280  float32x4_t a_real = vdupq_n_f32(lv_creal(src0[0]));
281  float32x4_t a_imag = vdupq_n_f32(lv_cimag(src0[0]));
282 
283  for (number = 0; number < quarter_points; ++number) {
284  b_vec = vld2q_f32((float*)points);
285  __VOLK_PREFETCH(points + 8);
286 
287  diff_real = vsubq_f32(a_real, b_vec.val[0]);
288  diff_imag = vsubq_f32(a_imag, b_vec.val[1]);
289 
290  /* dist_sq = diff_real^2 + diff_imag^2 using FMA */
291  dist_sq = vfmaq_f32(vmulq_f32(diff_real, diff_real), diff_imag, diff_imag);
292 
293  vst1q_f32(target, dist_sq);
294  points += 4;
295  target += 4;
296  }
297 
298  for (number = quarter_points * 4; number < num_points; ++number) {
299  lv_32fc_t diff = src0[0] - *points++;
300  *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
301  }
302 }
303 #endif /* LV_HAVE_NEONV8 */
304 
305 
306 #ifdef LV_HAVE_GENERIC
307 static inline void volk_32fc_x2_square_dist_32f_generic(float* target,
308  const lv_32fc_t* src0,
309  const lv_32fc_t* points,
310  unsigned int num_points)
311 {
312  const unsigned int num_bytes = num_points * 8;
313 
314  lv_32fc_t diff;
315  float sq_dist;
316  unsigned int i = 0;
317 
318  for (; i < (num_bytes >> 3); ++i) {
319  diff = src0[0] - points[i];
320 
321  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
322 
323  target[i] = sq_dist;
324  }
325 }
326 
327 #endif /*LV_HAVE_GENERIC*/
328 
329 
330 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
331 
332 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
333 #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
334 
335 #include <inttypes.h>
336 #include <stdio.h>
337 #include <volk/volk_complex.h>
338 
339 #ifdef LV_HAVE_AVX2
340 #include <immintrin.h>
341 
342 static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target,
343  const lv_32fc_t* src0,
344  const lv_32fc_t* points,
345  unsigned int num_points)
346 {
347  const unsigned int num_bytes = num_points * 8;
348  __m128 xmm0, xmm9;
349  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
350 
351  lv_32fc_t diff;
352  float sq_dist;
353  int bound = num_bytes >> 6;
354  int leftovers1 = (num_bytes >> 3) & 0b11;
355  int i = 0;
356 
357  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
358  xmm1 = _mm256_setzero_ps();
359  xmm0 = _mm_loadu_ps((float*)src0);
360  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
361  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
362  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
363 
364  for (; i < bound; ++i) {
365  xmm2 = _mm256_loadu_ps((float*)&points[0]);
366  xmm3 = _mm256_loadu_ps((float*)&points[4]);
367  points += 8;
368 
369  xmm4 = _mm256_sub_ps(xmm1, xmm2);
370  xmm5 = _mm256_sub_ps(xmm1, xmm3);
371  xmm6 = _mm256_mul_ps(xmm4, xmm4);
372  xmm7 = _mm256_mul_ps(xmm5, xmm5);
373 
374  xmm4 = _mm256_hadd_ps(xmm6, xmm7);
375  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
376 
377  _mm256_storeu_ps(target, xmm4);
378 
379  target += 8;
380  }
381 
382  if (num_bytes >> 5 & 1) {
383 
384  xmm2 = _mm256_loadu_ps((float*)&points[0]);
385 
386  xmm4 = _mm256_sub_ps(xmm1, xmm2);
387 
388  points += 4;
389 
390  xmm6 = _mm256_mul_ps(xmm4, xmm4);
391 
392  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
393  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
394 
395  xmm9 = _mm256_extractf128_ps(xmm4, 1);
396  _mm_storeu_ps(target, xmm9);
397 
398  target += 4;
399  }
400 
401  for (i = 0; i < leftovers1; ++i) {
402 
403  diff = src0[0] - points[0];
404  points += 1;
405 
406  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
407 
408  target[0] = sq_dist;
409  target += 1;
410  }
411 }
412 
413 #endif /*LV_HAVE_AVX2*/
414 
415 #ifdef LV_HAVE_RVV
416 #include <riscv_vector.h>
417 
418 static inline void volk_32fc_x2_square_dist_32f_rvv(float* target,
419  const lv_32fc_t* src0,
420  const lv_32fc_t* points,
421  unsigned int num_points)
422 {
423  size_t vlmax = __riscv_vsetvlmax_e32m4();
424  vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax);
425  vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax);
426 
427  size_t n = num_points;
428  for (size_t vl; n > 0; n -= vl, target += vl, points += vl) {
429  vl = __riscv_vsetvl_e32m4(n);
430  vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)points, vl);
431  vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
432  vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
433  vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
434  vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
435  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
436  __riscv_vse32(target, v, vl);
437  }
438 }
439 #endif /*LV_HAVE_RVV*/
440 
441 #ifdef LV_HAVE_RVVSEG
442 #include <riscv_vector.h>
443 
444 static inline void volk_32fc_x2_square_dist_32f_rvvseg(float* target,
445  const lv_32fc_t* src0,
446  const lv_32fc_t* points,
447  unsigned int num_points)
448 {
449  size_t vlmax = __riscv_vsetvlmax_e32m4();
450  vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax);
451  vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax);
452 
453  size_t n = num_points;
454  for (size_t vl; n > 0; n -= vl, target += vl, points += vl) {
455  vl = __riscv_vsetvl_e32m4(n);
456  vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)points, vl);
457  vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0);
458  vfloat32m4_t vbi = __riscv_vget_f32m4(vb, 1);
459  vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
460  vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
461  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
462  __riscv_vse32(target, v, vl);
463  }
464 }
465 #endif /*LV_HAVE_RVVSEG*/
466 
467 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:98
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_32fc_x2_square_dist_32f_generic
static void volk_32fc_x2_square_dist_32f_generic(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:307
volk_32fc_x2_square_dist_32f_a_sse3
static void volk_32fc_x2_square_dist_32f_a_sse3(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:168
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_complex.h
volk_32fc_x2_square_dist_32f_neon
static void volk_32fc_x2_square_dist_32f_neon(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:235
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:96