Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014, 2019 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
66 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
67 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
68 
69 #include <volk/volk_complex.h>
70 
71 
72 static inline void calculate_scaled_distances(float* target,
73  const lv_32fc_t symbol,
74  const lv_32fc_t* points,
75  const float scalar,
76  const unsigned int num_points)
77 {
78  lv_32fc_t diff;
79  for (unsigned int i = 0; i < num_points; ++i) {
80  /*
81  * Calculate: |y - x|^2 * SNR_lin
82  * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
83  */
84  diff = symbol - *points++;
85  *target++ =
86  scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
87  }
88 }
89 
90 
91 #ifdef LV_HAVE_AVX2
92 #include <immintrin.h>
94 
95 static inline void
96 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target,
97  const lv_32fc_t* src0,
98  const lv_32fc_t* points,
99  float scalar,
100  unsigned int num_points)
101 {
102  const unsigned int num_bytes = num_points * 8;
103  __m128 xmm9, xmm10;
104  __m256 xmm4, xmm6;
105  __m256 xmm_points0, xmm_points1, xmm_result;
106 
107  const unsigned int bound = num_bytes >> 6;
108 
109  // load complex value into all parts of the register.
110  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
111  const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
112 
113  // Load scalar into all 8 parts of the register
114  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
115  const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
116 
117  // Set permutation constant
118  const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
119 
120  for (unsigned int i = 0; i < bound; ++i) {
121  xmm_points0 = _mm256_load_ps((float*)points);
122  xmm_points1 = _mm256_load_ps((float*)(points + 4));
123  points += 8;
124  __VOLK_PREFETCH(points);
125 
126  xmm_result = _mm256_scaled_norm_dist_ps_avx2(
127  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
128 
129  _mm256_store_ps(target, xmm_result);
130  target += 8;
131  }
132 
133  if (num_bytes >> 5 & 1) {
134  xmm_points0 = _mm256_load_ps((float*)points);
135 
136  xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
137 
138  points += 4;
139 
140  xmm6 = _mm256_mul_ps(xmm4, xmm4);
141 
142  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
143  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
144 
145  xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
146 
147  xmm9 = _mm256_extractf128_ps(xmm_result, 1);
148  _mm_store_ps(target, xmm9);
149  target += 4;
150  }
151 
152  if (num_bytes >> 4 & 1) {
153  xmm9 = _mm_load_ps((float*)points);
154 
155  xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
156 
157  points += 2;
158 
159  xmm9 = _mm_mul_ps(xmm10, xmm10);
160 
161  xmm10 = _mm_hadd_ps(xmm9, xmm9);
162 
163  xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
164 
165  _mm_storeh_pi((__m64*)target, xmm10);
166  target += 2;
167  }
168 
169  calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
170 }
171 
172 #endif /*LV_HAVE_AVX2*/
173 
174 
175 #ifdef LV_HAVE_AVX
176 #include <immintrin.h>
178 
179 static inline void
181  const lv_32fc_t* src0,
182  const lv_32fc_t* points,
183  float scalar,
184  unsigned int num_points)
185 {
186  const int eightsPoints = num_points / 8;
187  const int remainder = num_points - 8 * eightsPoints;
188 
189  __m256 xmm_points0, xmm_points1, xmm_result;
190 
191  // load complex value into all parts of the register.
192  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
193 
194  // Load scalar into all 8 parts of the register
195  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
196 
197  for (int i = 0; i < eightsPoints; ++i) {
198  xmm_points0 = _mm256_load_ps((float*)points);
199  xmm_points1 = _mm256_load_ps((float*)(points + 4));
200  points += 8;
201 
202  xmm_result = _mm256_scaled_norm_dist_ps(
203  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
204 
205  _mm256_store_ps(target, xmm_result);
206  target += 8;
207  }
208 
209  const lv_32fc_t symbol = *src0;
210  calculate_scaled_distances(target, symbol, points, scalar, remainder);
211 }
212 
213 #endif /* LV_HAVE_AVX */
214 
215 
216 #ifdef LV_HAVE_SSE3
217 #include <pmmintrin.h>
219 
220 static inline void
222  const lv_32fc_t* src0,
223  const lv_32fc_t* points,
224  float scalar,
225  unsigned int num_points)
226 {
227  __m128 xmm_points0, xmm_points1, xmm_result;
228 
229  /*
230  * First do 4 values in every loop iteration.
231  * There may be up to 3 values left.
232  * leftovers0 indicates if at least 2 more are available for SSE execution.
233  * leftovers1 indicates if there is a single element left.
234  */
235  const int quarterPoints = num_points / 4;
236  const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
237  const int leftovers1 = num_points % 2;
238 
239  // load complex value into both parts of the register.
240  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
241 
242  // Load scalar into all 4 parts of the register
243  const __m128 xmm_scalar = _mm_load1_ps(&scalar);
244 
245  for (int i = 0; i < quarterPoints; ++i) {
246  xmm_points0 = _mm_load_ps((float*)points);
247  xmm_points1 = _mm_load_ps((float*)(points + 2));
248  points += 4;
249  __VOLK_PREFETCH(points);
250  // calculate distances
251  xmm_result = _mm_scaled_norm_dist_ps_sse3(
252  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
253 
254  _mm_store_ps(target, xmm_result);
255  target += 4;
256  }
257 
258  for (int i = 0; i < leftovers0; ++i) {
259  xmm_points0 = _mm_load_ps((float*)points);
260  points += 2;
261 
262  xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
263  xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
264  xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
265  xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
266 
267  _mm_storeh_pi((__m64*)target, xmm_result);
268  target += 2;
269  }
270 
271  calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
272 }
273 
274 #endif /*LV_HAVE_SSE3*/
275 
276 #ifdef LV_HAVE_SSE
278 #include <xmmintrin.h>
279 static inline void
281  const lv_32fc_t* src0,
282  const lv_32fc_t* points,
283  float scalar,
284  unsigned int num_points)
285 {
286  const __m128 xmm_scalar = _mm_set1_ps(scalar);
287  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
288 
289  for (unsigned i = 0; i < num_points / 4; ++i) {
290  __m128 xmm_points0 = _mm_load_ps((float*)points);
291  __m128 xmm_points1 = _mm_load_ps((float*)(points + 2));
292  points += 4;
293  __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
294  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
295  _mm_store_ps((float*)target, xmm_result);
296  target += 4;
297  }
298 
299  calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
300 }
301 #endif // LV_HAVE_SSE
302 
303 #ifdef LV_HAVE_GENERIC
304 static inline void
306  const lv_32fc_t* src0,
307  const lv_32fc_t* points,
308  float scalar,
309  unsigned int num_points)
310 {
311  const lv_32fc_t symbol = *src0;
312  calculate_scaled_distances(target, symbol, points, scalar, num_points);
313 }
314 
315 #endif /*LV_HAVE_GENERIC*/
316 
317 #ifdef LV_HAVE_NEON
318 #include <arm_neon.h>
319 
320 static inline void
322  const lv_32fc_t* src0,
323  const lv_32fc_t* points,
324  float scalar,
325  unsigned int num_points)
326 {
327  unsigned int number = 0;
328  const unsigned int quarterPoints = num_points / 4;
329 
330  // Load the reference symbol real and imag into vectors
331  const float32x4_t symbolReal = vdupq_n_f32(lv_creal(*src0));
332  const float32x4_t symbolImag = vdupq_n_f32(lv_cimag(*src0));
333  const float32x4_t vScalar = vdupq_n_f32(scalar);
334 
335  for (; number < quarterPoints; number++) {
336  // Load 4 complex points (8 floats) and deinterleave
337  float32x4x2_t pts = vld2q_f32((const float*)points);
338  points += 4;
339 
340  // Calculate difference
341  float32x4_t diffReal = vsubq_f32(symbolReal, pts.val[0]);
342  float32x4_t diffImag = vsubq_f32(symbolImag, pts.val[1]);
343 
344  // Calculate squared magnitude and scale
345  float32x4_t result = vmulq_f32(diffReal, diffReal);
346  result = vmlaq_f32(result, diffImag, diffImag);
347  result = vmulq_f32(result, vScalar);
348 
349  vst1q_f32(target, result);
350  target += 4;
351  }
352 
353  // Handle remaining points
355  target, *src0, points, scalar, num_points - quarterPoints * 4);
356 }
357 
358 #endif /*LV_HAVE_NEON*/
359 
360 #ifdef LV_HAVE_NEONV8
361 #include <arm_neon.h>
362 
363 static inline void
364 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_neonv8(float* target,
365  const lv_32fc_t* src0,
366  const lv_32fc_t* points,
367  float scalar,
368  unsigned int num_points)
369 {
370  unsigned int number = 0;
371  const unsigned int eighthPoints = num_points / 8;
372 
373  // Load the reference symbol real and imag into vectors
374  const float32x4_t symbolReal = vdupq_n_f32(lv_creal(*src0));
375  const float32x4_t symbolImag = vdupq_n_f32(lv_cimag(*src0));
376  const float32x4_t vScalar = vdupq_n_f32(scalar);
377 
378  for (; number < eighthPoints; number++) {
379  __VOLK_PREFETCH(points + 16);
380 
381  // Load 8 complex points (16 floats) and deinterleave
382  float32x4x2_t pts0 = vld2q_f32((const float*)points);
383  float32x4x2_t pts1 = vld2q_f32((const float*)(points + 4));
384  points += 8;
385 
386  // Calculate difference
387  float32x4_t diffReal0 = vsubq_f32(symbolReal, pts0.val[0]);
388  float32x4_t diffImag0 = vsubq_f32(symbolImag, pts0.val[1]);
389  float32x4_t diffReal1 = vsubq_f32(symbolReal, pts1.val[0]);
390  float32x4_t diffImag1 = vsubq_f32(symbolImag, pts1.val[1]);
391 
392  // Calculate squared magnitude: real^2 + imag^2 using FMA
393  float32x4_t result0 =
394  vfmaq_f32(vmulq_f32(diffReal0, diffReal0), diffImag0, diffImag0);
395  float32x4_t result1 =
396  vfmaq_f32(vmulq_f32(diffReal1, diffReal1), diffImag1, diffImag1);
397 
398  // Scale
399  result0 = vmulq_f32(result0, vScalar);
400  result1 = vmulq_f32(result1, vScalar);
401 
402  vst1q_f32(target, result0);
403  vst1q_f32(target + 4, result1);
404  target += 8;
405  }
406 
407  // Handle remaining points
408  const unsigned int remaining = num_points - eighthPoints * 8;
409  calculate_scaled_distances(target, *src0, points, scalar, remaining);
410 }
411 
412 #endif /*LV_HAVE_NEONV8*/
413 
414 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
415 
416 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
417 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
418 
419 #include <volk/volk_complex.h>
420 
421 
422 #ifdef LV_HAVE_AVX2
423 #include <immintrin.h>
425 
426 static inline void
427 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target,
428  const lv_32fc_t* src0,
429  const lv_32fc_t* points,
430  float scalar,
431  unsigned int num_points)
432 {
433  const unsigned int num_bytes = num_points * 8;
434  __m128 xmm9, xmm10;
435  __m256 xmm4, xmm6;
436  __m256 xmm_points0, xmm_points1, xmm_result;
437 
438  const unsigned int bound = num_bytes >> 6;
439 
440  // load complex value into all parts of the register.
441  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
442  const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
443 
444  // Load scalar into all 8 parts of the register
445  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
446  const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
447 
448  // Set permutation constant
449  const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
450 
451  for (unsigned int i = 0; i < bound; ++i) {
452  xmm_points0 = _mm256_loadu_ps((float*)points);
453  xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
454  points += 8;
455  __VOLK_PREFETCH(points);
456 
457  xmm_result = _mm256_scaled_norm_dist_ps_avx2(
458  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
459 
460  _mm256_storeu_ps(target, xmm_result);
461  target += 8;
462  }
463 
464  if (num_bytes >> 5 & 1) {
465  xmm_points0 = _mm256_loadu_ps((float*)points);
466 
467  xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
468 
469  points += 4;
470 
471  xmm6 = _mm256_mul_ps(xmm4, xmm4);
472 
473  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
474  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
475 
476  xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
477 
478  xmm9 = _mm256_extractf128_ps(xmm_result, 1);
479  _mm_storeu_ps(target, xmm9);
480  target += 4;
481  }
482 
483  if (num_bytes >> 4 & 1) {
484  xmm9 = _mm_loadu_ps((float*)points);
485 
486  xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
487 
488  points += 2;
489 
490  xmm9 = _mm_mul_ps(xmm10, xmm10);
491 
492  xmm10 = _mm_hadd_ps(xmm9, xmm9);
493 
494  xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
495 
496  _mm_storeh_pi((__m64*)target, xmm10);
497  target += 2;
498  }
499 
500  calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
501 }
502 
503 #endif /*LV_HAVE_AVX2*/
504 
505 
506 #ifdef LV_HAVE_AVX
507 #include <immintrin.h>
509 
510 static inline void
512  const lv_32fc_t* src0,
513  const lv_32fc_t* points,
514  float scalar,
515  unsigned int num_points)
516 {
517  const int eightsPoints = num_points / 8;
518  const int remainder = num_points - 8 * eightsPoints;
519 
520  __m256 xmm_points0, xmm_points1, xmm_result;
521 
522  // load complex value into all parts of the register.
523  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
524 
525  // Load scalar into all 8 parts of the register
526  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
527 
528  for (int i = 0; i < eightsPoints; ++i) {
529  xmm_points0 = _mm256_loadu_ps((float*)points);
530  xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
531  points += 8;
532 
533  xmm_result = _mm256_scaled_norm_dist_ps(
534  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
535 
536  _mm256_storeu_ps(target, xmm_result);
537  target += 8;
538  }
539 
540  const lv_32fc_t symbol = *src0;
541  calculate_scaled_distances(target, symbol, points, scalar, remainder);
542 }
543 
544 #endif /* LV_HAVE_AVX */
545 
546 
547 #ifdef LV_HAVE_SSE3
548 #include <pmmintrin.h>
550 
551 static inline void
553  const lv_32fc_t* src0,
554  const lv_32fc_t* points,
555  float scalar,
556  unsigned int num_points)
557 {
558  __m128 xmm_points0, xmm_points1, xmm_result;
559 
560  /*
561  * First do 4 values in every loop iteration.
562  * There may be up to 3 values left.
563  * leftovers0 indicates if at least 2 more are available for SSE execution.
564  * leftovers1 indicates if there is a single element left.
565  */
566  const int quarterPoints = num_points / 4;
567  const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
568  const int leftovers1 = num_points % 2;
569 
570  // load complex value into both parts of the register.
571  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
572 
573  // Load scalar into all 4 parts of the register
574  const __m128 xmm_scalar = _mm_load1_ps(&scalar);
575 
576  for (int i = 0; i < quarterPoints; ++i) {
577  xmm_points0 = _mm_loadu_ps((float*)points);
578  xmm_points1 = _mm_loadu_ps((float*)(points + 2));
579  points += 4;
580  __VOLK_PREFETCH(points);
581  // calculate distances
582  xmm_result = _mm_scaled_norm_dist_ps_sse3(
583  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
584 
585  _mm_storeu_ps(target, xmm_result);
586  target += 4;
587  }
588 
589  for (int i = 0; i < leftovers0; ++i) {
590  xmm_points0 = _mm_loadu_ps((float*)points);
591  points += 2;
592 
593  xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
594  xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
595  xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
596  xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
597 
598  _mm_storeh_pi((__m64*)target, xmm_result);
599  target += 2;
600  }
601 
602  calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
603 }
604 
605 #endif /*LV_HAVE_SSE3*/
606 
607 #ifdef LV_HAVE_SSE
609 #include <xmmintrin.h>
610 static inline void
612  const lv_32fc_t* src0,
613  const lv_32fc_t* points,
614  float scalar,
615  unsigned int num_points)
616 {
617  const __m128 xmm_scalar = _mm_set1_ps(scalar);
618  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
619 
620  for (unsigned i = 0; i < num_points / 4; ++i) {
621  __m128 xmm_points0 = _mm_loadu_ps((float*)points);
622  __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
623  points += 4;
624  __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
625  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
626  _mm_storeu_ps((float*)target, xmm_result);
627  target += 4;
628  }
629 
630  calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
631 }
632 #endif // LV_HAVE_SSE
633 
634 #ifdef LV_HAVE_RVV
635 #include <riscv_vector.h>
636 
637 static inline void
638 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvv(float* target,
639  const lv_32fc_t* src0,
640  const lv_32fc_t* points,
641  float scalar,
642  unsigned int num_points)
643 {
644  size_t vlmax = __riscv_vsetvlmax_e32m4();
645  vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax);
646  vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax);
647  vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax);
648 
649  size_t n = num_points;
650  for (size_t vl; n > 0; n -= vl, target += vl, points += vl) {
651  vl = __riscv_vsetvl_e32m4(n);
652  vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)points, vl);
653  vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
654  vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
655  vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
656  vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
657  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
658  __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl);
659  }
660 }
661 #endif /*LV_HAVE_RVV*/
662 
663 #ifdef LV_HAVE_RVVSEG
664 #include <riscv_vector.h>
665 
666 static inline void
667 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvvseg(float* target,
668  const lv_32fc_t* src0,
669  const lv_32fc_t* points,
670  float scalar,
671  unsigned int num_points)
672 {
673  size_t vlmax = __riscv_vsetvlmax_e32m4();
674  vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax);
675  vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax);
676  vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax);
677 
678  size_t n = num_points;
679  for (size_t vl; n > 0; n -= vl, target += vl, points += vl) {
680  vl = __riscv_vsetvl_e32m4(n);
681  vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)points, vl);
682  vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0);
683  vfloat32m4_t vbi = __riscv_vget_f32m4(vb, 1);
684  vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
685  vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
686  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
687  __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl);
688  }
689 }
690 #endif /*LV_HAVE_RVVSEG*/
691 
692 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:98
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:180
calculate_scaled_distances
static void calculate_scaled_distances(float *target, const lv_32fc_t symbol, const lv_32fc_t *points, const float scalar, const unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:72
volk_sse3_intrinsics.h
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:611
_mm256_scaled_norm_dist_ps
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx_intrinsics.h:176
_mm256_scaled_norm_dist_ps_avx2
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:134
_mm_scaled_norm_dist_ps_sse
static __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse_intrinsics.h:128
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_neon
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_neon(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:321
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
i
for i
Definition: volk_config_fixed.tmpl.h:13
volk_sse_intrinsics.h
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:305
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74
volk_complex.h
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:280
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:511
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:552
volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:221
volk_avx_intrinsics.h
_mm_scaled_norm_dist_ps_sse3
static __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse3_intrinsics.h:50
volk_avx2_intrinsics.h
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:96