Vector Optimized Library of Kernels  3.3.0
Architecture-tuned implementations of math kernels
volk_32u_popcnt.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
43 #ifndef INCLUDED_VOLK_32u_POPCNT_A16_H
44 #define INCLUDED_VOLK_32u_POPCNT_A16_H
45 
46 #include <inttypes.h>
47 #include <stdio.h>
48 
49 #ifdef LV_HAVE_GENERIC
50 
51 static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value)
52 {
53  // This is faster than a lookup table
54  uint32_t retVal = value;
55 
56  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
57  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
58  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
59  retVal = (retVal + (retVal >> 8));
60  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
61 
62  *ret = retVal;
63 }
64 
65 #endif /*LV_HAVE_GENERIC*/
66 
67 
68 #ifdef LV_HAVE_NEON
69 #include <arm_neon.h>
70 
71 static inline void volk_32u_popcnt_neon(uint32_t* ret, const uint32_t value)
72 {
73  // Load value into a 64-bit vector (as 8 bytes)
74  uint8x8_t input = vreinterpret_u8_u32(vdup_n_u32(value));
75  // Count bits in each byte
76  uint8x8_t counts = vcnt_u8(input);
77  // Sum across all bytes (only first 4 matter for 32-bit value)
78  // Use vpaddl to widen and add: 8x8 -> 4x16 -> 2x32 -> 1x64
79  uint16x4_t sum16 = vpaddl_u8(counts);
80  uint32x2_t sum32 = vpaddl_u16(sum16);
81  // Extract the lower 32-bit element which contains the sum of the lower 4 bytes
82  *ret = vget_lane_u32(sum32, 0);
83 }
84 #endif /* LV_HAVE_NEON */
85 
86 
87 #ifdef LV_HAVE_SSE4_2
88 
89 #include <nmmintrin.h>
90 
91 static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value)
92 {
93  *ret = _mm_popcnt_u32(value);
94 }
95 
96 #endif /*LV_HAVE_SSE4_2*/
97 
98 #ifdef LV_HAVE_RVV
99 #include <riscv_vector.h>
100 
101 static inline void volk_32u_popcnt_rvv(uint32_t* ret, const uint32_t value)
102 {
103  *ret = __riscv_vcpop(__riscv_vreinterpret_b4(__riscv_vmv_s_x_u64m1(value, 1)), 32);
104 }
105 #endif /*LV_HAVE_RVV*/
106 
107 #ifdef LV_HAVE_RVA22V
108 #include <riscv_bitmanip.h>
109 
110 static inline void volk_32u_popcnt_rva22(uint32_t* ret, const uint32_t value)
111 {
112  *ret = __riscv_cpop_32(value);
113 }
114 #endif /*LV_HAVE_RVA22V*/
115 
116 #endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/
volk_32u_popcnt_neon
static void volk_32u_popcnt_neon(uint32_t *ret, const uint32_t value)
Definition: volk_32u_popcnt.h:71
volk_32u_popcnt_a_sse4_2
static void volk_32u_popcnt_a_sse4_2(uint32_t *ret, const uint32_t value)
Definition: volk_32u_popcnt.h:91
volk_32u_popcnt_generic
static void volk_32u_popcnt_generic(uint32_t *ret, const uint32_t value)
Definition: volk_32u_popcnt.h:51