Giorgio Gambino - 1 year ago 143
C++ Question

# Bit count : preprocessor magic vs modern C++

Suppose that I want to create a compile time constructed bit count lookup table for 64bit integers in 16 bit chunks. The only way I know to do this is the following code:

``````#define B4(n) n, n + 1, n + 1, n + 2
#define B6(n)   B4(n),   B4(n + 1),   B4(n + 1),  B4(n + 2)
#define B8(n)   B6(n),   B6(n + 1),   B6(n + 1),  B6(n + 2)
#define B10(n)  B8(n),   B8(n + 1),   B8(n + 1),  B8(n + 2)
#define B12(n)  B10(n),  B10(n + 1),  B10(n + 1), B10(n + 2)
#define B14(n)  B12(n),  B12(n + 1),  B12(n + 1), B12(n + 2)
#define B16(n)  B14(n),  B14(n + 1),  B14(n + 1), B14(n + 2)
#define COUNT_BITS B16(0), B16(1), B16(1), B16(2)

unsigned int lookup[65536] = { COUNT_BITS };
``````

Is there a modern (C++11/14) way to obtain the same result?

Why not use the standard library?

``````#include <bitset>

int bits_in(std::uint64_t u)
{
auto bs = std::bitset<64>(u);
return bs.count();
}
``````

resulting assembler (Compiled with -O2):

``````bits_in(unsigned long):
xor     eax, eax
popcnt  rax, rdi
ret
``````

@tambre mentioned that in reality, when it can, the optimiser will go further:

``````volatile int results[3];

int main()
{
results[0] = bits_in(255);
results[1] = bits_in(1023);
results[2] = bits_in(0x8000800080008000);
}
``````

resulting assembler:

``````main:
mov     DWORD PTR results[rip], 8
xor     eax, eax
mov     DWORD PTR results[rip+4], 10
mov     DWORD PTR results[rip+8], 4
ret
``````

Old-school bit-twiddlers like me need to find new problems to solve :)

### Update

Not everyone was happy that the solution relies on cpu help to compute the bitcount. So what if we used an autogenerated table but allowed the developer to configure the size of it? (warning - long compile time for the 16-bit table version)

``````#include <utility>
#include <cstdint>
#include <array>
#include <numeric>
#include <bitset>

template<std::size_t word_size, std::size_t...Is>
constexpr auto generate(std::integral_constant<std::size_t, word_size>, std::index_sequence<Is...>) {
struct popcount_type {
constexpr auto operator()(int i) const {
int bits = 0;
while (i) {
i &= i - 1;
++bits;
}
return bits;
}
};
constexpr auto popcnt = popcount_type();

return std::array<int, sizeof...(Is)>
{
{popcnt(Is)...}
};
}

template<class T>
constexpr auto power2(T x) {
T result = 1;
for (T i = 0; i < x; ++i)
result *= 2;
return result;
}

template<class TableWord>
struct table {
static constexpr auto word_size = std::numeric_limits<TableWord>::digits;
static constexpr auto table_length = power2(word_size);
using array_type = std::array<int, table_length>;
static const array_type& get_data() {
static const array_type data = generate(std::integral_constant<std::size_t, word_size>(),
std::make_index_sequence<table_length>());
return data;
};

};

template<class Word>
struct use_table_word {
};

template<class Word, class TableWord = std::uint8_t>
int bits_in(Word val, use_table_word<TableWord> = use_table_word<TableWord>()) {
constexpr auto table_word_size = std::numeric_limits<TableWord>::digits;

constexpr auto word_size = std::numeric_limits<Word>::digits;
constexpr auto times = word_size / table_word_size;
static_assert(times > 0, "incompatible");

auto reduce = [val](auto times) {
return (val >> (table_word_size * times)) & (power2(table_word_size) - 1);
};

auto const& data = table<TableWord>::get_data();
auto result = 0;
for (int i = 0; i < times; ++i) {
result += data[reduce(i)];
}
return result;
}

volatile int results[3];

#include <iostream>

int main() {
auto input = std::uint64_t(1023);
results[0] = bits_in(input);
results[0] = bits_in(input, use_table_word<std::uint16_t>());

results[1] = bits_in(0x8000800080008000);
results[2] = bits_in(34567890);

for (int i = 0; i < 3; ++i) {
std::cout << results[i] << std::endl;
}
return 0;
}
``````

### Final Update

This version allows the use of any number of bits in the lookup table and supports any input type, even if it's smaller than the number of bits in the lookup table.

It also short-circuits if the high bits are zero.

``````#include <utility>
#include <cstdint>
#include <array>
#include <numeric>
#include <algorithm>

namespace detail {
template<std::size_t bits, typename = void>
struct smallest_word;

template<std::size_t bits>
struct smallest_word<bits, std::enable_if_t<(bits <= 8)>>
{
using type = std::uint8_t;
};

template<std::size_t bits>
struct smallest_word<bits, std::enable_if_t<(bits > 8 and bits <= 16)>>
{
using type = std::uint16_t;
};

template<std::size_t bits>
struct smallest_word<bits, std::enable_if_t<(bits > 16 and bits <= 32)>>
{
using type = std::uint32_t;
};

template<std::size_t bits>
struct smallest_word<bits, std::enable_if_t<(bits > 32 and bits <= 64)>>
{
using type = std::uint64_t;
};
}

template<std::size_t bits> using smallest_word = typename detail::smallest_word<bits>::type;

template<class WordType, std::size_t bits, std::size_t...Is>
constexpr auto generate(std::index_sequence<Is...>) {

using word_type = WordType;

struct popcount_type {
constexpr auto operator()(word_type i) const {
int result = 0;
while (i) {
i &= i - 1;
++result;
}
return result;
}
};
constexpr auto popcnt = popcount_type();

return std::array<word_type, sizeof...(Is)>
{
{popcnt(Is)...}
};
}

template<class T>
constexpr auto power2(T x) {
return T(1) << x;
}

template<std::size_t word_size>
struct table {

static constexpr auto table_length = power2(word_size);

using word_type = smallest_word<word_size>;

using array_type = std::array<word_type, table_length>;

static const array_type& get_data() {
static const array_type data = generate<word_type, word_size>(std::make_index_sequence<table_length>());
return data;
};

template<class Type, std::size_t bits>
static constexpr auto n_bits() {
auto result = Type();
auto b = bits;
while(b--) {
result = (result << 1) | Type(1);
}
return result;
};

template<class Uint>
int operator()(Uint i) const {
constexpr auto mask = n_bits<Uint, word_size>();
}

};

template<int bits>
struct use_bits {
static constexpr auto digits = bits;
};

template<class T>
constexpr auto minimum(T x, T y) { return x < y ? x : y; }

template<class Word, class UseBits = use_bits<8>>
int bits_in(Word val, UseBits = UseBits()) {

using word_type = std::make_unsigned_t<Word>;
auto uval = static_cast<word_type>(val);

constexpr auto table_word_size = UseBits::digits;
constexpr auto word_size = std::numeric_limits<word_type>::digits;

auto const& mytable = table<table_word_size>();
int result = 0;
while (uval)
{
result += mytable(uval);
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wshift-count-overflow"
uval >>= minimum(table_word_size, word_size);
#pragma clang diagnostic pop
}

return result;
}

volatile int results[4];

#include <iostream>

int main() {
auto input = std::uint8_t(127);
results[0] = bits_in(input);
results[1] = bits_in(input, use_bits<4>());
results[2] = bits_in(input, use_bits<11>());
results[3] = bits_in(input, use_bits<15>());

for (auto&& i : results) {
std::cout << i << std::endl;
}

auto input2 = 0xabcdef;
results[0] = bits_in(input2);
results[1] = bits_in(input2, use_bits<4>());
results[2] = bits_in(input2, use_bits<11>());
results[3] = bits_in(input2, use_bits<15>());

for (auto&& i : results) {
std::cout << i << std::endl;
}

auto input3 = -1;
results[0] = bits_in(input3);
results[1] = bits_in(input3, use_bits<4>());
results[2] = bits_in(input3, use_bits<11>());
results[3] = bits_in(input3, use_bits<15>());

for (auto&& i : results) {
std::cout << i << std::endl;
}

return 0;
}
``````

example output:

``````7
7
7
7
17
17
17
17
32
32
32
32
``````

The resulting assembly output for a call to `bits_in(int, use_bits<11>())` for example, becomes:

``````.L16:
mov     edx, edi
and     edx, 2047
movzx   edx, WORD PTR table<11ul>::get_data()::data[rdx+rdx]