Compiler Explorer

Source code

// https://stackoverflow.com/questions/79178920/bit-packing-of-groups-of-n-repeated-bits-in-a-32-bit-word-compact-to-1-bit-per

#include <stdio.h>
#include <stdint.h>
//#define NDEBUG  // asserts are mostly for documentation about which n ranges different functions handle
#include <assert.h>
#include <stdalign.h>

#ifdef __SSE4_1__
// TODO: MSVC detection of SSE4
#include <immintrin.h>
uint32_t pack2_sse4(uint32_t x)
{
        __m128i v = _mm_cvtsi32_si128(x);
        // ppoonnmm...ddccbbaa
        v = _mm_unpacklo_epi8(_mm_slli_epi16(v,4), v);      // movdqa copy + psllw + punpcklbw
        // ppoonnmm | nnmmllkk | ... | ddccbbaa | bbaa0000    // in low 64 bits of vector
        v = _mm_cvtepi8_epi16(v);    // pmovsxbw
        // pppppppp | ppoonnmm | ... | dddddddd | ddccbbaa | bbbbbbbb | bbaa0000   // filling 128-bit xmm
        v = _mm_slli_epi16(v, 2);    // 0xbba0
        // pppppppp | oonnmm00 | ... | dddddddd | ccbbaa00 | bbbbbbbb | aa000000
        return _mm_movemask_epi8(v); // grab the top bit of each byte.
}

// scalar is better
uint32_t pack4_sse4(uint32_t x)
{
        // even if x is being loaded from memory, pmovsxbw would load 64 bits from a memory source
        // which would give us high garbage even if it was known safe (non-faulting and not causing an expensive page-split)
        // uint32_t x = *px;
        __m128i v = _mm_cvtsi32_si128(x);  // movd load or from a GPR
        v = _mm_cvtepi8_epi16(v);    // pmovsxbw : widen each 0xba element to 0xbbba
        v = _mm_slli_epi16(v, 4);    // 0xbba0
        return _mm_movemask_epi8(v); // grab the top bit of each byte.
}

uint32_t pack8_sse2(uint32_t x)
{
        __m128i v = _mm_cvtsi32_si128(x);  // movd, or a memory source operand for pmovmskb
        return _mm_movemask_epi8(v);
}

#if defined(__GFNI__) && defined(__AVX512VL__)
__m128i pack2_GFNI_AVX2(void *p){
	__m256i v = _mm256_loadu_si256(p);
	v &= _mm256_set1_epi8(0x55);
	v |= v>>7;  // with AVX512, just shift and vpternlogd to bit-blend, and clear high byte of the word.
	// bottom byte of each word contains the desired 8 bits
	// http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
	v = _mm256_gf2p8affine_epi64_epi8(v, magic, 0);  // put them in the right order 
	return  _mm256_cvtepi16_epi8(v);  // AVX512 vpmovwb with truncation
	// AVX2?
	//vpshufb + vpermq?  Or better, 2 vectors, 2x AND + vpackuswb + vpermq
}
#endif // GFNI + AVX2
#ifdef __AVX512BITALG__
uint64_t pack2_AVX512BITALG(void *p){
	__m128i vnarrow = _mm_loadu_si128(p);
//	__m256i v = _mm512_cvtepu32_epi64(vnarrow);
	__m256i v = _mm256_broadcastsi128_si256(vnarrow);
		
}
#endif // bitalg
#endif // SSE4

uint32_t pack2(uint32_t x)
{
    x &= 0b01100110011001100110011001100110;
        // 0po00nm00lk00ji00hg00fe00dc00ba0   AND mask to keep bit-pairs
    x += x<<2;					// LEA on x86
    x &= 0b01111000011110000111100001111000;
        // 0ponm0000lkji0000hgfe0000dcba000
    x += x<<4;
    x &= 0b01111111100000000111111110000000;
        // 0ponmlkji00000000hgfedcba0000000
    x *= (1u<<1)|(1u<<9);
        // ponmlkjihgfedcba0000000000000000
    x >>= 16;
    return x;
}

uint32_t pack3(uint32_t x) {
        // 0000j00i00h00g00f00e00d00c00b00a // incoming x
    x &= 0b00001100001100001100001100001100;
        //     ji    hg    fe    dc    ba
    x += x<<4;      // x *= 17
        // ji  jihg  hgfe  fedc  dcba  ba
    x &= 0b00001111000000001111000000001111;
        //     jihg        fedc        ba
        //         fedc        ba   // partial product from 1<<8
        // fedc        ba           // partial product from 1<<16
        // ((1<<0) | (1<<8) | (1<<16))<<4
    x *= (1<<4) | (1<<12) | (1<<20);
    // with the multiplier left-shifted by 4, result goes to the top, no high garbage after shifting
    x >>= 22;
    return x;
}

// unused: same as the LUT.  These were written to see how the patterns worked beyond n=3.
uint32_t pack4(uint32_t x) {
        // hhhhggggffffeeeeddddccccbbbbaaaa
    x &= 0b00011000000110000001100000011000;
        // 000hg000000fe000000dc000000ba000  // result, and 1<<0 partial product
        // 00000fe000000dc000000ba000000000  // 1<<6 partial product
	// 0000000dc000000ba000000000000000  // 1<<12
	// 0dc000000ba000000000000000000000  // 1<<18
	// with an extra shift by 3 to put the result at the top, avoiding high garbage
    x *= (1uL<<21) | (1uL<<15) | (1u<<9) | (1u<<3);
    x >>= (32-8);
    return x;
}

uint32_t pack5(uint32_t x) {
        // ..fffffeeeeedddddcccccbbbbbaaaaa
    x &= 0b00000011000000001100000000110000;
        // 000000fe00000000dc00000000ba00000  // result, and 1<<0 partial product
        // 00000000dc00000000ba00000          // 1<<8 partial product
        // dc00000000ba00000                  // 1<<16
    // with an extra <<6 to clear high garbage:
    x *= (1uL<<22) | (1u<<14) | (1u<<6);
    x >>= (32-6);
    return x;
}

// with bitpairs, allowing even simpler AArch64 code
uint32_t pack8(uint32_t x)
{    
    x &= 0b00000001100000000000000110000000;  // this can be an AArch64 immediate
        // 0000000dc00000000000000ba0000000
        // 000000000ba0000000..............  // x<<14
    x += x<<14;
    x >>= 21;  // (32-4) - 7
    return x;
/*
    x = (x >> 7) | (x>>16);  // more ILP but more x86 instructions, worse throughput if latency isn't a bottleneck
    x &= 0xFF;
    return x;
*/
}

// Mostly Konstantins's version
uint32_t pack8_nonpair(uint32_t x)
{
    // 0000000d0000000c0000000b0000000a
    x &= 0b1000000010000000100000001;
    // d0000000c0000000b0000000a         // <<7
    // 0c0000000b0000000a                // <<14
    // 00b0000000a                       // <<21
    // -------++++---------------------
    // dcb0000dcba0000cba00000ba000000a
    
    x *= 0b1000000100000010000001 << 7;  // shift the multiplier to put the result at the top
    x >>= (32-4);
    return x;
}

uint32_t pack_n_11_to_16(uint32_t x, unsigned n)
{
    assert(n>=11 && n<=16);  // n>=17 has only one group.
    x >>= n-1;     // the first (and only) bit-pair
    return x & 3;
}

// handle n large enough that there's at most 2 bit-groups
static inline
uint32_t pack_n_ge_11(uint32_t x, unsigned n)
{
    assert(n>=11 && n<=32);

// n=11 to 16: one pair : return x>>(n-1) & 3
    // n=17 and up: one bit : return x&1 or x>>(n-1) & 1;
    // if high padding (above the highest group) is known 0, then x>>(n-1) & 3 can be used unconditionally
#if 1
    x >>= n-1;
    uint32_t mask = n>16 ? 1 : 3;  // get a mask ready in parallel with shifting x.  GCC uses SBB/AND/ADD with a possible false-dependency on EAX on uarches other than AMD
    return x & mask;               // both dependency chains needed only as inputs to AND
#else
// if specializing for a known n, avoids a shift for n>16, which is good especially on Intel
    uint32_t onegroup = x&1;
    uint32_t twogroups = x>>(n-1) & 3;
    return n > 16 ? onegroup : twogroups;
#endif
}
// nope, this is backwards and off-by-1, would give n>=16 : 3 : 1;
//    uint32_t mask = (n>>3) | 1;  // n is 1..32.  For n=16..31, the 1<<4 = 16 bit is set, and isn't for smaller n.  And all higher bits are clear.
//                                 // for n=32, n>>4 is 2 not 1, but  x>>31  has no set bits above the LSB

uint32_t pack_n_7_to_10(uint32_t x, unsigned n)
{
    // at most 4 groups, 2 pairs
    assert(n>=7);   // works(?) but is less efficient for n>10

// low pair starting at bit #n-1 (counting from 0)
    uint32_t lowpair = x>>(n-1);
    lowpair &= 3;
    //  high pair starting at bit #3*n - 1
    uint32_t highpair = x>>(3*n - 1 - 2);
    uint32_t highmask = n <= 8 ? 3<<2 : 1<<2;
    highpair &= highmask;      // 1<<2 for n=9 or 10.  FIXME, needs to be 0 for n==11 and higher if there's high garbage.
    assert(n<=10);
    return lowpair | highpair;
}

// unsigned long n avoids the need to zero-extend to pointer width, if it didn't inline.
static inline
uint32_t pack_LUT(uint32_t x, unsigned long n)
{
    assert(n>=4 && n<=16);  // table currently only goes up to n=16
    struct LUT {
        // group both arrays into one static variable so compilers don't waste instructions getting the base address separately for each.
        uint8_t shift[32-4];  // counts first so the offset to next array is small, fitting in a disp8
        alignas(8)
        struct {uint32_t mask, mult;} maskmult[]; // [32-4] or flexible array member that's as long as we want it to be.
    };
    static const struct LUT lut = {
      .maskmult = {
        {0b00011000000110000001100000011000, ((1uL<<18) | (1u<<12) | (1u<<6) | 1)<<3 }, // n=4 ; 8 groups = 4 pairs
        {0b00000011000000001100000000110000, ((1uL<<16) | (1u<<8)  | 1)<<6 },  // n=5 ; 6 groups = 3 pairs
        {0b00100000000001100000000001100000, ((1uL<<20) | (1u<<10) | 1)<<2 },  // n=6 ; 5 groups = 3ish pairs
        {0b00000000001100000000000011000000, ((1uL<<12) | 1)<<0 }, // n=7 ; 4 groups, 2 pairs // no high garbage even without shifting to the top, shift=18
        {0b00000001100000000000000110000000, ((1uL<<14) | 1)<<0 }, // n=8 ; 4 groups, 2 pairs
        {0b00000100000000000000001100000000, ((1uL<<16) | 1)<<0 }, // n=9 ; 3 groups, 2ish pairs
        {0b00100000000000000000011000000000, ((1uL<<18) | 1)<<0 }, // n=10 ; 3 groups, 2ish pairs
        {0b00000000000000000000110000000000, 1}, // n=11   one pair, don't need to multiply anymore if we want to dispatch to a third version.
        {0b00000000000000000001100000000000, 1}, // n=12
        {3<<12, 1}, // n=13
        {3<<13, 1}, // n=14
        {3<<14, 1}, // n=15
        {3<<15, 1}, // n=16
        // Use a simpler strategy for larger n to save table size, even though dispatcher has more to choose from.
      },
      .shift = {
        32-8, 32-6, 32-5,  // n = 4..6 shift from the top to clear high garbage
        18, 21, 24, 27,    // n=7..10 have a second bit-pair (or single bit) at the top
        // n=11 and up just have 2 or fewer groups, trivial shift and keep only 2 or 1 bit
        10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
        // could choose multipliers so these are all 30
      }
    };
    x &= lut.maskmult[n-4].mask;
    x *= lut.maskmult[n-4].mult;
    x >>= lut.shift[n-4];
    return x;
}

static inline
uint32_t pack_n(uint32_t x, unsigned n)
{
#ifdef HAVE_PEXT
        return pack_pext(x, n);
#endif
    switch (n){
        default: 
		return x & 1;
	case 11 ... 16: 
		//return pack_n_ge_11(x, n);  // alternative: make the LUTs larger so switch is simpler.
//		return pack_n_11_to_16(x,n);

//	case 8: return pack8_sse2(x);
//	case 4: return pack4_sse4(x);

case 10:
        case 9:
        case 8:
        case 7:
         //   return pack_n_ge_7(x, n);  // no LUT just shifts
        case 6:
        case 5:
        case 4:
            return pack_LUT(x, n);

case 3:   return pack3(x);
        case 2:
#ifdef __SSE4_1__
            return pack2_sse4(x);
#else
            return pack2(x);
#endif
        case 1:   return x;  // trivial

}
}

// Toby Speight
static uint32_t b2(uint32_t v)
{
    return v & 1
         | (v >>  1) & 0x0002
         | (v >>  2) & 0x0004
         | (v >>  3) & 0x0008
         | (v >>  4) & 0x0010
         | (v >>  5) & 0x0020
         | (v >>  6) & 0x0040
         | (v >>  7) & 0x0080
         | (v >>  8) & 0x0100
         | (v >>  9) & 0x0200
         | (v >> 10) & 0x0400
         | (v >> 11) & 0x0800
         | (v >> 12) & 0x1000
         | (v >> 13) & 0x2000
         | (v >> 14) & 0x4000
         | (v >> 15) & 0x8000;
}

static uint32_t b3(uint32_t v)
{
    return v & 1
         | (v >>  2) & 0x0002
         | (v >>  4) & 0x0004
         | (v >>  6) & 0x0008
         | (v >>  8) & 0x0010
         | (v >> 10) & 0x0020
         | (v >> 12) & 0x0040
         | (v >> 14) & 0x0080
         | (v >> 16) & 0x0100
         | (v >> 18) & 0x0200;
}

static uint32_t b4(uint32_t v)
{
    return v & 1
         | (v >>  3) & 0x0002
         | (v >>  6) & 0x0004
         | (v >>  9) & 0x0008
         | (v >> 12) & 0x0010
         | (v >> 15) & 0x0020
         | (v >> 18) & 0x0040
         | (v >> 21) & 0x0080;
}

static uint32_t b5(uint32_t v)
{
    return v & 1
         | (v >>  4) & 0x0002
         | (v >>  8) & 0x0004
         | (v >> 12) & 0x0008
         | (v >> 16) & 0x0010
         | (v >> 20) & 0x0020;
}

static uint32_t b6(uint32_t v)
{
    return v & 1
         | (v >>  5) & 0x0002
         | (v >> 10) & 0x0004
         | (v >> 15) & 0x0008
         | (v >> 20) & 0x0010;
}

static uint32_t b7(uint32_t v)
{
    return v & 1
         | (v >>  6) & 0x0002
         | (v >> 12) & 0x0004
         | (v >> 18) & 0x0008;
}

static uint32_t b8(uint32_t v)
{
    return v & 1
         | (v >>  7) & 0x0002
         | (v >> 14) & 0x0004
         | (v >> 21) & 0x0008;
}

static uint32_t b9(uint32_t v)
{
    return v & 1
         | (v >>  8) & 0x0002
         | (v >> 16) & 0x0004;
}

static uint32_t b10(uint32_t v)
{
    return v & 1
         | (v >>  9) & 0x0002
         | (v >> 18) & 0x0004;
}

static uint32_t b11(uint32_t v)
{
    return v & 1
         | (v >> 10) & 0x0002;
}

static uint32_t b12(uint32_t v)
{
    return v & 1
         | (v >> 11) & 0x0002;
}

static uint32_t b13(uint32_t v)
{
    return v & 1
         | (v >> 12) & 0x0002;
}

static uint32_t b14(uint32_t v)
{
    return v & 1
         | (v >> 13) & 0x0002;
}

static uint32_t b15(uint32_t v)
{
    return v & 1
         | (v >> 14) & 0x0002;
}

static uint32_t b16(uint32_t v)
{
    return v & 1
         | (v >> 15) & 0x0002;
}

uint32_t PackBits(uint32_t value, uint8_t n)
{
    switch(n) {
    case 1: return value;
    case 2: return b2(value);
    case 3: return b3(value);
    case 4: return b4(value);
    case 5: return b5(value);
    case 6: return b6(value);
    case 7: return b7(value);
    case 8: return b8(value);
    case 9: return b9(value);
    case 10: return b10(value);
    case 11: return b11(value);
    case 12: return b12(value);
    case 13: return b13(value);
    case 14: return b14(value);
    case 15: return b15(value);
    case 16: return b16(value);
    case 17: return value & 1;
    case 18: return value & 1;
    case 19: return value & 1;
    case 20: return value & 1;
    case 21: return value & 1;
    case 22: return value & 1;
    case 23: return value & 1;
    case 24: return value & 1;
    case 25: return value & 1;
    case 26: return value & 1;
    case 27: return value & 1;
    case 28: return value & 1;
    case 29: return value & 1;
    case 30: return value & 1;
    case 31: return value & 1;
    case 32: return value & 1;
    }
    assert(0);
    return 0;
}

// from the question, fixed to avoid the top padding
uint32_t PackBits_ref(uint32_t value, unsigned n)
{
    if (n<2) return value;

uint32_t res = value & 1;
    uint32_t mask = 1uL<<(2*n-1);  // top bit so it gets shifted out instead of into partial-group padding
    unsigned s = 2 * (n-1);        // we start with the second group, peeling the first iteration.  Hrm, maybe should use uint64_t mask and value to avoid shift-count UB

do {        
        res |= (value & mask) >> s;  // Mask after shifting would allow 1-bit shifts of the mask, so x86 wouldn't have to keep swapping ECX
        mask <<= n;
        s += n-1;
    } while (mask);

return res;
}

uint64_t xorshift64(uint64_t *state)
{
	uint64_t x = *state;
	x ^= x << 7;
	x ^= x >> 9;
	return *state = x;
}

uint32_t fill_groups(uint32_t arg, unsigned n)
{
	uint32_t mask = 1;
	for (int pos = 0 ; pos < 32 ; pos+=n){
		mask |= mask<<n;
	}
	uint32_t lowmask = mask;
	uint32_t highmask = mask << (n-1);

arg &= lowmask;  // isolate bits at the bottom of each group
	// swar decrement of 0 or 1 to produce -1 or 0 : https://stackoverflow.com/a/59650412/224132
	// could maybe be simplified, and/or hoist the mask setup out of loops since GCC doesn't seem to do that for us.
    uint32_t x1 = arg |  highmask;
    uint32_t x2 = ~arg & highmask;
    // or uint64_t x2 = arg ^ x1; to save one instruction if you don't have an andnot instruction
    return (x1 - lowmask) ^ x2;
}

uint32_t pack3_orig(uint32_t x) {
    uint32_t x_0 = x & 0b1000000000001000000000001000;
    x_0 *= 0b10000000100000001;

uint32_t x_1 = x & 0b1000000000001000000000001;
    x_1 *= 0b1000000010000000100;

uint32_t x_2 = x & 0b1000000000001000000000;
    x_2 *= 0b1000000010000;

uint32_t x_3 = x & 0b1000000000001000000;
    x_3 *= 0b100000001000000;

x = x_0 | x_1 | x_2 | x_3;
    x >>= 18;
    x &= 0x3FF;
    return x;
}

int main() {

/*
    uint32_t x;
    printf("pack3 in 13 operations:\n");
    x = 0xC0000007; printf("%08X -> %08X\n", x, pack3(x));
    x = 0xC0000038; printf("%08X -> %08X\n", x, pack3(x));
    x = 0xC000003F; printf("%08X -> %08X\n", x, pack3(x));
    x = 0xC00001C0; printf("%08X -> %08X\n", x, pack3(x));
    x = 0xC71C71C7; printf("%08X -> %08X\n", x, pack3(x));
    x = 0xF8E38E38; printf("%08X -> %08X orig\n", x, pack3_orig(x));
    x = 0xF8E38E38; printf("%08X -> %08X new\n", x, pack3(x));
    x = 0xFFFFFFFF; printf("%08X -> %08X\n", x, pack3(x));

printf("\npack4 in 7 operations:\n");
    x = 0x0000000F; printf("%08X -> %08X\n", x, pack4(x));
    x = 0x000000F0; printf("%08X -> %08X\n", x, pack4(x));
    x = 0x000000FF; printf("%08X -> %08X\n", x, pack4(x));
    x = 0x00000F00; printf("%08X -> %08X\n", x, pack4(x));
    x = 0x0F0F0F0F; printf("%08X -> %08X\n", x, pack4(x));
    x = 0xF0F0F0F0; printf("%08X -> %08X\n", x, pack4(x));
    x = 0xFFFFFFFF; printf("%08X -> %08X\n", x, pack4(x));

printf("\npack5 in 6 operations:\n");
    x = 0xC000001F; printf("%08X -> %08X\n", x, pack5(x));
    x = 0xC00003E0; printf("%08X -> %08X\n", x, pack5(x));
    x = 0xC00003FF; printf("%08X -> %08X\n", x, pack5(x));
    x = 0xC0007C00; printf("%08X -> %08X\n", x, pack5(x));
    x = 0xC1F07C1F; printf("%08X -> %08X\n", x, pack5(x));
    x = 0xFE0F83E0; printf("%08X -> %08X\n", x, pack5(x));
    x = 0xFFFFFFFF; printf("%08X -> %08X\n", x, pack5(x));

printf("\npack8 in 4 operations:\n");
    x = 0x000000FF; printf("%08X -> %08X\n", x, pack8(x));
    x = 0x0000FF00; printf("%08X -> %08X\n", x, pack8(x));
    x = 0x0000FFFF; printf("%08X -> %08X\n", x, pack8(x));
    x = 0x00FF0000; printf("%08X -> %08X\n", x, pack8(x));
    x = 0x00FF00FF; printf("%08X -> %08X\n", x, pack8(x));
    x = 0xFF00FF00; printf("%08X -> %08X\n", x, pack8(x));
    x = 0xFFFFFFFF; printf("%08X -> %08X\n", x, pack8(x));

//puts("\npack2");
	//x = 0xFFFFFFFF; printf("%08X -> %08X pack2\n", x, pack2(x));
	//x = 0xFFFFFFFF; printf("%08X -> %08X pack2_sse4\n", x, pack2_sse4(x));
*/

uint64_t prng_state = -1;
    puts("testing pack(x, n) for all n with random x");
    for (int n = 1 ; n <= 32 ; n++) {
	for (int rep = 0 ; rep < 10240000 ; rep++){
		uint32_t x = xorshift64(&prng_state);
		x = fill_groups(x, n);
		uint32_t ref = PackBits(x, n);
		uint32_t peter = pack_n(x, n);
		if (ref != peter){
			printf("x = %08X  n = %d : reference %08X  peter: %08X  mismatch\n", x, n, ref, peter);
			break;
		}
	}
    }
}

// PackBits is Toby's version
// PackBits_ref is my slightly optimized version of the loop from the question.

/*
** Toby's version on my i7-6700k gcc14.2, without -march=native or -mbmi2
gcc -O3 -fno-plt -Wa,-mbranches-within-32B-boundaries -Wall -Wno-parentheses -g ~/src/SO/bitpack.c -o a.out  && perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,instructions,idq.mite_uops,branches,branch-misses  ./a.out

Performance counter stats for './a.out': (PackBits)

1,993.74 msec task-clock                       #    1.000 CPUs utilized             
                 0      context-switches                 #    0.000 /sec                      
                 0      cpu-migrations                   #    0.000 /sec                      
                55      page-faults                      #   27.586 /sec                      
     7,765,072,322      cycles                           #    3.895 GHz                       
    28,774,531,928      instructions                     #    3.71  insn per cycle            
        54,341,097      idq.mite_uops                    #   27.256 M/sec                     
     4,423,707,664      branches                         #    2.219 G/sec                     
            35,233      branch-misses                    #    0.00% of all branches

1.994180652 seconds time elapsed

1.992756000 seconds user
       0.000000000 seconds sys

** Looping version, same build command:
 Performance counter stats for './a.out': (PackBits_ref)

2,104.92 msec task-clock                       #    1.000 CPUs utilized             
                 0      context-switches                 #    0.000 /sec                      
                 0      cpu-migrations                   #    0.000 /sec                      
                56      page-faults                      #   26.604 /sec                      
     8,198,600,149      cycles                           #    3.895 GHz                       
    28,774,532,298      instructions                     #    3.51  insn per cycle            
     2,367,924,586      idq.mite_uops                    #    1.125 G/sec                     
     3,471,387,699      branches                         #    1.649 G/sec                     
            36,970      branch-misses                    #    0.00% of all branches

2.105404390 seconds time elapsed

2.100963000 seconds user
       0.000000000 seconds sys

************ with BMI2 (-mbmi2 added) **************

** Performance counter stats for './a.out':  (PackBits Toby with -mbmi2)

1,805.98 msec task-clock                       #    0.999 CPUs utilized             
                 0      context-switches                 #    0.000 /sec                      
                 0      cpu-migrations                   #    0.000 /sec                      
                57      page-faults                      #   31.562 /sec                      
     6,896,186,339      cycles                           #    3.819 GHz                       
    25,477,251,961      instructions                     #    3.69  insn per cycle            
     2,691,593,087      idq.mite_uops                    #    1.490 G/sec                     
     4,423,707,697      branches                         #    2.449 G/sec                     
        13,259,655      branch-misses                    #    0.30% of all branches

1.807690833 seconds time elapsed

1.804012000 seconds user
       0.000000000 seconds sys

** Performance counter stats for './a.out':  (PackBits_ref with -mbmi2)

1,472.52 msec task-clock                       #    1.000 CPUs utilized             
                 0      context-switches                 #    0.000 /sec                      
                 0      cpu-migrations                   #    0.000 /sec                      
                55      page-faults                      #   37.351 /sec                      
     5,727,937,847      cycles                           #    3.890 GHz                       
    24,586,372,062      instructions                     #    4.29  insn per cycle            
        54,537,253      idq.mite_uops                    #   37.037 M/sec                     
     3,471,387,556      branches                         #    2.357 G/sec                     
            41,516      branch-misses                    #    0.00% of all branches

1.472978769 seconds time elapsed

1.471742000 seconds user
       0.000000000 seconds sys
*/