Compiler Explorer

Source code

#ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H
#define ZOO_SWAR_ASSOCIATIVE_ITERATION_H

#ifndef ZOO_HEADER_META_BITMASKMAKER_H
#define ZOO_HEADER_META_BITMASKMAKER_H

#include <cstdint>

namespace zoo { namespace meta {

namespace detail {

/// Repeats the given pattern in the whole of the argument
/// \tparam T the desired integral type
/// \tparam Progression how big the pattern is
/// \tparam Remaining how many more times to copy the pattern
template<typename T, T Current, int CurrentSize, bool>
struct BitmaskMaker_impl{
    constexpr static T value =
        BitmaskMaker_impl<
            T,
            T(Current | (Current << CurrentSize)),
            CurrentSize*2,
            CurrentSize*2 < sizeof(T)*8
        >::value;
};
template<typename T, T Current, int CurrentSize>
struct BitmaskMaker_impl<T, Current, CurrentSize, false> {
    constexpr static T value = Current;
};

} // detail

/// Repeats the given pattern in the whole of the argument
/// \tparam T the desired integral type
/// \tparam Current the pattern to broadcast
/// \tparam CurrentSize the number of bits in the pattern
template<typename T, T Current, int CurrentSize>
struct BitmaskMaker {
    constexpr static auto value =
        detail::BitmaskMaker_impl<
            T, Current, CurrentSize,  CurrentSize*2 <= sizeof(T)*8
        >::value;
};

static_assert(0xF0F0 == BitmaskMaker<uint16_t, 0xF0, 8>::value);
static_assert(0xEDFEDFED == BitmaskMaker<uint32_t, 0xFED, 12>::value);

}} // zoo::meta

#endif

// #pragma once

/// \file Swar.h SWAR operations

#ifndef ZOO_HEADER_META_LOG_H
#define ZOO_HEADER_META_LOG_H

#ifndef ZOO_HEADER_META_POPCOUNT_H
#define ZOO_HEADER_META_POPCOUNT_H

// #include "zoo/meta/BitmaskMaker.h"

namespace zoo { namespace meta {

namespace detail {

template<int Level, typename T = uint64_t>
struct PopcountMask_impl {
    constexpr static inline auto value =
        BitmaskMaker<
            T,
            (1 << Level) - 1,
            (1 << (Level + 1))
        >::value;
};

template<int Bits> struct UInteger_impl;
template<> struct UInteger_impl<8> { using type = uint8_t; };
template<> struct UInteger_impl<16> { using type = uint16_t; };
template<> struct UInteger_impl<32> { using type = uint32_t; };
template<> struct UInteger_impl<64> { using type = uint64_t; };

// unfortunately this does not work since unsigned long long is not
// the same type as unsigned long, but may have the same size!
template<typename> constexpr int BitWidthLog = 0;
template<> constexpr inline int BitWidthLog<uint8_t> = 3;
template<> constexpr inline int BitWidthLog<uint16_t> = 4;
template<> constexpr inline int BitWidthLog<uint32_t> = 5;
template<> constexpr inline int BitWidthLog<uint64_t> = 6;

} // detail

template<int Bits> using UInteger = typename detail::UInteger_impl<Bits>::type;

template<int LogarithmOfGroupSize, typename T = std::uint64_t>
struct PopcountLogic {
    constexpr static int GroupSize = 1 << LogarithmOfGroupSize;
    constexpr static int HalvedGroupSize = GroupSize / 2;
    constexpr static auto CombiningMask =
        BitmaskMaker<T, T(1 << HalvedGroupSize) - 1, GroupSize>::value;
    using Recursion = PopcountLogic<LogarithmOfGroupSize - 1, T>;
    constexpr static T execute(T input);
};

template<typename T>
struct PopcountLogic<0, T> {
    constexpr static T execute(T input) { return input; }
};

template<typename T>
struct PopcountLogic<1, T> {
   constexpr static T execute(T input) {
        return input - ((input >> 1) & BitmaskMaker<T, 1, 2>::value);
        // For each pair of bits, the expression results in:
        // 11: 3 - 1: 2
        // 10: 2 - 1: 1
        // 01: 1 - 0: 1
        // 00: 0 - 0: 0
   }
};

template<int LogarithmOfGroupSize, typename T>
constexpr T PopcountLogic<LogarithmOfGroupSize, T>::execute(T input) {
    return
        Recursion::execute(input & CombiningMask) +
        Recursion::execute((input >> HalvedGroupSize) & CombiningMask);
}

#ifdef _MSC_VER
template<int LogarithmOfGroupSize, typename T = uint64_t>
using  PopcountIntrinsic = PopcountLogic<LogarithmOfGroupSize, T>;
#else
template<int LogarithmOfGroupSize, typename T = uint64_t>
struct PopcountIntrinsic {
    constexpr static T execute(T input) {
        using UI = UInteger<1 << LogarithmOfGroupSize>;
        constexpr auto times = 8*sizeof(T);
        uint64_t rv = 0;
        for(auto n = times; n; ) {
            n -= 8*sizeof(UI);
            UI tmp = input >> n;
            tmp = __builtin_popcountll(tmp);
            rv |= uint64_t(tmp) << n;
        }
        return rv;
    }
};
#endif

}}

#endif

namespace zoo { namespace meta {

/// The algorithm is, from the perspective of the most significant bit set, to copy it
/// downward to all positions.
/// First copy it once, meaning a group of two copies of the two most significant bit
/// Then copy it again, making a group of four copies, then 8 copies...
template<typename T>
constexpr int logFloor_WithoutIntrinsic(T value) {
    constexpr auto NBitsTotal = sizeof(T) * 8;
    for(auto groupSize = 1; groupSize < NBitsTotal; groupSize <<= 1) {
        value |= (value >> groupSize);
    }
    return PopcountLogic<detail::BitWidthLog<T>, T>::execute(value) - 1;
}

#ifdef _MSC_VER
// change to use the relevant functions in C++ 20's <bit> header
// when bumping to C++ 20
constexpr int logFloor(uint64_t arg) {
    return logFloor_WithoutIntrinsic(arg);
}
#else
constexpr int logFloor(uint64_t arg) {
    return 63 - __builtin_clzll(arg);
}
#endif

constexpr int logCeiling(uint64_t arg) {
    auto floorLog = logFloor(arg);
    return
        floorLog +
        // turn off the most significant bit and convert to 1 or 0
        ((arg ^ (1ull << floorLog)) ? 1 : 0);
}

}}

#endif

#include <type_traits>

#ifdef _MSC_VER
#include <iso646.h>
#endif

namespace zoo { namespace swar {

using u64 = uint64_t;
using u32 = uint32_t;
using u16 = uint16_t;
using u8 = std::uint8_t;

template<int LogNBits>
constexpr uint64_t popcount(uint64_t a) noexcept {
    return
        std::conditional_t<
            (4 < LogNBits),
            meta::PopcountIntrinsic<LogNBits>,
            meta::PopcountLogic<LogNBits>
        >::execute(a);
}

/// Index into the bits of the type T that contains the MSB.
template<typename T>
constexpr std::make_unsigned_t<T> msbIndex(T v) noexcept {
    return meta::logFloor(v);
}

/// Index into the bits of the type T that contains the LSB.
///
/// \todo incorporate __builtin_ctzg when it is more widely available
template<typename T>
constexpr std::make_unsigned_t<T> lsbIndex(T v) noexcept {
    // This check should be SFINAE, but supporting all sorts
    // of base types is an ongoing task, we put a bare-minimum
    // temporary preventive measure with static_assert
    static_assert(sizeof(T) <= 8, "Unsupported");
    #ifdef _MSC_VER
        // ~v & (v - 1) turns on all trailing zeroes, zeroes the rest
        return meta::logFloor(1 + (~v & (v - 1)));
    #else
        return ~v ? __builtin_ctzll(v) : sizeof(T) * 8;
    #endif
}

#ifndef _MSC_VER
constexpr __uint128_t lsbIndex(__uint128_t v) noexcept {
    auto low = (v << 64) >> 64;
    if(low) { return __builtin_ctzll(low); }
    return 64 + __builtin_ctzll(v >> 64);
}
#endif

/// Core abstraction around SIMD Within A Register (SWAR).  Specifies 'lanes'
/// of NBits width against a type T, and provides an abstraction for performing
/// SIMD operations against that primitive type T treated as a SIMD register.
/// SWAR operations are usually constant time, log(lane count) cost, or O(lane count) cost.
/// Certain computational workloads can be materially sped up using SWAR techniques.
template<int NBits_, typename T = uint64_t>
struct SWAR {
    using type = std::make_unsigned_t<T>;
    constexpr static inline type
        NBits = NBits_,
        BitWidth = sizeof(T) * 8,
        Lanes = BitWidth / NBits,
        NSlots = Lanes,
        PaddingBitsCount = BitWidth % NBits,
        SignificantBitsCount = BitWidth - PaddingBitsCount,
        AllOnes = ~std::make_unsigned_t<T>{0} >> PaddingBitsCount, // Also constructed in RobinHood utils: possible bug?
        LeastSignificantBit = meta::BitmaskMaker<T, std::make_unsigned_t<T>{1}, NBits>::value,
        AllOnesInFirstLane = AllOnes >> (NBits * (Lanes - 1)),
        MostSignificantBit = LeastSignificantBit << (NBits - 1),
        LeastSignificantLaneMask = []() {
            if constexpr (NBits < sizeof(T) * 8) {
                return (T(1) << NBits) - 1;
            } else {
                return ~T(0);
            }
        }(),
        // Use LowerBits in favor of ~MostSignificantBit to not pollute
        // "don't care" bits when non-power-of-two bit lane sizes are supported
        LowerBits = MostSignificantBit - LeastSignificantBit;

static_assert(std::is_unsigned_v<T>,
        "You should not use an unsigned type as the base for a SWAR type. "
        "If you have used `int` or `long`, please use `uint32_t` or `uint64_t` instead. "
        "This type parameter is only used to determine the total width of the SWAR register. "
        "The signed-ness of the type has no *intentional* semantic meaning to what you're defining and "
        "furthermore, some bitwise operations are different for signed and unsigned types."
    );

SWAR() = default;
    constexpr explicit SWAR(T v): m_v(v) {}
    constexpr explicit operator T() const noexcept { return m_v; }

constexpr T value() const noexcept { return m_v; }

#define SWAR_UNARY_OPERATORS_X_LIST \
        X(SWAR, ~)
    //constexpr SWAR operator~() const noexcept { return SWAR{~m_v}; }
    #define SWAR_BINARY_OPERATORS_X_LIST \
        X(SWAR, &) X(SWAR, ^) X(SWAR, |) X(SWAR, -) X(SWAR, +)

#define X(rt, op) constexpr rt operator op() const noexcept { return rt(op m_v); }
    SWAR_UNARY_OPERATORS_X_LIST
    #undef X

#define X(rt,op) constexpr rt operator op (rt o) const noexcept { return rt (m_v op o.m_v); };
    SWAR_BINARY_OPERATORS_X_LIST
    #undef X

constexpr static T laneMask(int laneIndex) noexcept {
        return LeastSignificantLaneMask << (NBits * laneIndex);
    }

// Returns lane at position with other lanes cleared.
    constexpr T isolateLane(int laneIndex) const noexcept {
        return m_v & laneMask(laneIndex);
    }

// Returns lane value at position, in lane 0, rest of SWAR cleared.
    constexpr T at(int position) const noexcept {
        return LeastSignificantLaneMask & (m_v >> (NBits * position));
    }

constexpr SWAR clear(int position) const noexcept {
        constexpr auto filter = (T(1) << NBits) - 1;
        auto invertedMask = filter << (NBits * position);
        auto mask = ~invertedMask;
        return SWAR(m_v & mask);
    }

/// The SWAR lane index that contains the MSB.  It is not the bit index of the MSB.
    /// IE: 4 bit wide 32 bit SWAR: 0x0040'0000 will return 5, not 22 (0 indexed).
    constexpr auto top() const noexcept { return msbIndex(m_v) / NBits; }
    constexpr auto lsbIndex() const noexcept { return swar::lsbIndex(m_v) / NBits; }

constexpr SWAR setBit(int index, int bit) const noexcept {
        return SWAR(m_v | (T(1) << (index * NBits + bit)));
    }

constexpr auto blitElement(int index, T value) const noexcept {
        auto elementMask = ((T(1) << NBits) - 1) << (index * NBits);
        return SWAR((m_v & ~elementMask) | (value << (index * NBits)));
    }

constexpr SWAR blitElement(int index, SWAR other) const noexcept {
        constexpr auto OneElementMask = SWAR(~(~T(0) << NBits));
        auto IsolationMask = OneElementMask.shiftLanesLeft(index);
        return (*this & ~IsolationMask) | (other & IsolationMask);
    }

constexpr SWAR shiftLanesLeft(int laneCount) const noexcept {
        return SWAR(value() << (NBits * laneCount));
    }

constexpr SWAR shiftLanesRight(int laneCount) const noexcept {
        return SWAR(value() >> (NBits * laneCount));
    }

/// \brief as the name suggests
    /// \param protectiveMask should clear the bits that would cross the lane.
    /// The bits that will be cleared are directly related to the count of
    /// shifts, it is natural to maintain the protective mask by the caller,
    /// otherwise, the mask would have to be computed in all invocations.
    /// We are not sure the optimizer would maintain this mask somewhere, if it
    /// were to recalculate it, it would be disastrous for performance
    /// \note the \c static_cast are necessary because of narrowing conversions
    #define SHIFT_INTRALANE_OP_X_LIST X(Left, <<) X(Right, >>)
    #define X(name, op) \
        constexpr SWAR \
        shiftIntraLane##name(int bitCount, SWAR protectiveMask) const noexcept { \
            T shiftC = static_cast<T>(bitCount); \
            auto V = (*this & protectiveMask).value(); \
            auto rv = static_cast<T>(V op shiftC); \
            return SWAR{rv}; \
        }
    SHIFT_INTRALANE_OP_X_LIST
    #undef X
    #undef SHIFT_INTRALANE_OP_X_LIST

constexpr SWAR
    multiply(T multiplier) const noexcept { return SWAR{m_v * multiplier}; }
    T m_v;
};

/// Defining operator== on base SWAR types is entirely too error prone. Force a verbose invocation.
template<int NBits, typename T = uint64_t>
constexpr auto horizontalEquality(SWAR<NBits, T> left, SWAR<NBits, T> right) {
    return left.m_v == right.m_v;
}

/// Isolating >= NBits in underlying integer type currently results in disaster.
// TODO(scottbruceheart) Attempting to use binary not (~) results in negative shift warnings.
template<int NBits, typename T = uint64_t>
constexpr auto isolate(T pattern) {
    return pattern & ((T(1)<<NBits)-1);
}

/// Clears the least bit set in type T
template<typename T = uint64_t>
constexpr auto clearLSB(T v) {
    return v & (v - 1);
}

/// Leaves on the least bit set, or all 1s for a 0 input.
template<typename T = uint64_t>
constexpr auto isolateLSB(T v) {
    return v & ~clearLSB(v);
}

template<int NBits, typename T>
constexpr auto leastNBitsMask() {
    return (T{1}<<NBits)-1;
}

template<int NBits, uint64_t T>
constexpr auto leastNBitsMask() {
    return ~((0ull)<<NBits);
}

template<int NBits, typename T = uint64_t>
constexpr T mostNBitsMask() {
    return ~leastNBitsMask<sizeof(T)*8-NBits, T>();
}

/// Clears the block of N bits anchored at the LSB.
/// clearLSBits<3> applied to binary 00111100 is binary 00100000
template<int NBits, typename T = uint64_t>
constexpr auto clearLSBits(T v) {
    constexpr auto lowMask = leastNBitsMask<NBits, T>();
    return v &(~(lowMask << meta::logFloor(isolateLSB<T>(v))));
}

/// Isolates the block of N bits anchored at the LSB.
/// isolateLSBits<2> applied to binary 00111100 is binary 00001100
template<int NBits, typename T = uint64_t>
constexpr auto isolateLSBits(T v) {
    constexpr auto lowMask = leastNBitsMask<NBits, T>();
    return v &(lowMask << meta::logFloor(isolateLSB<T>(v)));
}

/// Broadcasts the value in the 0th lane of the SWAR to the entire SWAR.
/// Precondition: 0th lane of |v| contains a value to broadcast, remainder of input SWAR zero.
template<int NBits, typename T = uint64_t>
constexpr auto broadcast(SWAR<NBits, T> v) {
    constexpr T Ones = meta::BitmaskMaker<T, 1, NBits>::value;
    return SWAR<NBits, T>(T(v) * Ones);
}

/// BooleanSWAR treats the MSB of each SWAR lane as the boolean associated with that lane.
template<int NBits, typename T>
struct BooleanSWAR: SWAR<NBits, T> {
    using Base = SWAR<NBits, T>;

// Booleanness is stored in the MSBs
    static constexpr auto MaskMSB =
        broadcast<NBits, T>(Base(T(1) << (NBits -1)));
    static constexpr auto MaskLSB =
         broadcast<NBits, T>(Base(T(1)));
    // Turns off LSB of each lane
    static constexpr auto MaskNonLSB = ~MaskLSB;
    static constexpr auto MaskNonMSB = ~MaskMSB;
    constexpr explicit BooleanSWAR(T v): Base(v) {}

constexpr BooleanSWAR clear(int bit) const noexcept {
        constexpr auto Bit = T(1) << (NBits - 1);
        return this->m_v ^ (Bit << (NBits * bit)); }

constexpr BooleanSWAR clearLSB() const noexcept {
        return BooleanSWAR(swar::clearLSB(this->value()));
    }

/// BooleanSWAR treats the MSB of each lane as the boolean associated with that lane.
    /// A logical NOT in this circumstance _only_ flips the MSB of each lane.  This operation is
    /// not ones or twos complement.

constexpr auto operator ~() const noexcept {
        return BooleanSWAR(Base{Base::MostSignificantBit} ^ *this);
    }

constexpr auto operator not() const noexcept {
        return BooleanSWAR(MaskMSB ^ *this);
    }

#define BOOLEANSWAR_BINARY_LOGIC_OPERATOR_X_LIST  X(^) X(&) X(|)
    #define X(op) \
        constexpr BooleanSWAR operator op(BooleanSWAR other) const noexcept { return this->Base::operator op(other); }
    BOOLEANSWAR_BINARY_LOGIC_OPERATOR_X_LIST
    #undef X

// BooleanSWAR as a mask: BooleanSWAR<4, u16>(0x0800).MSBtoLaneMask() => SWAR<4,u16>(0x0F00)
    constexpr auto MSBtoLaneMask() const noexcept {
        const auto MSBMinusOne = this->m_v - (this->m_v >> (NBits-1)); // Convert pattern 10* to 01*
        return SWAR<NBits,T>(MSBMinusOne | this->m_v); // Blit 01* and 10* together for 1* when MSB was on.
    }

explicit
    constexpr operator bool() const noexcept { return this->m_v; }
 private:
    constexpr BooleanSWAR(Base initializer) noexcept:
        SWAR<NBits, T>(initializer)
    {}

template<int N, int NB, typename TT>
    friend constexpr BooleanSWAR<NB, TT>
    constantIsGreaterEqual(SWAR<NB, TT>) noexcept;

template<int N, int NB, typename TT>
    friend constexpr BooleanSWAR<NB, TT>
    constantIsGreaterEqual_MSB_off(SWAR<NB, TT>) noexcept;

template<int NB, typename TT>
    friend constexpr BooleanSWAR<NB, TT>
    greaterEqual(SWAR<NB, TT>, SWAR<NB, TT>) noexcept;

template<int NB, typename TT>
    friend constexpr BooleanSWAR<NB, TT>
    greaterEqual_MSB_off(SWAR<NB, TT>, SWAR<NB, TT>) noexcept;

template<int NB, typename TT>
    constexpr T
    indexOfMostSignficantLaneSet(SWAR<NBits, T> test) noexcept;

template<int NB, typename TT>
    friend constexpr BooleanSWAR<NB, TT>
    convertToBooleanSWAR(SWAR<NB, TT> arg) noexcept;
};

template<int NBits, typename T>
constexpr BooleanSWAR<NBits, T>
convertToBooleanSWAR(SWAR<NBits, T> arg) noexcept {
    return SWAR<NBits, T>{SWAR<NBits, T>::MostSignificantBit} & arg;
}

template<int N, int NBits, typename T>
constexpr BooleanSWAR<NBits, T>
constantIsGreaterEqual(SWAR<NBits, T> subtrahend) noexcept {
    static_assert(1 < NBits, "Degenerated SWAR");
    constexpr auto MSB_Position  = NBits - 1;
    constexpr auto MSB = T(1) << MSB_Position;
    constexpr auto MSB_Mask =
        SWAR<NBits, T>{meta::BitmaskMaker<T, MSB, NBits>::value};
    constexpr auto Minuend =
        SWAR<NBits, T>{meta::BitmaskMaker<T, N, NBits>::value};
    constexpr auto N_MSB = MSB & N;

auto subtrahendWithMSB_on = MSB_Mask & subtrahend;
    auto subtrahendWithMSB_off = ~subtrahendWithMSB_on;
    auto subtrahendMSBs_turnedOff = subtrahend ^ subtrahendWithMSB_on;
    if constexpr(N_MSB) {
        auto leastSignificantComparison = Minuend - subtrahendMSBs_turnedOff;
        auto merged =
            subtrahendMSBs_turnedOff | // the minuend MSBs are on
            leastSignificantComparison;
        return MSB_Mask & merged;
    } else {
        auto minuendWithMSBs_turnedOn = Minuend | MSB_Mask;
        auto leastSignificantComparison =
            minuendWithMSBs_turnedOn - subtrahendMSBs_turnedOff;
        auto merged =
            subtrahendWithMSB_off & // the minuend MSBs are off
            leastSignificantComparison;
        return MSB_Mask & merged;
    }
}

template<int N, int NBits, typename T>
constexpr BooleanSWAR<NBits, T>
constantIsGreaterEqual_MSB_off(SWAR<NBits, T> subtrahend) noexcept {
    static_assert(1 < NBits, "Degenerated SWAR");
    constexpr auto MSB_Position  = NBits - 1;
    constexpr auto MSB = T(1) << MSB_Position;
    constexpr auto MSB_Mask = meta::BitmaskMaker<T, MSB, NBits>::value;
    constexpr auto Minuend = meta::BitmaskMaker<T, N, NBits>::value;
    constexpr auto N_MSB = MSB & Minuend;

auto subtrahendWithMSB_on = subtrahend;
    auto subtrahendWithMSB_off = ~subtrahendWithMSB_on;
    auto subtrahendMSBs_turnedOff = subtrahend ^ subtrahendWithMSB_on;
    if constexpr(N_MSB) { return MSB_Mask; }
    else {
        auto minuendWithMSBs_turnedOn = Minuend | MSB_Mask;
        auto leastSignificantComparison =
            minuendWithMSBs_turnedOn - subtrahendMSBs_turnedOff;
        return MSB_Mask & leastSignificantComparison;
    }
}

template<typename T, typename U, typename V>
constexpr T median(T x, U y, V z) {
    return (x | y) & (y | z) & (x | z);
}

template<int NBits, typename T>
constexpr BooleanSWAR<NBits, T>
greaterEqual(SWAR<NBits, T> left, SWAR<NBits, T> right) noexcept {
    // Adapted from TAOCP V4 P152
    // h is msbselector, x is right, l is lower/left.  Sets MSB to 1 in lanes
    // in test variable t for when xi < yi for lane i . Invert for greaterEqual.
    // t = h & ~<x~yz>
    // z = (x|h) - (y&~h)
    using S = swar::SWAR<NBits, T>;
    const auto h = S::MostSignificantBit, x = left.value(), y = right.value();  // x=left, y= right is x < y
    const auto z = (x|h) - (y&~h);
    // bitwise ternary median!
    const auto t = h & ~median(x, ~y, z);
    return ~BooleanSWAR<NBits, T>{static_cast<T>(t)};  // ~(x<y) === x >= y
}

// In the condition where only MSBs will be on, we can fast lookup with 1 multiply the index of the most signficant byte that is on.
// This appears to be a mapping from the (say) 256 unique values of a 64 bit int where only MSBs of each 8 bits can be on, but I don't fully understand it.
// Adapted from TAOCP Vol 4A Page 153 Eq 94.
template<int NBits, typename T>
constexpr T
indexOfMostSignficantLaneSet(SWAR<NBits, T> test) noexcept {
    const auto TypeWidth = sizeof(T) * 8;
    const auto TopVal = (T{1}<<(TypeWidth-NBits))-1, BottomVal = (T{1}<<(NBits-1))-1;
    const T MappingConstant = TopVal / BottomVal;
    return (test.value() * MappingConstant) >> (TypeWidth - NBits);
}

template<int NBits, typename T>
constexpr BooleanSWAR<NBits, T>
greaterEqual_MSB_off(SWAR<NBits, T> left, SWAR<NBits, T> right) noexcept {
    constexpr auto MLMSB = SWAR<NBits, T>{SWAR<NBits, T>::MostSignificantBit};

auto minuend = MLMSB | left;
    return MLMSB & (minuend - right);
}

template<int NB, typename T>
constexpr auto
booleans(SWAR<NB, T> arg) noexcept {
    return ~constantIsGreaterEqual<0>(arg);
}

template<int NBits, typename T>
constexpr auto
differents(SWAR<NBits, T> a1, SWAR<NBits, T> a2) {
    return booleans(a1 ^ a2);
}

template<int NBits, typename T>
constexpr auto
equals(SWAR<NBits, T> a1, SWAR<NBits, T> a2) {
    return ~differents(a1, a2);
}

/*
This is just a draft implementation:
b1. The isolator needs pre-computing instead of adding 3 ops per iteration
2. The update of the isolator is not needed in the last iteration
3. Consider returning not the logarithm, but the biased by 1 (to support 0)
 */
template<int NBits, typename T>
constexpr SWAR<NBits, T> logarithmFloor(SWAR<NBits, T> v) noexcept {
    constexpr auto LogNBits = meta::logFloor(NBits);
    static_assert(NBits == (1 << LogNBits), "Logarithms of element width not power of two is un-implemented");
    auto whole = v.value();
    auto isolationMask = SWAR<NBits, T>::MostSignificantBit;

for(auto groupSize = 1; groupSize < NBits; groupSize <<= 1) {
        auto shifted = whole >> groupSize;

// When shifting down a group to double the size of a group, the upper
        // "groupSize" bits will come from the element above, mask them out
        auto isolator = ~isolationMask;
        auto withoutCrossing = shifted & isolator;
        whole |= withoutCrossing;
        isolationMask |= (isolationMask >> groupSize);
    }
    constexpr auto ones = meta::BitmaskMaker<T, 1, NBits>::value;
    auto popcounts = meta::PopcountLogic<LogNBits, T>::execute(whole);
    return SWAR<NBits, T>{popcounts - ones};
}

static_assert(
    logarithmFloor(SWAR<8>{0x8040201008040201ull}).value() ==
    0x0706050403020100ull
);
static_assert(
    logarithmFloor(SWAR<8>{0xFF7F3F1F0F070301ull}).value() ==
    0x0706050403020100ull
);

static_assert(SWAR<4, uint16_t>::AllOnesInFirstLane == 0b0000'0000'0000'1111);

}}

#include <cstdint>

//#define ZOO_DEVELOPMENT_DEBUGGING
#ifdef ZOO_DEVELOPMENT_DEBUGGING
#include <iostream>

inline std::ostream &binary(std::ostream &out, uint64_t input, int count) {
    while(count--) {
        out << (1 & input);
        input >>= 1;
    }
    return out;
}

template<int NB, typename B>
std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR<NB, B> s) {
    using S = zoo::swar::SWAR<NB, B>;
    auto shiftCounter = sizeof(B) * 8 / NB;
    out << "<|";
    auto v = s.value();
    do {
        binary(out, v, NB) << '|';

} while(--shiftCounter);
    return out << ">";
}

#define ZOO_TO_STRING(a) #a
// std::endl is needed within the context of debugging: flush the line
#define ZOO_TRACEABLE_EXP_IMPL(F, L, ...) std::cout << '"' << (__VA_ARGS__) << "\", \"" <<  F << ':' << L << "\", \"" << ZOO_TO_STRING(__VA_ARGS__) << "\"" << std::endl;
#define ZOO_TRACEABLE_EXPRESSION(...) ZOO_TRACEABLE_EXP_IMPL(__FILE__, __LINE__, __VA_ARGS__)

#else

#define ZOO_TRACEABLE_EXPRESSION(...) (void)(__VA_ARGS__)

#endif

namespace zoo::swar {

/// \note This code should be substituted by an application of "progressive" algebraic iteration
/// \note There is also parallelPrefix (to be implemented)
template<int NB, typename B>
constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) {
    using S = SWAR<NB, B>;
    auto
        shiftClearingMask = S{static_cast<B>(~S::MostSignificantBit)},
        doubling = input,
        result = S{0};
    auto
        bitsToXOR = NB,
        power = 1;

#define ZTE(...)
        // ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__)
    for(;;) {
        ZTE(doubling);
        if(1 & bitsToXOR) {
            ZTE(result ^ doubling);
            result = result ^ doubling;
            ZTE(doubling.shiftIntraLaneLeft(power, shiftClearingMask));
            doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
        }
        ZTE(bitsToXOR >> 1);
        bitsToXOR >>= 1;
        if(!bitsToXOR) { break; }
        auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask);
        ZTE(shifted);
        ZTE(doubling ^ shifted);
        doubling = doubling ^ shifted;
        // 01...1
        // 001...1
        // 00001...1
        // 000000001...1
        shiftClearingMask =
            shiftClearingMask &
                S{static_cast<B>(shiftClearingMask.value() >> power)};
        ZTE(power << 1);
        power <<= 1;
    }
    ZTE(input);
    #undef ZTE
    return S{result};
}

/*
Binary compress: A fascinating algorithm.

Warren (Hacker's Delight) believes Guy L. Steele is the author of the following
binary compression operation, equivalent to Intel's BMI2 instruction PEXT of
"Parallel Extraction"

From a "mask", a selector of bits from an input, we want to put them together in
the output.

For example's sake, this is the selector:
Note: this follows the usual 'big endian' convention of denoting the most
significant bit first:
0001 0011 0111 0111 0110 1110 1100 1010
Imagine the input is the 32-bit or 32-boolean variable expression
abcd efgh ijkl mnop qrst uvxy zABC DEFG
We want the selection
   d   gh  jkl  nop  rs  uvx  zA   D F
To be compressed into the output
0000 0000 0000 00dg hjkl nopr suvx zADF

This algorithm will virtually calculate the count of positions that the selected
bits travel to the right, by constructing the binary encoding of that count:
It will identify the positions that will travel an odd number of positions to
the right, these are those whose position-travel-count have the units set.
It will then move those positions by one position to the right, and eliminate
them from the yet-to-move positions.  Because it eliminates the positions that
would move an odd count, there remains only positions that move an even number
of positions.  Now it finds the positions that move an odd count of /pairs/ of
positions, it moves them 2 positions.  This is equivalent to finding the
positions that would have the bit for 2 set in the count of positions to move
right.
Then an odd count of /quartets/ of positions, and moves them 4;
8, 16, 32, ...

Complete example (32 bits)
Selection mask:
0001 0011 0111 0111 0110 1110 1100 1010
Input (each letter or variable is a boolean, that can have 0 or 1)
abcd efgh ijkl mnop qrst uvxy zABC DEFG
Selection (using spaces)
   d   gh  jkl  nop  rs  uvx  zA   D F
Desired result:
                     dghjklnoprsuvxzADF

0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
1110 1100 1000 1000 1001 0001 0011 0101 ~compressionMask
1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1
                                            == groupsize of ~compressionMask
This indicates the positions that have a 0 immediately to the right in
                                            compressionMask
4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the
                                          current position in forParallelSuffix,
                                          last decimal digit
0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
                                            forParallelSuffix
We have just identified the positions that need to move an odd number of
positions.  Filter them with positions with a bit set in compressionMask:
0001 0011 0111 0111 0110 1110 1100 1010 compressionMask
---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of
                                            compressionMask by 1 == groupSize
0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits
                                            that will move)
---- ---- --11 1--- --1- -111 ---- ---1 mv >> 1 == groupSize
0001 0011 0011 1111 0010 0111 1100 1001 pseudo-compressed compressionMask.
0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of
                                            forParallelSuffix
1011 0111 0000 1111 0010 0001 1101 1001 ~mp == ~parallel suffix (bits not moved)
1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero
                                            immediately to their right)
1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved =>
                                                had even zeroes to their right)
At this point, we have removed from compressionMask the positions that moved an
odd number of positions and moved them 1 position,
then, we only keep positions that move an even number of positions.
Now, we will repeat these steps but for groups of two zeroes, then 4 zeroes, ...
*/

template<int NB, typename B>
constexpr SWAR<NB, B>
compress(SWAR<NB, B> input, SWAR<NB, B> compressionMask) {
    // This solution uses the parallel suffix operation as a primary tool:
    // For every bit postion it indicates an odd number of ones to the right,
    // including itself.
    // Because we want to detect the "oddness" of groups of zeroes to the right,
    // we flip the compression mask.  To not count the bit position itself,
    // we shift by one.
    #define ZTE(...)
        // ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__)
    ZTE(input);
    ZTE(compressionMask);
    using S = SWAR<NB, B>;
    auto result = input & compressionMask;
    auto groupSize = 1;
    auto
        shiftLeftMask = S{S::LowerBits},
        shiftRightMask = S{S::LowerBits << 1};
    ZTE(~compressionMask);
    auto forParallelSuffix = // this is called "mk" in the book
        (~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask);
    ZTE(forParallelSuffix);
        // note: forParallelSuffix denotes positions with a zero
        // immediately to the right in 'compressionMask'
    for(;;) {
        ZTE(groupSize);
        ZTE(shiftLeftMask);
        ZTE(shiftRightMask);
        ZTE(result);
        auto oddCountOfGroupsOfZerosToTheRight =  // called "mp" in the book
            parallelSuffix(forParallelSuffix);
        ZTE(oddCountOfGroupsOfZerosToTheRight);
        // compress the bits just identified in both the result and the mask
        auto moving = compressionMask & oddCountOfGroupsOfZerosToTheRight;
        ZTE(moving);
        compressionMask =
            (compressionMask ^ moving) | // clear the moving
            moving.shiftIntraLaneRight(groupSize, shiftRightMask);
        ZTE(compressionMask);
        auto movingFromInput = result & moving;
        result =
            (result ^ movingFromInput) | // clear the moving from the result
            movingFromInput.shiftIntraLaneRight(groupSize, shiftRightMask);
        auto nextGroupSize = groupSize << 1;
        if(NB <= nextGroupSize) {
            break;
        }
        auto evenCountOfGroupsOfZerosToTheRight =
            ~oddCountOfGroupsOfZerosToTheRight;
        forParallelSuffix =
            forParallelSuffix & evenCountOfGroupsOfZerosToTheRight;
        auto newShiftLeftMask =
            shiftLeftMask.shiftIntraLaneRight(groupSize, shiftRightMask);
        shiftRightMask =
            shiftRightMask.shiftIntraLaneLeft(groupSize, shiftLeftMask);
        shiftLeftMask = newShiftLeftMask;
        groupSize = nextGroupSize;
    }
    ZTE(result);
    #undef ZTE
    return result;
}

/// \todo because of the desirability of "accumuating" the XORs at the MSB,
/// the parallel suffix operation is more suitable.
template<int NB, typename B>
constexpr SWAR<NB, B> parity(SWAR<NB, B> input) {
    using S = SWAR<NB, B>;
    auto preResult = parallelSuffix(input);
    auto onlyMSB = preResult.value() & S::MostSignificantBit;
    return S{onlyMSB};
}

namespace impl {
template<int NB, typename B>
constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR<NB, B> msb, SWAR<NB, B> lsb) {
    auto msbCopiedDown = msb - lsb;
    auto msbReintroduced = msbCopiedDown | msb;
    return msbReintroduced;
}
}

template<int NB, typename B>
constexpr auto makeLaneMaskFromLSB(SWAR<NB, B> input) {
    using S = SWAR<NB, B>;
    auto lsb = input & S{S::LeastSignificantBit};
    auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)};
    return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb);
}

template<int NB, typename B>
constexpr auto makeLaneMaskFromMSB(SWAR<NB, B> input) {
    using S = SWAR<NB, B>;
    auto msb = input & S{S::MostSignificantBit};
    B val = msb.value() >> (NB - 1);
    auto msbCopiedToLSB = S{val};
    return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB);
}

template<int NB, typename B>
struct ArithmeticResultTriplet {
    SWAR<NB, B> result;
    BooleanSWAR<NB, B> carry, overflow;
};

/// \brief "Safe" addition, meaning non-corrupting unsigned overflow addition
/// and producing the flags for unsigned overflow (carry) and signed overflow.
///
/// This is the function to perform signed addition (that relies on supporting
/// unsigned overflow)
///
/// Notice that the support for unsigned overflow directly allows signed
/// addition (and both signed and unsigned substraction).
///
/// This function is called "full addition" because it can perform the addition
/// with all the bits of the inputs by making sure the overflow (in the
/// unsigned sense) does not cross the lane boundary.
/// This function has less performance than "optimistic" addition (operator+).
/// The mechanism to manage potential overflow naturally allows the calculation
/// of the carry and signed overflow flags for no extra performance cost.
///
/// The performance relies on the optimizer removing the calculation of
/// the carry or signed overflow if they are not used.
///
/// When interpreted as unsigned addition, carrying out of the result is
/// overflow.
///
/// The carry bit is essential to increase the precision of the results in
/// normal arithmetic, but in unsigned SWAR it is preferable to double the
/// precision before executing addition, thus guaranteeing no overflow will
/// occur and using the more performant operator+ addition.  Hence,
/// the carry flag is mostly useful in SWAR for detection of unsigned overflow,
/// as opposed to a helper to achieve higher precision as it is normally used:
/// Once we support non-power-of-two sizes of lanes in number of bits, we could
/// have transformations that would convert a SWAR of lanes of N bits to a SWAR
/// of (N + 1) bits, however, functionally, the performance cost of doing this
/// will be strictly higher than doubling the bit count of the current SWAR.
/// With double the number of bits, not only are additions guaranteed to not
/// overflow (as unsigned) but even multiplications won't.  Then it is not
/// practical to use the carry bit for other than detection of unsigned overflow.
///
/// The signed integer interpretation is two's complement, which
/// routinely overflows (when interpreted as unsigned).  Signed overflow may only
/// occur if the inputs have the same sign, it is detected when the sign of the
/// result is opposite that of the inputs.
///
/// \todo The library is not explicit with regards to the fact that
/// operator+ is only useful with the unsigned interpretation.  A decision
/// must be made to either keep the library as is, or to promote full addition
/// to operator+, and the rationale for the decision
///
/// \todo What is the right place for this function?
/// It was added here because in practice multiplication overflows, as a draft
template<int NB, typename B>
constexpr ArithmeticResultTriplet<NB, B>
fullAddition(SWAR<NB, B> s1, SWAR<NB, B> s2) {
    using S = SWAR<NB, B>;
    constexpr auto
        SignBit = S{S::MostSignificantBit},
        LowerBits = SignBit - S{S::LeastSignificantBit};
    // prevent overflow by clearing the most significant bits
    auto
        s1prime = LowerBits & s1,
        s2prime = LowerBits & s2,
        resultPrime = s1prime + s2prime,
        s1Sign = SignBit & s1,
        s2Sign = SignBit & s2,
        signPrime = SignBit & resultPrime,
        result = resultPrime ^ s1Sign ^ s2Sign,
        // carry is set whenever at least two of the sign bits of s1, s2,
        // signPrime are set
        carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime),
        // overflow: the inputs have the same sign and different to result
        // same sign: s1Sign ^ s2Sign
        overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result);
    using BS = BooleanSWAR<NB, B>;
    return { result, BS{carry.value()}, BS{overflow.value()} };
};

template<int NB, typename B>
constexpr SWAR<NB, B>
saturatingUnsignedAddition(SWAR<NB, B> s1, SWAR<NB, B> s2) {
    const auto additionResult = fullAddition(s1, s2);
    // If we carry unsigned, we need to saturate: thus OR the carry bit with the
    // lane bits (carry because it happens to be earlier in the struct
    // declaration)
    return additionResult.carry.MSBtoLaneMask() | additionResult.result;
}

/// \brief Negation is useful only for the signed integer interpretation
template<int NB, typename B>
constexpr auto negate(SWAR<NB, B> input) {
    using S = SWAR<NB, B>;
    constexpr auto Ones = S{S::LeastSignificantBit};
    return fullAddition(~input, Ones).result;
}

/// \brief Performs a generalized iterated application of an associative operator to a base
///
/// In algebra, the repeated application of an operator to a "base" has different names depending on the
/// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition",
/// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n
/// power".
/// The general term in algebra is "iteration", hence the naming of this function.
/// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we
/// keep the option to use the imprecise language of "product, base and exponent".  "Iteration" has a very
/// different meaning in programming and especially different in C++.
/// There may be iteration over an operator that is not associative (such as quaternion multiplication), this
/// function leverages the associative property of the operator to "halve" the count of iterations at each step.
/// \note There is a symmetrical operation to be implemented of associative iteration in the
/// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb,
/// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)"
/// \tparam Operator a callable with three arguments: the left and right arguments to the operation
/// and the count to be used, the "count" is an artifact of this generalization
/// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not
/// be a number, the iteration count is part of the execution context to apply the operator
/// \param forSquaring is an artifact of this generalization
/// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows
/// there are fewer iterations than what the type of exponent would allow
template<
    typename Base, typename IterationCount, typename Operator,
    // the critical use of associativity is that it allows halving the
    // iteration count
    typename CountHalver
>
constexpr auto associativeOperatorIterated_regressive(
    Base base,
    Base neutral,
    IterationCount count,
    IterationCount forSquaring,
    Operator op,
    unsigned log2Count,
    CountHalver ch
) {
    auto result = neutral;
    if(!log2Count) { return result; }
    for(;;) {
        result = op(result, base, count);
        if(!--log2Count) { break; }
        result = op(result, result, forSquaring);
        count = ch(count);
    }
    return result;
}

template<int ActualBits, int NB, typename T>
constexpr auto multiplication_OverflowUnsafe_SpecificBitCount(
    SWAR<NB, T> multiplicand, SWAR<NB, T> multiplier
) {
    using S = SWAR<NB, T>;

auto operation = [](auto left, auto right, auto counts) {
        auto addendums = makeLaneMaskFromMSB(counts);
        return left + (addendums & right);
    };

auto halver = [](auto counts) {
        auto msbCleared = counts & ~S{S::MostSignificantBit};
        T res = msbCleared.value() << 1;
        return S{res};
    };

T val = multiplier.value() << (NB - ActualBits);
    multiplier = S{val};
    return associativeOperatorIterated_regressive(
        multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation,
        ActualBits, halver
    );
}

/// \note Not removed yet because it is an example of "progressive" associative exponentiation
template<int ActualBits, int NB, typename T>
constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated(
    SWAR<NB, T> multiplicand,
    SWAR<NB, T> multiplier
) {
    using S = SWAR<NB, T>;
    constexpr auto LeastBit = S::LeastSignificantBit;
    auto multiplicandDoubling = multiplicand.value();
    auto mplier = multiplier.value();
    auto product = S{0};
    for(auto count = ActualBits;;) {
        auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier});
        product = product + (multiplicandDoublingMask & S{multiplicandDoubling});
        if(!--count) { break; }
        multiplicandDoubling <<= 1;
        auto leastBitCleared = mplier & ~LeastBit;
        mplier = leastBitCleared >> 1;
    }
    return product;
}

template<int ActualBits, int NB, typename T>
constexpr auto exponentiation_OverflowUnsafe_SpecificBitCount(
    SWAR<NB, T> x,
    SWAR<NB, T> exponent
) {
    using S = SWAR<NB, T>;

auto operation = [](auto left, auto right, auto counts) {
      const auto mask = makeLaneMaskFromMSB(counts);
      const auto product =
        multiplication_OverflowUnsafe_SpecificBitCount<ActualBits>(left, right);
      return (product & mask) | (left & ~mask);
    };

// halver should work same as multiplication... i think...
    auto halver = [](auto counts) {
        auto msbCleared = counts & ~S{S::MostSignificantBit};
        return S{static_cast<T>(msbCleared.value() << 1)};
    };

exponent = S{static_cast<T>(exponent.value() << (NB - ActualBits))};
    return associativeOperatorIterated_regressive(
        x,
        S{meta::BitmaskMaker<T, 1, NB>().value}, // neutral is lane wise..
        exponent,
        S{S::MostSignificantBit},
        operation,
        ActualBits,
        halver
    );
}

using S = SWAR<4, u16>;

/** Transforms a number into a number into a binary tally.
 * E.g. 0b0011 (3) -> 0b0111
 * It seems that trying to get the lane width as a tally is weird and overflowy.
 * */
template <typename S>
constexpr auto base2TallyTransform_Plural(S input) {
    constexpr auto two = S{meta::BitmaskMaker<typename S::type, 2, S::NBits>::value};
    constexpr auto one = S::LeastSignificantBit;
    typename S::type v = exponentiation_OverflowUnsafe_SpecificBitCount<S::NBits>(two, input).value() - one;
    return S{v};
}
static_assert(base2TallyTransform_Plural(S{0b0001'0010'0011'0011}).value() == 0b0001'0011'0111'0111);
static_assert(base2TallyTransform_Plural(S{0b0000'0001'0010'0011}).value() == 0b0000'0001'0011'0111);
static_assert(base2TallyTransform_Plural(S{0b0100'0001'0010'0011}).value() == 0b1111'0001'0011'0111);
static_assert(base2TallyTransform_Plural(S{0b0000'0000'0000'0001}).value() == 0b0000'0000'0000'0001);
static_assert(base2TallyTransform_Plural(SWAR<8, uint16_t>{0b000000111'00000101}).value() == 0b01111111'00011111); // 7 -> 5

template <typename S>
constexpr auto rightShift_Plural(S input, S shifts) {
    using T = typename S::type;
    auto minimumMask = ~base2TallyTransform_Plural(shifts);
    auto inputMasked = input.value() & minimumMask.value();

T result = 0;
    for (int i = 0; i < S::Lanes; i++) {
        auto laneMask = S::laneMask(i);
        auto currentShiftAmount = shifts.at(i);
        auto masked = inputMasked & laneMask;
        auto shifted = masked >> currentShiftAmount;
        result |= shifted;
    }
    return S{result};
}

static_assert(1 >> 0 == 1);

static_assert(rightShift_Plural(
    S{0b0111'0111'0111'0111},
    S{0b0010'0010'0010'0010}
).value() == 0b0001'0001'0001'0001);

static_assert(rightShift_Plural(
    S{0b0000'0000'1111'0001},
    S{0b0000'0000'0000'0001}
).value() == 0b0000'0000'1111'0000);

static_assert(rightShift_Plural(
    S{0b0000'1000'1000'1000},
    S{0b0100'0011'0010'0001}
).value() == 0b0000'0001'0010'0100);

static_assert(rightShift_Plural(
    S{0b1111'1111'1111'1111},
    S{0b0001'0001'0001'0001}
).value() == 0b0111'0111'0111'0111);

static_assert(rightShift_Plural(
    S{0b0000'0000'1111'0001},
    S{0b0000'0000'0000'0000}
).value() == 0b0000'0000'1111'0001);

static_assert(rightShift_Plural(
    S{0b0000'0000'1111'0001},
    S{0b0000'0000'0001'0001}
).value() == 0b0000'0000'0111'0000);

static_assert(S::LeastSignificantLaneMask == 0b0000'0000'0000'1111);
static_assert(S::laneMask(0) == 0b0000'0000'0000'1111);
static_assert(S::laneMask(1) == 0b0000'0000'1111'0000);
static_assert(S::laneMask(2) == 0b0000'1111'0000'0000);
static_assert(S::laneMask(3) == 0b1111'0000'0000'0000);
static_assert(S{S::laneMask(3)}.at(3) == 0b0000'0000'0000'1111);

template<int NB, typename T>
constexpr auto multiplication_OverflowUnsafe(
    SWAR<NB, T> multiplicand,
    SWAR<NB, T> multiplier
) {
    return
        multiplication_OverflowUnsafe_SpecificBitCount<NB>(
            multiplicand, multiplier
        );
}

template<int NB, typename T>
struct SWAR_Pair{
    SWAR<NB, T> even, odd;
};

template<int NB, typename T>
constexpr SWAR<NB, T> doublingMask() {
    using S = SWAR<NB, T>;
    static_assert(0 == S::Lanes % 2, "Only even number of elements supported");
    using D = SWAR<NB * 2, T>;
    return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit};
}

template<int NB, typename T>
constexpr auto doublePrecision(SWAR<NB, T> input) {
    using S = SWAR<NB, T>;
    static_assert(
        0 == S::NSlots % 2,
        "Precision can only be doubled for SWARs of even element count"
    );
    using RV = SWAR<NB * 2, T>;
    constexpr auto DM = doublingMask<NB, T>();
    return SWAR_Pair<NB * 2, T>{
        RV{(input & DM).value()},
        RV{(input.value() >> NB) & DM.value()}
    };
}

template<int NB, typename T>
constexpr auto halvePrecision(SWAR<NB, T> even, SWAR<NB, T> odd) {
    using S = SWAR<NB, T>;
    static_assert(0 == NB % 2, "Only even lane-bitcounts supported");
    using RV = SWAR<NB/2, T>;
    constexpr auto HalvingMask = doublingMask<NB/2, T>();
    auto
        evenHalf = RV{even.value()} & HalvingMask,
        oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2};
    return evenHalf | oddHalf;
}

}

#endif

using S = zoo::swar::SWAR<8, uint64_t>;
auto foo(S a, S b) {
    return rightShift_Plural(a, b);
    
}