Thanks for using Compiler Explorer
Sponsors
Jakt
C++
Ada
Algol68
Analysis
Android Java
Android Kotlin
Assembly
C
C3
Carbon
C with Coccinelle
C++ with Coccinelle
C++ (Circle)
CIRCT
Clean
CMake
CMakeScript
COBOL
C++ for OpenCL
MLIR
Cppx
Cppx-Blue
Cppx-Gold
Cpp2-cppfront
Crystal
C#
CUDA C++
D
Dart
Elixir
Erlang
Fortran
F#
GLSL
Go
Haskell
HLSL
Hook
Hylo
IL
ispc
Java
Julia
Kotlin
LLVM IR
LLVM MIR
Modula-2
Mojo
Nim
Numba
Nix
Objective-C
Objective-C++
OCaml
Odin
OpenCL C
Pascal
Pony
PTX
Python
Racket
Raku
Ruby
Rust
Sail
Snowball
Scala
Slang
Solidity
Spice
SPIR-V
Swift
LLVM TableGen
Toit
Triton
TypeScript Native
V
Vala
Visual Basic
Vyper
WASM
Zig
Javascript
GIMPLE
Ygen
sway
c++ source #1
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
6502-c++ 11.1.0
ARM GCC 10.2.0
ARM GCC 10.3.0
ARM GCC 10.4.0
ARM GCC 10.5.0
ARM GCC 11.1.0
ARM GCC 11.2.0
ARM GCC 11.3.0
ARM GCC 11.4.0
ARM GCC 12.1.0
ARM GCC 12.2.0
ARM GCC 12.3.0
ARM GCC 12.4.0
ARM GCC 12.5.0
ARM GCC 13.1.0
ARM GCC 13.2.0
ARM GCC 13.2.0 (unknown-eabi)
ARM GCC 13.3.0
ARM GCC 13.3.0 (unknown-eabi)
ARM GCC 13.4.0
ARM GCC 13.4.0 (unknown-eabi)
ARM GCC 14.1.0
ARM GCC 14.1.0 (unknown-eabi)
ARM GCC 14.2.0
ARM GCC 14.2.0 (unknown-eabi)
ARM GCC 14.3.0
ARM GCC 14.3.0 (unknown-eabi)
ARM GCC 15.1.0
ARM GCC 15.1.0 (unknown-eabi)
ARM GCC 15.2.0
ARM GCC 15.2.0 (unknown-eabi)
ARM GCC 4.5.4
ARM GCC 4.6.4
ARM GCC 5.4
ARM GCC 6.3.0
ARM GCC 6.4.0
ARM GCC 7.3.0
ARM GCC 7.5.0
ARM GCC 8.2.0
ARM GCC 8.5.0
ARM GCC 9.3.0
ARM GCC 9.4.0
ARM GCC 9.5.0
ARM GCC trunk
ARM gcc 10.2.1 (none)
ARM gcc 10.3.1 (2021.07 none)
ARM gcc 10.3.1 (2021.10 none)
ARM gcc 11.2.1 (none)
ARM gcc 5.4.1 (none)
ARM gcc 7.2.1 (none)
ARM gcc 8.2 (WinCE)
ARM gcc 8.3.1 (none)
ARM gcc 9.2.1 (none)
ARM msvc v19.0 (ex-WINE)
ARM msvc v19.10 (ex-WINE)
ARM msvc v19.14 (ex-WINE)
ARM64 Morello gcc 10.1 Alpha 2
ARM64 gcc 10.2
ARM64 gcc 10.3
ARM64 gcc 10.4
ARM64 gcc 10.5.0
ARM64 gcc 11.1
ARM64 gcc 11.2
ARM64 gcc 11.3
ARM64 gcc 11.4.0
ARM64 gcc 12.1
ARM64 gcc 12.2.0
ARM64 gcc 12.3.0
ARM64 gcc 12.4.0
ARM64 gcc 12.5.0
ARM64 gcc 13.1.0
ARM64 gcc 13.2.0
ARM64 gcc 13.3.0
ARM64 gcc 13.4.0
ARM64 gcc 14.1.0
ARM64 gcc 14.2.0
ARM64 gcc 14.3.0
ARM64 gcc 15.1.0
ARM64 gcc 15.2.0
ARM64 gcc 4.9.4
ARM64 gcc 5.4
ARM64 gcc 5.5.0
ARM64 gcc 6.3
ARM64 gcc 6.4
ARM64 gcc 7.3
ARM64 gcc 7.5
ARM64 gcc 8.2
ARM64 gcc 8.5
ARM64 gcc 9.3
ARM64 gcc 9.4
ARM64 gcc 9.5
ARM64 gcc trunk
ARM64 msvc v19.14 (ex-WINE)
AVR gcc 10.3.0
AVR gcc 11.1.0
AVR gcc 12.1.0
AVR gcc 12.2.0
AVR gcc 12.3.0
AVR gcc 12.4.0
AVR gcc 12.5.0
AVR gcc 13.1.0
AVR gcc 13.2.0
AVR gcc 13.3.0
AVR gcc 13.4.0
AVR gcc 14.1.0
AVR gcc 14.2.0
AVR gcc 14.3.0
AVR gcc 15.1.0
AVR gcc 15.2.0
AVR gcc 4.5.4
AVR gcc 4.6.4
AVR gcc 5.4.0
AVR gcc 9.2.0
AVR gcc 9.3.0
Arduino Mega (1.8.9)
Arduino Uno (1.8.9)
BPF clang (trunk)
BPF clang 13.0.0
BPF clang 14.0.0
BPF clang 15.0.0
BPF clang 16.0.0
BPF clang 17.0.1
BPF clang 18.1.0
BPF clang 19.1.0
BPF clang 20.1.0
BPF clang 21.1.0
EDG (experimental reflection)
EDG 6.5
EDG 6.5 (GNU mode gcc 13)
EDG 6.6
EDG 6.6 (GNU mode gcc 13)
EDG 6.7
EDG 6.7 (GNU mode gcc 14)
FRC 2019
FRC 2020
FRC 2023
HPPA gcc 14.2.0
HPPA gcc 14.3.0
HPPA gcc 15.1.0
HPPA gcc 15.2.0
KVX ACB 4.1.0 (GCC 7.5.0)
KVX ACB 4.1.0-cd1 (GCC 7.5.0)
KVX ACB 4.10.0 (GCC 10.3.1)
KVX ACB 4.11.1 (GCC 10.3.1)
KVX ACB 4.12.0 (GCC 11.3.0)
KVX ACB 4.2.0 (GCC 7.5.0)
KVX ACB 4.3.0 (GCC 7.5.0)
KVX ACB 4.4.0 (GCC 7.5.0)
KVX ACB 4.6.0 (GCC 9.4.1)
KVX ACB 4.8.0 (GCC 9.4.1)
KVX ACB 4.9.0 (GCC 9.4.1)
KVX ACB 5.0.0 (GCC 12.2.1)
KVX ACB 5.2.0 (GCC 13.2.1)
LoongArch64 clang (trunk)
LoongArch64 clang 17.0.1
LoongArch64 clang 18.1.0
LoongArch64 clang 19.1.0
LoongArch64 clang 20.1.0
LoongArch64 clang 21.1.0
M68K gcc 13.1.0
M68K gcc 13.2.0
M68K gcc 13.3.0
M68K gcc 13.4.0
M68K gcc 14.1.0
M68K gcc 14.2.0
M68K gcc 14.3.0
M68K gcc 15.1.0
M68K gcc 15.2.0
M68k clang (trunk)
MRISC32 gcc (trunk)
MSP430 gcc 4.5.3
MSP430 gcc 5.3.0
MSP430 gcc 6.2.1
MinGW clang 14.0.3
MinGW clang 14.0.6
MinGW clang 15.0.7
MinGW clang 16.0.0
MinGW clang 16.0.2
MinGW gcc 11.3.0
MinGW gcc 12.1.0
MinGW gcc 12.2.0
MinGW gcc 13.1.0
RISC-V (32-bits) gcc (trunk)
RISC-V (32-bits) gcc 10.2.0
RISC-V (32-bits) gcc 10.3.0
RISC-V (32-bits) gcc 11.2.0
RISC-V (32-bits) gcc 11.3.0
RISC-V (32-bits) gcc 11.4.0
RISC-V (32-bits) gcc 12.1.0
RISC-V (32-bits) gcc 12.2.0
RISC-V (32-bits) gcc 12.3.0
RISC-V (32-bits) gcc 12.4.0
RISC-V (32-bits) gcc 12.5.0
RISC-V (32-bits) gcc 13.1.0
RISC-V (32-bits) gcc 13.2.0
RISC-V (32-bits) gcc 13.3.0
RISC-V (32-bits) gcc 13.4.0
RISC-V (32-bits) gcc 14.1.0
RISC-V (32-bits) gcc 14.2.0
RISC-V (32-bits) gcc 14.3.0
RISC-V (32-bits) gcc 15.1.0
RISC-V (32-bits) gcc 15.2.0
RISC-V (32-bits) gcc 8.2.0
RISC-V (32-bits) gcc 8.5.0
RISC-V (32-bits) gcc 9.4.0
RISC-V (64-bits) gcc (trunk)
RISC-V (64-bits) gcc 10.2.0
RISC-V (64-bits) gcc 10.3.0
RISC-V (64-bits) gcc 11.2.0
RISC-V (64-bits) gcc 11.3.0
RISC-V (64-bits) gcc 11.4.0
RISC-V (64-bits) gcc 12.1.0
RISC-V (64-bits) gcc 12.2.0
RISC-V (64-bits) gcc 12.3.0
RISC-V (64-bits) gcc 12.4.0
RISC-V (64-bits) gcc 12.5.0
RISC-V (64-bits) gcc 13.1.0
RISC-V (64-bits) gcc 13.2.0
RISC-V (64-bits) gcc 13.3.0
RISC-V (64-bits) gcc 13.4.0
RISC-V (64-bits) gcc 14.1.0
RISC-V (64-bits) gcc 14.2.0
RISC-V (64-bits) gcc 14.3.0
RISC-V (64-bits) gcc 15.1.0
RISC-V (64-bits) gcc 15.2.0
RISC-V (64-bits) gcc 8.2.0
RISC-V (64-bits) gcc 8.5.0
RISC-V (64-bits) gcc 9.4.0
RISC-V rv32gc clang (trunk)
RISC-V rv32gc clang 10.0.0
RISC-V rv32gc clang 10.0.1
RISC-V rv32gc clang 11.0.0
RISC-V rv32gc clang 11.0.1
RISC-V rv32gc clang 12.0.0
RISC-V rv32gc clang 12.0.1
RISC-V rv32gc clang 13.0.0
RISC-V rv32gc clang 13.0.1
RISC-V rv32gc clang 14.0.0
RISC-V rv32gc clang 15.0.0
RISC-V rv32gc clang 16.0.0
RISC-V rv32gc clang 17.0.1
RISC-V rv32gc clang 18.1.0
RISC-V rv32gc clang 19.1.0
RISC-V rv32gc clang 20.1.0
RISC-V rv32gc clang 21.1.0
RISC-V rv32gc clang 9.0.0
RISC-V rv32gc clang 9.0.1
RISC-V rv64gc clang (trunk)
RISC-V rv64gc clang 10.0.0
RISC-V rv64gc clang 10.0.1
RISC-V rv64gc clang 11.0.0
RISC-V rv64gc clang 11.0.1
RISC-V rv64gc clang 12.0.0
RISC-V rv64gc clang 12.0.1
RISC-V rv64gc clang 13.0.0
RISC-V rv64gc clang 13.0.1
RISC-V rv64gc clang 14.0.0
RISC-V rv64gc clang 15.0.0
RISC-V rv64gc clang 16.0.0
RISC-V rv64gc clang 17.0.1
RISC-V rv64gc clang 18.1.0
RISC-V rv64gc clang 19.1.0
RISC-V rv64gc clang 20.1.0
RISC-V rv64gc clang 21.1.0
RISC-V rv64gc clang 9.0.0
RISC-V rv64gc clang 9.0.1
Raspbian Buster
Raspbian Stretch
SPARC LEON gcc 12.2.0
SPARC LEON gcc 12.3.0
SPARC LEON gcc 12.4.0
SPARC LEON gcc 12.5.0
SPARC LEON gcc 13.1.0
SPARC LEON gcc 13.2.0
SPARC LEON gcc 13.3.0
SPARC LEON gcc 13.4.0
SPARC LEON gcc 14.1.0
SPARC LEON gcc 14.2.0
SPARC LEON gcc 14.3.0
SPARC LEON gcc 15.1.0
SPARC LEON gcc 15.2.0
SPARC gcc 12.2.0
SPARC gcc 12.3.0
SPARC gcc 12.4.0
SPARC gcc 12.5.0
SPARC gcc 13.1.0
SPARC gcc 13.2.0
SPARC gcc 13.3.0
SPARC gcc 13.4.0
SPARC gcc 14.1.0
SPARC gcc 14.2.0
SPARC gcc 14.3.0
SPARC gcc 15.1.0
SPARC gcc 15.2.0
SPARC64 gcc 12.2.0
SPARC64 gcc 12.3.0
SPARC64 gcc 12.4.0
SPARC64 gcc 12.5.0
SPARC64 gcc 13.1.0
SPARC64 gcc 13.2.0
SPARC64 gcc 13.3.0
SPARC64 gcc 13.4.0
SPARC64 gcc 14.1.0
SPARC64 gcc 14.2.0
SPARC64 gcc 14.3.0
SPARC64 gcc 15.1.0
SPARC64 gcc 15.2.0
TI C6x gcc 12.2.0
TI C6x gcc 12.3.0
TI C6x gcc 12.4.0
TI C6x gcc 12.5.0
TI C6x gcc 13.1.0
TI C6x gcc 13.2.0
TI C6x gcc 13.3.0
TI C6x gcc 13.4.0
TI C6x gcc 14.1.0
TI C6x gcc 14.2.0
TI C6x gcc 14.3.0
TI C6x gcc 15.1.0
TI C6x gcc 15.2.0
TI CL430 21.6.1
Tricore gcc 11.3.0 (EEESlab)
VAX gcc NetBSDELF 10.4.0
VAX gcc NetBSDELF 10.5.0 (Nov 15 03:50:22 2023)
VAX gcc NetBSDELF 12.4.0 (Apr 16 05:27 2025)
WebAssembly clang (trunk)
Xtensa ESP32 gcc 11.2.0 (2022r1)
Xtensa ESP32 gcc 12.2.0 (20230208)
Xtensa ESP32 gcc 14.2.0 (20241119)
Xtensa ESP32 gcc 8.2.0 (2019r2)
Xtensa ESP32 gcc 8.2.0 (2020r1)
Xtensa ESP32 gcc 8.2.0 (2020r2)
Xtensa ESP32 gcc 8.4.0 (2020r3)
Xtensa ESP32 gcc 8.4.0 (2021r1)
Xtensa ESP32 gcc 8.4.0 (2021r2)
Xtensa ESP32-S2 gcc 11.2.0 (2022r1)
Xtensa ESP32-S2 gcc 12.2.0 (20230208)
Xtensa ESP32-S2 gcc 14.2.0 (20241119)
Xtensa ESP32-S2 gcc 8.2.0 (2019r2)
Xtensa ESP32-S2 gcc 8.2.0 (2020r1)
Xtensa ESP32-S2 gcc 8.2.0 (2020r2)
Xtensa ESP32-S2 gcc 8.4.0 (2020r3)
Xtensa ESP32-S2 gcc 8.4.0 (2021r1)
Xtensa ESP32-S2 gcc 8.4.0 (2021r2)
Xtensa ESP32-S3 gcc 11.2.0 (2022r1)
Xtensa ESP32-S3 gcc 12.2.0 (20230208)
Xtensa ESP32-S3 gcc 14.2.0 (20241119)
Xtensa ESP32-S3 gcc 8.4.0 (2020r3)
Xtensa ESP32-S3 gcc 8.4.0 (2021r1)
Xtensa ESP32-S3 gcc 8.4.0 (2021r2)
arm64 msvc v19.20 VS16.0
arm64 msvc v19.21 VS16.1
arm64 msvc v19.22 VS16.2
arm64 msvc v19.23 VS16.3
arm64 msvc v19.24 VS16.4
arm64 msvc v19.25 VS16.5
arm64 msvc v19.27 VS16.7
arm64 msvc v19.28 VS16.8
arm64 msvc v19.28 VS16.9
arm64 msvc v19.29 VS16.10
arm64 msvc v19.29 VS16.11
arm64 msvc v19.30 VS17.0
arm64 msvc v19.31 VS17.1
arm64 msvc v19.32 VS17.2
arm64 msvc v19.33 VS17.3
arm64 msvc v19.34 VS17.4
arm64 msvc v19.35 VS17.5
arm64 msvc v19.36 VS17.6
arm64 msvc v19.37 VS17.7
arm64 msvc v19.38 VS17.8
arm64 msvc v19.39 VS17.9
arm64 msvc v19.40 VS17.10
arm64 msvc v19.41 VS17.11
arm64 msvc v19.42 VS17.12
arm64 msvc v19.43 VS17.13
arm64 msvc v19.latest
armv7-a clang (trunk)
armv7-a clang 10.0.0
armv7-a clang 10.0.1
armv7-a clang 11.0.0
armv7-a clang 11.0.1
armv7-a clang 12.0.0
armv7-a clang 12.0.1
armv7-a clang 13.0.0
armv7-a clang 13.0.1
armv7-a clang 14.0.0
armv7-a clang 15.0.0
armv7-a clang 16.0.0
armv7-a clang 17.0.1
armv7-a clang 18.1.0
armv7-a clang 19.1.0
armv7-a clang 20.1.0
armv7-a clang 21.1.0
armv7-a clang 9.0.0
armv7-a clang 9.0.1
armv8-a clang (all architectural features, trunk)
armv8-a clang (trunk)
armv8-a clang 10.0.0
armv8-a clang 10.0.1
armv8-a clang 11.0.0
armv8-a clang 11.0.1
armv8-a clang 12.0.0
armv8-a clang 13.0.0
armv8-a clang 14.0.0
armv8-a clang 15.0.0
armv8-a clang 16.0.0
armv8-a clang 17.0.1
armv8-a clang 18.1.0
armv8-a clang 19.1.0
armv8-a clang 20.1.0
armv8-a clang 21.1.0
armv8-a clang 9.0.0
armv8-a clang 9.0.1
clad trunk (clang 21.1.0)
clad v1.10 (clang 20.1.0)
clad v1.8 (clang 18.1.0)
clad v1.9 (clang 19.1.0)
clad v2.00 (clang 20.1.0)
clang-cl 18.1.0
ellcc 0.1.33
ellcc 0.1.34
ellcc 2017-07-16
ez80-clang 15.0.0
ez80-clang 15.0.7
hexagon-clang 16.0.5
llvm-mos atari2600-3e
llvm-mos atari2600-4k
llvm-mos atari2600-common
llvm-mos atari5200-supercart
llvm-mos atari8-cart-megacart
llvm-mos atari8-cart-std
llvm-mos atari8-cart-xegs
llvm-mos atari8-common
llvm-mos atari8-dos
llvm-mos c128
llvm-mos c64
llvm-mos commodore
llvm-mos cpm65
llvm-mos cx16
llvm-mos dodo
llvm-mos eater
llvm-mos mega65
llvm-mos nes
llvm-mos nes-action53
llvm-mos nes-cnrom
llvm-mos nes-gtrom
llvm-mos nes-mmc1
llvm-mos nes-mmc3
llvm-mos nes-nrom
llvm-mos nes-unrom
llvm-mos nes-unrom-512
llvm-mos osi-c1p
llvm-mos pce
llvm-mos pce-cd
llvm-mos pce-common
llvm-mos pet
llvm-mos rp6502
llvm-mos rpc8e
llvm-mos supervision
llvm-mos vic20
loongarch64 gcc 12.2.0
loongarch64 gcc 12.3.0
loongarch64 gcc 12.4.0
loongarch64 gcc 12.5.0
loongarch64 gcc 13.1.0
loongarch64 gcc 13.2.0
loongarch64 gcc 13.3.0
loongarch64 gcc 13.4.0
loongarch64 gcc 14.1.0
loongarch64 gcc 14.2.0
loongarch64 gcc 14.3.0
loongarch64 gcc 15.1.0
loongarch64 gcc 15.2.0
mips clang 13.0.0
mips clang 14.0.0
mips clang 15.0.0
mips clang 16.0.0
mips clang 17.0.1
mips clang 18.1.0
mips clang 19.1.0
mips clang 20.1.0
mips clang 21.1.0
mips gcc 11.2.0
mips gcc 12.1.0
mips gcc 12.2.0
mips gcc 12.3.0
mips gcc 12.4.0
mips gcc 12.5.0
mips gcc 13.1.0
mips gcc 13.2.0
mips gcc 13.3.0
mips gcc 13.4.0
mips gcc 14.1.0
mips gcc 14.2.0
mips gcc 14.3.0
mips gcc 15.1.0
mips gcc 15.2.0
mips gcc 4.9.4
mips gcc 5.4
mips gcc 5.5.0
mips gcc 9.3.0 (codescape)
mips gcc 9.5.0
mips64 (el) gcc 12.1.0
mips64 (el) gcc 12.2.0
mips64 (el) gcc 12.3.0
mips64 (el) gcc 12.4.0
mips64 (el) gcc 12.5.0
mips64 (el) gcc 13.1.0
mips64 (el) gcc 13.2.0
mips64 (el) gcc 13.3.0
mips64 (el) gcc 13.4.0
mips64 (el) gcc 14.1.0
mips64 (el) gcc 14.2.0
mips64 (el) gcc 14.3.0
mips64 (el) gcc 15.1.0
mips64 (el) gcc 15.2.0
mips64 (el) gcc 4.9.4
mips64 (el) gcc 5.4.0
mips64 (el) gcc 5.5.0
mips64 (el) gcc 9.5.0
mips64 clang 13.0.0
mips64 clang 14.0.0
mips64 clang 15.0.0
mips64 clang 16.0.0
mips64 clang 17.0.1
mips64 clang 18.1.0
mips64 clang 19.1.0
mips64 clang 20.1.0
mips64 clang 21.1.0
mips64 gcc 11.2.0
mips64 gcc 12.1.0
mips64 gcc 12.2.0
mips64 gcc 12.3.0
mips64 gcc 12.4.0
mips64 gcc 12.5.0
mips64 gcc 13.1.0
mips64 gcc 13.2.0
mips64 gcc 13.3.0
mips64 gcc 13.4.0
mips64 gcc 14.1.0
mips64 gcc 14.2.0
mips64 gcc 14.3.0
mips64 gcc 15.1.0
mips64 gcc 15.2.0
mips64 gcc 4.9.4
mips64 gcc 5.4.0
mips64 gcc 5.5.0
mips64 gcc 9.5.0
mips64el clang 13.0.0
mips64el clang 14.0.0
mips64el clang 15.0.0
mips64el clang 16.0.0
mips64el clang 17.0.1
mips64el clang 18.1.0
mips64el clang 19.1.0
mips64el clang 20.1.0
mips64el clang 21.1.0
mipsel clang 13.0.0
mipsel clang 14.0.0
mipsel clang 15.0.0
mipsel clang 16.0.0
mipsel clang 17.0.1
mipsel clang 18.1.0
mipsel clang 19.1.0
mipsel clang 20.1.0
mipsel clang 21.1.0
mipsel gcc 12.1.0
mipsel gcc 12.2.0
mipsel gcc 12.3.0
mipsel gcc 12.4.0
mipsel gcc 12.5.0
mipsel gcc 13.1.0
mipsel gcc 13.2.0
mipsel gcc 13.3.0
mipsel gcc 13.4.0
mipsel gcc 14.1.0
mipsel gcc 14.2.0
mipsel gcc 14.3.0
mipsel gcc 15.1.0
mipsel gcc 15.2.0
mipsel gcc 4.9.4
mipsel gcc 5.4.0
mipsel gcc 5.5.0
mipsel gcc 9.5.0
nanoMIPS gcc 6.3.0 (mtk)
power gcc 11.2.0
power gcc 12.1.0
power gcc 12.2.0
power gcc 12.3.0
power gcc 12.4.0
power gcc 12.5.0
power gcc 13.1.0
power gcc 13.2.0
power gcc 13.3.0
power gcc 13.4.0
power gcc 14.1.0
power gcc 14.2.0
power gcc 14.3.0
power gcc 15.1.0
power gcc 15.2.0
power gcc 4.8.5
power64 AT12.0 (gcc8)
power64 AT13.0 (gcc9)
power64 gcc 11.2.0
power64 gcc 12.1.0
power64 gcc 12.2.0
power64 gcc 12.3.0
power64 gcc 12.4.0
power64 gcc 12.5.0
power64 gcc 13.1.0
power64 gcc 13.2.0
power64 gcc 13.3.0
power64 gcc 13.4.0
power64 gcc 14.1.0
power64 gcc 14.2.0
power64 gcc 14.3.0
power64 gcc 15.1.0
power64 gcc 15.2.0
power64 gcc trunk
power64le AT12.0 (gcc8)
power64le AT13.0 (gcc9)
power64le clang (trunk)
power64le gcc 11.2.0
power64le gcc 12.1.0
power64le gcc 12.2.0
power64le gcc 12.3.0
power64le gcc 12.4.0
power64le gcc 12.5.0
power64le gcc 13.1.0
power64le gcc 13.2.0
power64le gcc 13.3.0
power64le gcc 13.4.0
power64le gcc 14.1.0
power64le gcc 14.2.0
power64le gcc 14.3.0
power64le gcc 15.1.0
power64le gcc 15.2.0
power64le gcc 6.3.0
power64le gcc trunk
powerpc64 clang (trunk)
qnx 8.0.0
s390x gcc 11.2.0
s390x gcc 12.1.0
s390x gcc 12.2.0
s390x gcc 12.3.0
s390x gcc 12.4.0
s390x gcc 12.5.0
s390x gcc 13.1.0
s390x gcc 13.2.0
s390x gcc 13.3.0
s390x gcc 13.4.0
s390x gcc 14.1.0
s390x gcc 14.2.0
s390x gcc 14.3.0
s390x gcc 15.1.0
s390x gcc 15.2.0
sh gcc 12.2.0
sh gcc 12.3.0
sh gcc 12.4.0
sh gcc 12.5.0
sh gcc 13.1.0
sh gcc 13.2.0
sh gcc 13.3.0
sh gcc 13.4.0
sh gcc 14.1.0
sh gcc 14.2.0
sh gcc 14.3.0
sh gcc 15.1.0
sh gcc 15.2.0
sh gcc 4.9.4
sh gcc 9.5.0
vast (trunk)
x64 msvc v19.0 (ex-WINE)
x64 msvc v19.10 (ex-WINE)
x64 msvc v19.14 (ex-WINE)
x64 msvc v19.20 VS16.0
x64 msvc v19.21 VS16.1
x64 msvc v19.22 VS16.2
x64 msvc v19.23 VS16.3
x64 msvc v19.24 VS16.4
x64 msvc v19.25 VS16.5
x64 msvc v19.27 VS16.7
x64 msvc v19.28 VS16.8
x64 msvc v19.28 VS16.9
x64 msvc v19.29 VS16.10
x64 msvc v19.29 VS16.11
x64 msvc v19.30 VS17.0
x64 msvc v19.31 VS17.1
x64 msvc v19.32 VS17.2
x64 msvc v19.33 VS17.3
x64 msvc v19.34 VS17.4
x64 msvc v19.35 VS17.5
x64 msvc v19.36 VS17.6
x64 msvc v19.37 VS17.7
x64 msvc v19.38 VS17.8
x64 msvc v19.39 VS17.9
x64 msvc v19.40 VS17.10
x64 msvc v19.41 VS17.11
x64 msvc v19.42 VS17.12
x64 msvc v19.43 VS17.13
x64 msvc v19.latest
x86 djgpp 4.9.4
x86 djgpp 5.5.0
x86 djgpp 6.4.0
x86 djgpp 7.2.0
x86 msvc v19.0 (ex-WINE)
x86 msvc v19.10 (ex-WINE)
x86 msvc v19.14 (ex-WINE)
x86 msvc v19.20 VS16.0
x86 msvc v19.21 VS16.1
x86 msvc v19.22 VS16.2
x86 msvc v19.23 VS16.3
x86 msvc v19.24 VS16.4
x86 msvc v19.25 VS16.5
x86 msvc v19.27 VS16.7
x86 msvc v19.28 VS16.8
x86 msvc v19.28 VS16.9
x86 msvc v19.29 VS16.10
x86 msvc v19.29 VS16.11
x86 msvc v19.30 VS17.0
x86 msvc v19.31 VS17.1
x86 msvc v19.32 VS17.2
x86 msvc v19.33 VS17.3
x86 msvc v19.34 VS17.4
x86 msvc v19.35 VS17.5
x86 msvc v19.36 VS17.6
x86 msvc v19.37 VS17.7
x86 msvc v19.38 VS17.8
x86 msvc v19.39 VS17.9
x86 msvc v19.40 VS17.10
x86 msvc v19.41 VS17.11
x86 msvc v19.42 VS17.12
x86 msvc v19.43 VS17.13
x86 msvc v19.latest
x86 nvc++ 22.11
x86 nvc++ 22.7
x86 nvc++ 22.9
x86 nvc++ 23.1
x86 nvc++ 23.11
x86 nvc++ 23.3
x86 nvc++ 23.5
x86 nvc++ 23.7
x86 nvc++ 23.9
x86 nvc++ 24.1
x86 nvc++ 24.11
x86 nvc++ 24.3
x86 nvc++ 24.5
x86 nvc++ 24.7
x86 nvc++ 24.9
x86 nvc++ 25.1
x86 nvc++ 25.3
x86 nvc++ 25.5
x86 nvc++ 25.7
x86-64 Zapcc 190308
x86-64 clang (-fimplicit-constexpr)
x86-64 clang (Chris Bazley N3089)
x86-64 clang (EricWF contracts)
x86-64 clang (amd-staging)
x86-64 clang (assertions trunk)
x86-64 clang (clangir)
x86-64 clang (experimental -Wlifetime)
x86-64 clang (experimental P1061)
x86-64 clang (experimental P1144)
x86-64 clang (experimental P1221)
x86-64 clang (experimental P2998)
x86-64 clang (experimental P3068)
x86-64 clang (experimental P3309)
x86-64 clang (experimental P3367)
x86-64 clang (experimental P3372)
x86-64 clang (experimental P3385)
x86-64 clang (experimental P3776)
x86-64 clang (experimental metaprogramming - P2632)
x86-64 clang (old concepts branch)
x86-64 clang (p1974)
x86-64 clang (pattern matching - P2688)
x86-64 clang (reflection - C++26)
x86-64 clang (reflection - TS)
x86-64 clang (resugar)
x86-64 clang (string interpolation - P3412)
x86-64 clang (thephd.dev)
x86-64 clang (trunk)
x86-64 clang (variadic friends - P2893)
x86-64 clang (widberg)
x86-64 clang 10.0.0
x86-64 clang 10.0.0 (assertions)
x86-64 clang 10.0.1
x86-64 clang 11.0.0
x86-64 clang 11.0.0 (assertions)
x86-64 clang 11.0.1
x86-64 clang 12.0.0
x86-64 clang 12.0.0 (assertions)
x86-64 clang 12.0.1
x86-64 clang 13.0.0
x86-64 clang 13.0.0 (assertions)
x86-64 clang 13.0.1
x86-64 clang 14.0.0
x86-64 clang 14.0.0 (assertions)
x86-64 clang 15.0.0
x86-64 clang 15.0.0 (assertions)
x86-64 clang 16.0.0
x86-64 clang 16.0.0 (assertions)
x86-64 clang 17.0.1
x86-64 clang 17.0.1 (assertions)
x86-64 clang 18.1.0
x86-64 clang 18.1.0 (assertions)
x86-64 clang 19.1.0
x86-64 clang 19.1.0 (assertions)
x86-64 clang 2.6.0 (assertions)
x86-64 clang 2.7.0 (assertions)
x86-64 clang 2.8.0 (assertions)
x86-64 clang 2.9.0 (assertions)
x86-64 clang 20.1.0
x86-64 clang 20.1.0 (assertions)
x86-64 clang 21.1.0
x86-64 clang 21.1.0 (assertions)
x86-64 clang 3.0.0
x86-64 clang 3.0.0 (assertions)
x86-64 clang 3.1
x86-64 clang 3.1 (assertions)
x86-64 clang 3.2
x86-64 clang 3.2 (assertions)
x86-64 clang 3.3
x86-64 clang 3.3 (assertions)
x86-64 clang 3.4 (assertions)
x86-64 clang 3.4.1
x86-64 clang 3.5
x86-64 clang 3.5 (assertions)
x86-64 clang 3.5.1
x86-64 clang 3.5.2
x86-64 clang 3.6
x86-64 clang 3.6 (assertions)
x86-64 clang 3.7
x86-64 clang 3.7 (assertions)
x86-64 clang 3.7.1
x86-64 clang 3.8
x86-64 clang 3.8 (assertions)
x86-64 clang 3.8.1
x86-64 clang 3.9.0
x86-64 clang 3.9.0 (assertions)
x86-64 clang 3.9.1
x86-64 clang 4.0.0
x86-64 clang 4.0.0 (assertions)
x86-64 clang 4.0.1
x86-64 clang 5.0.0
x86-64 clang 5.0.0 (assertions)
x86-64 clang 5.0.1
x86-64 clang 5.0.2
x86-64 clang 6.0.0
x86-64 clang 6.0.0 (assertions)
x86-64 clang 6.0.1
x86-64 clang 7.0.0
x86-64 clang 7.0.0 (assertions)
x86-64 clang 7.0.1
x86-64 clang 7.1.0
x86-64 clang 8.0.0
x86-64 clang 8.0.0 (assertions)
x86-64 clang 8.0.1
x86-64 clang 9.0.0
x86-64 clang 9.0.0 (assertions)
x86-64 clang 9.0.1
x86-64 clang rocm-4.5.2
x86-64 clang rocm-5.0.2
x86-64 clang rocm-5.1.3
x86-64 clang rocm-5.2.3
x86-64 clang rocm-5.3.3
x86-64 clang rocm-5.7.0
x86-64 clang rocm-6.0.2
x86-64 clang rocm-6.1.2
x86-64 clang rocm-6.2.4
x86-64 clang rocm-6.3.3
x86-64 clang rocm-6.4.0
x86-64 gcc (P2034 lambdas)
x86-64 gcc (contract labels)
x86-64 gcc (contracts natural syntax)
x86-64 gcc (contracts)
x86-64 gcc (coroutines)
x86-64 gcc (modules)
x86-64 gcc (trunk)
x86-64 gcc 10.1
x86-64 gcc 10.2
x86-64 gcc 10.3
x86-64 gcc 10.3 (assertions)
x86-64 gcc 10.4
x86-64 gcc 10.4 (assertions)
x86-64 gcc 10.5
x86-64 gcc 10.5 (assertions)
x86-64 gcc 11.1
x86-64 gcc 11.1 (assertions)
x86-64 gcc 11.2
x86-64 gcc 11.2 (assertions)
x86-64 gcc 11.3
x86-64 gcc 11.3 (assertions)
x86-64 gcc 11.4
x86-64 gcc 11.4 (assertions)
x86-64 gcc 12.1
x86-64 gcc 12.1 (assertions)
x86-64 gcc 12.2
x86-64 gcc 12.2 (assertions)
x86-64 gcc 12.3
x86-64 gcc 12.3 (assertions)
x86-64 gcc 12.4
x86-64 gcc 12.4 (assertions)
x86-64 gcc 12.5
x86-64 gcc 12.5 (assertions)
x86-64 gcc 13.1
x86-64 gcc 13.1 (assertions)
x86-64 gcc 13.2
x86-64 gcc 13.2 (assertions)
x86-64 gcc 13.3
x86-64 gcc 13.3 (assertions)
x86-64 gcc 13.4
x86-64 gcc 13.4 (assertions)
x86-64 gcc 14.1
x86-64 gcc 14.1 (assertions)
x86-64 gcc 14.2
x86-64 gcc 14.2 (assertions)
x86-64 gcc 14.3
x86-64 gcc 14.3 (assertions)
x86-64 gcc 15.1
x86-64 gcc 15.1 (assertions)
x86-64 gcc 15.2
x86-64 gcc 15.2 (assertions)
x86-64 gcc 3.4.6
x86-64 gcc 4.0.4
x86-64 gcc 4.1.2
x86-64 gcc 4.4.7
x86-64 gcc 4.5.3
x86-64 gcc 4.6.4
x86-64 gcc 4.7.1
x86-64 gcc 4.7.2
x86-64 gcc 4.7.3
x86-64 gcc 4.7.4
x86-64 gcc 4.8.1
x86-64 gcc 4.8.2
x86-64 gcc 4.8.3
x86-64 gcc 4.8.4
x86-64 gcc 4.8.5
x86-64 gcc 4.9.0
x86-64 gcc 4.9.1
x86-64 gcc 4.9.2
x86-64 gcc 4.9.3
x86-64 gcc 4.9.4
x86-64 gcc 5.1
x86-64 gcc 5.2
x86-64 gcc 5.3
x86-64 gcc 5.4
x86-64 gcc 5.5
x86-64 gcc 6.1
x86-64 gcc 6.2
x86-64 gcc 6.3
x86-64 gcc 6.4
x86-64 gcc 6.5
x86-64 gcc 7.1
x86-64 gcc 7.2
x86-64 gcc 7.3
x86-64 gcc 7.4
x86-64 gcc 7.5
x86-64 gcc 8.1
x86-64 gcc 8.2
x86-64 gcc 8.3
x86-64 gcc 8.4
x86-64 gcc 8.5
x86-64 gcc 9.1
x86-64 gcc 9.2
x86-64 gcc 9.3
x86-64 gcc 9.4
x86-64 gcc 9.5
x86-64 icc 13.0.1
x86-64 icc 16.0.3
x86-64 icc 17.0.0
x86-64 icc 18.0.0
x86-64 icc 19.0.0
x86-64 icc 19.0.1
x86-64 icc 2021.1.2
x86-64 icc 2021.10.0
x86-64 icc 2021.2.0
x86-64 icc 2021.3.0
x86-64 icc 2021.4.0
x86-64 icc 2021.5.0
x86-64 icc 2021.6.0
x86-64 icc 2021.7.0
x86-64 icc 2021.7.1
x86-64 icc 2021.8.0
x86-64 icc 2021.9.0
x86-64 icx 2021.1.2
x86-64 icx 2021.2.0
x86-64 icx 2021.3.0
x86-64 icx 2021.4.0
x86-64 icx 2022.0.0
x86-64 icx 2022.1.0
x86-64 icx 2022.2.0
x86-64 icx 2022.2.1
x86-64 icx 2023.0.0
x86-64 icx 2023.1.0
x86-64 icx 2023.2.1
x86-64 icx 2024.0.0
x86-64 icx 2024.1.0
x86-64 icx 2024.2.0
x86-64 icx 2024.2.1
x86-64 icx 2025.0.0
x86-64 icx 2025.0.1
x86-64 icx 2025.0.3
x86-64 icx 2025.0.4
x86-64 icx 2025.1.0
x86-64 icx 2025.1.1
x86-64 icx 2025.2.0
x86-64 icx 2025.2.1
x86-64 icx 2025.2.1
z180-clang 15.0.0
z180-clang 15.0.7
z80-clang 15.0.0
z80-clang 15.0.7
zig c++ 0.10.0
zig c++ 0.11.0
zig c++ 0.12.0
zig c++ 0.12.1
zig c++ 0.13.0
zig c++ 0.14.0
zig c++ 0.14.1
zig c++ 0.15.1
zig c++ 0.6.0
zig c++ 0.7.0
zig c++ 0.7.1
zig c++ 0.8.0
zig c++ 0.9.0
zig c++ trunk
Options
Source code
#ifndef ZOO_SWAR_ASSOCIATIVE_ITERATION_H #define ZOO_SWAR_ASSOCIATIVE_ITERATION_H #ifndef ZOO_HEADER_META_BITMASKMAKER_H #define ZOO_HEADER_META_BITMASKMAKER_H #include <cstdint> namespace zoo { namespace meta { namespace detail { /// Repeats the given pattern in the whole of the argument /// \tparam T the desired integral type /// \tparam Progression how big the pattern is /// \tparam Remaining how many more times to copy the pattern template<typename T, T Current, int CurrentSize, bool> struct BitmaskMaker_impl{ constexpr static T value = BitmaskMaker_impl< T, T(Current | (Current << CurrentSize)), CurrentSize*2, CurrentSize*2 < sizeof(T)*8 >::value; }; template<typename T, T Current, int CurrentSize> struct BitmaskMaker_impl<T, Current, CurrentSize, false> { constexpr static T value = Current; }; } // detail /// Repeats the given pattern in the whole of the argument /// \tparam T the desired integral type /// \tparam Current the pattern to broadcast /// \tparam CurrentSize the number of bits in the pattern template<typename T, T Current, int CurrentSize> struct BitmaskMaker { constexpr static auto value = detail::BitmaskMaker_impl< T, Current, CurrentSize, CurrentSize*2 <= sizeof(T)*8 >::value; }; static_assert(0xF0F0 == BitmaskMaker<uint16_t, 0xF0, 8>::value); static_assert(0xEDFEDFED == BitmaskMaker<uint32_t, 0xFED, 12>::value); }} // zoo::meta #endif // #pragma once /// \file Swar.h SWAR operations #ifndef ZOO_HEADER_META_LOG_H #define ZOO_HEADER_META_LOG_H #ifndef ZOO_HEADER_META_POPCOUNT_H #define ZOO_HEADER_META_POPCOUNT_H // #include "zoo/meta/BitmaskMaker.h" namespace zoo { namespace meta { namespace detail { template<int Level, typename T = uint64_t> struct PopcountMask_impl { constexpr static inline auto value = BitmaskMaker< T, (1 << Level) - 1, (1 << (Level + 1)) >::value; }; template<int Bits> struct UInteger_impl; template<> struct UInteger_impl<8> { using type = uint8_t; }; template<> struct UInteger_impl<16> { using type = uint16_t; }; template<> struct UInteger_impl<32> { using type = uint32_t; }; template<> struct UInteger_impl<64> { using type = uint64_t; }; // unfortunately this does not work since unsigned long long is not // the same type as unsigned long, but may have the same size! template<typename> constexpr int BitWidthLog = 0; template<> constexpr inline int BitWidthLog<uint8_t> = 3; template<> constexpr inline int BitWidthLog<uint16_t> = 4; template<> constexpr inline int BitWidthLog<uint32_t> = 5; template<> constexpr inline int BitWidthLog<uint64_t> = 6; } // detail template<int Bits> using UInteger = typename detail::UInteger_impl<Bits>::type; template<int LogarithmOfGroupSize, typename T = std::uint64_t> struct PopcountLogic { constexpr static int GroupSize = 1 << LogarithmOfGroupSize; constexpr static int HalvedGroupSize = GroupSize / 2; constexpr static auto CombiningMask = BitmaskMaker<T, T(1 << HalvedGroupSize) - 1, GroupSize>::value; using Recursion = PopcountLogic<LogarithmOfGroupSize - 1, T>; constexpr static T execute(T input); }; template<typename T> struct PopcountLogic<0, T> { constexpr static T execute(T input) { return input; } }; template<typename T> struct PopcountLogic<1, T> { constexpr static T execute(T input) { return input - ((input >> 1) & BitmaskMaker<T, 1, 2>::value); // For each pair of bits, the expression results in: // 11: 3 - 1: 2 // 10: 2 - 1: 1 // 01: 1 - 0: 1 // 00: 0 - 0: 0 } }; template<int LogarithmOfGroupSize, typename T> constexpr T PopcountLogic<LogarithmOfGroupSize, T>::execute(T input) { return Recursion::execute(input & CombiningMask) + Recursion::execute((input >> HalvedGroupSize) & CombiningMask); } #ifdef _MSC_VER template<int LogarithmOfGroupSize, typename T = uint64_t> using PopcountIntrinsic = PopcountLogic<LogarithmOfGroupSize, T>; #else template<int LogarithmOfGroupSize, typename T = uint64_t> struct PopcountIntrinsic { constexpr static T execute(T input) { using UI = UInteger<1 << LogarithmOfGroupSize>; constexpr auto times = 8*sizeof(T); uint64_t rv = 0; for(auto n = times; n; ) { n -= 8*sizeof(UI); UI tmp = input >> n; tmp = __builtin_popcountll(tmp); rv |= uint64_t(tmp) << n; } return rv; } }; #endif }} #endif namespace zoo { namespace meta { /// The algorithm is, from the perspective of the most significant bit set, to copy it /// downward to all positions. /// First copy it once, meaning a group of two copies of the two most significant bit /// Then copy it again, making a group of four copies, then 8 copies... template<typename T> constexpr int logFloor_WithoutIntrinsic(T value) { constexpr auto NBitsTotal = sizeof(T) * 8; for(auto groupSize = 1; groupSize < NBitsTotal; groupSize <<= 1) { value |= (value >> groupSize); } return PopcountLogic<detail::BitWidthLog<T>, T>::execute(value) - 1; } #ifdef _MSC_VER // change to use the relevant functions in C++ 20's <bit> header // when bumping to C++ 20 constexpr int logFloor(uint64_t arg) { return logFloor_WithoutIntrinsic(arg); } #else constexpr int logFloor(uint64_t arg) { return 63 - __builtin_clzll(arg); } #endif constexpr int logCeiling(uint64_t arg) { auto floorLog = logFloor(arg); return floorLog + // turn off the most significant bit and convert to 1 or 0 ((arg ^ (1ull << floorLog)) ? 1 : 0); } }} #endif #include <type_traits> #ifdef _MSC_VER #include <iso646.h> #endif namespace zoo { namespace swar { using u64 = uint64_t; using u32 = uint32_t; using u16 = uint16_t; using u8 = std::uint8_t; template<int LogNBits> constexpr uint64_t popcount(uint64_t a) noexcept { return std::conditional_t< (4 < LogNBits), meta::PopcountIntrinsic<LogNBits>, meta::PopcountLogic<LogNBits> >::execute(a); } /// Index into the bits of the type T that contains the MSB. template<typename T> constexpr std::make_unsigned_t<T> msbIndex(T v) noexcept { return meta::logFloor(v); } /// Index into the bits of the type T that contains the LSB. /// /// \todo incorporate __builtin_ctzg when it is more widely available template<typename T> constexpr std::make_unsigned_t<T> lsbIndex(T v) noexcept { // This check should be SFINAE, but supporting all sorts // of base types is an ongoing task, we put a bare-minimum // temporary preventive measure with static_assert static_assert(sizeof(T) <= 8, "Unsupported"); #ifdef _MSC_VER // ~v & (v - 1) turns on all trailing zeroes, zeroes the rest return meta::logFloor(1 + (~v & (v - 1))); #else return ~v ? __builtin_ctzll(v) : sizeof(T) * 8; #endif } #ifndef _MSC_VER constexpr __uint128_t lsbIndex(__uint128_t v) noexcept { auto low = (v << 64) >> 64; if(low) { return __builtin_ctzll(low); } return 64 + __builtin_ctzll(v >> 64); } #endif /// Core abstraction around SIMD Within A Register (SWAR). Specifies 'lanes' /// of NBits width against a type T, and provides an abstraction for performing /// SIMD operations against that primitive type T treated as a SIMD register. /// SWAR operations are usually constant time, log(lane count) cost, or O(lane count) cost. /// Certain computational workloads can be materially sped up using SWAR techniques. template<int NBits_, typename T = uint64_t> struct SWAR { using type = std::make_unsigned_t<T>; constexpr static inline type NBits = NBits_, BitWidth = sizeof(T) * 8, Lanes = BitWidth / NBits, NSlots = Lanes, PaddingBitsCount = BitWidth % NBits, SignificantBitsCount = BitWidth - PaddingBitsCount, AllOnes = ~std::make_unsigned_t<T>{0} >> PaddingBitsCount, // Also constructed in RobinHood utils: possible bug? LeastSignificantBit = meta::BitmaskMaker<T, std::make_unsigned_t<T>{1}, NBits>::value, AllOnesInFirstLane = AllOnes >> (NBits * (Lanes - 1)), MostSignificantBit = LeastSignificantBit << (NBits - 1), LeastSignificantLaneMask = []() { if constexpr (NBits < sizeof(T) * 8) { return (T(1) << NBits) - 1; } else { return ~T(0); } }(), // Use LowerBits in favor of ~MostSignificantBit to not pollute // "don't care" bits when non-power-of-two bit lane sizes are supported LowerBits = MostSignificantBit - LeastSignificantBit; static_assert(std::is_unsigned_v<T>, "You should not use an unsigned type as the base for a SWAR type. " "If you have used `int` or `long`, please use `uint32_t` or `uint64_t` instead. " "This type parameter is only used to determine the total width of the SWAR register. " "The signed-ness of the type has no *intentional* semantic meaning to what you're defining and " "furthermore, some bitwise operations are different for signed and unsigned types." ); SWAR() = default; constexpr explicit SWAR(T v): m_v(v) {} constexpr explicit operator T() const noexcept { return m_v; } constexpr T value() const noexcept { return m_v; } #define SWAR_UNARY_OPERATORS_X_LIST \ X(SWAR, ~) //constexpr SWAR operator~() const noexcept { return SWAR{~m_v}; } #define SWAR_BINARY_OPERATORS_X_LIST \ X(SWAR, &) X(SWAR, ^) X(SWAR, |) X(SWAR, -) X(SWAR, +) #define X(rt, op) constexpr rt operator op() const noexcept { return rt(op m_v); } SWAR_UNARY_OPERATORS_X_LIST #undef X #define X(rt,op) constexpr rt operator op (rt o) const noexcept { return rt (m_v op o.m_v); }; SWAR_BINARY_OPERATORS_X_LIST #undef X constexpr static T laneMask(int laneIndex) noexcept { return LeastSignificantLaneMask << (NBits * laneIndex); } // Returns lane at position with other lanes cleared. constexpr T isolateLane(int laneIndex) const noexcept { return m_v & laneMask(laneIndex); } // Returns lane value at position, in lane 0, rest of SWAR cleared. constexpr T at(int position) const noexcept { return LeastSignificantLaneMask & (m_v >> (NBits * position)); } constexpr SWAR clear(int position) const noexcept { constexpr auto filter = (T(1) << NBits) - 1; auto invertedMask = filter << (NBits * position); auto mask = ~invertedMask; return SWAR(m_v & mask); } /// The SWAR lane index that contains the MSB. It is not the bit index of the MSB. /// IE: 4 bit wide 32 bit SWAR: 0x0040'0000 will return 5, not 22 (0 indexed). constexpr auto top() const noexcept { return msbIndex(m_v) / NBits; } constexpr auto lsbIndex() const noexcept { return swar::lsbIndex(m_v) / NBits; } constexpr SWAR setBit(int index, int bit) const noexcept { return SWAR(m_v | (T(1) << (index * NBits + bit))); } constexpr auto blitElement(int index, T value) const noexcept { auto elementMask = ((T(1) << NBits) - 1) << (index * NBits); return SWAR((m_v & ~elementMask) | (value << (index * NBits))); } constexpr SWAR blitElement(int index, SWAR other) const noexcept { constexpr auto OneElementMask = SWAR(~(~T(0) << NBits)); auto IsolationMask = OneElementMask.shiftLanesLeft(index); return (*this & ~IsolationMask) | (other & IsolationMask); } constexpr SWAR shiftLanesLeft(int laneCount) const noexcept { return SWAR(value() << (NBits * laneCount)); } constexpr SWAR shiftLanesRight(int laneCount) const noexcept { return SWAR(value() >> (NBits * laneCount)); } /// \brief as the name suggests /// \param protectiveMask should clear the bits that would cross the lane. /// The bits that will be cleared are directly related to the count of /// shifts, it is natural to maintain the protective mask by the caller, /// otherwise, the mask would have to be computed in all invocations. /// We are not sure the optimizer would maintain this mask somewhere, if it /// were to recalculate it, it would be disastrous for performance /// \note the \c static_cast are necessary because of narrowing conversions #define SHIFT_INTRALANE_OP_X_LIST X(Left, <<) X(Right, >>) #define X(name, op) \ constexpr SWAR \ shiftIntraLane##name(int bitCount, SWAR protectiveMask) const noexcept { \ T shiftC = static_cast<T>(bitCount); \ auto V = (*this & protectiveMask).value(); \ auto rv = static_cast<T>(V op shiftC); \ return SWAR{rv}; \ } SHIFT_INTRALANE_OP_X_LIST #undef X #undef SHIFT_INTRALANE_OP_X_LIST constexpr SWAR multiply(T multiplier) const noexcept { return SWAR{m_v * multiplier}; } T m_v; }; /// Defining operator== on base SWAR types is entirely too error prone. Force a verbose invocation. template<int NBits, typename T = uint64_t> constexpr auto horizontalEquality(SWAR<NBits, T> left, SWAR<NBits, T> right) { return left.m_v == right.m_v; } /// Isolating >= NBits in underlying integer type currently results in disaster. // TODO(scottbruceheart) Attempting to use binary not (~) results in negative shift warnings. template<int NBits, typename T = uint64_t> constexpr auto isolate(T pattern) { return pattern & ((T(1)<<NBits)-1); } /// Clears the least bit set in type T template<typename T = uint64_t> constexpr auto clearLSB(T v) { return v & (v - 1); } /// Leaves on the least bit set, or all 1s for a 0 input. template<typename T = uint64_t> constexpr auto isolateLSB(T v) { return v & ~clearLSB(v); } template<int NBits, typename T> constexpr auto leastNBitsMask() { return (T{1}<<NBits)-1; } template<int NBits, uint64_t T> constexpr auto leastNBitsMask() { return ~((0ull)<<NBits); } template<int NBits, typename T = uint64_t> constexpr T mostNBitsMask() { return ~leastNBitsMask<sizeof(T)*8-NBits, T>(); } /// Clears the block of N bits anchored at the LSB. /// clearLSBits<3> applied to binary 00111100 is binary 00100000 template<int NBits, typename T = uint64_t> constexpr auto clearLSBits(T v) { constexpr auto lowMask = leastNBitsMask<NBits, T>(); return v &(~(lowMask << meta::logFloor(isolateLSB<T>(v)))); } /// Isolates the block of N bits anchored at the LSB. /// isolateLSBits<2> applied to binary 00111100 is binary 00001100 template<int NBits, typename T = uint64_t> constexpr auto isolateLSBits(T v) { constexpr auto lowMask = leastNBitsMask<NBits, T>(); return v &(lowMask << meta::logFloor(isolateLSB<T>(v))); } /// Broadcasts the value in the 0th lane of the SWAR to the entire SWAR. /// Precondition: 0th lane of |v| contains a value to broadcast, remainder of input SWAR zero. template<int NBits, typename T = uint64_t> constexpr auto broadcast(SWAR<NBits, T> v) { constexpr T Ones = meta::BitmaskMaker<T, 1, NBits>::value; return SWAR<NBits, T>(T(v) * Ones); } /// BooleanSWAR treats the MSB of each SWAR lane as the boolean associated with that lane. template<int NBits, typename T> struct BooleanSWAR: SWAR<NBits, T> { using Base = SWAR<NBits, T>; // Booleanness is stored in the MSBs static constexpr auto MaskMSB = broadcast<NBits, T>(Base(T(1) << (NBits -1))); static constexpr auto MaskLSB = broadcast<NBits, T>(Base(T(1))); // Turns off LSB of each lane static constexpr auto MaskNonLSB = ~MaskLSB; static constexpr auto MaskNonMSB = ~MaskMSB; constexpr explicit BooleanSWAR(T v): Base(v) {} constexpr BooleanSWAR clear(int bit) const noexcept { constexpr auto Bit = T(1) << (NBits - 1); return this->m_v ^ (Bit << (NBits * bit)); } constexpr BooleanSWAR clearLSB() const noexcept { return BooleanSWAR(swar::clearLSB(this->value())); } /// BooleanSWAR treats the MSB of each lane as the boolean associated with that lane. /// A logical NOT in this circumstance _only_ flips the MSB of each lane. This operation is /// not ones or twos complement. constexpr auto operator ~() const noexcept { return BooleanSWAR(Base{Base::MostSignificantBit} ^ *this); } constexpr auto operator not() const noexcept { return BooleanSWAR(MaskMSB ^ *this); } #define BOOLEANSWAR_BINARY_LOGIC_OPERATOR_X_LIST X(^) X(&) X(|) #define X(op) \ constexpr BooleanSWAR operator op(BooleanSWAR other) const noexcept { return this->Base::operator op(other); } BOOLEANSWAR_BINARY_LOGIC_OPERATOR_X_LIST #undef X // BooleanSWAR as a mask: BooleanSWAR<4, u16>(0x0800).MSBtoLaneMask() => SWAR<4,u16>(0x0F00) constexpr auto MSBtoLaneMask() const noexcept { const auto MSBMinusOne = this->m_v - (this->m_v >> (NBits-1)); // Convert pattern 10* to 01* return SWAR<NBits,T>(MSBMinusOne | this->m_v); // Blit 01* and 10* together for 1* when MSB was on. } explicit constexpr operator bool() const noexcept { return this->m_v; } private: constexpr BooleanSWAR(Base initializer) noexcept: SWAR<NBits, T>(initializer) {} template<int N, int NB, typename TT> friend constexpr BooleanSWAR<NB, TT> constantIsGreaterEqual(SWAR<NB, TT>) noexcept; template<int N, int NB, typename TT> friend constexpr BooleanSWAR<NB, TT> constantIsGreaterEqual_MSB_off(SWAR<NB, TT>) noexcept; template<int NB, typename TT> friend constexpr BooleanSWAR<NB, TT> greaterEqual(SWAR<NB, TT>, SWAR<NB, TT>) noexcept; template<int NB, typename TT> friend constexpr BooleanSWAR<NB, TT> greaterEqual_MSB_off(SWAR<NB, TT>, SWAR<NB, TT>) noexcept; template<int NB, typename TT> constexpr T indexOfMostSignficantLaneSet(SWAR<NBits, T> test) noexcept; template<int NB, typename TT> friend constexpr BooleanSWAR<NB, TT> convertToBooleanSWAR(SWAR<NB, TT> arg) noexcept; }; template<int NBits, typename T> constexpr BooleanSWAR<NBits, T> convertToBooleanSWAR(SWAR<NBits, T> arg) noexcept { return SWAR<NBits, T>{SWAR<NBits, T>::MostSignificantBit} & arg; } template<int N, int NBits, typename T> constexpr BooleanSWAR<NBits, T> constantIsGreaterEqual(SWAR<NBits, T> subtrahend) noexcept { static_assert(1 < NBits, "Degenerated SWAR"); constexpr auto MSB_Position = NBits - 1; constexpr auto MSB = T(1) << MSB_Position; constexpr auto MSB_Mask = SWAR<NBits, T>{meta::BitmaskMaker<T, MSB, NBits>::value}; constexpr auto Minuend = SWAR<NBits, T>{meta::BitmaskMaker<T, N, NBits>::value}; constexpr auto N_MSB = MSB & N; auto subtrahendWithMSB_on = MSB_Mask & subtrahend; auto subtrahendWithMSB_off = ~subtrahendWithMSB_on; auto subtrahendMSBs_turnedOff = subtrahend ^ subtrahendWithMSB_on; if constexpr(N_MSB) { auto leastSignificantComparison = Minuend - subtrahendMSBs_turnedOff; auto merged = subtrahendMSBs_turnedOff | // the minuend MSBs are on leastSignificantComparison; return MSB_Mask & merged; } else { auto minuendWithMSBs_turnedOn = Minuend | MSB_Mask; auto leastSignificantComparison = minuendWithMSBs_turnedOn - subtrahendMSBs_turnedOff; auto merged = subtrahendWithMSB_off & // the minuend MSBs are off leastSignificantComparison; return MSB_Mask & merged; } } template<int N, int NBits, typename T> constexpr BooleanSWAR<NBits, T> constantIsGreaterEqual_MSB_off(SWAR<NBits, T> subtrahend) noexcept { static_assert(1 < NBits, "Degenerated SWAR"); constexpr auto MSB_Position = NBits - 1; constexpr auto MSB = T(1) << MSB_Position; constexpr auto MSB_Mask = meta::BitmaskMaker<T, MSB, NBits>::value; constexpr auto Minuend = meta::BitmaskMaker<T, N, NBits>::value; constexpr auto N_MSB = MSB & Minuend; auto subtrahendWithMSB_on = subtrahend; auto subtrahendWithMSB_off = ~subtrahendWithMSB_on; auto subtrahendMSBs_turnedOff = subtrahend ^ subtrahendWithMSB_on; if constexpr(N_MSB) { return MSB_Mask; } else { auto minuendWithMSBs_turnedOn = Minuend | MSB_Mask; auto leastSignificantComparison = minuendWithMSBs_turnedOn - subtrahendMSBs_turnedOff; return MSB_Mask & leastSignificantComparison; } } template<typename T, typename U, typename V> constexpr T median(T x, U y, V z) { return (x | y) & (y | z) & (x | z); } template<int NBits, typename T> constexpr BooleanSWAR<NBits, T> greaterEqual(SWAR<NBits, T> left, SWAR<NBits, T> right) noexcept { // Adapted from TAOCP V4 P152 // h is msbselector, x is right, l is lower/left. Sets MSB to 1 in lanes // in test variable t for when xi < yi for lane i . Invert for greaterEqual. // t = h & ~<x~yz> // z = (x|h) - (y&~h) using S = swar::SWAR<NBits, T>; const auto h = S::MostSignificantBit, x = left.value(), y = right.value(); // x=left, y= right is x < y const auto z = (x|h) - (y&~h); // bitwise ternary median! const auto t = h & ~median(x, ~y, z); return ~BooleanSWAR<NBits, T>{static_cast<T>(t)}; // ~(x<y) === x >= y } // In the condition where only MSBs will be on, we can fast lookup with 1 multiply the index of the most signficant byte that is on. // This appears to be a mapping from the (say) 256 unique values of a 64 bit int where only MSBs of each 8 bits can be on, but I don't fully understand it. // Adapted from TAOCP Vol 4A Page 153 Eq 94. template<int NBits, typename T> constexpr T indexOfMostSignficantLaneSet(SWAR<NBits, T> test) noexcept { const auto TypeWidth = sizeof(T) * 8; const auto TopVal = (T{1}<<(TypeWidth-NBits))-1, BottomVal = (T{1}<<(NBits-1))-1; const T MappingConstant = TopVal / BottomVal; return (test.value() * MappingConstant) >> (TypeWidth - NBits); } template<int NBits, typename T> constexpr BooleanSWAR<NBits, T> greaterEqual_MSB_off(SWAR<NBits, T> left, SWAR<NBits, T> right) noexcept { constexpr auto MLMSB = SWAR<NBits, T>{SWAR<NBits, T>::MostSignificantBit}; auto minuend = MLMSB | left; return MLMSB & (minuend - right); } template<int NB, typename T> constexpr auto booleans(SWAR<NB, T> arg) noexcept { return ~constantIsGreaterEqual<0>(arg); } template<int NBits, typename T> constexpr auto differents(SWAR<NBits, T> a1, SWAR<NBits, T> a2) { return booleans(a1 ^ a2); } template<int NBits, typename T> constexpr auto equals(SWAR<NBits, T> a1, SWAR<NBits, T> a2) { return ~differents(a1, a2); } /* This is just a draft implementation: b1. The isolator needs pre-computing instead of adding 3 ops per iteration 2. The update of the isolator is not needed in the last iteration 3. Consider returning not the logarithm, but the biased by 1 (to support 0) */ template<int NBits, typename T> constexpr SWAR<NBits, T> logarithmFloor(SWAR<NBits, T> v) noexcept { constexpr auto LogNBits = meta::logFloor(NBits); static_assert(NBits == (1 << LogNBits), "Logarithms of element width not power of two is un-implemented"); auto whole = v.value(); auto isolationMask = SWAR<NBits, T>::MostSignificantBit; for(auto groupSize = 1; groupSize < NBits; groupSize <<= 1) { auto shifted = whole >> groupSize; // When shifting down a group to double the size of a group, the upper // "groupSize" bits will come from the element above, mask them out auto isolator = ~isolationMask; auto withoutCrossing = shifted & isolator; whole |= withoutCrossing; isolationMask |= (isolationMask >> groupSize); } constexpr auto ones = meta::BitmaskMaker<T, 1, NBits>::value; auto popcounts = meta::PopcountLogic<LogNBits, T>::execute(whole); return SWAR<NBits, T>{popcounts - ones}; } static_assert( logarithmFloor(SWAR<8>{0x8040201008040201ull}).value() == 0x0706050403020100ull ); static_assert( logarithmFloor(SWAR<8>{0xFF7F3F1F0F070301ull}).value() == 0x0706050403020100ull ); static_assert(SWAR<4, uint16_t>::AllOnesInFirstLane == 0b0000'0000'0000'1111); }} #include <cstdint> //#define ZOO_DEVELOPMENT_DEBUGGING #ifdef ZOO_DEVELOPMENT_DEBUGGING #include <iostream> inline std::ostream &binary(std::ostream &out, uint64_t input, int count) { while(count--) { out << (1 & input); input >>= 1; } return out; } template<int NB, typename B> std::ostream &operator<<(std::ostream &out, zoo::swar::SWAR<NB, B> s) { using S = zoo::swar::SWAR<NB, B>; auto shiftCounter = sizeof(B) * 8 / NB; out << "<|"; auto v = s.value(); do { binary(out, v, NB) << '|'; } while(--shiftCounter); return out << ">"; } #define ZOO_TO_STRING(a) #a // std::endl is needed within the context of debugging: flush the line #define ZOO_TRACEABLE_EXP_IMPL(F, L, ...) std::cout << '"' << (__VA_ARGS__) << "\", \"" << F << ':' << L << "\", \"" << ZOO_TO_STRING(__VA_ARGS__) << "\"" << std::endl; #define ZOO_TRACEABLE_EXPRESSION(...) ZOO_TRACEABLE_EXP_IMPL(__FILE__, __LINE__, __VA_ARGS__) #else #define ZOO_TRACEABLE_EXPRESSION(...) (void)(__VA_ARGS__) #endif namespace zoo::swar { /// \note This code should be substituted by an application of "progressive" algebraic iteration /// \note There is also parallelPrefix (to be implemented) template<int NB, typename B> constexpr SWAR<NB, B> parallelSuffix(SWAR<NB, B> input) { using S = SWAR<NB, B>; auto shiftClearingMask = S{static_cast<B>(~S::MostSignificantBit)}, doubling = input, result = S{0}; auto bitsToXOR = NB, power = 1; #define ZTE(...) // ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__) for(;;) { ZTE(doubling); if(1 & bitsToXOR) { ZTE(result ^ doubling); result = result ^ doubling; ZTE(doubling.shiftIntraLaneLeft(power, shiftClearingMask)); doubling = doubling.shiftIntraLaneLeft(power, shiftClearingMask); } ZTE(bitsToXOR >> 1); bitsToXOR >>= 1; if(!bitsToXOR) { break; } auto shifted = doubling.shiftIntraLaneLeft(power, shiftClearingMask); ZTE(shifted); ZTE(doubling ^ shifted); doubling = doubling ^ shifted; // 01...1 // 001...1 // 00001...1 // 000000001...1 shiftClearingMask = shiftClearingMask & S{static_cast<B>(shiftClearingMask.value() >> power)}; ZTE(power << 1); power <<= 1; } ZTE(input); #undef ZTE return S{result}; } /* Binary compress: A fascinating algorithm. Warren (Hacker's Delight) believes Guy L. Steele is the author of the following binary compression operation, equivalent to Intel's BMI2 instruction PEXT of "Parallel Extraction" From a "mask", a selector of bits from an input, we want to put them together in the output. For example's sake, this is the selector: Note: this follows the usual 'big endian' convention of denoting the most significant bit first: 0001 0011 0111 0111 0110 1110 1100 1010 Imagine the input is the 32-bit or 32-boolean variable expression abcd efgh ijkl mnop qrst uvxy zABC DEFG We want the selection d gh jkl nop rs uvx zA D F To be compressed into the output 0000 0000 0000 00dg hjkl nopr suvx zADF This algorithm will virtually calculate the count of positions that the selected bits travel to the right, by constructing the binary encoding of that count: It will identify the positions that will travel an odd number of positions to the right, these are those whose position-travel-count have the units set. It will then move those positions by one position to the right, and eliminate them from the yet-to-move positions. Because it eliminates the positions that would move an odd count, there remains only positions that move an even number of positions. Now it finds the positions that move an odd count of /pairs/ of positions, it moves them 2 positions. This is equivalent to finding the positions that would have the bit for 2 set in the count of positions to move right. Then an odd count of /quartets/ of positions, and moves them 4; 8, 16, 32, ... Complete example (32 bits) Selection mask: 0001 0011 0111 0111 0110 1110 1100 1010 Input (each letter or variable is a boolean, that can have 0 or 1) abcd efgh ijkl mnop qrst uvxy zABC DEFG Selection (using spaces) d gh jkl nop rs uvx zA D F Desired result: dghjklnoprsuvxzADF 0001 0011 0111 0111 0110 1110 1100 1010 compressionMask 1110 1100 1000 1000 1001 0001 0011 0101 ~compressionMask 1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix == mk == shiftleft 1 == groupsize of ~compressionMask This indicates the positions that have a 0 immediately to the right in compressionMask 4322 1000 9999 8888 7765 5554 4432 2110 number of 1s at and to the right of the current position in forParallelSuffix, last decimal digit 0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of forParallelSuffix We have just identified the positions that need to move an odd number of positions. Filter them with positions with a bit set in compressionMask: 0001 0011 0111 0111 0110 1110 1100 1010 compressionMask ---- ---- -111 ---- -1-- 111- ---- --1- mv == move (compress) these bits of compressionMask by 1 == groupSize 0001 0011 0000 0111 0010 0000 1100 1000 mv ^ compressionMask (clear the bits that will move) ---- ---- --11 1--- --1- -111 ---- ---1 mv >> 1 == groupSize 0001 0011 0011 1111 0010 0111 1100 1001 pseudo-compressed compressionMask. 0100 1000 1111 0000 1101 1110 0010 0110 mp == parallel suffix of forParallelSuffix 1011 0111 0000 1111 0010 0001 1101 1001 ~mp == ~parallel suffix (bits not moved) 1101 1001 0001 0001 0010 0010 0110 1010 forParallelSuffix (remember: had a zero immediately to their right) 1001 0001 0000 0001 0010 0000 0100 1000 new forParallelSuffix (also not moved => had even zeroes to their right) At this point, we have removed from compressionMask the positions that moved an odd number of positions and moved them 1 position, then, we only keep positions that move an even number of positions. Now, we will repeat these steps but for groups of two zeroes, then 4 zeroes, ... */ template<int NB, typename B> constexpr SWAR<NB, B> compress(SWAR<NB, B> input, SWAR<NB, B> compressionMask) { // This solution uses the parallel suffix operation as a primary tool: // For every bit postion it indicates an odd number of ones to the right, // including itself. // Because we want to detect the "oddness" of groups of zeroes to the right, // we flip the compression mask. To not count the bit position itself, // we shift by one. #define ZTE(...) // ZOO_TRACEABLE_EXPRESSION(__VA_ARGS__) ZTE(input); ZTE(compressionMask); using S = SWAR<NB, B>; auto result = input & compressionMask; auto groupSize = 1; auto shiftLeftMask = S{S::LowerBits}, shiftRightMask = S{S::LowerBits << 1}; ZTE(~compressionMask); auto forParallelSuffix = // this is called "mk" in the book (~compressionMask).shiftIntraLaneLeft(groupSize, shiftLeftMask); ZTE(forParallelSuffix); // note: forParallelSuffix denotes positions with a zero // immediately to the right in 'compressionMask' for(;;) { ZTE(groupSize); ZTE(shiftLeftMask); ZTE(shiftRightMask); ZTE(result); auto oddCountOfGroupsOfZerosToTheRight = // called "mp" in the book parallelSuffix(forParallelSuffix); ZTE(oddCountOfGroupsOfZerosToTheRight); // compress the bits just identified in both the result and the mask auto moving = compressionMask & oddCountOfGroupsOfZerosToTheRight; ZTE(moving); compressionMask = (compressionMask ^ moving) | // clear the moving moving.shiftIntraLaneRight(groupSize, shiftRightMask); ZTE(compressionMask); auto movingFromInput = result & moving; result = (result ^ movingFromInput) | // clear the moving from the result movingFromInput.shiftIntraLaneRight(groupSize, shiftRightMask); auto nextGroupSize = groupSize << 1; if(NB <= nextGroupSize) { break; } auto evenCountOfGroupsOfZerosToTheRight = ~oddCountOfGroupsOfZerosToTheRight; forParallelSuffix = forParallelSuffix & evenCountOfGroupsOfZerosToTheRight; auto newShiftLeftMask = shiftLeftMask.shiftIntraLaneRight(groupSize, shiftRightMask); shiftRightMask = shiftRightMask.shiftIntraLaneLeft(groupSize, shiftLeftMask); shiftLeftMask = newShiftLeftMask; groupSize = nextGroupSize; } ZTE(result); #undef ZTE return result; } /// \todo because of the desirability of "accumuating" the XORs at the MSB, /// the parallel suffix operation is more suitable. template<int NB, typename B> constexpr SWAR<NB, B> parity(SWAR<NB, B> input) { using S = SWAR<NB, B>; auto preResult = parallelSuffix(input); auto onlyMSB = preResult.value() & S::MostSignificantBit; return S{onlyMSB}; } namespace impl { template<int NB, typename B> constexpr auto makeLaneMaskFromMSB_and_LSB(SWAR<NB, B> msb, SWAR<NB, B> lsb) { auto msbCopiedDown = msb - lsb; auto msbReintroduced = msbCopiedDown | msb; return msbReintroduced; } } template<int NB, typename B> constexpr auto makeLaneMaskFromLSB(SWAR<NB, B> input) { using S = SWAR<NB, B>; auto lsb = input & S{S::LeastSignificantBit}; auto lsbCopiedToMSB = S{lsb.value() << (NB - 1)}; return impl::makeLaneMaskFromMSB_and_LSB(lsbCopiedToMSB, lsb); } template<int NB, typename B> constexpr auto makeLaneMaskFromMSB(SWAR<NB, B> input) { using S = SWAR<NB, B>; auto msb = input & S{S::MostSignificantBit}; B val = msb.value() >> (NB - 1); auto msbCopiedToLSB = S{val}; return impl::makeLaneMaskFromMSB_and_LSB(msb, msbCopiedToLSB); } template<int NB, typename B> struct ArithmeticResultTriplet { SWAR<NB, B> result; BooleanSWAR<NB, B> carry, overflow; }; /// \brief "Safe" addition, meaning non-corrupting unsigned overflow addition /// and producing the flags for unsigned overflow (carry) and signed overflow. /// /// This is the function to perform signed addition (that relies on supporting /// unsigned overflow) /// /// Notice that the support for unsigned overflow directly allows signed /// addition (and both signed and unsigned substraction). /// /// This function is called "full addition" because it can perform the addition /// with all the bits of the inputs by making sure the overflow (in the /// unsigned sense) does not cross the lane boundary. /// This function has less performance than "optimistic" addition (operator+). /// The mechanism to manage potential overflow naturally allows the calculation /// of the carry and signed overflow flags for no extra performance cost. /// /// The performance relies on the optimizer removing the calculation of /// the carry or signed overflow if they are not used. /// /// When interpreted as unsigned addition, carrying out of the result is /// overflow. /// /// The carry bit is essential to increase the precision of the results in /// normal arithmetic, but in unsigned SWAR it is preferable to double the /// precision before executing addition, thus guaranteeing no overflow will /// occur and using the more performant operator+ addition. Hence, /// the carry flag is mostly useful in SWAR for detection of unsigned overflow, /// as opposed to a helper to achieve higher precision as it is normally used: /// Once we support non-power-of-two sizes of lanes in number of bits, we could /// have transformations that would convert a SWAR of lanes of N bits to a SWAR /// of (N + 1) bits, however, functionally, the performance cost of doing this /// will be strictly higher than doubling the bit count of the current SWAR. /// With double the number of bits, not only are additions guaranteed to not /// overflow (as unsigned) but even multiplications won't. Then it is not /// practical to use the carry bit for other than detection of unsigned overflow. /// /// The signed integer interpretation is two's complement, which /// routinely overflows (when interpreted as unsigned). Signed overflow may only /// occur if the inputs have the same sign, it is detected when the sign of the /// result is opposite that of the inputs. /// /// \todo The library is not explicit with regards to the fact that /// operator+ is only useful with the unsigned interpretation. A decision /// must be made to either keep the library as is, or to promote full addition /// to operator+, and the rationale for the decision /// /// \todo What is the right place for this function? /// It was added here because in practice multiplication overflows, as a draft template<int NB, typename B> constexpr ArithmeticResultTriplet<NB, B> fullAddition(SWAR<NB, B> s1, SWAR<NB, B> s2) { using S = SWAR<NB, B>; constexpr auto SignBit = S{S::MostSignificantBit}, LowerBits = SignBit - S{S::LeastSignificantBit}; // prevent overflow by clearing the most significant bits auto s1prime = LowerBits & s1, s2prime = LowerBits & s2, resultPrime = s1prime + s2prime, s1Sign = SignBit & s1, s2Sign = SignBit & s2, signPrime = SignBit & resultPrime, result = resultPrime ^ s1Sign ^ s2Sign, // carry is set whenever at least two of the sign bits of s1, s2, // signPrime are set carry = (s1Sign & s2Sign) | (s1Sign & signPrime) | (s2Sign & signPrime), // overflow: the inputs have the same sign and different to result // same sign: s1Sign ^ s2Sign overflow = (s1Sign ^ s2Sign ^ SignBit) & (s1Sign ^ result); using BS = BooleanSWAR<NB, B>; return { result, BS{carry.value()}, BS{overflow.value()} }; }; template<int NB, typename B> constexpr SWAR<NB, B> saturatingUnsignedAddition(SWAR<NB, B> s1, SWAR<NB, B> s2) { const auto additionResult = fullAddition(s1, s2); // If we carry unsigned, we need to saturate: thus OR the carry bit with the // lane bits (carry because it happens to be earlier in the struct // declaration) return additionResult.carry.MSBtoLaneMask() | additionResult.result; } /// \brief Negation is useful only for the signed integer interpretation template<int NB, typename B> constexpr auto negate(SWAR<NB, B> input) { using S = SWAR<NB, B>; constexpr auto Ones = S{S::LeastSignificantBit}; return fullAddition(~input, Ones).result; } /// \brief Performs a generalized iterated application of an associative operator to a base /// /// In algebra, the repeated application of an operator to a "base" has different names depending on the /// operator, for example "a + a + a + ... + a" n-times would be called "repeated addition", /// if * is numeric multiplication, "a * a * a * ... * a" n-times would be called "exponentiation of a to the n /// power". /// The general term in algebra is "iteration", hence the naming of this function. /// Since * and "product" are frequently used in Algebra to denote the application of a general operator, we /// keep the option to use the imprecise language of "product, base and exponent". "Iteration" has a very /// different meaning in programming and especially different in C++. /// There may be iteration over an operator that is not associative (such as quaternion multiplication), this /// function leverages the associative property of the operator to "halve" the count of iterations at each step. /// \note There is a symmetrical operation to be implemented of associative iteration in the /// "progressive" direction: instead of starting with the most significant bit of the count, down to the lsb, /// and doing "op(result, base, count)"; going from lsb to msb doing "op(result, square, exponent)" /// \tparam Operator a callable with three arguments: the left and right arguments to the operation /// and the count to be used, the "count" is an artifact of this generalization /// \tparam IterationCount loosely models the "exponent" in "exponentiation", however, it may not /// be a number, the iteration count is part of the execution context to apply the operator /// \param forSquaring is an artifact of this generalization /// \param log2Count is to potentially reduce the number of iterations if the caller a-priori knows /// there are fewer iterations than what the type of exponent would allow template< typename Base, typename IterationCount, typename Operator, // the critical use of associativity is that it allows halving the // iteration count typename CountHalver > constexpr auto associativeOperatorIterated_regressive( Base base, Base neutral, IterationCount count, IterationCount forSquaring, Operator op, unsigned log2Count, CountHalver ch ) { auto result = neutral; if(!log2Count) { return result; } for(;;) { result = op(result, base, count); if(!--log2Count) { break; } result = op(result, result, forSquaring); count = ch(count); } return result; } template<int ActualBits, int NB, typename T> constexpr auto multiplication_OverflowUnsafe_SpecificBitCount( SWAR<NB, T> multiplicand, SWAR<NB, T> multiplier ) { using S = SWAR<NB, T>; auto operation = [](auto left, auto right, auto counts) { auto addendums = makeLaneMaskFromMSB(counts); return left + (addendums & right); }; auto halver = [](auto counts) { auto msbCleared = counts & ~S{S::MostSignificantBit}; T res = msbCleared.value() << 1; return S{res}; }; T val = multiplier.value() << (NB - ActualBits); multiplier = S{val}; return associativeOperatorIterated_regressive( multiplicand, S{0}, multiplier, S{S::MostSignificantBit}, operation, ActualBits, halver ); } /// \note Not removed yet because it is an example of "progressive" associative exponentiation template<int ActualBits, int NB, typename T> constexpr auto multiplication_OverflowUnsafe_SpecificBitCount_deprecated( SWAR<NB, T> multiplicand, SWAR<NB, T> multiplier ) { using S = SWAR<NB, T>; constexpr auto LeastBit = S::LeastSignificantBit; auto multiplicandDoubling = multiplicand.value(); auto mplier = multiplier.value(); auto product = S{0}; for(auto count = ActualBits;;) { auto multiplicandDoublingMask = makeLaneMaskFromLSB(S{mplier}); product = product + (multiplicandDoublingMask & S{multiplicandDoubling}); if(!--count) { break; } multiplicandDoubling <<= 1; auto leastBitCleared = mplier & ~LeastBit; mplier = leastBitCleared >> 1; } return product; } template<int ActualBits, int NB, typename T> constexpr auto exponentiation_OverflowUnsafe_SpecificBitCount( SWAR<NB, T> x, SWAR<NB, T> exponent ) { using S = SWAR<NB, T>; auto operation = [](auto left, auto right, auto counts) { const auto mask = makeLaneMaskFromMSB(counts); const auto product = multiplication_OverflowUnsafe_SpecificBitCount<ActualBits>(left, right); return (product & mask) | (left & ~mask); }; // halver should work same as multiplication... i think... auto halver = [](auto counts) { auto msbCleared = counts & ~S{S::MostSignificantBit}; return S{static_cast<T>(msbCleared.value() << 1)}; }; exponent = S{static_cast<T>(exponent.value() << (NB - ActualBits))}; return associativeOperatorIterated_regressive( x, S{meta::BitmaskMaker<T, 1, NB>().value}, // neutral is lane wise.. exponent, S{S::MostSignificantBit}, operation, ActualBits, halver ); } using S = SWAR<4, u16>; /** Transforms a number into a number into a binary tally. * E.g. 0b0011 (3) -> 0b0111 * It seems that trying to get the lane width as a tally is weird and overflowy. * */ template <typename S> constexpr auto base2TallyTransform_Plural(S input) { constexpr auto two = S{meta::BitmaskMaker<typename S::type, 2, S::NBits>::value}; constexpr auto one = S::LeastSignificantBit; typename S::type v = exponentiation_OverflowUnsafe_SpecificBitCount<S::NBits>(two, input).value() - one; return S{v}; } static_assert(base2TallyTransform_Plural(S{0b0001'0010'0011'0011}).value() == 0b0001'0011'0111'0111); static_assert(base2TallyTransform_Plural(S{0b0000'0001'0010'0011}).value() == 0b0000'0001'0011'0111); static_assert(base2TallyTransform_Plural(S{0b0100'0001'0010'0011}).value() == 0b1111'0001'0011'0111); static_assert(base2TallyTransform_Plural(S{0b0000'0000'0000'0001}).value() == 0b0000'0000'0000'0001); static_assert(base2TallyTransform_Plural(SWAR<8, uint16_t>{0b000000111'00000101}).value() == 0b01111111'00011111); // 7 -> 5 template <typename S> constexpr auto rightShift_Plural(S input, S shifts) { using T = typename S::type; auto minimumMask = ~base2TallyTransform_Plural(shifts); auto inputMasked = input.value() & minimumMask.value(); T result = 0; for (int i = 0; i < S::Lanes; i++) { auto laneMask = S::laneMask(i); auto currentShiftAmount = shifts.at(i); auto masked = inputMasked & laneMask; auto shifted = masked >> currentShiftAmount; result |= shifted; } return S{result}; } static_assert(1 >> 0 == 1); static_assert(rightShift_Plural( S{0b0111'0111'0111'0111}, S{0b0010'0010'0010'0010} ).value() == 0b0001'0001'0001'0001); static_assert(rightShift_Plural( S{0b0000'0000'1111'0001}, S{0b0000'0000'0000'0001} ).value() == 0b0000'0000'1111'0000); static_assert(rightShift_Plural( S{0b0000'1000'1000'1000}, S{0b0100'0011'0010'0001} ).value() == 0b0000'0001'0010'0100); static_assert(rightShift_Plural( S{0b1111'1111'1111'1111}, S{0b0001'0001'0001'0001} ).value() == 0b0111'0111'0111'0111); static_assert(rightShift_Plural( S{0b0000'0000'1111'0001}, S{0b0000'0000'0000'0000} ).value() == 0b0000'0000'1111'0001); static_assert(rightShift_Plural( S{0b0000'0000'1111'0001}, S{0b0000'0000'0001'0001} ).value() == 0b0000'0000'0111'0000); static_assert(S::LeastSignificantLaneMask == 0b0000'0000'0000'1111); static_assert(S::laneMask(0) == 0b0000'0000'0000'1111); static_assert(S::laneMask(1) == 0b0000'0000'1111'0000); static_assert(S::laneMask(2) == 0b0000'1111'0000'0000); static_assert(S::laneMask(3) == 0b1111'0000'0000'0000); static_assert(S{S::laneMask(3)}.at(3) == 0b0000'0000'0000'1111); template<int NB, typename T> constexpr auto multiplication_OverflowUnsafe( SWAR<NB, T> multiplicand, SWAR<NB, T> multiplier ) { return multiplication_OverflowUnsafe_SpecificBitCount<NB>( multiplicand, multiplier ); } template<int NB, typename T> struct SWAR_Pair{ SWAR<NB, T> even, odd; }; template<int NB, typename T> constexpr SWAR<NB, T> doublingMask() { using S = SWAR<NB, T>; static_assert(0 == S::Lanes % 2, "Only even number of elements supported"); using D = SWAR<NB * 2, T>; return S{(D::LeastSignificantBit << NB) - D::LeastSignificantBit}; } template<int NB, typename T> constexpr auto doublePrecision(SWAR<NB, T> input) { using S = SWAR<NB, T>; static_assert( 0 == S::NSlots % 2, "Precision can only be doubled for SWARs of even element count" ); using RV = SWAR<NB * 2, T>; constexpr auto DM = doublingMask<NB, T>(); return SWAR_Pair<NB * 2, T>{ RV{(input & DM).value()}, RV{(input.value() >> NB) & DM.value()} }; } template<int NB, typename T> constexpr auto halvePrecision(SWAR<NB, T> even, SWAR<NB, T> odd) { using S = SWAR<NB, T>; static_assert(0 == NB % 2, "Only even lane-bitcounts supported"); using RV = SWAR<NB/2, T>; constexpr auto HalvingMask = doublingMask<NB/2, T>(); auto evenHalf = RV{even.value()} & HalvingMask, oddHalf = RV{(RV{odd.value()} & HalvingMask).value() << NB/2}; return evenHalf | oddHalf; } } #endif using S = zoo::swar::SWAR<8, uint64_t>; auto foo(S a, S b) { return rightShift_Plural(a, b); }
Become a Patron
Sponsor on GitHub
Donate via PayPal
Compiler Explorer Shop
Source on GitHub
Mailing list
Installed libraries
Wiki
Report an issue
How it works
Contact the author
CE on Mastodon
CE on Bluesky
Statistics
Changelog
Version tree