Thanks for using Compiler Explorer
Sponsors
Jakt
C++
Ada
Algol68
Analysis
Android Java
Android Kotlin
Assembly
C
C3
Carbon
C with Coccinelle
C++ with Coccinelle
C++ (Circle)
CIRCT
Clean
CMake
CMakeScript
COBOL
C++ for OpenCL
MLIR
Cppx
Cppx-Blue
Cppx-Gold
Cpp2-cppfront
Crystal
C#
CUDA C++
D
Dart
Elixir
Erlang
Fortran
F#
GLSL
Go
Haskell
HLSL
Hook
Hylo
IL
ispc
Java
Julia
Kotlin
LLVM IR
LLVM MIR
Modula-2
Mojo
Nim
Numba
Nix
Objective-C
Objective-C++
OCaml
Odin
OpenCL C
Pascal
Pony
PTX
Python
Racket
Raku
Ruby
Rust
Sail
Snowball
Scala
Slang
Solidity
Spice
SPIR-V
Swift
LLVM TableGen
Toit
Triton
TypeScript Native
V
Vala
Visual Basic
Vyper
WASM
Zig
Javascript
GIMPLE
Ygen
sway
c++ source #1
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
6502-c++ 11.1.0
ARM GCC 10.2.0
ARM GCC 10.3.0
ARM GCC 10.4.0
ARM GCC 10.5.0
ARM GCC 11.1.0
ARM GCC 11.2.0
ARM GCC 11.3.0
ARM GCC 11.4.0
ARM GCC 12.1.0
ARM GCC 12.2.0
ARM GCC 12.3.0
ARM GCC 12.4.0
ARM GCC 12.5.0
ARM GCC 13.1.0
ARM GCC 13.2.0
ARM GCC 13.2.0 (unknown-eabi)
ARM GCC 13.3.0
ARM GCC 13.3.0 (unknown-eabi)
ARM GCC 13.4.0
ARM GCC 13.4.0 (unknown-eabi)
ARM GCC 14.1.0
ARM GCC 14.1.0 (unknown-eabi)
ARM GCC 14.2.0
ARM GCC 14.2.0 (unknown-eabi)
ARM GCC 14.3.0
ARM GCC 14.3.0 (unknown-eabi)
ARM GCC 15.1.0
ARM GCC 15.1.0 (unknown-eabi)
ARM GCC 15.2.0
ARM GCC 15.2.0 (unknown-eabi)
ARM GCC 4.5.4
ARM GCC 4.6.4
ARM GCC 5.4
ARM GCC 6.3.0
ARM GCC 6.4.0
ARM GCC 7.3.0
ARM GCC 7.5.0
ARM GCC 8.2.0
ARM GCC 8.5.0
ARM GCC 9.3.0
ARM GCC 9.4.0
ARM GCC 9.5.0
ARM GCC trunk
ARM gcc 10.2.1 (none)
ARM gcc 10.3.1 (2021.07 none)
ARM gcc 10.3.1 (2021.10 none)
ARM gcc 11.2.1 (none)
ARM gcc 5.4.1 (none)
ARM gcc 7.2.1 (none)
ARM gcc 8.2 (WinCE)
ARM gcc 8.3.1 (none)
ARM gcc 9.2.1 (none)
ARM msvc v19.0 (ex-WINE)
ARM msvc v19.10 (ex-WINE)
ARM msvc v19.14 (ex-WINE)
ARM64 Morello gcc 10.1 Alpha 2
ARM64 gcc 10.2
ARM64 gcc 10.3
ARM64 gcc 10.4
ARM64 gcc 10.5.0
ARM64 gcc 11.1
ARM64 gcc 11.2
ARM64 gcc 11.3
ARM64 gcc 11.4.0
ARM64 gcc 12.1
ARM64 gcc 12.2.0
ARM64 gcc 12.3.0
ARM64 gcc 12.4.0
ARM64 gcc 12.5.0
ARM64 gcc 13.1.0
ARM64 gcc 13.2.0
ARM64 gcc 13.3.0
ARM64 gcc 13.4.0
ARM64 gcc 14.1.0
ARM64 gcc 14.2.0
ARM64 gcc 14.3.0
ARM64 gcc 15.1.0
ARM64 gcc 15.2.0
ARM64 gcc 4.9.4
ARM64 gcc 5.4
ARM64 gcc 5.5.0
ARM64 gcc 6.3
ARM64 gcc 6.4
ARM64 gcc 7.3
ARM64 gcc 7.5
ARM64 gcc 8.2
ARM64 gcc 8.5
ARM64 gcc 9.3
ARM64 gcc 9.4
ARM64 gcc 9.5
ARM64 gcc trunk
ARM64 msvc v19.14 (ex-WINE)
AVR gcc 10.3.0
AVR gcc 11.1.0
AVR gcc 12.1.0
AVR gcc 12.2.0
AVR gcc 12.3.0
AVR gcc 12.4.0
AVR gcc 12.5.0
AVR gcc 13.1.0
AVR gcc 13.2.0
AVR gcc 13.3.0
AVR gcc 13.4.0
AVR gcc 14.1.0
AVR gcc 14.2.0
AVR gcc 14.3.0
AVR gcc 15.1.0
AVR gcc 15.2.0
AVR gcc 4.5.4
AVR gcc 4.6.4
AVR gcc 5.4.0
AVR gcc 9.2.0
AVR gcc 9.3.0
Arduino Mega (1.8.9)
Arduino Uno (1.8.9)
BPF clang (trunk)
BPF clang 13.0.0
BPF clang 14.0.0
BPF clang 15.0.0
BPF clang 16.0.0
BPF clang 17.0.1
BPF clang 18.1.0
BPF clang 19.1.0
BPF clang 20.1.0
BPF clang 21.1.0
EDG (experimental reflection)
EDG 6.5
EDG 6.5 (GNU mode gcc 13)
EDG 6.6
EDG 6.6 (GNU mode gcc 13)
EDG 6.7
EDG 6.7 (GNU mode gcc 14)
FRC 2019
FRC 2020
FRC 2023
HPPA gcc 14.2.0
HPPA gcc 14.3.0
HPPA gcc 15.1.0
HPPA gcc 15.2.0
KVX ACB 4.1.0 (GCC 7.5.0)
KVX ACB 4.1.0-cd1 (GCC 7.5.0)
KVX ACB 4.10.0 (GCC 10.3.1)
KVX ACB 4.11.1 (GCC 10.3.1)
KVX ACB 4.12.0 (GCC 11.3.0)
KVX ACB 4.2.0 (GCC 7.5.0)
KVX ACB 4.3.0 (GCC 7.5.0)
KVX ACB 4.4.0 (GCC 7.5.0)
KVX ACB 4.6.0 (GCC 9.4.1)
KVX ACB 4.8.0 (GCC 9.4.1)
KVX ACB 4.9.0 (GCC 9.4.1)
KVX ACB 5.0.0 (GCC 12.2.1)
KVX ACB 5.2.0 (GCC 13.2.1)
LoongArch64 clang (trunk)
LoongArch64 clang 17.0.1
LoongArch64 clang 18.1.0
LoongArch64 clang 19.1.0
LoongArch64 clang 20.1.0
LoongArch64 clang 21.1.0
M68K gcc 13.1.0
M68K gcc 13.2.0
M68K gcc 13.3.0
M68K gcc 13.4.0
M68K gcc 14.1.0
M68K gcc 14.2.0
M68K gcc 14.3.0
M68K gcc 15.1.0
M68K gcc 15.2.0
M68k clang (trunk)
MRISC32 gcc (trunk)
MSP430 gcc 4.5.3
MSP430 gcc 5.3.0
MSP430 gcc 6.2.1
MinGW clang 14.0.3
MinGW clang 14.0.6
MinGW clang 15.0.7
MinGW clang 16.0.0
MinGW clang 16.0.2
MinGW gcc 11.3.0
MinGW gcc 12.1.0
MinGW gcc 12.2.0
MinGW gcc 13.1.0
RISC-V (32-bits) gcc (trunk)
RISC-V (32-bits) gcc 10.2.0
RISC-V (32-bits) gcc 10.3.0
RISC-V (32-bits) gcc 11.2.0
RISC-V (32-bits) gcc 11.3.0
RISC-V (32-bits) gcc 11.4.0
RISC-V (32-bits) gcc 12.1.0
RISC-V (32-bits) gcc 12.2.0
RISC-V (32-bits) gcc 12.3.0
RISC-V (32-bits) gcc 12.4.0
RISC-V (32-bits) gcc 12.5.0
RISC-V (32-bits) gcc 13.1.0
RISC-V (32-bits) gcc 13.2.0
RISC-V (32-bits) gcc 13.3.0
RISC-V (32-bits) gcc 13.4.0
RISC-V (32-bits) gcc 14.1.0
RISC-V (32-bits) gcc 14.2.0
RISC-V (32-bits) gcc 14.3.0
RISC-V (32-bits) gcc 15.1.0
RISC-V (32-bits) gcc 15.2.0
RISC-V (32-bits) gcc 8.2.0
RISC-V (32-bits) gcc 8.5.0
RISC-V (32-bits) gcc 9.4.0
RISC-V (64-bits) gcc (trunk)
RISC-V (64-bits) gcc 10.2.0
RISC-V (64-bits) gcc 10.3.0
RISC-V (64-bits) gcc 11.2.0
RISC-V (64-bits) gcc 11.3.0
RISC-V (64-bits) gcc 11.4.0
RISC-V (64-bits) gcc 12.1.0
RISC-V (64-bits) gcc 12.2.0
RISC-V (64-bits) gcc 12.3.0
RISC-V (64-bits) gcc 12.4.0
RISC-V (64-bits) gcc 12.5.0
RISC-V (64-bits) gcc 13.1.0
RISC-V (64-bits) gcc 13.2.0
RISC-V (64-bits) gcc 13.3.0
RISC-V (64-bits) gcc 13.4.0
RISC-V (64-bits) gcc 14.1.0
RISC-V (64-bits) gcc 14.2.0
RISC-V (64-bits) gcc 14.3.0
RISC-V (64-bits) gcc 15.1.0
RISC-V (64-bits) gcc 15.2.0
RISC-V (64-bits) gcc 8.2.0
RISC-V (64-bits) gcc 8.5.0
RISC-V (64-bits) gcc 9.4.0
RISC-V rv32gc clang (trunk)
RISC-V rv32gc clang 10.0.0
RISC-V rv32gc clang 10.0.1
RISC-V rv32gc clang 11.0.0
RISC-V rv32gc clang 11.0.1
RISC-V rv32gc clang 12.0.0
RISC-V rv32gc clang 12.0.1
RISC-V rv32gc clang 13.0.0
RISC-V rv32gc clang 13.0.1
RISC-V rv32gc clang 14.0.0
RISC-V rv32gc clang 15.0.0
RISC-V rv32gc clang 16.0.0
RISC-V rv32gc clang 17.0.1
RISC-V rv32gc clang 18.1.0
RISC-V rv32gc clang 19.1.0
RISC-V rv32gc clang 20.1.0
RISC-V rv32gc clang 21.1.0
RISC-V rv32gc clang 9.0.0
RISC-V rv32gc clang 9.0.1
RISC-V rv64gc clang (trunk)
RISC-V rv64gc clang 10.0.0
RISC-V rv64gc clang 10.0.1
RISC-V rv64gc clang 11.0.0
RISC-V rv64gc clang 11.0.1
RISC-V rv64gc clang 12.0.0
RISC-V rv64gc clang 12.0.1
RISC-V rv64gc clang 13.0.0
RISC-V rv64gc clang 13.0.1
RISC-V rv64gc clang 14.0.0
RISC-V rv64gc clang 15.0.0
RISC-V rv64gc clang 16.0.0
RISC-V rv64gc clang 17.0.1
RISC-V rv64gc clang 18.1.0
RISC-V rv64gc clang 19.1.0
RISC-V rv64gc clang 20.1.0
RISC-V rv64gc clang 21.1.0
RISC-V rv64gc clang 9.0.0
RISC-V rv64gc clang 9.0.1
Raspbian Buster
Raspbian Stretch
SPARC LEON gcc 12.2.0
SPARC LEON gcc 12.3.0
SPARC LEON gcc 12.4.0
SPARC LEON gcc 12.5.0
SPARC LEON gcc 13.1.0
SPARC LEON gcc 13.2.0
SPARC LEON gcc 13.3.0
SPARC LEON gcc 13.4.0
SPARC LEON gcc 14.1.0
SPARC LEON gcc 14.2.0
SPARC LEON gcc 14.3.0
SPARC LEON gcc 15.1.0
SPARC LEON gcc 15.2.0
SPARC gcc 12.2.0
SPARC gcc 12.3.0
SPARC gcc 12.4.0
SPARC gcc 12.5.0
SPARC gcc 13.1.0
SPARC gcc 13.2.0
SPARC gcc 13.3.0
SPARC gcc 13.4.0
SPARC gcc 14.1.0
SPARC gcc 14.2.0
SPARC gcc 14.3.0
SPARC gcc 15.1.0
SPARC gcc 15.2.0
SPARC64 gcc 12.2.0
SPARC64 gcc 12.3.0
SPARC64 gcc 12.4.0
SPARC64 gcc 12.5.0
SPARC64 gcc 13.1.0
SPARC64 gcc 13.2.0
SPARC64 gcc 13.3.0
SPARC64 gcc 13.4.0
SPARC64 gcc 14.1.0
SPARC64 gcc 14.2.0
SPARC64 gcc 14.3.0
SPARC64 gcc 15.1.0
SPARC64 gcc 15.2.0
TI C6x gcc 12.2.0
TI C6x gcc 12.3.0
TI C6x gcc 12.4.0
TI C6x gcc 12.5.0
TI C6x gcc 13.1.0
TI C6x gcc 13.2.0
TI C6x gcc 13.3.0
TI C6x gcc 13.4.0
TI C6x gcc 14.1.0
TI C6x gcc 14.2.0
TI C6x gcc 14.3.0
TI C6x gcc 15.1.0
TI C6x gcc 15.2.0
TI CL430 21.6.1
Tricore gcc 11.3.0 (EEESlab)
VAX gcc NetBSDELF 10.4.0
VAX gcc NetBSDELF 10.5.0 (Nov 15 03:50:22 2023)
VAX gcc NetBSDELF 12.4.0 (Apr 16 05:27 2025)
WebAssembly clang (trunk)
Xtensa ESP32 gcc 11.2.0 (2022r1)
Xtensa ESP32 gcc 12.2.0 (20230208)
Xtensa ESP32 gcc 14.2.0 (20241119)
Xtensa ESP32 gcc 8.2.0 (2019r2)
Xtensa ESP32 gcc 8.2.0 (2020r1)
Xtensa ESP32 gcc 8.2.0 (2020r2)
Xtensa ESP32 gcc 8.4.0 (2020r3)
Xtensa ESP32 gcc 8.4.0 (2021r1)
Xtensa ESP32 gcc 8.4.0 (2021r2)
Xtensa ESP32-S2 gcc 11.2.0 (2022r1)
Xtensa ESP32-S2 gcc 12.2.0 (20230208)
Xtensa ESP32-S2 gcc 14.2.0 (20241119)
Xtensa ESP32-S2 gcc 8.2.0 (2019r2)
Xtensa ESP32-S2 gcc 8.2.0 (2020r1)
Xtensa ESP32-S2 gcc 8.2.0 (2020r2)
Xtensa ESP32-S2 gcc 8.4.0 (2020r3)
Xtensa ESP32-S2 gcc 8.4.0 (2021r1)
Xtensa ESP32-S2 gcc 8.4.0 (2021r2)
Xtensa ESP32-S3 gcc 11.2.0 (2022r1)
Xtensa ESP32-S3 gcc 12.2.0 (20230208)
Xtensa ESP32-S3 gcc 14.2.0 (20241119)
Xtensa ESP32-S3 gcc 8.4.0 (2020r3)
Xtensa ESP32-S3 gcc 8.4.0 (2021r1)
Xtensa ESP32-S3 gcc 8.4.0 (2021r2)
arm64 msvc v19.20 VS16.0
arm64 msvc v19.21 VS16.1
arm64 msvc v19.22 VS16.2
arm64 msvc v19.23 VS16.3
arm64 msvc v19.24 VS16.4
arm64 msvc v19.25 VS16.5
arm64 msvc v19.27 VS16.7
arm64 msvc v19.28 VS16.8
arm64 msvc v19.28 VS16.9
arm64 msvc v19.29 VS16.10
arm64 msvc v19.29 VS16.11
arm64 msvc v19.30 VS17.0
arm64 msvc v19.31 VS17.1
arm64 msvc v19.32 VS17.2
arm64 msvc v19.33 VS17.3
arm64 msvc v19.34 VS17.4
arm64 msvc v19.35 VS17.5
arm64 msvc v19.36 VS17.6
arm64 msvc v19.37 VS17.7
arm64 msvc v19.38 VS17.8
arm64 msvc v19.39 VS17.9
arm64 msvc v19.40 VS17.10
arm64 msvc v19.41 VS17.11
arm64 msvc v19.42 VS17.12
arm64 msvc v19.43 VS17.13
arm64 msvc v19.latest
armv7-a clang (trunk)
armv7-a clang 10.0.0
armv7-a clang 10.0.1
armv7-a clang 11.0.0
armv7-a clang 11.0.1
armv7-a clang 12.0.0
armv7-a clang 12.0.1
armv7-a clang 13.0.0
armv7-a clang 13.0.1
armv7-a clang 14.0.0
armv7-a clang 15.0.0
armv7-a clang 16.0.0
armv7-a clang 17.0.1
armv7-a clang 18.1.0
armv7-a clang 19.1.0
armv7-a clang 20.1.0
armv7-a clang 21.1.0
armv7-a clang 9.0.0
armv7-a clang 9.0.1
armv8-a clang (all architectural features, trunk)
armv8-a clang (trunk)
armv8-a clang 10.0.0
armv8-a clang 10.0.1
armv8-a clang 11.0.0
armv8-a clang 11.0.1
armv8-a clang 12.0.0
armv8-a clang 13.0.0
armv8-a clang 14.0.0
armv8-a clang 15.0.0
armv8-a clang 16.0.0
armv8-a clang 17.0.1
armv8-a clang 18.1.0
armv8-a clang 19.1.0
armv8-a clang 20.1.0
armv8-a clang 21.1.0
armv8-a clang 9.0.0
armv8-a clang 9.0.1
clad trunk (clang 21.1.0)
clad v1.10 (clang 20.1.0)
clad v1.8 (clang 18.1.0)
clad v1.9 (clang 19.1.0)
clad v2.00 (clang 20.1.0)
clang-cl 18.1.0
ellcc 0.1.33
ellcc 0.1.34
ellcc 2017-07-16
ez80-clang 15.0.0
ez80-clang 15.0.7
hexagon-clang 16.0.5
llvm-mos atari2600-3e
llvm-mos atari2600-4k
llvm-mos atari2600-common
llvm-mos atari5200-supercart
llvm-mos atari8-cart-megacart
llvm-mos atari8-cart-std
llvm-mos atari8-cart-xegs
llvm-mos atari8-common
llvm-mos atari8-dos
llvm-mos c128
llvm-mos c64
llvm-mos commodore
llvm-mos cpm65
llvm-mos cx16
llvm-mos dodo
llvm-mos eater
llvm-mos mega65
llvm-mos nes
llvm-mos nes-action53
llvm-mos nes-cnrom
llvm-mos nes-gtrom
llvm-mos nes-mmc1
llvm-mos nes-mmc3
llvm-mos nes-nrom
llvm-mos nes-unrom
llvm-mos nes-unrom-512
llvm-mos osi-c1p
llvm-mos pce
llvm-mos pce-cd
llvm-mos pce-common
llvm-mos pet
llvm-mos rp6502
llvm-mos rpc8e
llvm-mos supervision
llvm-mos vic20
loongarch64 gcc 12.2.0
loongarch64 gcc 12.3.0
loongarch64 gcc 12.4.0
loongarch64 gcc 12.5.0
loongarch64 gcc 13.1.0
loongarch64 gcc 13.2.0
loongarch64 gcc 13.3.0
loongarch64 gcc 13.4.0
loongarch64 gcc 14.1.0
loongarch64 gcc 14.2.0
loongarch64 gcc 14.3.0
loongarch64 gcc 15.1.0
loongarch64 gcc 15.2.0
mips clang 13.0.0
mips clang 14.0.0
mips clang 15.0.0
mips clang 16.0.0
mips clang 17.0.1
mips clang 18.1.0
mips clang 19.1.0
mips clang 20.1.0
mips clang 21.1.0
mips gcc 11.2.0
mips gcc 12.1.0
mips gcc 12.2.0
mips gcc 12.3.0
mips gcc 12.4.0
mips gcc 12.5.0
mips gcc 13.1.0
mips gcc 13.2.0
mips gcc 13.3.0
mips gcc 13.4.0
mips gcc 14.1.0
mips gcc 14.2.0
mips gcc 14.3.0
mips gcc 15.1.0
mips gcc 15.2.0
mips gcc 4.9.4
mips gcc 5.4
mips gcc 5.5.0
mips gcc 9.3.0 (codescape)
mips gcc 9.5.0
mips64 (el) gcc 12.1.0
mips64 (el) gcc 12.2.0
mips64 (el) gcc 12.3.0
mips64 (el) gcc 12.4.0
mips64 (el) gcc 12.5.0
mips64 (el) gcc 13.1.0
mips64 (el) gcc 13.2.0
mips64 (el) gcc 13.3.0
mips64 (el) gcc 13.4.0
mips64 (el) gcc 14.1.0
mips64 (el) gcc 14.2.0
mips64 (el) gcc 14.3.0
mips64 (el) gcc 15.1.0
mips64 (el) gcc 15.2.0
mips64 (el) gcc 4.9.4
mips64 (el) gcc 5.4.0
mips64 (el) gcc 5.5.0
mips64 (el) gcc 9.5.0
mips64 clang 13.0.0
mips64 clang 14.0.0
mips64 clang 15.0.0
mips64 clang 16.0.0
mips64 clang 17.0.1
mips64 clang 18.1.0
mips64 clang 19.1.0
mips64 clang 20.1.0
mips64 clang 21.1.0
mips64 gcc 11.2.0
mips64 gcc 12.1.0
mips64 gcc 12.2.0
mips64 gcc 12.3.0
mips64 gcc 12.4.0
mips64 gcc 12.5.0
mips64 gcc 13.1.0
mips64 gcc 13.2.0
mips64 gcc 13.3.0
mips64 gcc 13.4.0
mips64 gcc 14.1.0
mips64 gcc 14.2.0
mips64 gcc 14.3.0
mips64 gcc 15.1.0
mips64 gcc 15.2.0
mips64 gcc 4.9.4
mips64 gcc 5.4.0
mips64 gcc 5.5.0
mips64 gcc 9.5.0
mips64el clang 13.0.0
mips64el clang 14.0.0
mips64el clang 15.0.0
mips64el clang 16.0.0
mips64el clang 17.0.1
mips64el clang 18.1.0
mips64el clang 19.1.0
mips64el clang 20.1.0
mips64el clang 21.1.0
mipsel clang 13.0.0
mipsel clang 14.0.0
mipsel clang 15.0.0
mipsel clang 16.0.0
mipsel clang 17.0.1
mipsel clang 18.1.0
mipsel clang 19.1.0
mipsel clang 20.1.0
mipsel clang 21.1.0
mipsel gcc 12.1.0
mipsel gcc 12.2.0
mipsel gcc 12.3.0
mipsel gcc 12.4.0
mipsel gcc 12.5.0
mipsel gcc 13.1.0
mipsel gcc 13.2.0
mipsel gcc 13.3.0
mipsel gcc 13.4.0
mipsel gcc 14.1.0
mipsel gcc 14.2.0
mipsel gcc 14.3.0
mipsel gcc 15.1.0
mipsel gcc 15.2.0
mipsel gcc 4.9.4
mipsel gcc 5.4.0
mipsel gcc 5.5.0
mipsel gcc 9.5.0
nanoMIPS gcc 6.3.0 (mtk)
power gcc 11.2.0
power gcc 12.1.0
power gcc 12.2.0
power gcc 12.3.0
power gcc 12.4.0
power gcc 12.5.0
power gcc 13.1.0
power gcc 13.2.0
power gcc 13.3.0
power gcc 13.4.0
power gcc 14.1.0
power gcc 14.2.0
power gcc 14.3.0
power gcc 15.1.0
power gcc 15.2.0
power gcc 4.8.5
power64 AT12.0 (gcc8)
power64 AT13.0 (gcc9)
power64 gcc 11.2.0
power64 gcc 12.1.0
power64 gcc 12.2.0
power64 gcc 12.3.0
power64 gcc 12.4.0
power64 gcc 12.5.0
power64 gcc 13.1.0
power64 gcc 13.2.0
power64 gcc 13.3.0
power64 gcc 13.4.0
power64 gcc 14.1.0
power64 gcc 14.2.0
power64 gcc 14.3.0
power64 gcc 15.1.0
power64 gcc 15.2.0
power64 gcc trunk
power64le AT12.0 (gcc8)
power64le AT13.0 (gcc9)
power64le clang (trunk)
power64le gcc 11.2.0
power64le gcc 12.1.0
power64le gcc 12.2.0
power64le gcc 12.3.0
power64le gcc 12.4.0
power64le gcc 12.5.0
power64le gcc 13.1.0
power64le gcc 13.2.0
power64le gcc 13.3.0
power64le gcc 13.4.0
power64le gcc 14.1.0
power64le gcc 14.2.0
power64le gcc 14.3.0
power64le gcc 15.1.0
power64le gcc 15.2.0
power64le gcc 6.3.0
power64le gcc trunk
powerpc64 clang (trunk)
qnx 8.0.0
s390x gcc 11.2.0
s390x gcc 12.1.0
s390x gcc 12.2.0
s390x gcc 12.3.0
s390x gcc 12.4.0
s390x gcc 12.5.0
s390x gcc 13.1.0
s390x gcc 13.2.0
s390x gcc 13.3.0
s390x gcc 13.4.0
s390x gcc 14.1.0
s390x gcc 14.2.0
s390x gcc 14.3.0
s390x gcc 15.1.0
s390x gcc 15.2.0
sh gcc 12.2.0
sh gcc 12.3.0
sh gcc 12.4.0
sh gcc 12.5.0
sh gcc 13.1.0
sh gcc 13.2.0
sh gcc 13.3.0
sh gcc 13.4.0
sh gcc 14.1.0
sh gcc 14.2.0
sh gcc 14.3.0
sh gcc 15.1.0
sh gcc 15.2.0
sh gcc 4.9.4
sh gcc 9.5.0
vast (trunk)
x64 msvc v19.0 (ex-WINE)
x64 msvc v19.10 (ex-WINE)
x64 msvc v19.14 (ex-WINE)
x64 msvc v19.20 VS16.0
x64 msvc v19.21 VS16.1
x64 msvc v19.22 VS16.2
x64 msvc v19.23 VS16.3
x64 msvc v19.24 VS16.4
x64 msvc v19.25 VS16.5
x64 msvc v19.27 VS16.7
x64 msvc v19.28 VS16.8
x64 msvc v19.28 VS16.9
x64 msvc v19.29 VS16.10
x64 msvc v19.29 VS16.11
x64 msvc v19.30 VS17.0
x64 msvc v19.31 VS17.1
x64 msvc v19.32 VS17.2
x64 msvc v19.33 VS17.3
x64 msvc v19.34 VS17.4
x64 msvc v19.35 VS17.5
x64 msvc v19.36 VS17.6
x64 msvc v19.37 VS17.7
x64 msvc v19.38 VS17.8
x64 msvc v19.39 VS17.9
x64 msvc v19.40 VS17.10
x64 msvc v19.41 VS17.11
x64 msvc v19.42 VS17.12
x64 msvc v19.43 VS17.13
x64 msvc v19.latest
x86 djgpp 4.9.4
x86 djgpp 5.5.0
x86 djgpp 6.4.0
x86 djgpp 7.2.0
x86 msvc v19.0 (ex-WINE)
x86 msvc v19.10 (ex-WINE)
x86 msvc v19.14 (ex-WINE)
x86 msvc v19.20 VS16.0
x86 msvc v19.21 VS16.1
x86 msvc v19.22 VS16.2
x86 msvc v19.23 VS16.3
x86 msvc v19.24 VS16.4
x86 msvc v19.25 VS16.5
x86 msvc v19.27 VS16.7
x86 msvc v19.28 VS16.8
x86 msvc v19.28 VS16.9
x86 msvc v19.29 VS16.10
x86 msvc v19.29 VS16.11
x86 msvc v19.30 VS17.0
x86 msvc v19.31 VS17.1
x86 msvc v19.32 VS17.2
x86 msvc v19.33 VS17.3
x86 msvc v19.34 VS17.4
x86 msvc v19.35 VS17.5
x86 msvc v19.36 VS17.6
x86 msvc v19.37 VS17.7
x86 msvc v19.38 VS17.8
x86 msvc v19.39 VS17.9
x86 msvc v19.40 VS17.10
x86 msvc v19.41 VS17.11
x86 msvc v19.42 VS17.12
x86 msvc v19.43 VS17.13
x86 msvc v19.latest
x86 nvc++ 22.11
x86 nvc++ 22.7
x86 nvc++ 22.9
x86 nvc++ 23.1
x86 nvc++ 23.11
x86 nvc++ 23.3
x86 nvc++ 23.5
x86 nvc++ 23.7
x86 nvc++ 23.9
x86 nvc++ 24.1
x86 nvc++ 24.11
x86 nvc++ 24.3
x86 nvc++ 24.5
x86 nvc++ 24.7
x86 nvc++ 24.9
x86 nvc++ 25.1
x86 nvc++ 25.3
x86 nvc++ 25.5
x86 nvc++ 25.7
x86-64 Zapcc 190308
x86-64 clang (-fimplicit-constexpr)
x86-64 clang (Chris Bazley N3089)
x86-64 clang (EricWF contracts)
x86-64 clang (amd-staging)
x86-64 clang (assertions trunk)
x86-64 clang (clangir)
x86-64 clang (experimental -Wlifetime)
x86-64 clang (experimental P1061)
x86-64 clang (experimental P1144)
x86-64 clang (experimental P1221)
x86-64 clang (experimental P2998)
x86-64 clang (experimental P3068)
x86-64 clang (experimental P3309)
x86-64 clang (experimental P3367)
x86-64 clang (experimental P3372)
x86-64 clang (experimental P3385)
x86-64 clang (experimental P3776)
x86-64 clang (experimental metaprogramming - P2632)
x86-64 clang (old concepts branch)
x86-64 clang (p1974)
x86-64 clang (pattern matching - P2688)
x86-64 clang (reflection - C++26)
x86-64 clang (reflection - TS)
x86-64 clang (resugar)
x86-64 clang (string interpolation - P3412)
x86-64 clang (thephd.dev)
x86-64 clang (trunk)
x86-64 clang (variadic friends - P2893)
x86-64 clang (widberg)
x86-64 clang 10.0.0
x86-64 clang 10.0.0 (assertions)
x86-64 clang 10.0.1
x86-64 clang 11.0.0
x86-64 clang 11.0.0 (assertions)
x86-64 clang 11.0.1
x86-64 clang 12.0.0
x86-64 clang 12.0.0 (assertions)
x86-64 clang 12.0.1
x86-64 clang 13.0.0
x86-64 clang 13.0.0 (assertions)
x86-64 clang 13.0.1
x86-64 clang 14.0.0
x86-64 clang 14.0.0 (assertions)
x86-64 clang 15.0.0
x86-64 clang 15.0.0 (assertions)
x86-64 clang 16.0.0
x86-64 clang 16.0.0 (assertions)
x86-64 clang 17.0.1
x86-64 clang 17.0.1 (assertions)
x86-64 clang 18.1.0
x86-64 clang 18.1.0 (assertions)
x86-64 clang 19.1.0
x86-64 clang 19.1.0 (assertions)
x86-64 clang 2.6.0 (assertions)
x86-64 clang 2.7.0 (assertions)
x86-64 clang 2.8.0 (assertions)
x86-64 clang 2.9.0 (assertions)
x86-64 clang 20.1.0
x86-64 clang 20.1.0 (assertions)
x86-64 clang 21.1.0
x86-64 clang 21.1.0 (assertions)
x86-64 clang 3.0.0
x86-64 clang 3.0.0 (assertions)
x86-64 clang 3.1
x86-64 clang 3.1 (assertions)
x86-64 clang 3.2
x86-64 clang 3.2 (assertions)
x86-64 clang 3.3
x86-64 clang 3.3 (assertions)
x86-64 clang 3.4 (assertions)
x86-64 clang 3.4.1
x86-64 clang 3.5
x86-64 clang 3.5 (assertions)
x86-64 clang 3.5.1
x86-64 clang 3.5.2
x86-64 clang 3.6
x86-64 clang 3.6 (assertions)
x86-64 clang 3.7
x86-64 clang 3.7 (assertions)
x86-64 clang 3.7.1
x86-64 clang 3.8
x86-64 clang 3.8 (assertions)
x86-64 clang 3.8.1
x86-64 clang 3.9.0
x86-64 clang 3.9.0 (assertions)
x86-64 clang 3.9.1
x86-64 clang 4.0.0
x86-64 clang 4.0.0 (assertions)
x86-64 clang 4.0.1
x86-64 clang 5.0.0
x86-64 clang 5.0.0 (assertions)
x86-64 clang 5.0.1
x86-64 clang 5.0.2
x86-64 clang 6.0.0
x86-64 clang 6.0.0 (assertions)
x86-64 clang 6.0.1
x86-64 clang 7.0.0
x86-64 clang 7.0.0 (assertions)
x86-64 clang 7.0.1
x86-64 clang 7.1.0
x86-64 clang 8.0.0
x86-64 clang 8.0.0 (assertions)
x86-64 clang 8.0.1
x86-64 clang 9.0.0
x86-64 clang 9.0.0 (assertions)
x86-64 clang 9.0.1
x86-64 clang rocm-4.5.2
x86-64 clang rocm-5.0.2
x86-64 clang rocm-5.1.3
x86-64 clang rocm-5.2.3
x86-64 clang rocm-5.3.3
x86-64 clang rocm-5.7.0
x86-64 clang rocm-6.0.2
x86-64 clang rocm-6.1.2
x86-64 clang rocm-6.2.4
x86-64 clang rocm-6.3.3
x86-64 clang rocm-6.4.0
x86-64 gcc (P2034 lambdas)
x86-64 gcc (contract labels)
x86-64 gcc (contracts natural syntax)
x86-64 gcc (contracts)
x86-64 gcc (coroutines)
x86-64 gcc (modules)
x86-64 gcc (trunk)
x86-64 gcc 10.1
x86-64 gcc 10.2
x86-64 gcc 10.3
x86-64 gcc 10.3 (assertions)
x86-64 gcc 10.4
x86-64 gcc 10.4 (assertions)
x86-64 gcc 10.5
x86-64 gcc 10.5 (assertions)
x86-64 gcc 11.1
x86-64 gcc 11.1 (assertions)
x86-64 gcc 11.2
x86-64 gcc 11.2 (assertions)
x86-64 gcc 11.3
x86-64 gcc 11.3 (assertions)
x86-64 gcc 11.4
x86-64 gcc 11.4 (assertions)
x86-64 gcc 12.1
x86-64 gcc 12.1 (assertions)
x86-64 gcc 12.2
x86-64 gcc 12.2 (assertions)
x86-64 gcc 12.3
x86-64 gcc 12.3 (assertions)
x86-64 gcc 12.4
x86-64 gcc 12.4 (assertions)
x86-64 gcc 12.5
x86-64 gcc 12.5 (assertions)
x86-64 gcc 13.1
x86-64 gcc 13.1 (assertions)
x86-64 gcc 13.2
x86-64 gcc 13.2 (assertions)
x86-64 gcc 13.3
x86-64 gcc 13.3 (assertions)
x86-64 gcc 13.4
x86-64 gcc 13.4 (assertions)
x86-64 gcc 14.1
x86-64 gcc 14.1 (assertions)
x86-64 gcc 14.2
x86-64 gcc 14.2 (assertions)
x86-64 gcc 14.3
x86-64 gcc 14.3 (assertions)
x86-64 gcc 15.1
x86-64 gcc 15.1 (assertions)
x86-64 gcc 15.2
x86-64 gcc 15.2 (assertions)
x86-64 gcc 3.4.6
x86-64 gcc 4.0.4
x86-64 gcc 4.1.2
x86-64 gcc 4.4.7
x86-64 gcc 4.5.3
x86-64 gcc 4.6.4
x86-64 gcc 4.7.1
x86-64 gcc 4.7.2
x86-64 gcc 4.7.3
x86-64 gcc 4.7.4
x86-64 gcc 4.8.1
x86-64 gcc 4.8.2
x86-64 gcc 4.8.3
x86-64 gcc 4.8.4
x86-64 gcc 4.8.5
x86-64 gcc 4.9.0
x86-64 gcc 4.9.1
x86-64 gcc 4.9.2
x86-64 gcc 4.9.3
x86-64 gcc 4.9.4
x86-64 gcc 5.1
x86-64 gcc 5.2
x86-64 gcc 5.3
x86-64 gcc 5.4
x86-64 gcc 5.5
x86-64 gcc 6.1
x86-64 gcc 6.2
x86-64 gcc 6.3
x86-64 gcc 6.4
x86-64 gcc 6.5
x86-64 gcc 7.1
x86-64 gcc 7.2
x86-64 gcc 7.3
x86-64 gcc 7.4
x86-64 gcc 7.5
x86-64 gcc 8.1
x86-64 gcc 8.2
x86-64 gcc 8.3
x86-64 gcc 8.4
x86-64 gcc 8.5
x86-64 gcc 9.1
x86-64 gcc 9.2
x86-64 gcc 9.3
x86-64 gcc 9.4
x86-64 gcc 9.5
x86-64 icc 13.0.1
x86-64 icc 16.0.3
x86-64 icc 17.0.0
x86-64 icc 18.0.0
x86-64 icc 19.0.0
x86-64 icc 19.0.1
x86-64 icc 2021.1.2
x86-64 icc 2021.10.0
x86-64 icc 2021.2.0
x86-64 icc 2021.3.0
x86-64 icc 2021.4.0
x86-64 icc 2021.5.0
x86-64 icc 2021.6.0
x86-64 icc 2021.7.0
x86-64 icc 2021.7.1
x86-64 icc 2021.8.0
x86-64 icc 2021.9.0
x86-64 icx 2021.1.2
x86-64 icx 2021.2.0
x86-64 icx 2021.3.0
x86-64 icx 2021.4.0
x86-64 icx 2022.0.0
x86-64 icx 2022.1.0
x86-64 icx 2022.2.0
x86-64 icx 2022.2.1
x86-64 icx 2023.0.0
x86-64 icx 2023.1.0
x86-64 icx 2023.2.1
x86-64 icx 2024.0.0
x86-64 icx 2024.1.0
x86-64 icx 2024.2.0
x86-64 icx 2024.2.1
x86-64 icx 2025.0.0
x86-64 icx 2025.0.1
x86-64 icx 2025.0.3
x86-64 icx 2025.0.4
x86-64 icx 2025.1.0
x86-64 icx 2025.1.1
x86-64 icx 2025.2.0
x86-64 icx 2025.2.1
x86-64 icx 2025.2.1
z180-clang 15.0.0
z180-clang 15.0.7
z80-clang 15.0.0
z80-clang 15.0.7
zig c++ 0.10.0
zig c++ 0.11.0
zig c++ 0.12.0
zig c++ 0.12.1
zig c++ 0.13.0
zig c++ 0.14.0
zig c++ 0.14.1
zig c++ 0.15.1
zig c++ 0.6.0
zig c++ 0.7.0
zig c++ 0.7.1
zig c++ 0.8.0
zig c++ 0.9.0
zig c++ trunk
Options
Source code
#define _USE_MATH_DEFINES #include <cmath> #include <vector> #include <chrono> #include <type_traits> #include <cstdint> #include <mutex> #include <iostream> #include <iomanip> #include <random> #include <functional> #include <stdexcept> #include <memory> #include <thread> #include <array> #include <optional> #include <set> #include <algorithm> #include <unordered_map> #include <sstream> #include <complex> #include <cstring> #include <omp.h> #include <emmintrin.h> #include <immintrin.h> #if (defined(_MSC_VER) && !defined(__clang__)) #define IS_MSVC 1 #else #define IS_MSVC 0 #endif #ifdef _WIN32 #define IS_WIN 1 #else #define IS_WIN 0 #endif #ifdef __SSE2__ #define HAS_SSE2 1 #else #define HAS_SSE2 0 #endif #ifdef __AVX__ #define HAS_AVX 1 #else #define HAS_AVX 0 #endif #define HAS_SSE2 1 #define HAS_AVX 1 #define HAS_INT128 (!IS_MSVC) #define USE_BOOST_MP_128 (!HAS_INT128) #if USE_BOOST_MP_128 #include <boost/multiprecision/cpp_int.hpp> #endif #if IS_WIN #include <intrin.h> #endif #define ASSERT_MSG(cond, msg) { if (!(cond)) throw std::runtime_error("Assertion (" #cond ") failed at line " + std::to_string(__LINE__) + "! Msg: '" + std::string(msg) + "'."); } #define ASSERT(cond) ASSERT_MSG(cond, "") #define IS_DEBUG 0 #if IS_DEBUG #define DASSERT_MSG(cond, msg) ASSERT_MSG(cond, msg) #else #define DASSERT_MSG(cond, msg) #endif #define DASSERT(cond) DASSERT_MSG(cond, "") #define COUT(code) { std::unique_lock<std::mutex> lock(cout_mux); std::cout code; std::cout << std::flush; } #define LN //{ COUT(<< "LN " << __LINE__ << std::endl); } #define DUMP(var) //{ COUT(<< __LINE__ << " : " << #var << " = (" << (var) << ")" << std::endl); } #define JOIN0(x, y) x##y #define JOIN(x, y) JOIN0(x, y) #define TEST(name) static void JOIN(Test_##name, __LINE__)(); static bool const JOIN(reg_test_##name, __LINE__) = []{ g_tests.push_back(std::make_pair(#name, &JOIN(Test_##name, __LINE__))); return true; }(); static void JOIN(Test_##name, __LINE__)() #define RUN_TESTS() { for (auto & [name, f]: g_tests) { std::cout << "Test " << name << " " << std::flush; auto tb = Time(); f(); tb = Time() - tb; std::cout << std::fixed << std::setprecision(3) << tb << " sec" << std::endl; } } #define VAT(v, i) VAt(v, i, __LINE__, #v ", " #i) #define bisizeof(x) (sizeof(x) * 8) // https://stackoverflow.com/a/9504047/941531 #if defined(_MSC_VER) #define ATTR_PACKED #define PACKED_BEGIN __pragma(pack(push, 1)) #define PACKED_END __pragma(pack(pop)) #elif defined(__GNUC__) #define ATTR_PACKED __attribute__((__packed__)) #define PACKED_BEGIN #define PACKED_END #endif using i8 = int8_t; using u8 = uint8_t; using i16 = int16_t; using u16 = uint16_t; using i32 = int32_t; using u32 = uint32_t; using i64 = int64_t; using u64 = uint64_t; #if HAS_INT128 using u128_cl = unsigned __int128; using i128_cl = signed __int128; #endif static std::vector<std::pair<std::string, std::function<void()>>> g_tests; static std::mutex cout_mux; PACKED_BEGIN template <typename T> class ATTR_PACKED DWordNum { template <typename S> constexpr static bool IsDwT = std::is_same_v<std::decay_t<S>, DWordNum<T>>; static bool constexpr has_int128 = HAS_INT128; static size_t constexpr dw_bysize = sizeof(T) * 2, dw_bisize = dw_bysize * 8; template <typename S> constexpr static void AssertHalf(S const & x) { static_assert(sizeof(S) * 2 <= dw_bysize); } constexpr static void Assert128() { static_assert(dw_bisize == 128); } public: constexpr DWordNum() {} template <typename S> constexpr DWordNum(S const & x) { Set(x); } template <typename A, typename B> constexpr DWordNum(A const & a, B const & b) : lo(a), hi(b) {} template <typename S> constexpr DWordNum & operator = (S const & x) { Set(x); return *this; } static u8 AddCarry(u8 carry_in, u64 a, u64 b, u64 * res) { #if IS_MSVC return _addcarry_u64(carry_in, a, b, res); #else unsigned long long carry_out; *res = __builtin_addcll(a, b, carry_in, &carry_out); return bool(carry_out); #endif } template <typename S> constexpr DWordNum & operator += (S const & x) { Assert128(); if constexpr(IsDwT<S>) { hi += x.hi + AddCarry(0, lo, x.lo, &lo); } else { AssertHalf(x); hi += AddCarry(0, lo, x, &lo); } return *this; } template <typename S> constexpr DWordNum operator + (S const & x) const { auto c = *this; c += x; return c; } template <typename S> constexpr DWordNum & operator -= (S const & x) { Assert128(); if constexpr(IsDwT<S>) { hi -= x.hi + _subborrow_u64(0, lo, x.lo, &lo); } else { AssertHalf(x); hi -= _subborrow_u64(0, lo, x, &lo); } return *this; } template <typename S> constexpr DWordNum operator - (S const & x) const { auto c = *this; c -= x; return c; } template <typename S> constexpr DWordNum & operator *= (S const & x) { Assert128(); static_assert(!IsDwT<S>); AssertHalf(x); #if HAS_INT128 *(u128_cl*)this *= x; #else u64 h; lo = _umul128(lo, x, &h); hi = hi * x + h; #endif return *this; } template <typename S> constexpr DWordNum operator * (S const & x) const { auto c = *this; c *= x; return c; } template <typename S> constexpr DWordNum & operator %= (S const & x) { Assert128(); AssertHalf(x); lo = Mod64(x); hi = 0; return *this; } template <typename S> constexpr DWordNum operator % (S const & x) const { auto c = *this; c %= x; return c; } DWordNum & SetMul64x64(u64 a, u64 b) { Assert128(); #if HAS_INT128 *(u128_cl*)this = u128_cl(a) * b; #else lo = _umul128(a, b, &hi); #endif return *this; } u64 Mod64(u64 x) const { Assert128(); #if HAS_INT128 return u64(*(u128_cl*)this % x); #else u64 r; _udiv128(hi, lo, x, &r); return r; #endif } T Lo() const { return lo; } T Hi() const { return hi; } private: template <typename S> constexpr void Set(S const & x) { if constexpr(IsDwT<S>) { lo = x.lo; hi = x.hi; } else { AssertHalf(x); lo = x; hi = 0; } } T lo, hi; }; PACKED_END using u128 = DWordNum<u64>; inline u128 Mul128(u64 a, u64 b) { return DWordNum<u64>().SetMul64x64(a, b); } #if USE_BOOST_MP_128 using u128_b = boost::multiprecision::uint128_t; using i128_b = boost::multiprecision::number<boost::multiprecision::cpp_int_backend<128, 128, boost::multiprecision::signed_magnitude, boost::multiprecision::unchecked, void>>; #endif #if HAS_INT128 using u128_r = u128_cl; using i128_r = i128_cl; #else using u128_r = u128_b; using i128_r = i128_b; #endif template <typename VT> inline auto & VAt(VT & v, size_t i, size_t line = 0, char const * code = "") { using T = typename std::decay_t<decltype(v)>::value_type; DASSERT_MSG(i < v.size(), "line " + std::to_string(line) + " i " + std::to_string(i) + " size() " + std::to_string(v.size()) + " code '" + std::string(code) + "'"); return v[i]; } template <typename VT> inline auto const & VAt(VT const & v, size_t i, size_t line = 0, char const * code = "") { return VAt(const_cast<VT &>(v), i, line, code); } double Time() { static auto const gtb = std::chrono::high_resolution_clock::now(); return std::chrono::duration_cast<std::chrono::duration<double>>( std::chrono::high_resolution_clock::now() - gtb).count(); } size_t NThreads() { return std::thread::hardware_concurrency(); } class Timer { public: Timer() { Start(); } void Start() { start_ = Time(); } double Elapsed() const { return Time() - start_; } void Save() { saved_ = Elapsed(); } double Saved() const { return saved_; } void Add() { saved_ += Elapsed(); } private: double start_ = 0, saved_ = 0; }; template <typename T> struct MakeSigned : std::type_identity<std::make_signed_t<T>> {}; //template <> struct MakeSigned<i128> : std::type_identity<i128> {}; //template <> struct MakeSigned<u128> : std::type_identity<i128> {}; template <typename T> using MakeSignedT = typename MakeSigned<T>::type; template <typename T, typename Enable = void> struct DWordOf; template <typename T> struct DWordOf<T, std::enable_if_t<std::is_signed_v<std::decay_t<T>>>> { static size_t constexpr num_bits = sizeof(T) * 8; using type = std::conditional_t<num_bits <= 8, i16, std::conditional_t<num_bits <= 16, i32, std::conditional_t<num_bits <= 32, i64, //std::conditional_t<num_bits <= 64, i128, void>>>; }; template <typename T> struct DWordOf<T, std::enable_if_t<!std::is_signed_v<std::decay_t<T>>>> { static size_t constexpr num_bits = sizeof(T) * 8; using type = std::conditional_t<num_bits <= 8, u16, std::conditional_t<num_bits <= 16, u32, std::conditional_t<num_bits <= 32, u64, std::conditional_t<num_bits <= 64, u128, void>>>>; }; template <typename T> using DWordOfT = typename DWordOf<T>::type; template <typename T> inline auto MulD(T const & a, T const & b) { using DT = DWordOfT<T>; static_assert(std::is_same_v<std::decay_t<T>, u64>); //return DT(a) * b; return Mul128(a, b); } template <typename T> inline T To(DWordNum<T> const & x) { return x.Lo(); } template <typename T, typename S> inline T To(S const & x) { return T(x); } template <typename T> constexpr auto EGCD(T const & a, T const & b) { static_assert(std::is_same_v<std::decay_t<T>, u64>); using ST = i64; //MakeSignedT<T>; using DST = i128_r; //DWordOfT<ST>; T ro = 0, r = 0, qu = 0, re = 0; ST so = 0, s = 0; std::tie(ro, r, so, s) = std::make_tuple(a, b, 1, 0); while (r != 0) { std::tie(qu, re) = std::make_tuple(ro / r, ro % r); std::tie(ro, r) = std::make_tuple(r, re); std::tie(so, s) = std::make_tuple(s, ST(so - DST(s) * ST(qu))); } ST const to = ST((DST(ro) - DST(a) * so) / ST(b)); return std::make_tuple(ro, so, to); } template <typename T> constexpr T ModInv(T x, T mod) { using ST = MakeSignedT<T>; using DT = DWordOfT<T>; x %= mod; auto [g, s, t] = EGCD(x, mod); //ASSERT(g == 1); if (s < 0) { //ASSERT(ST(mod) + s >= 0); s += mod; } else { //ASSERT(s < mod); } //ASSERT((DT(x) * s) % mod == 1); return s; } bool constexpr use_montg = 1; constexpr std::tuple<u64, u64, u64, u64> MontgKRR(u64 n) { u128_r constexpr r = u128_r(1) << 64; u64 const rmod = u64(r % n), rmod2 = u64((u128_r(rmod) * rmod) % n), rinv = ModInv<u64>(rmod, n); u128_r const k0 = (r * u128_r(rinv) - 1) / n; //ASSERT(k0 < (u128_r(1) << 64)); u64 const k = u64(k0); return std::make_tuple(k, rmod, rmod2, rinv); } constexpr u64 Adjust(u64 x, u64 mod) { u64 const mask = ~u64(i64(x - mod) >> 63); return x - (mod & mask); } template <u64 mod> constexpr u64 AdjustC(u64 x) { u64 const mask = ~u64(i64(x - mod) >> 63); return x - (mod & mask); } template <u64 mod> constexpr u64 AdjustMod2C(u64 x) { return AdjustC<mod * 2>(x); } // https://www.nayuki.io/page/montgomery-reduction-algorithm template <bool adjust = false> constexpr u64 MontgMod(u128 const & x, u64 k, u64 n) { if constexpr(!use_montg) return x.Mod64(n); u64 const s = x.Lo() * k; u128 const t = x + MulD<u64>(s, n); u64 const u = t.Hi(); if constexpr(adjust) return Adjust(u, n); else return u; } template <u64 k, u64 n, bool adjust = false> constexpr u64 MontgModC(u128 const & x) { if constexpr(!use_montg) return x.Mod64(n); u64 const s = x.Lo() * k; u128 const t = x + MulD<u64>(s, n); u64 const u = t.Hi(); if constexpr(adjust) return AdjustC<n>(u); else return u; } template <bool adjust = false> constexpr u64 ToMontg(u64 x, u64 rmod2, u64 k, u64 n) { if constexpr(!use_montg) return x; return MontgMod<adjust>(MulD<u64>(x, rmod2), k, n); } template <u64 rmod2, u64 k, u64 n, bool adjust = false> constexpr u64 ToMontgC(u64 x) { if constexpr(!use_montg) return x; return MontgModC<k, n, adjust>(MulD<u64>(x, rmod2)); } void CalcRev(auto & rev, size_t n) { using RevT = typename std::decay_t<decltype(rev)>::value_type; static auto const tab = []{ std::array<u8, 256> tab{}; for (size_t i = 0; i < 256; ++i) for (size_t j = 0; j < 8; ++j) if (i & (1 << j)) tab[u8(i)] |= u8(1) << (7 - j); return tab; }(); size_t log_n = 0; for (; (size_t(1) << log_n) < n; ++log_n); size_t const nby = (log_n + 7) / 8, sh = nby * 8 - log_n; rev.resize(n); for (size_t i = 0; i < n; ++i) { size_t const ish = i << sh; size_t r = 0; if constexpr(1) for (size_t j = 0; j < nby; ++j) r |= size_t(tab[u8(ish >> ((nby - 1 - j) * 8))]) << (j * 8); else for (size_t j = 0; j < log_n; ++j) if (i & (1 << j)) r |= 1 << (log_n - 1 - j); rev[i] = RevT(r); } } template <size_t N, size_t I = 0> inline void LoopUnroll(auto const & f) { if constexpr(I < N) { f(std::integral_constant<size_t, I>{}); LoopUnroll<N, I + 1>(f); } } template <typename VT> void Reverse(VT & a) { size_t n = a.size(); static std::unordered_map<size_t, std::shared_ptr<std::vector<u32>>> revs; static std::mutex mux; std::vector<u32> const * prev = nullptr; { std::unique_lock<std::mutex> lock(mux); if (!revs.contains(n)) { std::vector<u32> crev; CalcRev(crev, n); revs[n] = std::make_shared<std::vector<u32>>(std::move(crev)); } prev = &*revs.at(n); } auto const rev = *prev; VT acv1(n); size_t constexpr cnt = 4; std::array<size_t, cnt> sh{}; size_t const hi = n / cnt; for (size_t i = 0; i < cnt; ++i) sh[i] = i * hi; #pragma omp parallel for for (i64 i = 0; i < hi; ++i) LoopUnroll<cnt>([&](auto j){ size_t const k = sh[size_t(j)] + i; acv1[k] = a[rev[k]]; }); a = std::move(acv1); } template <typename T> struct FindNttEntry { size_t k = 0, c = 0; T p = 0, g = 0, root = 0; double plog2 = 0; }; template <typename T, FindNttEntry<T> root_entry> void NttTransform(std::vector<T> & a, bool invert, bool to_montg = true, bool from_montg = true, bool verbose = false) { using ST = MakeSignedT<T>; using DT = DWordOfT<T>; Timer timer_transform; u64 constexpr mod = root_entry.p, root = root_entry.root, root_pw = u64(1) << root_entry.k; u64 constexpr root_1 = ModInv<u64>(root, mod); bool constexpr new_reverse = 1; size_t const nthr = NThreads(); // https://cp-algorithms.com/algebra/fft.html size_t const n = a.size(); ASSERT_MSG(n > 0 && (n & (n - 1)) == 0, "n " + std::to_string(n)); Timer timer_swap_slow; if constexpr(!new_reverse) { std::vector<T> * pa = nullptr; std::vector<T> a2; if constexpr(new_reverse) { a2 = a; pa = &a2; } else pa = &a; for (size_t i = 1, j = 0; i < n; ++i) { size_t bit = n >> 1; for (; j & bit; bit >>= 1) j ^= bit; j ^= bit; if (i < j) std::swap((*pa)[i], (*pa)[j]); } } timer_swap_slow.Save(); Timer timer_swap; if constexpr(new_reverse) Reverse(a); timer_swap.Save(); auto constexpr montg_krr = MontgKRR(mod); auto constexpr mk = std::get<0>(montg_krr), mrm = std::get<1>(montg_krr), mrm2 = std::get<2>(montg_krr), mri = std::get<3>(montg_krr); Timer timer_to_montg; if (to_montg) { #pragma omp parallel for for (i64 i = 0; i < n; ++i) a[i] = ToMontgC<mrm2, mk, mod>(a[i]); } timer_to_montg.Save(); std::vector<T> wlens; wlens.reserve(n / 2); Timer timer_main, timer_mainA, timer_mainB; for (size_t len = 2; len <= n; len <<= 1) { timer_mainA.Start(); { T wlen = ToMontgC<mrm2, mk, mod>(invert ? root_1 : root); for (T i = len; i < root_pw; i <<= 1) wlen = MontgModC<mk, mod>(MulD<T>(wlen, wlen)); size_t const len2 = len / 2, block = (len2 + nthr) / (nthr + 1); T w = ToMontgC<mrm2, mk, mod>(1); wlens.clear(); wlens.resize(len2); for (size_t j = 0; j < block; ++j) { wlens[j] = w; w = MontgModC<mk, mod>(MulD<T>(w, wlen)); } ASSERT(!wlens.empty()); #pragma omp parallel for for (i64 ithr = 1; ithr < (nthr + 1); ++ithr) { size_t const begin = ithr * block, end = std::min<size_t>((ithr + 1) * block, len2); auto wcur = wlens[0]; for (size_t j = 0; j < ithr; ++j) wcur = MontgModC<mk, mod>(MulD<T>(wcur, w)); for (size_t j = begin; j < end; ++j) { wlens[j] = wcur; wcur = MontgModC<mk, mod>(MulD<T>(wcur, wlen)); } } } timer_mainA.Add(); timer_mainB.Start(); #pragma omp parallel for for (i64 i = 0; i < n; i += len) { size_t const len2 = len / 2; for (size_t j = 0, ij = i; j < len2; ++j, ++ij) { T const w = wlens[j], u = AdjustMod2C<mod>(a[ij]), v = MontgModC<mk, mod>(MulD<T>(a[ij + len2], w)); a[ij] = u + v; a[ij + len2] = (mod * 2) + u - v; } } timer_mainB.Add(); } timer_main.Save(); Timer timer_invert; if (invert) { T ni = ModInv<T>(n, mod); if (!from_montg) ni = ToMontgC<mrm2, mk, mod>(ni); #pragma omp parallel for for (i64 i = 0; i < n; ++i) a[i] = MontgModC<mk, mod, true>(MulD<T>(a[i], ni)); } else if (from_montg) { #pragma omp parallel for for (i64 i = 0; i < n; ++i) a[i] = MontgModC<mk, mod, true>(a[i]); } timer_invert.Save(); timer_transform.Save(); if (verbose) COUT(<< "Swap " << std::fixed << std::setprecision(3) << timer_swap.Saved() << " (Slow " << timer_swap_slow.Saved() <<") ToMontg " << timer_to_montg.Saved() << " Main " << timer_main.Saved() << " (" << timer_mainA.Saved() << ", " << timer_mainB.Saved() << ") Invert " << timer_invert.Saved() << " All " << timer_transform.Saved() << std::endl); } template <typename Q, std::size_t N> class AlignmentAllocator { public: typedef Q value_type; typedef std::size_t size_type; typedef std::ptrdiff_t difference_type; typedef Q * pointer; typedef const Q * const_pointer; typedef Q & reference; typedef const Q & const_reference; public: inline AlignmentAllocator() throw() {} template <typename T2> inline AlignmentAllocator(const AlignmentAllocator<T2, N> &) throw() {} inline ~AlignmentAllocator() throw() {} inline pointer adress(reference r) { return &r; } inline const_pointer adress(const_reference r) const { return &r; } inline pointer allocate(size_type n) { #if IS_WIN auto p = (pointer)_aligned_malloc(n * sizeof(value_type), N); #else auto p = (pointer)std::aligned_alloc(N, n * sizeof(value_type)); #endif ASSERT(p); return p; } inline void deallocate(pointer p, size_type) { #if IS_WIN _aligned_free(p); #else std::free(p); #endif } inline void construct(pointer p, const value_type & wert) { new (p) value_type(wert); } inline void destroy(pointer p) { p->~value_type(); } inline size_type max_size() const throw() { return size_type(-1) / sizeof(value_type); } template <typename T2> struct rebind { typedef AlignmentAllocator<T2, N> other; }; bool operator!=(const AlignmentAllocator<Q, N> & other) const { return !(*this == other); } bool operator==(const AlignmentAllocator<Q, N> & other) const { return true; } }; template <typename Q> using Aligned64Vector = std::vector<Q, AlignmentAllocator<Q, 64>>; template <typename T> size_t constexpr SimdArrBitSize = HAS_AVX ? 256 * 4 : HAS_SSE2 ? 128 * 4 : 0; template <typename T, size_t Size, typename Arr = std::array<T, Size>> void ArrCopy(Arr & dst, Arr const & src) { if constexpr(0 && !(HAS_AVX && Size * bisizeof(T) >= 256 || !HAS_AVX && HAS_SSE2 && Size * bisizeof(T) >= 128)) { std::memcpy(&dst, &src, sizeof(Arr)); return; } else { static_assert(std::is_same_v<T, double>); if constexpr(HAS_AVX) { size_t constexpr bits = 256; static_assert((Size * bisizeof(T)) % bits == 0); size_t constexpr reg_size = bits / bisizeof(T), num_regs = Size * bisizeof(T) / bits; DASSERT_MSG(size_t(&dst) % 32 == 0, "dst " + std::to_string(size_t(&dst))); DASSERT_MSG(size_t(&src) % 32 == 0, "src " + std::to_string(size_t(&src))); LoopUnroll<num_regs>([&](auto I){ _mm256_store_pd( (T*)&std::get<I * reg_size>(dst), _mm256_load_pd((T*)&std::get<I * reg_size>(src)) ); }); } else if constexpr(HAS_SSE2) { size_t constexpr bits = 128; static_assert((Size * bisizeof(T)) % bits == 0); size_t constexpr reg_size = bits / bisizeof(T), num_regs = Size * bisizeof(T) / bits; DASSERT_MSG(size_t(&dst) % 16 == 0, "dst " + std::to_string(size_t(&dst))); DASSERT_MSG(size_t(&src) % 16 == 0, "src " + std::to_string(size_t(&src))); LoopUnroll<num_regs>([&](auto I){ _mm_store_pd( (T*)&std::get<I * reg_size>(dst), _mm_load_pd((T*)&std::get<I * reg_size>(src)) ); }); } else static_assert([]{ return false; }()); } } template <typename T, size_t Size, size_t IOp, typename Arr = std::array<T, Size>> void ArrOp(Arr const & a, Arr const & b, Arr & c) { if constexpr(0 && !(HAS_AVX && Size * bisizeof(T) >= 256 || !HAS_AVX && HAS_SSE2 && Size * bisizeof(T) >= 128)) { LoopUnroll<Size>([&](auto I){ if constexpr(IOp == 0) std::get<I>(c) = std::get<I>(a) + std::get<I>(b); else if constexpr(IOp == 1) std::get<I>(c) = std::get<I>(a) - std::get<I>(b); else if constexpr(IOp == 2) std::get<I>(c) = std::get<I>(a) * std::get<I>(b); else static_assert([]{ return false; }()); }); return; } else { static_assert(std::is_same_v<T, double>); auto Op = [](auto bits, auto const & a, auto const & b){ if constexpr(IOp == 0) { if constexpr(bits == 128) return _mm_add_pd(a, b); else if constexpr(bits == 256) return _mm256_add_pd(a, b); } else if constexpr(IOp == 1) { if constexpr(bits == 128) return _mm_sub_pd(a, b); else if constexpr(bits == 256) return _mm256_sub_pd(a, b); } else if constexpr(IOp == 2) { if constexpr(bits == 128) return _mm_mul_pd(a, b); else if constexpr(bits == 256) return _mm256_mul_pd(a, b); } else static_assert([]{ return false; }()); }; if constexpr(HAS_AVX) { size_t constexpr bits = 256; static_assert((Size * bisizeof(T)) % bits == 0); size_t constexpr reg_size = bits / bisizeof(T), num_regs = Size * bisizeof(T) / bits; DASSERT_MSG(size_t(&a) % 32 == 0, "a " + std::to_string(size_t(&a))); DASSERT_MSG(size_t(&b) % 32 == 0, "b " + std::to_string(size_t(&b))); DASSERT_MSG(size_t(&c) % 32 == 0, "c " + std::to_string(size_t(&c))); LoopUnroll<num_regs>([&](auto I){ _mm256_store_pd( (T*)&std::get<I * reg_size>(c), Op(std::integral_constant<size_t, bits>{}, _mm256_load_pd((T*)&std::get<I * reg_size>(a)), _mm256_load_pd((T*)&std::get<I * reg_size>(b)) ) ); }); } else if constexpr(HAS_SSE2) { size_t constexpr bits = 128; static_assert((Size * bisizeof(T)) % bits == 0); size_t constexpr reg_size = bits / bisizeof(T), num_regs = Size * bisizeof(T) / bits; DASSERT_MSG(size_t(&a) % 16 == 0, "a " + std::to_string(size_t(&a))); DASSERT_MSG(size_t(&b) % 16 == 0, "b " + std::to_string(size_t(&b))); DASSERT_MSG(size_t(&c) % 16 == 0, "c " + std::to_string(size_t(&c))); LoopUnroll<num_regs>([&](auto I){ _mm_store_pd( (T*)&std::get<I * reg_size>(c), Op(std::integral_constant<size_t, bits>{}, _mm_load_pd((T*)&std::get<I * reg_size>(a)), _mm_load_pd((T*)&std::get<I * reg_size>(b)) ) ); }); } else static_assert([]{ return false; }()); } } PACKED_BEGIN template <typename T, size_t Size> class ATTR_PACKED ComplexArr { using This = ComplexArr; using Complex = std::complex<T>; public: using Arr = std::array<T, Size>; using Arr2 = std::array<T, Size * 2>; ComplexArr() { Init(); } ComplexArr(Arr const & _real) : real(_real) { Init(); ClearImag(); } ComplexArr(Arr const & _real, Arr const & _imag) : real(_real), imag(_imag) { Init(); } ComplexArr(ComplexArr const & other) { Init(); CopyFrom(other); } This & operator = (ComplexArr const & other) { return CopyFrom(other); } This & CopyFrom(ComplexArr const & other) { if constexpr(0) { real = other.real; imag = other.imag; return *this; } ArrCopy<T, Size * 2>(*(Arr2*)&real, *(Arr2*)&other.real); return *this; } void ClearImag() { Arr constexpr zero{}; imag = zero; } void Clear() { Arr constexpr zero{}; real = zero; imag = zero; } This & Set1(T const & re) { for (auto & x: real) x = re; ClearImag(); return *this; } This & Mul(ComplexArr const & other) { if constexpr(IS_DEBUG) { alignas(64) Arr t0, t1, t2, t3, t4, t5, t6, t7; ArrOp<T, Size, 2>(real, other.real, t0); ArrOp<T, Size, 2>(imag, other.imag, t1); ArrOp<T, Size, 2>(real, other.imag, t2); ArrOp<T, Size, 2>(imag, other.real, t3); ArrOp<T, Size, 1>(t0, t1, t4); ArrOp<T, Size, 0>(t2, t3, t5); for (size_t i = 0; i < Size; ++i) { auto x = Complex(real[i], imag[i]) * Complex(other.real[i], other.imag[i]); t6[i] = x.real(); t7[i] = x.imag(); } ASSERT(t4 == t6); ASSERT(t5 == t7); } alignas(64) Arr t0, t1, t2, t3; ArrOp<T, Size, 2>(real, other.real, t0); ArrOp<T, Size, 2>(imag, other.imag, t1); ArrOp<T, Size, 2>(real, other.imag, t2); ArrOp<T, Size, 2>(imag, other.real, t3); ArrOp<T, Size, 1>(t0, t1, real); ArrOp<T, Size, 0>(t2, t3, imag); return *this; } This & Sqr() { if constexpr(IS_DEBUG) { alignas(64) Arr t0, t1, t2, t3, t4, t5, t6; ArrOp<T, Size, 2>(real, real, t0); ArrOp<T, Size, 2>(imag, imag, t1); ArrOp<T, Size, 2>(real, imag, t2); ArrOp<T, Size, 1>(t0, t1, t3); ArrOp<T, Size, 0>(t2, t2, t4); for (size_t i = 0; i < Size; ++i) { auto x = Complex(real[i], imag[i]) * Complex(real[i], imag[i]); t5[i] = x.real(); t6[i] = x.imag(); } ASSERT(t3 == t5); ASSERT(t4 == t6); } alignas(64) Arr t0, t1, t2; ArrOp<T, Size, 2>(real, real, t0); ArrOp<T, Size, 2>(imag, imag, t1); ArrOp<T, Size, 2>(real, imag, t2); ArrOp<T, Size, 1>(t0, t1, real); ArrOp<T, Size, 0>(t2, t2, imag); return *this; } This & Add(ComplexArr const & other) { if constexpr(IS_DEBUG) { alignas(64) Arr t0, t1, t2, t3; ArrOp<T, Size, 0>(real, other.real, t0); ArrOp<T, Size, 0>(imag, other.imag, t1); for (size_t i = 0; i < Size; ++i) { auto x = Complex(real[i], imag[i]) + Complex(other.real[i], other.imag[i]); t2[i] = x.real(); t3[i] = x.imag(); } ASSERT(t0 == t2); ASSERT(t1 == t3); } ArrOp<T, Size, 0>(real, other.real, real); ArrOp<T, Size, 0>(imag, other.imag, imag); return *this; } This & Sub(ComplexArr const & other) { if constexpr(IS_DEBUG) { alignas(64) Arr t0, t1, t2, t3; ArrOp<T, Size, 1>(real, other.real, t0); ArrOp<T, Size, 1>(imag, other.imag, t1); for (size_t i = 0; i < Size; ++i) { auto x = Complex(real[i], imag[i]) - Complex(other.real[i], other.imag[i]); t2[i] = x.real(); t3[i] = x.imag(); } ASSERT(t0 == t2); ASSERT(t1 == t3); } ArrOp<T, Size, 1>(real, other.real, real); ArrOp<T, Size, 1>(imag, other.imag, imag); return *this; } public: Arr real, imag; private: inline void Init() { static_assert(Size > 0 && (Size & (Size - 1)) == 0); size_t constexpr mult = 64; //std::min<size_t>(Size * sizeof(T), 64); static_assert(sizeof(real) % mult == 0); static_assert(sizeof(imag) % mult == 0); DASSERT(size_t(&real) % mult == 0); DASSERT(size_t(&imag) % mult == 0); } }; PACKED_END template <typename T> class ComplexVec { public: using This = ComplexVec; static_assert(std::is_same_v<T, double>); static size_t constexpr ArrCnt = SimdArrBitSize<T> / bisizeof(T); using Complex = std::complex<T>; using CArr = ComplexArr<T, ArrCnt>; using Arr = typename CArr::Arr; public: ComplexVec() {} template <typename It> ComplexVec(It it, It end) { alignas(64) CArr a; while (it != end) { size_t i = 0; for (i = 0; i < ArrCnt && it != end; ++it, ++i) a.real[i] = *it; ASSERT(i == ArrCnt); nums_.emplace_back(a); } } ComplexVec(std::vector<Complex> const & other) { auto it = other.begin(), end = other.end(); alignas(64) CArr a; while (it != end) { size_t i = 0; for (i = 0; i < ArrCnt && it != end; ++it, ++i) { a.real[i] = it->real(); a.imag[i] = it->imag(); } ASSERT(i == ArrCnt); nums_.emplace_back(a); } } void FromComplex(Complex const * ptr, size_t size) { ASSERT(size % ArrCnt == 0); nums_.resize(size / ArrCnt); size_t const i_end = size / ArrCnt; #pragma omp parallel for for (i64 i = 0; i < i_end; ++i) { auto & e = nums_[i]; for (size_t j0 = 0, j = i * ArrCnt; j0 < ArrCnt; ++j, ++j0) { e.real[j0] = ptr[j].real(); e.imag[j0] = ptr[j].imag(); } } } void ToComplex(Complex * ptr, size_t size) { ASSERT(this->size() == size); ASSERT(size % ArrCnt == 0); size_t const i_end = size / ArrCnt; #pragma omp parallel for for (i64 i = 0; i < i_end; ++i) { auto const & e = nums_[i]; for (size_t j0 = 0, j = i * ArrCnt; j0 < ArrCnt; ++j, ++j0) { ptr[j].real(e.real[j0]); ptr[j].imag(e.imag[j0]); } } } CArr & GetArr(size_t i) { return VAT(nums_, i); } CArr const & GetArr(size_t i) const { return const_cast<ComplexVec &>(*this).GetArr(i); } template <size_t S> ComplexArr<T, S> const & GetArrS(size_t i) const { DASSERT(S <= ArrCnt); if constexpr(S == ArrCnt) { return GetArr(i); } static_assert(S > ArrCnt || ArrCnt % S == 0); size_t constexpr mult = ArrCnt / S; size_t const idx = i / mult, off = (i % mult) * S; auto const & e = VAT(nums_, idx); alignas(64) thread_local ComplexArr<T, S> r; std::memcpy(&r.real, &e.real[off], S * sizeof(T)); std::memcpy(&r.imag, &e.imag[off], S * sizeof(T)); return r; } Complex Get(size_t i) const { auto const & e = VAT(nums_, i / ArrCnt); size_t const j = i % ArrCnt; return Complex(e.real[j], e.imag[j]); } void SetArr(size_t i, CArr const & a) { VAT(nums_, i) = a; } template <size_t S> void SetArrS(size_t i, ComplexArr<T, S> const & a) { DASSERT(S <= ArrCnt); if constexpr(S == ArrCnt) { return SetArr(i, a); } static_assert(S > ArrCnt || ArrCnt % S == 0); size_t constexpr mult = ArrCnt / S; size_t const idx = i / mult, off = (i % mult) * S; auto & e = VAT(nums_, idx); std::memcpy(&e.real[off], &a.real, S * sizeof(T)); std::memcpy(&e.imag[off], &a.imag, S * sizeof(T)); } void Set(size_t i, T const & real, T const & imag) { auto & e = VAT(nums_, i / ArrCnt); size_t const j = i % ArrCnt; e.real[j] = real; e.imag[j] = imag; } void Set(size_t i, Complex const & c) { Set(i, c.real(), c.imag()); } size_t size() const { return nums_.size() * ArrCnt; } bool empty() const { return nums_.empty(); } void resize(size_t size) { ASSERT_MSG(size % ArrCnt == 0, "size " + std::to_string(size) + " ArrCnt " + std::to_string(ArrCnt)); nums_.resize(size / ArrCnt); } void reserve(size_t size) { ASSERT_MSG(size % ArrCnt == 0, "size " + std::to_string(size) + " ArrCnt " + std::to_string(ArrCnt)); nums_.reserve(size / ArrCnt); } void clear() { nums_.clear(); } private: Aligned64Vector<CArr> nums_; }; template <typename C0> void FftTransform(std::vector<C0> & a0, bool invert, bool verbose = false) { using T = double; using C = typename ComplexVec<T>::Complex; static_assert(std::is_same_v<C0, C>); using CArr = typename ComplexVec<T>::CArr; bool constexpr new_reverse = 1; size_t constexpr ArrCnt = ComplexVec<T>::ArrCnt; size_t const nthr = NThreads(); Timer timer_transform; // https://cp-algorithms.com/algebra/fft.html size_t const n = a0.size(); ASSERT_MSG(n > 0 && (n & (n - 1)) == 0, "n " + std::to_string(n)); ASSERT_MSG(n % ArrCnt == 0 && n >= SimdArrBitSize<T> * 2 / 32, "n " + std::to_string(n)); Timer timer_swap_slow; if constexpr(!new_reverse) { std::vector<C> * pa = nullptr; std::vector<C> a2; if constexpr(new_reverse) { a2 = a0; pa = &a2; } else pa = &a0; for (size_t i = 1, j = 0; i < n; ++i) { size_t bit = n >> 1; for (; j & bit; bit >>= 1) j ^= bit; j ^= bit; if (i < j) std::swap((*pa)[i], (*pa)[j]); } } timer_swap_slow.Save(); Timer timer_swap; if constexpr(new_reverse) Reverse(a0); timer_swap.Save(); Timer timer_from_complex; ComplexVec<T> a, wlens; wlens.resize(n / 2); a.FromComplex(a0.data(), a0.size()); timer_from_complex.Save(); Timer timer_main, timer_mainA, timer_mainB; for (size_t len = 2; len <= n; len <<= 1) { timer_mainA.Start(); { double const angle = 2.0 * M_PI / len * (invert ? -1 : 1); C const wlen(std::cos(angle), std::sin(angle)); size_t const len2 = len / 2, block = (len2 + nthr) / (nthr + 1); C w(1); for (size_t j = 0; j < block; ++j) { wlens.Set(j, w); w *= wlen; } ASSERT(!wlens.empty()); #pragma omp parallel for for (i64 ithr = 1; ithr < (nthr + 1); ++ithr) { size_t const begin = ithr * block, end = std::min<size_t>((ithr + 1) * block, len2); auto wcur = wlens.Get(0); for (size_t j = 0; j < ithr; ++j) wcur *= w; for (size_t j = begin; j < end; ++j) { wlens.Set(j, wcur); wcur *= wlen; } } } timer_mainA.Add(); timer_mainB.Start(); #pragma omp parallel for for (i64 i = 0; i < n; i += len) { size_t const len2 = len / 2; auto Process = [&](auto SubArrCnt0){ ASSERT(SubArrCnt0 <= ArrCnt); size_t constexpr SubArrCnt = std::min<size_t>(SubArrCnt0, ArrCnt); ASSERT_MSG(len2 % SubArrCnt == 0, "len2 " + std::to_string(len2) + " SubArrCnt " + std::to_string(SubArrCnt) + " len2 % SubArrCnt " + std::to_string(len2 % SubArrCnt)); ASSERT_MSG(i % SubArrCnt == 0, "i " + std::to_string(i) + " SubArrCnt " + std::to_string(SubArrCnt)); ASSERT(SubArrCnt <= ArrCnt); size_t const len2ac = len2 / SubArrCnt; alignas(64) ComplexArr<T, SubArrCnt> cau, cat, cav; for (size_t j = 0, ij = i / SubArrCnt; j < len2ac; ++j, ++ij) { auto const & caw = wlens.GetArr(j); cau = a.GetArr(ij); cat = cau; cav = a.GetArr(ij + len2ac); cav.Mul(caw); a.SetArr(ij, cau.Add(cav)); a.SetArr(ij + len2ac, cat.Sub(cav)); } }; if (len2 < ArrCnt) { for (size_t j = 0, ij = i; j < len2; ++j, ++ij) { C const w = wlens.Get(j), u = a.Get(ij), v = a.Get(ij + len2) * w; a.Set(ij, u + v); a.Set(ij + len2, u - v); } } else Process(std::integral_constant<size_t, ArrCnt>{}); } timer_mainB.Add(); } timer_main.Save(); Timer timer_invert; if (invert) { double const ni = 1.0 / n; alignas(64) CArr nia; nia.Set1(ni); #pragma omp parallel for for (i64 i = 0; i < n / ArrCnt; ++i) a.GetArr(i).Mul(nia); } timer_invert.Save(); Timer timer_to_complex; a0.resize(a.size()); a.ToComplex(a0.data(), a0.size()); timer_to_complex.Save(); timer_transform.Save(); if (verbose) COUT(<< "Swap " << std::fixed << std::setprecision(3) << timer_swap.Saved() << " FromComplex " << timer_from_complex.Saved() << " Main " << timer_main.Saved() << " (" << timer_mainA.Saved() << ", " << timer_mainB.Saved() << ") Invert " << timer_invert.Saved() << " ToComplex " << timer_to_complex.Saved() << " All " << timer_transform.Saved() << std::endl); } void NttTransformOriginal(std::vector<int> & a, bool invert) { // https://cp-algorithms.com/algebra/fft.html const int mod = 7340033; const int root = 5; const int root_1 = 4404020; const int root_pw = 1 << 20; int n = a.size(); for (int i = 1, j = 0; i < n; i++) { int bit = n >> 1; for (; j & bit; bit >>= 1) j ^= bit; j ^= bit; if (i < j) std::swap(a[i], a[j]); } for (int len = 2; len <= n; len <<= 1) { int wlen = invert ? root_1 : root; for (int i = len; i < root_pw; i <<= 1) wlen = (int)(1LL * wlen * wlen % mod); for (int i = 0; i < n; i += len) { int w = 1; for (int j = 0; j < len / 2; j++) { int u = a[i+j], v = (int)(1LL * a[i+j+len/2] * w % mod); a[i+j] = u + v < mod ? u + v : u + v - mod; a[i+j+len/2] = u - v >= 0 ? u - v : u - v + mod; w = (int)(1LL * w * wlen % mod); } } } if (invert) { int n_1 = ModInv<u64>(n, mod); for (int & x : a) x = (int)(1LL * x * n_1 % mod); } } template <typename cd> void FftTransformOriginal(std::vector<cd> & a, bool invert, bool verbose = false) { // https://cp-algorithms.com/algebra/fft.html int n = a.size(); for (int i = 1, j = 0; i < n; i++) { int bit = n >> 1; for (; j & bit; bit >>= 1) j ^= bit; j ^= bit; if (i < j) std::swap(a[i], a[j]); } for (int len = 2; len <= n; len <<= 1) { double ang = 2.0 * M_PI / len * (invert ? -1 : 1); cd wlen(std::cos(ang), std::sin(ang)); for (int i = 0; i < n; i += len) { cd w(1); for (int j = 0; j < len / 2; j++) { cd u = a[i+j], v = a[i+j+len/2] * w; a[i+j] = u + v; a[i+j+len/2] = u - v; w *= wlen; } } } if (invert) { for (cd & x : a) x /= n; } } template <typename T> auto MultReg(std::vector<T> const & a, std::vector<T> const & b) { std::vector<T> c(a.size() + b.size() - 1); for (size_t i = 0; i < a.size(); ++i) for (size_t j = 0; j < b.size(); ++j) c[i + j] += a[i] * b[j]; return c; } std::array<FindNttEntry<u64>, 9> constexpr root_entries = { FindNttEntry<u64>{.k = 54, .c = 127, .p = 2287828610704211969, .g = 3, .root = 878887558841786394, .plog2 = 60.99}, FindNttEntry<u64>{.k = 53, .c = 233, .p = 2098677426354651137, .g = 3, .root = 358459497095251936, .plog2 = 60.86}, FindNttEntry<u64>{.k = 55, .c = 57, .p = 2053641430080946177, .g = 7, .root = 640559856471874596, .plog2 = 60.83}, FindNttEntry<u64>{.k = 56, .c = 27, .p = 1945555039024054273, .g = 5, .root = 1613915479851665306, .plog2 = 60.75}, FindNttEntry<u64>{.k = 52, .c = 397, .p = 1787929052066086913, .g = 3, .root = 582754759099465746, .plog2 = 60.63}, FindNttEntry<u64>{.k = 53, .c = 161, .p = 1450159080013299713, .g = 3, .root = 359678689516082930, .plog2 = 60.33}, FindNttEntry<u64>{.k = 52, .c = 295, .p = 1328561890074296321, .g = 6, .root = 776720884896391373, .plog2 = 60.2}, FindNttEntry<u64>{.k = 53, .c = 143, .p = 1288029493427961857, .g = 3, .root = 531113314168589713, .plog2 = 60.16}, FindNttEntry<u64>{.k = 55, .c = 35, .p = 1261007895663738881, .g = 6, .root = 397650301651152680, .plog2 = 60.13}, }; size_t constexpr use_root_entry = 0; template <typename T> auto MultNtt(std::vector<T> a, std::vector<T> b0, bool square = false, bool verbose = false) { using DT = DWordOfT<T>; auto & b = square ? a : b0; size_t const final_size = std::max<size_t>(1, a.size() + b.size()) - 1; size_t n = 1; while (n < final_size) n <<= 1; a.resize(n); b.resize(n); auto constexpr root_entry = root_entries[use_root_entry]; u64 constexpr mod = root_entry.p; auto constexpr montg_krr = MontgKRR(mod); auto constexpr mk = std::get<0>(montg_krr); NttTransform<T, root_entry>(a, false, true, false, verbose); if (!square) NttTransform<T, root_entry>(b, false, true, false, verbose); Timer timer_mul; #pragma omp parallel for for (i64 i = 0; i < n; ++i) a[i] = MontgModC<mk, mod>(MulD<T>(a[i], b[i])); timer_mul.Save(); if (verbose) COUT(<< "MidMul " << std::fixed << std::setprecision(3) << timer_mul.Saved() << std::endl); NttTransform<T, root_entry>(a, true, false, true, verbose); for (size_t i = final_size; i < a.size(); ++i) ASSERT_MSG(a[i] == 0, "i " + std::to_string(i) + " a[i] " + std::to_string(a[i])); a.resize(final_size); return a; } template <typename T> auto MultFft(std::vector<T> a, std::vector<T> b0, bool square = false, bool verbose = false) { using C = std::complex<double>; auto & b = square ? a : b0; size_t const final_size = std::max<size_t>(1, a.size() + b.size()) - 1; size_t n = 1; while (n < std::max<size_t>(SimdArrBitSize<double> * 2 / 32, final_size)) n <<= 1; a.resize(n); b.resize(n); auto Assign = [](auto & vc, auto const & v){ vc.resize(v.size()); #pragma omp parallel for for (i64 i = 0; i < v.size(); ++i) vc[i].real(v[i]); }; Timer timer_assign; std::vector<C> ac, bc0; Assign(ac, a); if (!square) Assign(bc0, b); auto & bc = square ? ac : bc0; timer_assign.Save(); if (verbose) COUT(<< "AssignComplex " << std::setprecision(3) << timer_assign.Saved() << std::endl); #define FftTransformF FftTransform FftTransformF(ac, false, verbose); if (!square) FftTransformF(bc, false, verbose); Timer timer_mul; #pragma omp parallel for for (i64 i = 0; i < n; ++i) ac[i] *= bc[i]; timer_mul.Save(); if (verbose) COUT(<< "MidMul " << std::fixed << std::setprecision(3) << timer_mul.Saved() << std::endl); FftTransformF(ac, true, verbose); #undef FftTransformF Timer timer_round; #pragma omp parallel for for (i64 i = 0; i < a.size(); ++i) a[i] = size_t(ac[i].real() + 0.5); timer_round.Save(); if (verbose) COUT(<< "Round " << std::setprecision(3) << timer_round.Saved() << std::endl); //for (size_t i = final_size; i < a.size(); ++i) DASSERT_MSG(a[i] == 0, "i " + std::to_string(i) + " a[i] " + std::to_string(a[i])); a.resize(final_size); return std::move(a); } constexpr size_t NumBits(u64 n) { return 64 - std::countl_zero(n); } template <typename T> constexpr T PowMod(T a, T b, T const & c) { using DT = DWordOfT<T>; T r = 1; while (b != 0) { if (b & 1) r = To<T>(MulD<T>(r, a) % c); a = To<T>(MulD<T>(a, a) % c); b >>= 1; } return r; } template <typename T> constexpr bool IsPrime_Fermat(T const & n, size_t ntrials = 32) { if (n <= 16) return n == 2 || n == 3 || n == 5 || n == 7 || n == 11 || n == 13; std::mt19937_64 rng{1234567}; for (size_t i = 0; i < ntrials; ++i) if (PowMod(rng() % (n - 3) + 2, n - 1, n) != 1) return false; return true; } template <typename T> constexpr bool IsPrime_TrialDiv(T const & n, u32 limit = u32(-1)) { if (n <= 16) return n == 2 || n == 3 || n == 5 || n == 7 || n == 11 || n == 13; if ((n & 1) == 0) return false; for (u32 d = 3; d < limit && u64(d) * d <= n; d += 2) if (n % d == 0) return false; return true; } template <typename T> constexpr bool IsPrime(T const & n) { if (!IsPrime_TrialDiv(n, 1 << 14)) return false; if (!IsPrime_Fermat(n)) return false; return true; } template <typename T> constexpr std::pair<std::vector<T>, T> Factor_TrialDiv(T n, u32 limit = u32(-1)) { ASSERT(n > 0); std::vector<T> fs; while ((n & 1) == 0) { fs.push_back(2); n >>= 1; } u32 d = 0; for (d = 3; d < limit && u64(d) * d <= n; d += 2) while (n % d == 0) { fs.push_back(d); n /= d; } if (u64(d) * d > n && n != 1) { fs.push_back(n); n = 1; } return {std::move(fs), n}; } template <typename T> constexpr std::optional<std::vector<T>> Factor(T n, u32 limit = u32(-1)) { auto [fs, rem] = Factor_TrialDiv(n, limit); if (rem == 1) return fs; if (IsPrime(rem)) { fs.push_back(rem); return fs; } return std::nullopt; } template <typename T> constexpr std::optional<T> Generator(T const & p) { // https://cp-algorithms.com/algebra/primitive-root.html ASSERT(IsPrime(p)); T const phi = p - 1; auto const fs = Factor(phi, 1 << 20); if (!fs) return std::nullopt; for (T res = 2; res <= p; ++res) { bool ok = true; for (size_t i = 0; i < fs->size() && ok; ++i) ok &= PowMod(res, phi / (*fs)[i], p) != 1; if (ok) return res; } return std::nullopt; } template <typename T, size_t MaxCnt = 10> constexpr auto FindNttMod(size_t k_begin = 0, size_t k_end = 63, size_t pbits_begin = 0, size_t pbits_end = 65) { // https://cp-algorithms.com/algebra/fft.html std::array<FindNttEntry<T>, MaxCnt> entries{}; size_t cnt_entries = 0; for (ptrdiff_t k = ptrdiff_t(k_end) - 1; k >= ptrdiff_t(k_begin); --k) { T const pow2 = T(1) << k; for (size_t c = 1;; ++c) { auto const p128 = u128_r(c) * pow2 + 1; if (p128 >= u64(-1)) break; T const p = c * pow2 + 1; size_t const pbits = NumBits(p); if (!(pbits_begin <= pbits && pbits < pbits_end)) continue; if (!IsPrime(p)) continue; auto const g = Generator(p); if (!g) continue; entries[cnt_entries] = FindNttEntry<T>{.k = size_t(k), .c = c, .p = p, .g = *g, .root = PowMod<T>(*g, c, p), .plog2 = std::llround(std::log2(p) * 100.0) / 100.0}; ++cnt_entries; if (cnt_entries >= MaxCnt) return entries; } } return entries; } TEST(FindNttMod) { COUT(<< std::endl); auto const x0 = FindNttMod<u64, 20>(50, 63, 61, 63); std::vector<FindNttEntry<u64>> x; std::set<u64> used; for (size_t i = 0; i < x0.size(); ++i) { if (x0[i].p == 0) break; if (used.count(x0[i].p)) continue; x.push_back(x0[i]); used.insert(x0[i].p); } std::sort(x.begin(), x.end(), [](auto const & a, auto const & b){ return a.p > b.p; }); for (size_t i = 0; i < x.size(); ++i) { COUT(<< "FindNttEntry<T>{.k = " << x[i].k << ", .c = " << x[i].c << ", .p = " << x[i].p << ", .g = " << x[i].g << ", .root = " << x[i].root << ", .plog2 = " << x[i].plog2 << "}," << std::endl); } } template <typename T> std::string IntToStr(T n) { if (n == 0) return "0"; std::string r; bool sign = false; if (n < 0) { sign = true; n = -n; } while (n != 0) { u32 constexpr mod = 1'000'000'000U; std::ostringstream ss; auto const nm = u32(n % mod); n /= mod; if (n != 0) ss << std::setw(9) << std::setfill('0'); ss << nm; r = ss.str() + r; } return (sign ? "-" : "") + r; } template <typename T> std::string DiffVec(std::vector<T> const & a, std::vector<T> const & b) { std::string r; for (size_t i = 0; i < std::min(a.size(), b.size()); ++i) r += a[i] == b[i] ? ".," : IntToStr(a[i]) + "|" + IntToStr(b[i]) + ","; for (size_t i = std::min(a.size(), b.size()); i < std::max(a.size(), b.size()); ++i) r += i < a.size() ? IntToStr(a[i]) + "|_," : "_|" + IntToStr(b.at(i)) + ","; return r; } TEST(CompareNttMultWithReg) { COUT(<< std::endl); using T = u64; bool constexpr is_square = 1; std::mt19937_64 rng{123}; size_t ibits = 0; for (auto bits: {14, 19}) { size_t constexpr limit = 1 << 11; size_t const n = (1ULL << bits) * 3 / 2; std::vector<T> a, b; for (size_t i = 0; i < n; ++i) { a.push_back(rng() % limit); if (!is_square) b.push_back(rng() % limit); } Timer timer_ntt; auto const rntt = MultNtt(a, is_square ? std::vector<T>() : b, is_square, ibits >= 1); timer_ntt.Save(); Timer timer_fft; auto const rfft = MultFft(a, is_square ? std::vector<T>() : b, is_square, ibits >= 1); timer_fft.Save(); Timer timer_reg; std::vector<T> rreg; if (ibits <= 0) rreg = MultReg(a, is_square ? a : b); timer_reg.Save(); COUT(<< "Time NTT " << std::fixed << std::setprecision(3) << timer_ntt.Saved() << " FFT " << timer_fft.Saved()); if (ibits <= 0) { COUT(<< " Reg " << timer_reg.Saved() << " Boost_NTT " << timer_reg.Saved() / timer_ntt.Saved() << "x (FFT " << timer_reg.Saved() / timer_fft.Saved() << ")"); ASSERT_MSG(rreg == rntt, DiffVec(rreg, rntt)); ASSERT_MSG(rreg == rfft, DiffVec(rreg, rfft)); } if (ibits >= 1) COUT(<< " Boost_NTT " << timer_fft.Saved() / timer_ntt.Saved() << "x"); COUT(<< std::endl); ++ibits; } } // https://github.com/shaunakshende/FBSI_3152/blob/598d3325a9b1ee5a6933000261eee1e7747e8823/petsc-3.15.2/arch-linux2-c-debug/externalpackages/openmpi-4.1.0/ompi/mca/op/avx/op_avx_component.c static void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd) { #if defined(_MSC_VER) __cpuidex((int*)abcd, eax, ecx); #else uint32_t ebx = 0, edx = 0; #if defined( __i386__ ) && defined ( __PIC__ ) /* in case of PIC under 32-bit EBX cannot be clobbered */ __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx), #else __asm__ ( "cpuid" : "+b" (ebx), #endif /* defined( __i386__ ) && defined ( __PIC__ ) */ "+a" (eax), "+c" (ecx), "=d" (edx) ); abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx; #endif } #define OMPI_OP_AVX_HAS_AVX512BW_FLAG 0x00000200 #define OMPI_OP_AVX_HAS_AVX512F_FLAG 0x00000100 #define OMPI_OP_AVX_HAS_AVX2_FLAG 0x00000020 #define OMPI_OP_AVX_HAS_AVX_FLAG 0x00000010 #define OMPI_OP_AVX_HAS_SSE4_1_FLAG 0x00000008 #define OMPI_OP_AVX_HAS_SSE3_FLAG 0x00000004 #define OMPI_OP_AVX_HAS_SSE2_FLAG 0x00000002 #define OMPI_OP_AVX_HAS_SSE_FLAG 0x00000001 static uint32_t has_intel_AVX_features(void) { /* From https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits */ const uint32_t avx512f_mask = (1U << 16); // AVX512F (EAX = 7, ECX = 0) : EBX const uint32_t avx512_bw_mask = (1U << 30); // AVX512BW (EAX = 7, ECX = 0) : EBX const uint32_t avx2_mask = (1U << 5); // AVX2 (EAX = 7, ECX = 0) : EBX const uint32_t avx_mask = (1U << 28); // AVX (EAX = 1, ECX = 0) : ECX const uint32_t sse4_1_mask = (1U << 19); // SSE4.1 (EAX = 1, ECX = 0) : ECX const uint32_t sse3_mask = (1U << 0); // SSE3 (EAX = 1, ECX = 0) : ECX const uint32_t sse2_mask = (1U << 26); // SSE2 (EAX = 1, ECX = 0) : EDX const uint32_t sse_mask = (1U << 15); // SSE (EAX = 1, ECX = 0) : EDX uint32_t flags = 0, abcd[4]; run_cpuid( 1, 0, abcd ); flags |= (abcd[2] & avx_mask) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0; flags |= (abcd[2] & sse4_1_mask) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0; flags |= (abcd[2] & sse3_mask) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0; flags |= (abcd[3] & sse2_mask) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0; flags |= (abcd[3] & sse_mask) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0; #if defined(__APPLE__) uint32_t fma_movbe_osxsave_mask = ((1U << 12) | (1U << 22) | (1U << 27)); /* FMA(12) + MOVBE (22) OSXSAVE (27) */ // OS supports extended processor state management ? if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask ) return 0; #endif /* defined(__APPLE__) */ run_cpuid( 7, 0, abcd ); flags |= (abcd[1] & avx512f_mask) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0; flags |= (abcd[1] & avx512_bw_mask) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0; flags |= (abcd[1] & avx2_mask) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0; return flags; } void CheckSimdAvailability() { #if 0 // https://github.com/kodirovsshik/libksn/blob/4b6945d1780eea7430ae92960cf71491b0baad1d/ksn/include/ksn_old/cpu_features.h #define _FEATURE_SSE2 0x00000040ULL #define _FEATURE_AVX 0x00010000ULL #define _FEATURE_AVX512F 0x08000000ULL if constexpr(HAS_SSE2) ASSERT_MSG(_may_i_use_cpu_feature(_FEATURE_SSE2), "SSE2 not supported, please modify HAS_SSE2 macro at start of this C++ file!"); if constexpr(HAS_AVX) ASSERT_MSG(_may_i_use_cpu_feature(_FEATURE_AVX), "AVX not supported, please modify HAS_AVX macro at start of this C++ file!"); #endif static_assert(HAS_SSE2); auto flags = has_intel_AVX_features(); if constexpr(HAS_SSE2) ASSERT_MSG(flags & OMPI_OP_AVX_HAS_SSE2_FLAG, "SSE2 not supported, please modify HAS_SSE2 macro at start of this C++ file!"); if constexpr(HAS_AVX) ASSERT_MSG(flags & OMPI_OP_AVX_HAS_AVX_FLAG, "AVX not supported, please modify HAS_AVX macro at start of this C++ file!"); } void Init() { omp_set_num_threads(NThreads()); omp_set_dynamic(0); //ASSERT(omp_get_thread_limit() >= NThreads()); CheckSimdAvailability(); COUT(<< "Using SIMD " << (HAS_AVX ? "AVX" : "SSE2") << std::endl); } int main() { try { Init(); RUN_TESTS(); return 0; } catch (std::exception const & ex) { COUT(<< "Exception: " << ex.what() << std::endl); return -1; } }
Become a Patron
Sponsor on GitHub
Donate via PayPal
Compiler Explorer Shop
Source on GitHub
Mailing list
Installed libraries
Wiki
Report an issue
How it works
Contact the author
CE on Mastodon
CE on Bluesky
Statistics
Changelog
Version tree