Thanks for using Compiler Explorer
Sponsors
Jakt
C++
Ada
Algol68
Analysis
Android Java
Android Kotlin
Assembly
C
C3
Carbon
C with Coccinelle
C++ with Coccinelle
C++ (Circle)
CIRCT
Clean
Clojure
CMake
CMakeScript
COBOL
C++ for OpenCL
MLIR
Cppx
Cppx-Blue
Cppx-Gold
Cpp2-cppfront
Crystal
C#
CUDA C++
D
Dart
Elixir
Erlang
Fortran
F#
GLSL
Go
Haskell
HLSL
Hook
Hylo
IL
ispc
Java
Julia
Kotlin
LLVM IR
LLVM MIR
Modula-2
Mojo
Nim
Numba
Nix
Objective-C
Objective-C++
OCaml
Odin
OpenCL C
Pascal
Pony
PTX
Python
Racket
Raku
Ruby
Rust
Sail
Snowball
Scala
Slang
Solidity
Spice
SPIR-V
Swift
LLVM TableGen
Toit
Triton
TypeScript Native
V
Vala
Visual Basic
Vyper
WASM
Yul (Solidity IR)
Zig
Javascript
GIMPLE
Ygen
sway
assembly source #1
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
AArch64 binutils (trunk)
AArch64 binutils 2.24
AArch64 binutils 2.28
AArch64 binutils 2.29
AArch64 binutils 2.30
AArch64 binutils 2.31.1
AArch64 binutils 2.32
AArch64 binutils 2.33.1
AArch64 binutils 2.34
AArch64 binutils 2.35.1
AArch64 binutils 2.35.2
AArch64 binutils 2.36.1
AArch64 binutils 2.37
AArch64 binutils 2.38
AArch64 binutils 2.39
AArch64 binutils 2.40
AArch64 binutils 2.41
AArch64 binutils 2.42
AArch64 binutils 2.43
AArch64 binutils 2.45
AArch64 clang (assertions trunk)
AArch64 clang (trunk)
AArch64 clang 10.0.0
AArch64 clang 10.0.1
AArch64 clang 11.0.0
AArch64 clang 11.0.1
AArch64 clang 12.0.0
AArch64 clang 12.0.1
AArch64 clang 13.0.0
AArch64 clang 14.0.0
AArch64 clang 15.0.0
AArch64 clang 16.0.0
AArch64 clang 17.0.1
AArch64 clang 18.1.0
AArch64 clang 19.1.0
AArch64 clang 20.1.0
AArch64 clang 21.1.0
AArch64 clang 9.0.0
ARM binutils 2.25
ARM binutils 2.28
ARM binutils 2.31.1
ARM gcc 10.2 (linux)
ARM gcc 13.2 (linux)
ARM gcc 15.1 (linux)
ARM gcc 9.3 (linux)
ARMhf binutils 2.28
BeebAsm 1.09
NASM 2.12.02
NASM 2.13.02
NASM 2.13.03
NASM 2.14.02
NASM 2.16.01
RISC-V binutils 2.31.1
RISC-V binutils 2.31.1
RISC-V binutils 2.35.1
RISC-V binutils 2.35.1
RISC-V binutils 2.37.0
RISC-V binutils 2.37.0
RISC-V binutils 2.38.0
RISC-V binutils 2.38.0
RISC-V binutils 2.42.0
RISC-V binutils 2.42.0
RISC-V binutils 2.45.0
RISC-V binutils 2.45.0
x86-64 binutils (trunk)
x86-64 binutils 2.27
x86-64 binutils 2.28
x86-64 binutils 2.29.1
x86-64 binutils 2.34
x86-64 binutils 2.36.1
x86-64 binutils 2.38
x86-64 binutils 2.42
x86-64 binutils 2.45
x86-64 clang (assertions trunk)
x86-64 clang (trunk)
x86-64 clang 10.0.0
x86-64 clang 10.0.1
x86-64 clang 11.0.0
x86-64 clang 11.0.1
x86-64 clang 12.0.0
x86-64 clang 12.0.1
x86-64 clang 13.0.0
x86-64 clang 14.0.0
x86-64 clang 15.0.0
x86-64 clang 16.0.0
x86-64 clang 17.0.1
x86-64 clang 18.1.0
x86-64 clang 19.1.0
x86-64 clang 20.1.0
x86-64 clang 21.1.0
x86-64 clang 3.0.0
x86-64 clang 3.1
x86-64 clang 3.2
x86-64 clang 3.3
x86-64 clang 3.4.1
x86-64 clang 3.5
x86-64 clang 3.5.1
x86-64 clang 3.5.2
x86-64 clang 3.6
x86-64 clang 3.7
x86-64 clang 3.7.1
x86-64 clang 3.8
x86-64 clang 3.8.1
x86-64 clang 3.9.0
x86-64 clang 3.9.1
x86-64 clang 4.0.0
x86-64 clang 4.0.1
x86-64 clang 5.0.0
x86-64 clang 6.0.0
x86-64 clang 7.0.0
x86-64 clang 8.0.0
x86-64 clang 9.0.0
Options
Source code
; t=sieve-cordes; nasm -felf64 $t.asm && ld $t.o -o $t && taskset -c 3 perf stat --all-user -d ./$t 40000 ;SYSREAD equ 0 ; command line arg instead default rel ; we don't need any static storage anyway section .text global _start _start: mov rsi, [rsp+16] ; argv[1] is above argv[0] and argc test rsi, rsi ; NULL pointer check, in case too few args were passed. jz .exit_error ;;; n = atoi(argv[1]) ; from my answer on https://stackoverflow.com/a/49548057/224132 movzx eax, byte [rsi] ; start with the first digit sub eax, '0' ; convert from ASCII to number cmp al, 9 ; check that it's a decimal digit [0..9] jbe .loop_entry ; too low -> wraps to high value, fails unsigned compare check .exit_error: mov edi, 1 ; else: bad first digit mov eax, 231 syscall ; exit_group(1) ; rotate the loop so we can put the JCC at the bottom where it belongs ; but still check the digit before messing up our total .next_digit: ; do { lea eax, [rax*4 + rax] ; total *= 5 lea eax, [rax*2 + rcx] ; total = (total*5)*2 + digit .loop_entry: inc rsi movzx ecx, byte [rsi] sub ecx, '0' cmp ecx, 9 jbe .next_digit ; } while( digit <= 9 ) allocate: mov r15d, eax shr eax, 3 ; bytes = bits/8. (Not rounded up, possible corner case?) lea esi, [rax+4095] and esi, -4096 ; size in bytes, rounded up to a page size xor r9d, r9d ; offset mov r8d, -1 ; fd mov r10d, 0x22 ; MAP_PRIVATE|MAP_ANONYMOUS mov edx, 3 ; PROT_READ|PROT_WRITE xor edi, edi ; hint address mov eax, 9 ; __NR_mmap syscall ; mmap(0, size_rounded_up, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); mov rbx, rax ; RBX = base of bit-array ; R15 = max count to sieve up to. mov rdi, rax ; mov esi, ; size already set mov edx, 14 ; MADV_HUGEPAGE ; mov edx, 3 ; MADV_WILLNEED mov eax, 28 ; __NR_madvise syscall mov r8d, 10000 ; benchmark rep count sieve: mov r14d, r15d shr r14d, 1 ; nbits = n/2 = number of bits in the bitmap mov eax, -1 ; 1 = potential prime, 0 = crossed off. Inverted so TZCNT can work directly; use BTR to reset bits to zero. But even without benchmark repeat, have to memset instead of using zeroed memory mov rdi, rbx ; including the first byte so rep stosb gets an aligned address, even though we'll store it separately mov ecx, r15d shr ecx, 3 + 1 ; bytes per bit, and only store the odd numbers rep stosb ; memset(rdi, al, (R15/2) >>3) ; primes in the first byte: 15:composite, 13:prime, 11:prime, 9:composite 7:prime, 5:prime, 3:prime 1:composite mov byte [rbx], 0b01101110 ;;; bitmap[j] represents i = j*2 + 1 ;;; i is represented by bitmap[j = i/2] rounding down, i.e. just a right shift works, because the candidate primes are always odd. ;;; TODO: manual memset with 3x XMM or YMM registers, crossing out 3 mov r11d, 3 ; i = current prime to cross out mov ecx, 9/2 ; j = bit position of i*i, first position to cross it out. Smaller primes have crossed out its lower multiples ; lea ecx, [r11-3 + 9] ; golf for alignment mov r12d, BITALIGN_NMASK_r12 ; constants for ANDN and SHRX to save mov reg,reg instructions in the loop mov r10d, BIT_TO_CHUNK_r10 ;align 8 .sieve_outer: lea esi, [r11+2] ; doing this early means less to do after a branch miss on loop exit shr esi, 3+1 ; byte offset that contains the next bit to look at ; load+TZCNT early, too? Only the first bit past here has to be accurate, and the initial byte store avoids startup problems .crossout: ; do{ %ifdef BTS_MEM btr [rbx], ecx ; bitarr[j] = 0; quite slow, but compact BIT_TO_CHUNK_r10 equ 0 ;; dummy %else ;;; 4 uops instead of 10 to clear a bit in memory on Skylake ; mov eax, ecx ; shr eax, BIT_TO_DWORD %if 1 BIT_TO_CHUNK_r10 equ 3 shrx eax, ecx, r10d ; index of aligned byte containing that bit (ecx is already a bit-position, not the number represented). movzx edx, byte [rbx+rax] andn ebp, r12d, ecx ; mov ebp, ecx ; and ebp, 7 btr edx, ebp ; BTS reg,reg is fast and does the mod 32 part for free. There is no BTS byte reg so we unfortunately can't do narrower accesses to gain more parallelism mov [rbx+rax], dl %else %if 1 BIT_TO_CHUNK_r10 equ 4 shrx eax, ecx, r10d ; index of aligned dword containing that bit (ecx is already a bit-position, not the number represented). Byte offsets for misaligned dwords would be worse, store-forwarding stalls as well as misalignment movzx ebp, word [rbx+rax*2] btr bp, cx ; BTS reg,reg is fast and does the mod 32 part for free. There is no BTS byte reg so we unfortunately can't do narrower accesses to gain more parallelism mov [rbx+rax*2], bp %else BIT_TO_CHUNK_r10 equ 5 shrx eax, ecx, r10d ; index of aligned dword containing that bit (ecx is already a bit-position, not the number represented). Byte offsets for misaligned dwords would be worse, store-forwarding stalls as well as misalignment mov ebp, [rbx+rax*4] btr ebp, ecx ; BTS reg,reg is fast and does the mod 32 part for free. There is no BTS byte reg so we unfortunately can't do narrower accesses to gain more parallelism mov [rbx+rax*4], ebp %endif %endif %endif ; TODO: unroll by 2? Run two i values in parallel to interleave store/reload bottlenecks and increase cache locality? ; TODO: loop within the same dword or qword for small i? add ecx, r11d ; j += i next odd multiple. (num_represented += 2*i; skipping even multiples) cmp ecx, r14d jb .crossout ; }while(j < arrsize) ; update i to the next prime: search for a 1 bit mov rax, [rbx + rsi] ; grab 64 bits, byte-aligned thus starting up to 7 bits before the one to look at BITALIGN_NMASK_r12 equ ~7 ; mov ecx, r11d ; and ecx, ~BITALIGN_MASK ; bit-position of next_i within the low byte of this qword ; lea ecx, [r11 + 2] ; first candidate prime is the next odd number add r11d, 2 mov ecx, r11d shr ecx, 1 and ecx, 7 ; bit-position of next_i within the low byte of this qword shrx rax, rax, rcx ; shift out the bits below sieve[next_i] tzcnt rax, rax ; bit-index of lowest set bit. Not using SIMD pcmpeqb here; optimize for the in-first-qword case. ;;; rax = bitset[i + 0..63].trailing_ones() jc abort ;nextprime_retry ; CF=1 if input was zero. First case at 31397, which is sqrt(985771609) ; i=31,397 has a prime gap of 72 (and a worst-case shift) https://en.wikipedia.org/wiki/Prime_gap#Numerical_results lea ecx, [r11 + rax*2] ; i += distance to next prime. (From the first candidate; r11 already incremented to that) mov r11d, ecx ;; set up for next iteration imul ecx, ecx shr ecx, 1 ; j = bitmap index representing i*i cmp ecx, r14d ; j < bitmap_size jb .sieve_outer ; if initial j is outside the array, we're done. ;; fall through when done dec r8d jnz sieve .done_sieve: mov edi, 2 ; special case, implicit in the bitmap ; print edi mov ecx, 1 ; base value for bitmap[chunkstart] = 0*2 + 1 ; shr r14d, 3 ; TODO, accurate bounds check for the end? print_primes: mov rax, [rbx] test rax, rax jz .none_this_chunk ; first occurrence at ECX=188032; the earlier gap of 72 is split across qword boundaries .bits: ; loop over the non-crossed-off bits in this qword tzcnt rdx, rax ; on Intel before Skylake, false dependency on destination register. lea edi, [rcx + rdx*2] ; materialize the prime number in a register (set a breakpoint after this and display $edi in GDB) .prime_ready: ; print edi blsr rax, rax jnz .bits ; loop until we've found all the non-crossed-out entries in this chunk .none_this_chunk: add rbx, 8 add ecx, 128 cmp ecx, r15d jb print_primes ; TODO: sieve to the end of the last chunk, or early-out. ;;; for actual printing, ;;; https://stackoverflow.com/questions/13166064/how-do-i-print-an-integer-in-assembly-level-programming-without-printf-from-the/46301894#46301894 ;;; exit xor edi, edi mov eax, 231 syscall ; exit_group(0) abort: ud2 ; TODO: search loop in 64-bit chunks. No shifts needed for later chunks.
Become a Patron
Sponsor on GitHub
Donate via PayPal
Compiler Explorer Shop
Source on GitHub
Mailing list
Installed libraries
Wiki
Report an issue
How it works
Contact the author
CE on Mastodon
CE on Bluesky
Statistics
Changelog
Version tree