Thanks for using Compiler Explorer
Sponsors
Jakt
C++
Ada
Algol68
Analysis
Android Java
Android Kotlin
Assembly
C
C3
Carbon
C with Coccinelle
C++ with Coccinelle
C++ (Circle)
CIRCT
Clean
Clojure
CMake
CMakeScript
COBOL
C++ for OpenCL
MLIR
Cppx
Cppx-Blue
Cppx-Gold
Cpp2-cppfront
Crystal
C#
CUDA C++
D
Dart
Elixir
Erlang
Fortran
F#
GLSL
Go
Haskell
HLSL
Hook
Hylo
IL
ispc
Java
Julia
Kotlin
LLVM IR
LLVM MIR
Modula-2
Mojo
Nim
Numba
Nix
Objective-C
Objective-C++
OCaml
Odin
OpenCL C
Pascal
Pony
PTX
Python
Racket
Raku
Ruby
Rust
Sail
Snowball
Scala
Slang
Solidity
Spice
SPIR-V
Swift
LLVM TableGen
Toit
Triton
TypeScript Native
V
Vala
Visual Basic
Vyper
WASM
Yul (Solidity IR)
Zig
Javascript
GIMPLE
Ygen
sway
assembly source #1
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
AArch64 binutils (trunk)
AArch64 binutils 2.24
AArch64 binutils 2.28
AArch64 binutils 2.29
AArch64 binutils 2.30
AArch64 binutils 2.31.1
AArch64 binutils 2.32
AArch64 binutils 2.33.1
AArch64 binutils 2.34
AArch64 binutils 2.35.1
AArch64 binutils 2.35.2
AArch64 binutils 2.36.1
AArch64 binutils 2.37
AArch64 binutils 2.38
AArch64 binutils 2.39
AArch64 binutils 2.40
AArch64 binutils 2.41
AArch64 binutils 2.42
AArch64 binutils 2.43
AArch64 binutils 2.45
AArch64 clang (assertions trunk)
AArch64 clang (trunk)
AArch64 clang 10.0.0
AArch64 clang 10.0.1
AArch64 clang 11.0.0
AArch64 clang 11.0.1
AArch64 clang 12.0.0
AArch64 clang 12.0.1
AArch64 clang 13.0.0
AArch64 clang 14.0.0
AArch64 clang 15.0.0
AArch64 clang 16.0.0
AArch64 clang 17.0.1
AArch64 clang 18.1.0
AArch64 clang 19.1.0
AArch64 clang 20.1.0
AArch64 clang 21.1.0
AArch64 clang 9.0.0
ARM binutils 2.25
ARM binutils 2.28
ARM binutils 2.31.1
ARM gcc 10.2 (linux)
ARM gcc 13.2 (linux)
ARM gcc 15.1 (linux)
ARM gcc 9.3 (linux)
ARMhf binutils 2.28
BeebAsm 1.09
NASM 2.12.02
NASM 2.13.02
NASM 2.13.03
NASM 2.14.02
NASM 2.16.01
RISC-V binutils 2.31.1
RISC-V binutils 2.31.1
RISC-V binutils 2.35.1
RISC-V binutils 2.35.1
RISC-V binutils 2.37.0
RISC-V binutils 2.37.0
RISC-V binutils 2.38.0
RISC-V binutils 2.38.0
RISC-V binutils 2.42.0
RISC-V binutils 2.42.0
RISC-V binutils 2.45.0
RISC-V binutils 2.45.0
x86-64 binutils (trunk)
x86-64 binutils 2.27
x86-64 binutils 2.28
x86-64 binutils 2.29.1
x86-64 binutils 2.34
x86-64 binutils 2.36.1
x86-64 binutils 2.38
x86-64 binutils 2.42
x86-64 binutils 2.45
x86-64 clang (assertions trunk)
x86-64 clang (trunk)
x86-64 clang 10.0.0
x86-64 clang 10.0.1
x86-64 clang 11.0.0
x86-64 clang 11.0.1
x86-64 clang 12.0.0
x86-64 clang 12.0.1
x86-64 clang 13.0.0
x86-64 clang 14.0.0
x86-64 clang 15.0.0
x86-64 clang 16.0.0
x86-64 clang 17.0.1
x86-64 clang 18.1.0
x86-64 clang 19.1.0
x86-64 clang 20.1.0
x86-64 clang 21.1.0
x86-64 clang 3.0.0
x86-64 clang 3.1
x86-64 clang 3.2
x86-64 clang 3.3
x86-64 clang 3.4.1
x86-64 clang 3.5
x86-64 clang 3.5.1
x86-64 clang 3.5.2
x86-64 clang 3.6
x86-64 clang 3.7
x86-64 clang 3.7.1
x86-64 clang 3.8
x86-64 clang 3.8.1
x86-64 clang 3.9.0
x86-64 clang 3.9.1
x86-64 clang 4.0.0
x86-64 clang 4.0.1
x86-64 clang 5.0.0
x86-64 clang 6.0.0
x86-64 clang 7.0.0
x86-64 clang 8.0.0
x86-64 clang 9.0.0
Options
Source code
; t=sieve-cordes; nasm -felf64 $t.asm && ld $t.o -o $t && taskset -c 3 perf stat --all-user -d ./$t 40000 ;; Performance counter stats for './sieve-cordes 40000': ;; 141.17 msec task-clock # 0.998 CPUs utilized ;; 0 context-switches # 0.000 /sec ;; 0 cpu-migrations # 0.000 /sec ;; 3 page-faults # 21.252 /sec ;; 587,601,684 cycles # 4.162 GHz (61.75%) ;; 1,889,590,243 instructions # 3.22 insn per cycle (74.50%) ;; 269,885,148 branches # 1.912 G/sec (74.50%) ;; 393,524 branch-misses # 0.15% of all branches (74.50%) ;; 268,836,453 L1-dcache-loads # 1.904 G/sec (74.50%) ;; 5,167 L1-dcache-load-misses # 0.00% of all L1-dcache accesses (76.50%) ;; 166 LLC-loads # 1.176 K/sec (51.00%) ;; 0 LLC-load-misses # 0.00% of all LL-cache accesses (49.01%) ;; 0.141430062 seconds time elapsed ;; 0.141221000 seconds user ;; 0.000000000 seconds sys ;; timing from my i7-6700k CPU. ;; With an earlier version, still using 32-bit chunks for load/BTS/store, for a repeat count of 10k sieves ;SYSREAD equ 0 ; command line arg instead default rel ; we don't need any static storage anyway section .text global _start _start: ; xor eax, eax ; __NR_read ; push rax ; non-digit terminator after the read buffer ; mov edx, 64-8 ; multiple of 16 buffer size including the push ; sub rsp, rdx ; xor edi, edi ; mov rsi, rsp ; syscall ; read(0, buf, size) mov rsi, [rsp+16] ; argv[1] is above argv[0] and argc test rsi, rsi ; NULL pointer check, in case too few args were passed. jz .exit_error ; from my answer on https://stackoverflow.com/a/49548057/224132 movzx eax, byte [rsi] ; start with the first digit sub eax, '0' ; convert from ASCII to number cmp al, 9 ; check that it's a decimal digit [0..9] jbe .loop_entry ; too low -> wraps to high value, fails unsigned compare check .exit_error: ; else: bad first digit mov edi, 1 mov eax, 231 syscall ; exit_group(1) ; rotate the loop so we can put the JCC at the bottom where it belongs ; but still check the digit before messing up our total .next_digit: ; do { lea eax, [rax*4 + rax] ; total *= 5 lea eax, [rax*2 + rcx] ; total = (total*5)*2 + digit ; imul eax, 10 / add eax, ecx .loop_entry: inc rsi movzx ecx, byte [rsi] sub ecx, '0' cmp ecx, 9 jbe .next_digit ; } while( digit <= 9 ) ;; allocate stack space for a buffer. 64 bytes were already reserved ; and eax, -64 ; sub rsp, rax ; Including the 64 bytes allocated earlier, round up the input size and alloc that many bytes. ;; Nope, would be vulnerable to stack-clash attacks with huge user-input sizes. And 8MiB Linux stacks aren't big enough for large problem sizes. allocate: mov r15d, eax shr eax, 3 ; bytes = bits/8. (Not rounded up, possible corner case?) lea esi, [rax+4095] and esi, -4096 ; size in bytes, rounded up to a page size xor r9d, r9d ; offset mov r8d, -1 ; fd mov r10d, 0x22 ; MAP_PRIVATE|MAP_ANONYMOUS mov edx, 3 ; PROT_READ|PROT_WRITE xor edi, edi ; hint address mov eax, 9 ; __NR_mmap syscall ; mmap(0, size_rounded_up, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); mov rbx, rax ; RBX = base of bit-array ; R15 = max count to sieve up to. mov rdi, rax ; mov esi, ; size already set mov edx, 14 ; MADV_HUGEPAGE ; mov edx, 3 ; MADV_WILLNEED mov eax, 28 ; __NR_madvise syscall ;;; benchmark rep count mov r14d, 10000 algorithm: mov eax, 0b01010101 ; 0 = potential prime, 1 = crossed off. TODO: invert so TZCNT can work directly; use BTR to reset bits to zero. But then compressiong to not store even numbers would still require init. mov rdi, rbx ; including the first byte so rep stosb gets an aligned address, even though we'll store it separately mov ecx, r15d shr ecx, 3 rep stosb ; memset(rdi, al, R15>>3) ; special case cross out of 2 and all its multiples, ignore it from now on. TODO: compressed sieve mov byte [rbx], 0b01010011 ; don't count 0 and 1 as prime, but 2 itself is prime; uncross it mov r12d, ~7 ; mask for % 8 with andnot mov r11d, 3 ; i = current prime to cross out mov ecx, 9 ; j = i*i = first position to cross it out. Smaller primes have crossed out its lower multiples mov r13d, 4 ; lea ecx, [r11-3 + 9] .sieve_outer: lea edx, [r11 + r11] add r11d, 2 ; setup for next iter: first candidate prime is the next odd number mov esi, r11d ; doing this early means less to do after a branch miss on loop exit shr esi, 3 ; byte offset that contains the next bit to look at .crossout: ; do{ %ifdef BTS_MEM bts [rbx], ecx ; bitarr[j] = 1; quite slow, but compact %else ;;; 4 uops instead of 10 to set a bit in memory on Skylake ; mov eax, ecx ; shr eax, 4 ; words are the narrowest BTS operand-size, so the narrowest we can go and still get count masking for free shrx eax, ecx, r13d ; eax = j >> 4 = index of aligned word containing the bit. Byte offsets would be worse, store-forwarding stalls as well as misalignment movzx ebp, word [rbx+rax*2] bts bp, cx ; BTS reg,reg is fast and does the mod 32 part for free. There is no BTS byte reg so we unfortunately can't do narrower accesses to gain more parallelism mov [rbx+rax*2], bp %endif add ecx, edx ; j += 2*i next odd multiple ; lea edx, [rdx + r11*2]. Even multiples already crossed off by 2 cmp ecx, r15d jb .crossout ; }while(j < arrsize) ; update i to the next prime: search for a 0 bit mov rax, [rbx + rsi] ; grab 64 bits, byte-aligned thus starting up to 7 bits before the one to look at ; mov ecx, r11d ; and ecx, 7 ; bit-position of next_i within the low byte of this qword andn ecx, r12d, r11d not rax ; tzcnt searches for a 1, but we want to search for a 0 shrx rax, rax, rcx ; shift out the bits below sieve[next_i]. Shift in zeros *after* flipping, to avoid false positives tzcnt rax, rax ; bit-index of lowest set bit. Not using SIMD here; optimize for the in-first-qword case. ;;; rax = bitset[i + 0..63].trailing_ones() jc abort ;nextprime_retry ; CF=1 if input was zero. First case at 31397, which is sqrt(985771609) ; i=31,397 has a prime gap of 72 (and a worst-case shift) https://en.wikipedia.org/wiki/Prime_gap#Numerical_results add r11d, eax ; i += distance to next prime. (From the first candidate; r11 already incremented to that) ;; set up for next iteration mov ecx, r11d imul ecx, ecx ; j = i*i cmp ecx, r15d jb .sieve_outer ; if initial j is outside the array, we're done. ;; fall through when done dec r14d jnz algorithm .done_sieve: xor ecx, ecx ; i/8 value for the low bit of the chunk at [rbx] shr r15d, 3 ; FIXME, accurate bounds check for the end? print_primes: mov rax, [rbx + rcx] cmp rax, -1 ; xor rax,-1 / jz would have smaller code size, but 1 cycle extra latency to detect a branch miss (cmp/jz fuses) jz .skip ; first occurrence at ECX=188032; the earlier gap of 72 is split across qword boundaries not rax ; doesn't set FLAGS .bits: ; loop over the non-crossed-off bits in this qword tzcnt rdx, rax ; on Intel before Skylake, false dependency on destination register. lea edi, [rcx*8+rdx] ; materialize the prime number in a register (set a breakpoint after this and display $edi in GDB) ; [rdx + rcx*8] scaled index could avoid a separate counter? blsr rax, rax jnz .bits ; loop until we've found all the non-crossed-out entries in this chunk .skip: ; add rbx, 8 add ecx, 8 ; 64 cmp ecx, r15d jb print_primes ; TODO: sieve to the end of the last chunk, or early-out. ;;; exit xor edi, edi mov eax, 231 syscall ; exit_group(0) abort: ud2 ; TODO: loop in 64-bit chunks. No shifts needed for later chunks. ; for actual printing, ; https://stackoverflow.com/questions/13166064/how-do-i-print-an-integer-in-assembly-level-programming-without-printf-from-the/46301894#46301894
Become a Patron
Sponsor on GitHub
Donate via PayPal
Compiler Explorer Shop
Source on GitHub
Mailing list
Installed libraries
Wiki
Report an issue
How it works
Contact the author
CE on Mastodon
CE on Bluesky
Statistics
Changelog
Version tree