Compiler Explorer

Source code

; t=sieve-cordes; nasm -felf64 $t.asm && ld $t.o -o $t && taskset -c 3 perf stat --all-user -d  ./$t 40000

;SYSREAD equ 0  ; command line arg instead
default rel  ; we don't need any static storage anyway

section .text
global _start
_start:
    mov   rsi, [rsp+16]          ; argv[1] is above argv[0] and argc
    test  rsi, rsi               ; NULL pointer check, in case too few args were passed.
    jz    .exit_error

;;; n = atoi(argv[1])
    ; from my answer on https://stackoverflow.com/a/49548057/224132
     movzx   eax, byte [rsi]    ; start with the first digit
     sub     eax, '0'           ; convert from ASCII to number
     cmp     al, 9              ; check that it's a decimal digit [0..9]
     jbe     .loop_entry        ; too low -> wraps to high value, fails unsigned compare check
.exit_error:
    mov     edi, 1              ; else: bad first digit
    mov     eax, 231
    syscall                     ; exit_group(1)

; rotate the loop so we can put the JCC at the bottom where it belongs
     ; but still check the digit before messing up our total
  .next_digit:                  ; do {
     lea     eax, [rax*4 + rax]    ; total *= 5
     lea     eax, [rax*2 + rcx]    ; total = (total*5)*2 + digit
  .loop_entry:
     inc     rsi
     movzx   ecx, byte [rsi]
     sub     ecx, '0'
     cmp     ecx, 9
     jbe     .next_digit        ; } while( digit <= 9 )

allocate:
    mov  r15d, eax
    shr  eax, 3                 ; bytes = bits/8.  (Not rounded up, possible corner case?)
    lea  esi, [rax+4095]
    and  esi, -4096             ; size in bytes, rounded up to a page size

xor     r9d, r9d            ; offset
    mov     r8d, -1             ; fd
    mov     r10d, 0x22          ;  MAP_PRIVATE|MAP_ANONYMOUS
    mov     edx, 3              ;  PROT_READ|PROT_WRITE
    xor     edi, edi            ; hint address
    mov     eax, 9              ; __NR_mmap
    syscall                     ; mmap(0, size_rounded_up, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);

mov     rbx, rax            ; RBX = base of bit-array
                                ; R15 = max count to sieve up to.

mov     rdi, rax
;    mov     esi,         ; size already set
    mov     edx, 14       ; MADV_HUGEPAGE
    ; mov edx, 3   ; MADV_WILLNEED
    mov     eax, 28      ; __NR_madvise
    syscall

mov r8d, 10000               ; benchmark rep count
sieve:
    mov    r14d, r15d
    shr    r14d, 1              ; nbits = n/2 = number of bits in the bitmap

mov   eax, -1               ; 1 = potential prime, 0 = crossed off.  Inverted so TZCNT can work directly; use BTR to reset bits to zero.  But even without benchmark repeat, have to memset instead of using zeroed memory
    mov   rdi, rbx              ; including the first byte so rep stosb gets an aligned address, even though we'll store it separately
    mov   ecx, r15d
    shr   ecx, 3 + 1            ; bytes per bit, and only store the odd numbers
    rep   stosb                 ; memset(rdi, al, (R15/2) >>3)
    ; primes in the first byte: 15:composite, 13:prime, 11:prime, 9:composite   7:prime, 5:prime, 3:prime 1:composite
    mov   byte [rbx], 0b01101110
;;; bitmap[j] represents i = j*2 + 1
;;; i is represented by bitmap[j = i/2]  rounding down, i.e. just a right shift works, because the candidate primes are always odd.
;;; TODO: manual memset with 3x XMM or YMM registers, crossing out 3

mov   r11d, 3         ; i = current prime to cross out
    mov   ecx, 9/2        ; j = bit position of i*i, first position to cross it out.  Smaller primes have crossed out its lower multiples
;    lea   ecx, [r11-3 + 9]  ; golf for alignment
    mov   r12d, BITALIGN_NMASK_r12    ; constants for ANDN and SHRX to save mov reg,reg instructions in the loop
    mov   r10d, BIT_TO_CHUNK_r10
;align 8
.sieve_outer:
    lea    esi, [r11+2]            ; doing this early means less to do after a branch miss on loop exit
    shr    esi, 3+1             ; byte offset that contains the next bit to look at
    ; load+TZCNT early, too?  Only the first bit past here has to be accurate, and the initial byte store avoids startup problems
.crossout:                      ; do{
%ifdef BTS_MEM
    btr    [rbx], ecx             ; bitarr[j] = 0;   quite slow, but compact
BIT_TO_CHUNK_r10 equ 0  ;; dummy
%else
;;; 4 uops instead of 10 to clear a bit in memory on Skylake
;    mov    eax, ecx
;    shr    eax, BIT_TO_DWORD
%if 1
BIT_TO_CHUNK_r10 equ 3
    shrx    eax, ecx, r10d      ; index of aligned byte containing that bit (ecx is already a bit-position, not the number represented).
    movzx   edx, byte [rbx+rax]
    andn    ebp, r12d, ecx
;    mov   ebp, ecx
;    and     ebp, 7
    btr     edx, ebp            ; BTS reg,reg is fast and does the mod 32 part for free.  There is no BTS byte reg so we unfortunately can't do narrower accesses to gain more parallelism
    mov     [rbx+rax], dl
%else
 %if 1
 BIT_TO_CHUNK_r10 equ 4
    shrx    eax, ecx, r10d      ; index of aligned dword containing that bit (ecx is already a bit-position, not the number represented).  Byte offsets for misaligned dwords would be worse, store-forwarding stalls as well as misalignment
    movzx     ebp, word [rbx+rax*2]
    btr     bp, cx            ; BTS reg,reg is fast and does the mod 32 part for free.  There is no BTS byte reg so we unfortunately can't do narrower accesses to gain more parallelism
    mov     [rbx+rax*2], bp
 %else
 BIT_TO_CHUNK_r10 equ 5
    shrx    eax, ecx, r10d      ; index of aligned dword containing that bit (ecx is already a bit-position, not the number represented).  Byte offsets for misaligned dwords would be worse, store-forwarding stalls as well as misalignment
    mov     ebp, [rbx+rax*4]
    btr     ebp, ecx            ; BTS reg,reg is fast and does the mod 32 part for free.  There is no BTS byte reg so we unfortunately can't do narrower accesses to gain more parallelism
    mov     [rbx+rax*4], ebp
 %endif
%endif
%endif
    ; TODO: unroll by 2?  Run two i values in parallel to interleave store/reload bottlenecks and increase cache locality?
    ; TODO: loop within the same dword or qword for small i?
    add    ecx, r11d            ; j += i  next odd multiple.  (num_represented += 2*i; skipping even multiples)
    cmp    ecx, r14d
    jb    .crossout            ; }while(j < arrsize)
    ; update i to the next prime: search for a 1 bit

mov    rax, [rbx + rsi]     ; grab 64 bits, byte-aligned thus starting up to 7 bits before the one to look at
BITALIGN_NMASK_r12 equ ~7
;    mov    ecx, r11d
;    and    ecx, ~BITALIGN_MASK     ; bit-position of next_i within the low byte of this qword
;    lea    ecx, [r11 + 2]           ; first candidate prime is the next odd number
    add    r11d, 2
    mov    ecx, r11d
    shr    ecx, 1
    and    ecx, 7               ; bit-position of next_i within the low byte of this qword
    shrx   rax, rax, rcx        ; shift out the bits below sieve[next_i]
    tzcnt  rax, rax             ; bit-index of lowest set bit.  Not using SIMD pcmpeqb here; optimize for the in-first-qword case.
     ;;; rax = bitset[i + 0..63].trailing_ones()
    jc     abort  ;nextprime_retry      ; CF=1 if input was zero.   First case at 31397, which is sqrt(985771609)
	     ; i=31,397 has a prime gap of 72 (and a worst-case shift)  https://en.wikipedia.org/wiki/Prime_gap#Numerical_results
    lea    ecx, [r11 + rax*2]   ; i += distance to next prime.  (From the first candidate; r11 already incremented to that)
    mov    r11d, ecx

;; set up for next iteration
    imul   ecx, ecx
    shr    ecx, 1                ; j = bitmap index representing i*i
    cmp    ecx, r14d             ; j < bitmap_size
    jb    .sieve_outer          ; if initial j is outside the array, we're done.
    ;; fall through when done

dec r8d
    jnz sieve
.done_sieve:

mov   edi, 2                ; special case, implicit in the bitmap
; print edi
    mov   ecx, 1                ; base value for bitmap[chunkstart] = 0*2 + 1
;    shr   r14d, 3               ; TODO, accurate bounds check for the end?
print_primes:
    mov   rax, [rbx]
    test  rax, rax
    jz    .none_this_chunk      ; first occurrence at ECX=188032; the earlier gap of 72 is split across qword boundaries
.bits:                          ; loop over the non-crossed-off bits in this qword
    tzcnt rdx, rax              ;  on Intel before Skylake, false dependency on destination register.
    lea   edi, [rcx + rdx*2]    ; materialize the prime number in a register (set a breakpoint after this and  display $edi  in GDB)
.prime_ready:
; print edi
    blsr  rax, rax
    jnz   .bits                 ; loop until we've found all the non-crossed-out entries in this chunk
.none_this_chunk:
    add   rbx, 8
    add   ecx, 128
    cmp   ecx, r15d
    jb   print_primes           ; TODO: sieve to the end of the last chunk, or early-out.
;;; for actual printing,
;;; https://stackoverflow.com/questions/13166064/how-do-i-print-an-integer-in-assembly-level-programming-without-printf-from-the/46301894#46301894

;;; exit
        xor   edi, edi
        mov   eax, 231
        syscall          ; exit_group(0)

abort:
    ud2                         ; TODO: search loop in 64-bit chunks.  No shifts needed for later chunks.