Compiler Explorer

Source code

; t=sieve-cordes; nasm -felf64 $t.asm && ld $t.o -o $t && taskset -c 3 perf stat --all-user -d  ./$t 40000
 ;; Performance counter stats for './sieve-cordes 40000':

;;            141.17 msec task-clock                #    0.998 CPUs utilized
 ;;                 0      context-switches          #    0.000 /sec
 ;;                 0      cpu-migrations            #    0.000 /sec
 ;;                 3      page-faults               #   21.252 /sec
 ;;       587,601,684      cycles                    #    4.162 GHz                      (61.75%)
 ;;     1,889,590,243      instructions              #    3.22  insn per cycle           (74.50%)
 ;;       269,885,148      branches                  #    1.912 G/sec                    (74.50%)
 ;;           393,524      branch-misses             #    0.15% of all branches          (74.50%)
 ;;       268,836,453      L1-dcache-loads           #    1.904 G/sec                    (74.50%)
 ;;             5,167      L1-dcache-load-misses     #    0.00% of all L1-dcache accesses  (76.50%)
 ;;               166      LLC-loads                 #    1.176 K/sec                    (51.00%)
 ;;                 0      LLC-load-misses           #    0.00% of all LL-cache accesses  (49.01%)

;;       0.141430062 seconds time elapsed

;;       0.141221000 seconds user
 ;;       0.000000000 seconds sys
 
 ;; timing from my i7-6700k CPU.
 ;; With an earlier version, still using 32-bit chunks for load/BTS/store, for a repeat count of 10k sieves

;SYSREAD equ 0  ; command line arg instead
default rel  ; we don't need any static storage anyway

section .text
global _start
_start:
;    xor  eax, eax               ; __NR_read
;    push rax                    ; non-digit terminator after the read buffer
;    mov  edx, 64-8              ; multiple of 16 buffer size including the push
;    sub  rsp, rdx
;    xor  edi, edi
;    mov  rsi, rsp
    ;    syscall                     ; read(0, buf, size)
    mov   rsi, [rsp+16]          ; argv[1] is above argv[0] and argc
    test  rsi, rsi               ; NULL pointer check, in case too few args were passed.
    jz    .exit_error

; from my answer on https://stackoverflow.com/a/49548057/224132
     movzx   eax, byte [rsi]    ; start with the first digit
     sub     eax, '0'           ; convert from ASCII to number
     cmp     al, 9              ; check that it's a decimal digit [0..9]
     jbe     .loop_entry        ; too low -> wraps to high value, fails unsigned compare check

.exit_error:
     ; else: bad first digit
    mov     edi, 1
    mov     eax, 231
    syscall                     ; exit_group(1)

; rotate the loop so we can put the JCC at the bottom where it belongs
     ; but still check the digit before messing up our total
  .next_digit:                  ; do {
     lea     eax, [rax*4 + rax]    ; total *= 5
     lea     eax, [rax*2 + rcx]    ; total = (total*5)*2 + digit
       ; imul eax, 10  / add eax, ecx
  .loop_entry:
     inc     rsi
     movzx   ecx, byte [rsi]
     sub     ecx, '0'
     cmp     ecx, 9
     jbe     .next_digit        ; } while( digit <= 9 )

;; allocate stack space for a buffer.  64 bytes were already reserved
;    and     eax, -64
;    sub     rsp, rax            ; Including the 64 bytes allocated earlier, round up the input size and alloc that many bytes.
    ;; Nope, would be vulnerable to stack-clash attacks with huge user-input sizes.  And 8MiB Linux stacks aren't big enough for large problem sizes.
allocate:
    mov  r15d, eax
    shr  eax, 3                 ; bytes = bits/8.  (Not rounded up, possible corner case?)
    lea  esi, [rax+4095]
    and  esi, -4096             ; size in bytes, rounded up to a page size

xor     r9d, r9d            ; offset
    mov     r8d, -1             ; fd
    mov     r10d, 0x22          ;  MAP_PRIVATE|MAP_ANONYMOUS
    mov     edx, 3              ;  PROT_READ|PROT_WRITE
    xor     edi, edi            ; hint address
    mov     eax, 9              ; __NR_mmap
    syscall                     ; mmap(0, size_rounded_up, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);

mov     rbx, rax            ; RBX = base of bit-array
                                ; R15 = max count to sieve up to.

mov     rdi, rax
;    mov     esi,         ; size already set
    mov     edx, 14       ; MADV_HUGEPAGE
    ; mov edx, 3   ; MADV_WILLNEED
    mov     eax, 28      ; __NR_madvise
    syscall

;;; benchmark rep count
    mov r14d, 10000
algorithm:
    mov   eax, 0b01010101       ; 0 = potential prime, 1 = crossed off.  TODO: invert so TZCNT can work directly; use BTR to reset bits to zero.  But then compressiong to not store even numbers would still require init.
    mov   rdi, rbx              ; including the first byte so rep stosb gets an aligned address, even though we'll store it separately
    mov   ecx, r15d
    shr   ecx, 3
    rep   stosb                 ; memset(rdi, al, R15>>3)
    ; special case cross out of 2 and all its multiples, ignore it from now on.  TODO: compressed sieve
    mov   byte [rbx], 0b01010011       ; don't count 0 and 1 as prime, but 2 itself is prime; uncross it

mov   r12d, ~7     ; mask for % 8 with andnot
    mov   r11d, 3     ; i = current prime to cross out
    mov   ecx, 9      ; j = i*i = first position to cross it out.  Smaller primes have crossed out its lower multiples
    mov   r13d, 4
;    lea   ecx, [r11-3 + 9]
.sieve_outer:
    lea    edx, [r11 + r11]
    add    r11d, 2              ; setup for next iter: first candidate prime is the next odd number
    mov    esi, r11d            ; doing this early means less to do after a branch miss on loop exit
    shr    esi, 3               ; byte offset that contains the next bit to look at
.crossout:                      ; do{
%ifdef BTS_MEM
    bts    [rbx], ecx             ; bitarr[j] = 1;   quite slow, but compact
%else
;;; 4 uops instead of 10 to set a bit in memory on Skylake
;    mov    eax, ecx
;    shr    eax, 4
    ; words are the narrowest BTS operand-size, so the narrowest we can go and still get count masking for free
    shrx    eax, ecx, r13d      ; eax = j >> 4 = index of aligned word containing the bit.  Byte offsets would be worse, store-forwarding stalls as well as misalignment
    movzx   ebp, word [rbx+rax*2]
    bts     bp, cx              ; BTS reg,reg is fast and does the mod 32 part for free.  There is no BTS byte reg so we unfortunately can't do narrower accesses to gain more parallelism
    mov     [rbx+rax*2], bp
%endif
    add    ecx, edx               ; j += 2*i  next odd multiple   ; lea edx, [rdx + r11*2].  Even multiples already crossed off by 2
    cmp    ecx, r15d
    jb    .crossout            ; }while(j < arrsize)
    ; update i to the next prime: search for a 0 bit

mov    rax, [rbx + rsi]     ; grab 64 bits, byte-aligned thus starting up to 7 bits before the one to look at
;    mov    ecx, r11d
;    and    ecx, 7               ; bit-position of next_i within the low byte of this qword
    andn   ecx, r12d, r11d
    not    rax                  ; tzcnt searches for a 1, but we want to search for a 0
    shrx   rax, rax, rcx        ; shift out the bits below sieve[next_i].  Shift in zeros *after* flipping, to avoid false positives
    tzcnt  rax, rax             ; bit-index of lowest set bit.  Not using SIMD here; optimize for the in-first-qword case.
     ;;; rax = bitset[i + 0..63].trailing_ones()
    jc     abort       ;nextprime_retry      ; CF=1 if input was zero.   First case at 31397, which is sqrt(985771609)
	     ; i=31,397 has a prime gap of 72 (and a worst-case shift)  https://en.wikipedia.org/wiki/Prime_gap#Numerical_results
    add    r11d, eax            ; i += distance to next prime.  (From the first candidate; r11 already incremented to that)

;; set up for next iteration
    mov    ecx, r11d
    imul   ecx, ecx              ; j = i*i
    cmp    ecx, r15d
    jb    .sieve_outer          ; if initial j is outside the array, we're done.
    ;; fall through when done

dec r14d
    jnz algorithm
.done_sieve:

xor   ecx, ecx              ; i/8 value for the low bit of the chunk at [rbx]
    shr   r15d, 3               ; FIXME, accurate bounds check for the end?
print_primes:
    mov   rax, [rbx + rcx]
    cmp   rax, -1                 ; xor rax,-1 / jz  would have smaller code size, but 1 cycle extra latency to detect a branch miss (cmp/jz fuses)
    jz    .skip                   ; first occurrence at ECX=188032; the earlier gap of 72 is split across qword boundaries
    not   rax          ; doesn't set FLAGS
.bits:                          ; loop over the non-crossed-off bits in this qword
    tzcnt rdx, rax              ;  on Intel before Skylake, false dependency on destination register.
    lea   edi, [rcx*8+rdx]        ; materialize the prime number in a register (set a breakpoint after this and  display $edi  in GDB)
                                ; [rdx + rcx*8] scaled index could avoid a separate counter?
    blsr  rax, rax
    jnz   .bits                 ; loop until we've found all the non-crossed-out entries in this chunk
.skip:
;    add   rbx, 8
    add   ecx, 8  ; 64
    cmp   ecx, r15d
    jb   print_primes           ; TODO: sieve to the end of the last chunk, or early-out.

;;; exit
        xor   edi, edi
        mov   eax, 231
        syscall          ; exit_group(0)

abort:
    ud2                         ; TODO: loop in 64-bit chunks.  No shifts needed for later chunks.

; for actual printing,
    ; https://stackoverflow.com/questions/13166064/how-do-i-print-an-integer-in-assembly-level-programming-without-printf-from-the/46301894#46301894