Compiler Explorer

Source code

%if 0
$ taskset -c 3 perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,branches,branch-misses,instructions,idq.dsb_uops,idq.mite_uops -r 2 ./a.out 
ans: 12509316
ans: 12509316

Performance counter stats for './a.out' (2 runs):

593.33 msec task-clock                #    0.999 CPUs utilized            ( +-  0.10% )
                 0      context-switches          #    0.000 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
                58      page-faults               #    0.098 K/sec                    ( +-  3.45% )
     2,311,512,376      cycles                    #    3.896 GHz                      ( +-  0.10% )
     2,000,637,317      branches                  # 3371.896 M/sec                    ( +-  0.00% )
         1,690,747      branch-misses             #    0.08% of all branches          ( +-  0.60% )
    11,015,217,579      instructions              #    4.77  insn per cycle           ( +-  0.00% )
     9,204,430,548      idq.dsb_uops              # 15513.249 M/sec                   ( +-  0.00% )
         1,008,922      idq.mite_uops             #    1.700 M/sec                    ( +- 20.51% )

0.593699 +- 0.000523 seconds time elapsed  ( +-  0.09% )

%endif

%use SMARTALIGN
alignmode p6, 64

section .bss
 wordbits: resb 65536
;    n resd 1
;    k resd 1
    ans resd 1
section .rodata
  n: dd 1000000000
  k: dd 8
  print_fmt: db `ans: %d\n`, 0

section .text

global main
main:            ; no popcnt version
    push  edi    ; save some call-preserved registers
    push  esi
    push  ebx

mov   edi, wordbits
%define wordbits edi            ; ugly hack to use indexed addressing to save code-size
    ;fill in wordbits, ecx is wordbits array index
    mov   ecx, 1     ; leave wordbits[0] = 0
.init_loop:
    mov   eax,ecx
    xor   ebx,ebx
  .popc_loop:
      lea   edx, [eax-1]
      inc   ebx
      and   eax,edx         ; v &= v - 1; // blsr
      jnz  .popc_loop

;computed bits set
    mov [wordbits + ecx], bl

inc ecx
    cmp ecx,65536
    jna .init_loop

;    GET_DEC 4,n
;    GET_DEC 4,k
    mov   ecx, [n]      ; ecx counts from n down to 1
    mov   esi, [k]
    xor   ebx, ebx      ; ebx = ans

align 32
.loop:
    ;popcnt eax, ecx
    movzx  edx, cx
    mov    eax, ecx
    shr    eax, 16
    movzx  edx, byte [wordbits + edx]
    add     dl,      [wordbits + eax]   ; no partial-reg stall on Sandybridge-family for this

lea   edx, [edx + esi - 1]

;    xor eax, eax         ; break false dependency, or just let OoO exec hide it after breaking once per iter
    bsr eax, ecx         ; eax = 31-lzcnt
;    xor eax, 0x1f        ; eax = lzcnt (for non-zero x)
    ; want:  32- clz(x) - popcnt_u32(x)  = (31-clz) + 1-popcnt = (31-clz) - (popcnt-1)

cmp eax, edx          ; k + 1-popcnt == 31-clz
    je .yesk      ; not-taken is the more common fast path
 .done_inc:
    dec ecx
    jnz .loop     ; }while(--n >= 0U)

.print_and_exit:
    ;print ans
;    PRINT_DEC 4,ans
    push  ebx
    push  print_fmt
extern printf
    call  printf
    add   esp, 8

pop  ebx
    pop  esi
    pop  edi
    xor  eax, eax
    ret

align 8
.yesk:
   inc  ebx
;   jmp  .done_inc
   dec  ecx                      ;; tail duplication is a *tiny* bit faster
   jnz  .loop
   jmp  .print_and_exit