Compiler Explorer

Source code

; the block_size=4 special case, assuming reads past the end get garbage but don't fault.
    ; started writing this before realizing that was just an example.
rwhash4:
    ; input: uint8_t *RSI, unsigned length ECX
    ; output: store into 4x uint16_t pointed to by RDI
    pxor  mm0, mm0

.loop:
    sub     ecx, 4                ; count down from the end
    jb      .tail                 ; craptastic while(){} loop structure
    pxor    mm1, mm1
    punpcklbw  mm1, [rsi+rcx]     ; reads 8 bytes, uses only the low 4 (emulate pmovzxbw)
    paddb   mm0, mm1              ; avoid polluting the high byte of each element
    jmp  .loop
.tail:
    ; TODO: last 0..3 bytes, maybe 4-byte load and shift to zero them?
    ; possibly bzhi?

; final processing on accumulator mm0
    pcmpeqw  mm7, mm2
    psrlw    mm7, 16-5            ; set1_epi16(0x1f) mask

pshufw   mm1, mm0, 0b00111001  ; rotate right by 1 word element
    pshufw   mm2, mm0, 0b01001110  ; rotate by 2 word elements
    pand     mm1, mm7
    paddb    mm0, mm1

psrlw    mm7, 4               ; mask is now set1_epi16(0x1)
    pand     mm2, mm7
    paddb    mm0, mm2
    ; each 16-bit element is one of the 4 "block" elements.

; TODO: multiplicative inverse with pmulhw to implement % 36
    ; https://stackoverflow.com/questions/41183935/why-does-gcc-use-multiplication-by-a-strange-number-in-implementing-integer-divi

movq     [rdi], mm0           ; array of 4 integer values, not their ASCII characters.

;    pshufw   mm1, mm0, 0b00000001    ; 2-byte rotate of low 4.

;   movd  eax, mm0       ; or try scalar
    ;   mov  ecx, eax
    ;   ror  eax, 8
    ;   and  eax, 0x1f1f1f1f
    ;       add  ecx, eax     ; nope, and SWAR add requires a lot of masking.  https://www.chessprogramming.org/SIMD_and_SWAR_Techniques
    ;   movd back to mm1?

; some versions of ARM and/or MIPS IIRC has SIMD byte-element adds within general-purpose registers; would be great for this.