Compiler Explorer

Source code

fxor:
      ; long double rdi[2]
       mov    ecx, [rdi+0  + 8]  ; exponent and sign, and padded with zeros, not high garbage (custom ABI)
       sub    ecx, [rdi+16 + 8]
       movd   xmm7, ecx
       ;pabsd xmm7, xmm7
        ; but also need to swap which ldouble to shift vs. which to xor with
       ;jcc   some pointer adjustment, or LEA rax, [rdi+16] earlier and jb over xchg here?

movq   xmm0, [rdi+16]     ; zero-extending load of just the mantissa, including explicit leading 1 bit
       psrlq  xmm0, xmm7         ; x2 mantissa >> (e1-e2) saturating on counts >= 64
       xorps  xmm0, [rdi+0]      ; FIXME: leaves the mantissa potentially unnormalized
                                ; which the hardware won't renormalize for us, tested on Skylake.
       ret

; fixable with lzcnt / shl mantissa  / add the lzcnt to the exponent.
    ; full cancellation of the mantissa needs to change the exponent to produce 0.0
   ; maybe easier to manually saturate the shift count in integer regs
   ; so we can branch on flags set by mantissa xor

; I think non-normals only happen with equal exponents;
    ; where we didn't shift out any (would be significant) bits.

; AArch64 has saturating integer shifts, but it's harder to justify operating on 80-bit floats on AArch64

align 16
global _start
_start:

sub  rsp, 32
;   vmovaps   ymm0, [args]
;   vmovaps   [rsp], ymm0     ; take a pointer instead of stack values, because [rsp] addr modes need a ModRM
   lea  rdi, [rel args]
   fld  tword [rdi]
   fld  tword [rdi+16]
   call fxor
   vmovaps   [rsp], xmm0
   fldz
   fld tword [rsp]             ; hardware does *not* renormalize for us on an 80-bit float with high bit of mantissa cleared
   faddp
times 4   fadd  st0

xor edi,edi
   mov eax, 231              ; Linux _NR_exit_group
   syscall

section .rodata
default rel
align 16
  args: dt 3.141592653589793
        times 3 dw 0            ; align 16 with 0s, not NOPs
        dt 2.141592653589793
;result: from info reg $st0  in GDB
; st0            <invalid float value> (raw 0x40004000000000000000)    ; after fld
; st0            -nan(0xc000000000000000) (raw 0xffffc000000000000000) ; after faddp with 0.0
; hardware doesn't like it either (Skylake CPU)

%if 0
;fxor:
       
;        vpminuq xmm2, xmm1, xmm0       ; AVX-512F+VL
;        vpmaxuq xmm3, xmm1, xmm0       ; the larger magnitude one.  Keep it, right shift the other's mantissa
                                       ; nope, doesn't XOR the leading 1 in the mantissa
        vmaxpd                       ; AVX1 / SSE2, IDK what I was thinking with AVX-512 integer
        vminpd

;       vpsubq xmm2, xmm1, xmm0
        vpsubd xmm2, xmm1, xmm0
        psrlq  xmm2, 52
        psllq  xmm1, 12
        psrlq  xmm1, 12            ; clear the sign/exponent field of
%endif