Thanks for using Compiler Explorer
Sponsors
Jakt
C++
Ada
Algol68
Analysis
Android Java
Android Kotlin
Assembly
C
C3
Carbon
C with Coccinelle
C++ with Coccinelle
C++ (Circle)
CIRCT
Clean
CMake
CMakeScript
COBOL
C++ for OpenCL
MLIR
Cppx
Cppx-Blue
Cppx-Gold
Cpp2-cppfront
Crystal
C#
CUDA C++
D
Dart
Elixir
Erlang
Fortran
F#
GLSL
Go
Haskell
HLSL
Hook
Hylo
IL
ispc
Java
Julia
Kotlin
LLVM IR
LLVM MIR
Modula-2
Mojo
Nim
Numba
Nix
Objective-C
Objective-C++
OCaml
Odin
OpenCL C
Pascal
Pony
PTX
Python
Racket
Raku
Ruby
Rust
Sail
Snowball
Scala
Slang
Solidity
Spice
SPIR-V
Swift
LLVM TableGen
Toit
Triton
TypeScript Native
V
Vala
Visual Basic
Vyper
WASM
Zig
Javascript
GIMPLE
Ygen
sway
assembly source #1
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
AArch64 binutils 2.28
AArch64 binutils 2.31.1
AArch64 binutils 2.33.1
AArch64 binutils 2.35.1
AArch64 binutils 2.38
AArch64 binutils 2.45
ARM binutils 2.25
ARM binutils 2.28
ARM binutils 2.31.1
ARM gcc 10.2 (linux)
ARM gcc 13.2 (linux)
ARM gcc 15.1 (linux)
ARM gcc 9.3 (linux)
ARMhf binutils 2.28
BeebAsm 1.09
NASM 2.12.02
NASM 2.13.02
NASM 2.13.03
NASM 2.14.02
NASM 2.16.01
RISC-V binutils 2.31.1
RISC-V binutils 2.31.1
RISC-V binutils 2.35.1
RISC-V binutils 2.35.1
RISC-V binutils 2.37.0
RISC-V binutils 2.37.0
RISC-V binutils 2.38.0
RISC-V binutils 2.38.0
RISC-V binutils 2.42.0
RISC-V binutils 2.42.0
RISC-V binutils 2.45.0
RISC-V binutils 2.45.0
x86-64 binutils (trunk)
x86-64 binutils 2.27
x86-64 binutils 2.28
x86-64 binutils 2.29.1
x86-64 binutils 2.34
x86-64 binutils 2.36.1
x86-64 binutils 2.38
x86-64 binutils 2.42
x86-64 binutils 2.45
x86-64 clang (assertions trunk)
x86-64 clang (trunk)
x86-64 clang 10.0.0
x86-64 clang 10.0.1
x86-64 clang 11.0.0
x86-64 clang 11.0.1
x86-64 clang 12.0.0
x86-64 clang 12.0.1
x86-64 clang 13.0.0
x86-64 clang 14.0.0
x86-64 clang 15.0.0
x86-64 clang 16.0.0
x86-64 clang 17.0.1
x86-64 clang 18.1.0
x86-64 clang 19.1.0
x86-64 clang 20.1.0
x86-64 clang 21.1.0
x86-64 clang 3.0.0
x86-64 clang 3.1
x86-64 clang 3.2
x86-64 clang 3.3
x86-64 clang 3.4.1
x86-64 clang 3.5
x86-64 clang 3.5.1
x86-64 clang 3.5.2
x86-64 clang 3.6
x86-64 clang 3.7
x86-64 clang 3.7.1
x86-64 clang 3.8
x86-64 clang 3.8.1
x86-64 clang 3.9.0
x86-64 clang 3.9.1
x86-64 clang 4.0.0
x86-64 clang 4.0.1
x86-64 clang 5.0.0
x86-64 clang 6.0.0
x86-64 clang 7.0.0
x86-64 clang 8.0.0
x86-64 clang 9.0.0
Options
Source code
///////. Compiled BY ptxas, works fine LDC R1, c[0x0][0x28] S2UR UR13, SR_CgaCtaId S2R R0, SR_TID.X UMOV UR12, 0x400 BSSY B0, 0x4a0 UIADD3 UR4, UR12, 0x6000, URZ IADD3 R1, R1, -0x40, RZ UPRMT UR5, UR13, 0x654, UR4 ULDC UR4, c[0x0][0x0] UIADD3 UR4, -UR4, 0x100000, URZ USHF.L.U32 UR7, UR4, 0xb, URZ ISETP.NE.AND P0, PT, R0, RZ, PT USHF.L.U32 UR6, UR4, 0x1, URZ NOP SYNCS.EXCH.64 URZ, [UR5], UR6 BAR.SYNC 0x0 @P0 BRA 0x480 LDC R0, c[0x0][0x20] IMAD.MOV.U32 R4, RZ, RZ, 0x80 IMAD.MOV.U32 R5, RZ, RZ, 0x0 IMAD.MOV.U32 R2, RZ, RZ, 0x40 IMAD.MOV.U32 R3, RZ, RZ, 0x0 LDC R8, c[0x0][0x210] STL.64 [R1], R4 STL.64 [R1+0x8], R2 LDC R10, c[0x0][0x218] IMAD.IADD R11, R1, 0x1, R0 IADD3 R15, -R0.reuse, 0x10, R11.reuse IADD3 R13, -R0, 0x20, R11.reuse LEA R6, R8, R1, 0x3 IADD3 R11, -R0, 0x30, R11 IMAD R8, R8, 0x8, R13 LDL R6, [R6] IMAD R0, R10.reuse, 0x8, R15 IMAD R9, R10, 0x8, R11 STL.64 [R15], R4 STL.64 [R15+0x8], R2 LDL R7, [R0] STL.64 [R13], R2 STL.64 [R13+0x8], R2 LDL R8, [R8] STL.64 [R11], R2 STL.64 [R11+0x8], R2 LDL R9, [R9] PLOP3.LUT P0, PT, PT, PT, PT, 0x80, 0x0 UPRMT UR4, UR13, 0x654, UR12 ULDC.64 UR8, c[0x0][0x220] UMOV UR6, URZ IMAD R6, R7, R6, RZ IMAD R6, R8, R9, R6 @P0 ELECT P1, URZ, PT UMOV UR7, UR6 UTMALDG.2D [UR4], [UR8] @P1 PLOP3.LUT P0, PT, P1, PT, PT, 0x8, 0x0 PLOP3.LUT P1, PT, PT, PT, PT, 0x8, 0x0 @P0 BRA.U.ANY 0x320 UIADD3 UR4, UR12, 0x4000, URZ PLOP3.LUT P0, PT, PT, PT, PT, 0x80, 0x0 ULDC.64 UR14, c[0x0][0x228] UPRMT UR4, UR13, 0x654, UR4 @P0 ELECT P1, URZ, PT UMOV UR9, UR5 UMOV UR10, UR6 UMOV UR11, UR6 UMOV UR8, UR4 UTMALDG.2D [UR8], [UR14] @P1 PLOP3.LUT P0, PT, P1, PT, PT, 0x8, 0x0 PLOP3.LUT P1, PT, PT, PT, PT, 0x8, 0x0 @P0 BRA.U.ANY 0x3c0 SHF.L.U32 R0, R6, 0x1, RZ SYNCS.ARRIVE.TRANS64 RZ, [UR5], R0 BRA 0x490 SYNCS.ARRIVE.TRANS64 RZ, [UR5], RZ BSYNC B0 IMAD.U32 R0, RZ, RZ, UR5 SYNCS.PHASECHK.TRANS64.TRYWAIT P0, [R0+URZ], RZ DEPBAR.LE SB0, 0x0 @!P0 NANOSLEEP.SYNCS 0x989681 @!P0 SYNCS.PHASECHK.TRANS64 P0, [R0+URZ], RZ @P0 EXIT SYNCS.PHASECHK.TRANS64.TRYWAIT P0, [R0+URZ], RZ DEPBAR.LE SB0, 0x0 @!P0 NANOSLEEP.SYNCS 0x989681 @!P0 SYNCS.PHASECHK.TRANS64 P0, [R0+URZ], RZ @!P0 BRA 0x500 EXIT BRA 0x560
assembly source #2
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
AArch64 binutils 2.28
AArch64 binutils 2.31.1
AArch64 binutils 2.33.1
AArch64 binutils 2.35.1
AArch64 binutils 2.38
AArch64 binutils 2.45
ARM binutils 2.25
ARM binutils 2.28
ARM binutils 2.31.1
ARM gcc 10.2 (linux)
ARM gcc 13.2 (linux)
ARM gcc 15.1 (linux)
ARM gcc 9.3 (linux)
ARMhf binutils 2.28
BeebAsm 1.09
NASM 2.12.02
NASM 2.13.02
NASM 2.13.03
NASM 2.14.02
NASM 2.16.01
RISC-V binutils 2.31.1
RISC-V binutils 2.31.1
RISC-V binutils 2.35.1
RISC-V binutils 2.35.1
RISC-V binutils 2.37.0
RISC-V binutils 2.37.0
RISC-V binutils 2.38.0
RISC-V binutils 2.38.0
RISC-V binutils 2.42.0
RISC-V binutils 2.42.0
RISC-V binutils 2.45.0
RISC-V binutils 2.45.0
x86-64 binutils (trunk)
x86-64 binutils 2.27
x86-64 binutils 2.28
x86-64 binutils 2.29.1
x86-64 binutils 2.34
x86-64 binutils 2.36.1
x86-64 binutils 2.38
x86-64 binutils 2.42
x86-64 binutils 2.45
x86-64 clang (assertions trunk)
x86-64 clang (trunk)
x86-64 clang 10.0.0
x86-64 clang 10.0.1
x86-64 clang 11.0.0
x86-64 clang 11.0.1
x86-64 clang 12.0.0
x86-64 clang 12.0.1
x86-64 clang 13.0.0
x86-64 clang 14.0.0
x86-64 clang 15.0.0
x86-64 clang 16.0.0
x86-64 clang 17.0.1
x86-64 clang 18.1.0
x86-64 clang 19.1.0
x86-64 clang 20.1.0
x86-64 clang 21.1.0
x86-64 clang 3.0.0
x86-64 clang 3.1
x86-64 clang 3.2
x86-64 clang 3.3
x86-64 clang 3.4.1
x86-64 clang 3.5
x86-64 clang 3.5.1
x86-64 clang 3.5.2
x86-64 clang 3.6
x86-64 clang 3.7
x86-64 clang 3.7.1
x86-64 clang 3.8
x86-64 clang 3.8.1
x86-64 clang 3.9.0
x86-64 clang 3.9.1
x86-64 clang 4.0.0
x86-64 clang 4.0.1
x86-64 clang 5.0.0
x86-64 clang 6.0.0
x86-64 clang 7.0.0
x86-64 clang 8.0.0
x86-64 clang 9.0.0
Options
Source code
/////// Compiled by driver (default MLIR), /// Thread 1 "mlir-cpu-runner" received signal /// CUDA_EXCEPTION_6, Warp Misaligned Address. LDC R1, c[0x0][0x28] S2R R0, SR_TID.X ULDC UR6, c[0x0][0x0] UMOV UR4, 0x0 UMOV UR8, 0x400 BSSY B0, 0x4c0 UIADD3 UR4, UR4, UR8, URZ IADD3 R1, R1, -0x40, RZ S2UR UR9, SR_CgaCtaId UIADD3 UR6, -UR6, 0x100000, URZ USHF.L.U32 UR7, UR6, 0xb, URZ USHF.L.U32 UR6, UR6, 0x1, URZ ISETP.NE.AND P0, PT, R0, RZ, PT UPRMT UR5, UR9, 0x654, UR4 NOP FENCE.VIEW.ASYNC.S SYNCS.EXCH.64 URZ, [UR5], UR6 BAR.SYNC 0x0 @P0 BRA 0x4a0 LDC R0, c[0x0][0x20] HFMA2.MMA R8, -RZ, RZ, 0, 7.62939453125e-06 IMAD.MOV.U32 R9, RZ, RZ, 0x0 MOV R2, 0x40 IMAD.MOV.U32 R3, RZ, RZ, 0x0 LDC R6, c[0x0][0x210] STL.64 [R1], R8 LDC R12, c[0x0][0x218] STL.64 [R1+0x8], R2 UMOV UR4, 0x2008 PLOP3.LUT P0, PT, PT, PT, PT, 0x80, 0x0 ULDC.64 UR10, c[0x0][0x220] IMAD.IADD R11, R1, 0x1, R0 IADD3 R15, -R0.reuse, 0x10, R11.reuse IADD3 R13, -R0, 0x20, R11.reuse LEA R10, R6, R1, 0x3 IADD3 R11, -R0, 0x30, R11 LEA R6, R6, R13, 0x3 LDL R0, [R10] IMAD R4, R12, 0x8, R15 IMAD R7, R12, 0x8, R11 STL.64 [R15], R8 STL.64 [R15+0x8], R2 LDL R5, [R4] STL.64 [R13], R2 STL.64 [R13+0x8], R2 LDL R6, [R6] STL.64 [R11], R2 STL.64 [R11+0x8], R2 LDL R7, [R7] UIADD3 UR4, UR8, UR4, URZ UMOV UR6, URZ UPRMT UR4, UR9, 0x654, UR4 IMAD R0, R5, R0, RZ IMAD R0, R6, R7, R0 @P0 ELECT P1, URZ, PT UMOV UR7, UR6 UTMALDG.2D [UR4], [UR10] @P1 PLOP3.LUT P0, PT, P1, PT, PT, 0x8, 0x0 PLOP3.LUT P1, PT, PT, PT, PT, 0x8, 0x0 @P0 BRA.U.ANY 0x360 UMOV UR4, 0x8 PLOP3.LUT P0, PT, PT, PT, PT, 0x80, 0x0 UIADD3 UR4, UR8, UR4, URZ ULDC.64 UR10, c[0x0][0x228] UPRMT UR4, UR9, 0x654, UR4 @P0 ELECT P1, URZ, PT UMOV UR7, UR6 UTMALDG.2D [UR4], [UR10] @P1 PLOP3.LUT P0, PT, P1, PT, PT, 0x8, 0x0 PLOP3.LUT P1, PT, PT, PT, PT, 0x8, 0x0 @P0 BRA.U.ANY 0x410 SHF.L.U32 R0, R0, 0x1, RZ SYNCS.ARRIVE.TRANS64 RZ, [UR5], R0 BRA 0x4b0 SYNCS.ARRIVE.TRANS64 RZ, [UR5], RZ BSYNC B0 MOV R0, UR5 SYNCS.PHASECHK.TRANS64.TRYWAIT P0, [R0+URZ], RZ DEPBAR.LE SB0, 0x0 @!P0 NANOSLEEP.SYNCS 0x989680 @!P0 SYNCS.PHASECHK.TRANS64 P0, [R0+URZ], RZ @P0 EXIT SYNCS.PHASECHK.TRANS64.TRYWAIT P0, [R0+URZ], RZ DEPBAR.LE SB0, 0x0 @!P0 NANOSLEEP.SYNCS 0x989680 @!P0 SYNCS.PHASECHK.TRANS64 P0, [R0+URZ], RZ @!P0 BRA 0x520 EXIT BRA 0x580
assembly source #3
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
AArch64 binutils 2.28
AArch64 binutils 2.31.1
AArch64 binutils 2.33.1
AArch64 binutils 2.35.1
AArch64 binutils 2.38
AArch64 binutils 2.45
ARM binutils 2.25
ARM binutils 2.28
ARM binutils 2.31.1
ARM gcc 10.2 (linux)
ARM gcc 13.2 (linux)
ARM gcc 15.1 (linux)
ARM gcc 9.3 (linux)
ARMhf binutils 2.28
BeebAsm 1.09
NASM 2.12.02
NASM 2.13.02
NASM 2.13.03
NASM 2.14.02
NASM 2.16.01
RISC-V binutils 2.31.1
RISC-V binutils 2.31.1
RISC-V binutils 2.35.1
RISC-V binutils 2.35.1
RISC-V binutils 2.37.0
RISC-V binutils 2.37.0
RISC-V binutils 2.38.0
RISC-V binutils 2.38.0
RISC-V binutils 2.42.0
RISC-V binutils 2.42.0
RISC-V binutils 2.45.0
RISC-V binutils 2.45.0
x86-64 binutils (trunk)
x86-64 binutils 2.27
x86-64 binutils 2.28
x86-64 binutils 2.29.1
x86-64 binutils 2.34
x86-64 binutils 2.36.1
x86-64 binutils 2.38
x86-64 binutils 2.42
x86-64 binutils 2.45
x86-64 clang (assertions trunk)
x86-64 clang (trunk)
x86-64 clang 10.0.0
x86-64 clang 10.0.1
x86-64 clang 11.0.0
x86-64 clang 11.0.1
x86-64 clang 12.0.0
x86-64 clang 12.0.1
x86-64 clang 13.0.0
x86-64 clang 14.0.0
x86-64 clang 15.0.0
x86-64 clang 16.0.0
x86-64 clang 17.0.1
x86-64 clang 18.1.0
x86-64 clang 19.1.0
x86-64 clang 20.1.0
x86-64 clang 21.1.0
x86-64 clang 3.0.0
x86-64 clang 3.1
x86-64 clang 3.2
x86-64 clang 3.3
x86-64 clang 3.4.1
x86-64 clang 3.5
x86-64 clang 3.5.1
x86-64 clang 3.5.2
x86-64 clang 3.6
x86-64 clang 3.7
x86-64 clang 3.7.1
x86-64 clang 3.8
x86-64 clang 3.8.1
x86-64 clang 3.9.0
x86-64 clang 3.9.1
x86-64 clang 4.0.0
x86-64 clang 4.0.1
x86-64 clang 5.0.0
x86-64 clang 6.0.0
x86-64 clang 7.0.0
x86-64 clang 8.0.0
x86-64 clang 9.0.0
Options
Source code
/////// PTX code generated by MLIR .version 8.0 .target sm_90 .address_size 64 // .globl main_kernel .shared .align 2 .b8 bufferLhsGlobal[16384]; .shared .align 2 .b8 bufferRhsGlobal[8192]; .shared .align 8 .b8 __mbarrier[8]; .visible .entry main_kernel( .param .u64 main_kernel_param_0, .param .u64 main_kernel_param_1, .param .u64 main_kernel_param_2, .param .u64 main_kernel_param_3 ) { .local .align 8 .b8 __local_depot0[64]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<2>; .reg .b32 %r<33>; .reg .b64 %rd<20>; mov.u64 %SPL, __local_depot0; cvta.local.u64 %SP, %SPL; mov.u32 %r1, %ntid.x; mov.u32 %r2, %tid.x; mov.u32 %r11, __mbarrier; mbarrier.init.shared.b64 [%r11], %r1; bar.sync 0; setp.ne.s32 %p1, %r2, 0; mov.u32 %r8, 0; @%p1 bra $L__BB0_2; bra.uni $L__BB0_1; $L__BB0_2: // begin inline asm mbarrier.arrive.expect_tx.shared.b64 _, [%r11], %r8; // end inline asm bra.uni $L__BB0_3; $L__BB0_1: ld.param.u64 %rd6, [main_kernel_param_3]; ld.param.u64 %rd5, [main_kernel_param_2]; ld.param.u64 %rd2, [main_kernel_param_1]; ld.param.u64 %rd1, [main_kernel_param_0]; add.u64 %rd7, %SP, 0; { .reg .b64 %tmp; cvta.to.local.u64 %tmp, %rd7; cvt.u32.u64 %r16, %tmp; } mov.u64 %rd8, 128; st.local.u64 [%r16], %rd8; mov.u64 %rd9, 64; st.local.u64 [%r16+8], %rd9; cvt.u32.u64 %r17, %rd1; shl.b32 %r18, %r17, 3; add.s32 %r19, %r16, %r18; ld.local.u64 %rd10, [%r19]; add.u64 %rd11, %SP, 16; { .reg .b64 %tmp; cvta.to.local.u64 %tmp, %rd11; cvt.u32.u64 %r20, %tmp; } st.local.u64 [%r20], %rd8; st.local.u64 [%r20+8], %rd9; cvt.u32.u64 %r21, %rd2; shl.b32 %r22, %r21, 3; add.s32 %r23, %r20, %r22; ld.local.u64 %rd12, [%r23]; add.u64 %rd13, %SP, 32; { .reg .b64 %tmp; cvta.to.local.u64 %tmp, %rd13; cvt.u32.u64 %r24, %tmp; } st.local.u64 [%r24], %rd9; st.local.u64 [%r24+8], %rd9; add.s32 %r25, %r24, %r18; ld.local.u64 %rd14, [%r25]; add.u64 %rd15, %SP, 48; { .reg .b64 %tmp; cvta.to.local.u64 %tmp, %rd15; cvt.u32.u64 %r26, %tmp; } st.local.u64 [%r26], %rd9; st.local.u64 [%r26+8], %rd9; add.s32 %r27, %r26, %r22; ld.local.u64 %rd16, [%r27]; mul.lo.s64 %rd17, %rd12, %rd10; mul.lo.s64 %rd18, %rd16, %rd14; add.s64 %rd19, %rd18, %rd17; mov.u32 %r6, bufferLhsGlobal; // begin inline asm cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [%r6], [%rd5, {%r8, %r8} ], [%r11]; // end inline asm mov.u32 %r10, bufferRhsGlobal; // begin inline asm cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [%r10], [%rd6, {%r8, %r8} ], [%r11]; // end inline asm cvt.u32.u64 %r28, %rd19; shl.b32 %r15, %r28, 1; // begin inline asm mbarrier.arrive.expect_tx.shared.b64 _, [%r11], %r15; // end inline asm $L__BB0_3: mov.u32 %r31, 10000000; // begin inline asm { .reg .pred P1; LAB_WAIT: mbarrier.try_wait.parity.shared.b64 P1, [%r11], %r8, %r31; @P1 bra.uni DONE; bra.uni LAB_WAIT; DONE: } // end inline asm ret; }
Become a Patron
Sponsor on GitHub
Donate via PayPal
Compiler Explorer Shop
Source on GitHub
Mailing list
Installed libraries
Wiki
Report an issue
How it works
Contact the author
CE on Mastodon
CE on Bluesky
Statistics
Changelog
Version tree