Thanks for using Compiler Explorer
Sponsors
Jakt
C++
Ada
Analysis
Android Java
Android Kotlin
Assembly
C
C3
Carbon
C++ (Circle)
CIRCT
Clean
CMake
CMakeScript
COBOL
C++ for OpenCL
MLIR
Cppx
Cppx-Blue
Cppx-Gold
Cpp2-cppfront
Crystal
C#
CUDA C++
D
Dart
Elixir
Erlang
Fortran
F#
GLSL
Go
Haskell
HLSL
Hook
Hylo
IL
ispc
Java
Julia
Kotlin
LLVM IR
LLVM MIR
Modula-2
Nim
Objective-C
Objective-C++
OCaml
OpenCL C
Pascal
Pony
Python
Racket
Ruby
Rust
Snowball
Scala
Solidity
Spice
SPIR-V
Swift
LLVM TableGen
Toit
TypeScript Native
V
Vala
Visual Basic
WASM
Zig
Javascript
GIMPLE
Ygen
assembly source #1
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
AArch64 binutils 2.28
AArch64 binutils 2.31.1
AArch64 binutils 2.33.1
AArch64 binutils 2.35.1
AArch64 binutils 2.38
ARM binutils 2.25
ARM binutils 2.28
ARM binutils 2.31.1
ARM gcc 10.2 (linux)
ARM gcc 9.3 (linux)
ARMhf binutils 2.28
BeebAsm 1.09
NASM 2.12.02
NASM 2.13.02
NASM 2.13.03
NASM 2.14.02
NASM 2.16.01
PTX Assembler 10.0.130
PTX Assembler 10.1.105
PTX Assembler 10.1.168
PTX Assembler 10.1.243
PTX Assembler 10.2.89
PTX Assembler 11.0.2
PTX Assembler 11.0.3
PTX Assembler 11.1.0
PTX Assembler 11.1.1
PTX Assembler 11.2.0
PTX Assembler 11.2.1
PTX Assembler 11.2.2
PTX Assembler 11.3.0
PTX Assembler 11.3.1
PTX Assembler 11.4.0
PTX Assembler 11.4.1
PTX Assembler 11.5.0
PTX Assembler 11.6.1
PTX Assembler 11.6.2
PTX Assembler 11.7.0
PTX Assembler 11.7.1
PTX Assembler 11.8.0
PTX Assembler 12.0.0
PTX Assembler 12.0.1
PTX Assembler 12.1.0
PTX Assembler 12.2.1
PTX Assembler 12.3.1
PTX Assembler 12.4.1
PTX Assembler 12.5.1
PTX Assembler 12.6.1
PTX Assembler 9.1.85
PTX Assembler 9.2.88
RISC-V binutils 2.31.1
RISC-V binutils 2.31.1
RISC-V binutils 2.35.1
RISC-V binutils 2.35.1
RISC-V binutils 2.37.0
RISC-V binutils 2.37.0
RISC-V binutils 2.38.0
RISC-V binutils 2.38.0
x86-64 binutils (trunk)
x86-64 binutils 2.27
x86-64 binutils 2.28
x86-64 binutils 2.29.1
x86-64 binutils 2.34
x86-64 binutils 2.36.1
x86-64 binutils 2.38
x86-64 clang (assertions trunk)
x86-64 clang (trunk)
x86-64 clang 10.0.0
x86-64 clang 10.0.1
x86-64 clang 11.0.0
x86-64 clang 11.0.1
x86-64 clang 12.0.0
x86-64 clang 12.0.1
x86-64 clang 13.0.0
x86-64 clang 14.0.0
x86-64 clang 15.0.0
x86-64 clang 16.0.0
x86-64 clang 17.0.1
x86-64 clang 18.1.0
x86-64 clang 3.0.0
x86-64 clang 3.1
x86-64 clang 3.2
x86-64 clang 3.3
x86-64 clang 3.4.1
x86-64 clang 3.5
x86-64 clang 3.5.1
x86-64 clang 3.5.2
x86-64 clang 3.6
x86-64 clang 3.7
x86-64 clang 3.7.1
x86-64 clang 3.8
x86-64 clang 3.8.1
x86-64 clang 3.9.0
x86-64 clang 3.9.1
x86-64 clang 4.0.0
x86-64 clang 4.0.1
x86-64 clang 5.0.0
x86-64 clang 6.0.0
x86-64 clang 7.0.0
x86-64 clang 8.0.0
x86-64 clang 9.0.0
Options
Source code
;; t=uop-replay-cache-miss; asm-link -d -- "$t.asm" && taskset -c 3 perf stat -etask-clock:u,context-switches,cpu-migrations,page-faults,cycles:u,instructions:u,uops_dispatched_port.port_2:u,uops_dispatched_port.port_3:u,uops_dispatched_port.port_7:u,uops_dispatched_port.port_0:u,uops_dispatched_port.port_1:u,uops_dispatched_port.port_5:u,uops_dispatched_port.port_6:u -r1 ./"$t" ;; nasm -felf64 -Worphan-labels uop-replay-cache-miss.asm ;; ld -o uop-replay-cache-miss uop-replay-cache-miss.o ;; taskset -c 3 perf stat -etask-clock:u,context-switches,cpu-migrations,page-faults,cycles:u,instructions:u,uops_dispatched_port.port_2:u,uops_dispatched_port.port_3:u,uops_dispatched_port.port_7:u,uops_dispatched_port.port_0:u,uops_dispatched_port.port_1:u,uops_dispatched_port.port_5:u,uops_dispatched_port.port_6:u -r1 ./uop-replay-cache-miss default rel %ifdef __YASM_VER__ ; CPU ; CPU intelnop CPU Conroe AMD CPU Skylake AMD %else %use smartalign alignmode p6, 64 %endif %ifdef IACA_MARKS %macro IACA_start 0 mov ebx, 111 db 0x64, 0x67, 0x90 %endmacro %macro IACA_end 0 mov ebx, 222 db 0x64, 0x67, 0x90 %endmacro %else %define IACA_start %define IACA_end %endif global _start _start: %if 1 lea rdi, [buf] lea rsi, [bufsrc] %endif vzeroupper mov ebp, 100000000 xor edx,edx ; xorps xmm0, xmm0 ; vbroadcastss xmm1, [one] ; vbroadcastss xmm2, [small_number] align 64 IACA_start .loop: mov ecx, [rdi + rax] lea edx, [rcx + rcx] ; add ebx, ecx mov ecx, [rdi + rax+128*2] lea edx, [rcx + rcx] ; add ebx, ecx mov ecx, [rdi + rax+128*4] lea edx, [rcx + rcx] ; add r8d, ecx mov ecx, [rdi + rax+128*6] lea edx, [rcx + rcx] ; add r9d, ecx mov ecx, [rdi + rax+128*8] lea edx, [rcx + rcx] ; add r10d, ecx ; dependent ALU uop add eax, 128*10 and eax, (4096*4096)-1 ; and eax, 0 ; get L1d hits instead of just L3 hits dec ebp jnz .loop IACA_end .end: ;;NASM-only, not YASM: %if __BITS__ == 32 %ifidn __OUTPUT_FORMAT__, elf32 mov eax,1 xor ebx,ebx int 0x80 ; sys_exit(0) 32-bit ABI %else xor edi,edi mov eax,231 ; __NR_exit_group from /usr/include/asm/unistd_64.h syscall ; sys_exit_group(0) %endif section .bss align 4096 buf: resb 4096*4096 bufsrc: resb 4096 resb 100
Become a Patron
Sponsor on GitHub
Donate via PayPal
Source on GitHub
Mailing list
Installed libraries
Wiki
Report an issue
How it works
Contact the author
CE on Mastodon
About the author
Statistics
Changelog
Version tree