Thanks for using Compiler Explorer
Sponsors
C with Coccinelle
C++ with Coccinelle
Jakt
C++
Ada
Algol68
Analysis
Android Java
Android Kotlin
Assembly
C
C3
Carbon
C++ (Circle)
CIRCT
Clean
CMake
CMakeScript
COBOL
C++ for OpenCL
MLIR
Cppx
Cppx-Blue
Cppx-Gold
Cpp2-cppfront
Crystal
C#
CUDA C++
D
Dart
Elixir
Erlang
Fortran
F#
GLSL
Go
Haskell
HLSL
Hook
Hylo
IL
ispc
Java
Julia
Kotlin
LLVM IR
LLVM MIR
Modula-2
Nim
Numba
Objective-C
Objective-C++
OCaml
Odin
OpenCL C
Pascal
Pony
Python
Racket
Ruby
Rust
Snowball
Scala
Slang
Solidity
Spice
SPIR-V
Swift
LLVM TableGen
Toit
TypeScript Native
V
Vala
Visual Basic
Vyper
WASM
Zig
Javascript
GIMPLE
Ygen
sway
cuda source #1
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
10.0.0 sm_75 CUDA-10.2
10.0.1 sm_75 CUDA-10.2
11.0.0 sm_75 CUDA-10.2
16.0.0 sm_90 CUDA-11.8
17.0.1(libc++) sm_90 CUDA-12.1
18.1.0(libc++) sm_90 CUDA-12.3.1
19.1.0 sm_90 CUDA-12.5.1
20.1.0 sm_90 CUDA-12.5.1
NVCC 10.0.130
NVCC 10.1.105
NVCC 10.1.168
NVCC 10.1.243
NVCC 10.2.89
NVCC 11.0.2
NVCC 11.0.3
NVCC 11.1.0
NVCC 11.1.1
NVCC 11.2.0
NVCC 11.2.1
NVCC 11.2.2
NVCC 11.3.0
NVCC 11.3.1
NVCC 11.4.0
NVCC 11.4.1
NVCC 11.4.2
NVCC 11.4.3
NVCC 11.4.4
NVCC 11.5.0
NVCC 11.5.1
NVCC 11.5.2
NVCC 11.6.0
NVCC 11.6.1
NVCC 11.6.2
NVCC 11.7.0
NVCC 11.7.1
NVCC 11.8.0
NVCC 12.0.0
NVCC 12.0.1
NVCC 12.1.0
NVCC 12.2.1
NVCC 12.3.1
NVCC 12.4.1
NVCC 12.5.1
NVCC 12.6.1
NVCC 12.6.2
NVCC 9.1.85
NVCC 9.2.88
NVRTC 11.0.2
NVRTC 11.0.3
NVRTC 11.1.0
NVRTC 11.1.1
NVRTC 11.2.0
NVRTC 11.2.1
NVRTC 11.2.2
NVRTC 11.3.0
NVRTC 11.3.1
NVRTC 11.4.0
NVRTC 11.4.1
NVRTC 11.5.0
NVRTC 11.5.1
NVRTC 11.5.2
NVRTC 11.6.0
NVRTC 11.6.1
NVRTC 11.6.2
NVRTC 11.7.0
NVRTC 11.7.1
NVRTC 11.8.0
NVRTC 12.0.0
NVRTC 12.0.1
NVRTC 12.1.0
clang 7.0.0 sm_70 CUDA-9.1
clang 8.0.0 sm_75 CUDA-10.0
clang 9.0.0 sm_75 CUDA-10.1
clang rocm-4.5.2
clang rocm-5.0.2
clang rocm-5.1.3
clang rocm-5.2.3
clang rocm-5.3.2
clang rocm-5.7.0
clang rocm-6.0.2
clang rocm-6.1.2
clang staging rocm-6.1.2
clang trunk rocm-6.1.2
trunk sm_90 CUDA-12.6.1
Options
Source code
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <iostream> const unsigned long long int InitValue = 0x1000'0000'0000'0000ull; // the value that initializes the volatile variable const unsigned long long int BigValue = 0x7000'0000'0000'0000ull; // the amount written by the one volatile write (or atomicExch()) const unsigned long long int Delta = 0x0000'0001'0000'0001ull; // the amount added by each atomicAdd() -- note the two 1s const int N_AddPerThr = 0x0400; // all threads (except one) loop this many times and do one atomicAdd() per loop // for understanding the ffc00 pattern seen in main()'s bad value reports: // # of threads doing atomic adds * adds (loops) per thread // ------------------------------ ----------------------- // (1024 - 1) * 0x0400 = 0xffc00 // example ***** non-problematic ***** kernel: // k0() always leads to "ok value" reports from main() __global__ void k0( volatile unsigned long long int *d ){ if ( blockIdx.x == 0 ) { for ( int i = 0; i < N_AddPerThr; i++) atomicAdd((unsigned long long int *)d, Delta ); } else if ( blockIdx.x == 19 ) { if ( threadIdx.x == 0 ) { // while ( *d == 0 ); // don't create a time dependency on an atomicAdd() from another thread while ( ( clock() & 0xff0 ) != 0 ); // wait a random few ns instead *d = BigValue; } } else { while ( ( clock() & 0xff0 ) != 0 ); // wait a random few ns and then exit } } // for this kernel see: // https://stackoverflow.com/questions/15331009/when-to-use-volatile-with-shared-cuda-memory // lines commmented out below do not significantly change main()'s reports, but changing the VolatileWrite argument at the time of launch does // troublesome kernel: // k1() often leads to "bad value" reports from main() // k1() uses d only to report the final value of Victim __global__ void k1( volatile unsigned long long int *d, bool VolatileWrite, int I_Launch ) { __shared__ volatile unsigned long long int Victim; //__shared__ unsigned long long int Victim; //volatile unsigned long long int* VolVictim = const_cast<volatile unsigned long long int*>( & Victim ); if ( blockIdx.x != 0 ) return; if ( threadIdx.x == 0 ) Victim = InitValue; //if ( threadIdx.x == 0 ) *VolVictim = InitValue; __syncthreads(); if ( threadIdx.x == 0 ) { while ( ( clock() & 0x03f0 ) != 0 ); // wait a random few ns if ( VolatileWrite ) { Victim = BigValue; // this does not always have the intended effect, which is to reset the count to BigValue // *VolVictim = BigValue; } else atomicExch( & (unsigned long long) Victim, BigValue ); // this has has the intended effect, and main() reports only "ok value" } else { //else if ( threadIdx.x > 31 ) { for ( int i = 0; i < N_AddPerThr; i++ ) atomicAdd( & (unsigned long long int) Victim, Delta ); } __syncthreads(); if ( threadIdx.x == 0 ) *d = Victim; // report result to main() //if ( threadIdx.x == 0 ) *d = *VolVictim; // this produces the same result as the printout in main(): // if ( threadIdx.x == 0 ) { if ( Victim < BigValue ) printf( "got bad value: %llx on launch %x <-- from k1()\n", Victim, I_Launch ); } } int main(){ unsigned long long int *d; cudaMallocManaged(&d, sizeof(*d)); for ( int Run = 0; Run < 3; ++ Run ) { if ( Run == 0 ) std::cout << "\nStarting run that repeatedly launches k0():" << std::endl; if ( Run == 1 ) std::cout << "\nStarting run that repeatedly launches k1() with volatile write,\n(debug build yields only a few good values -- try another run for examples):" << std::endl; if ( Run == 2 ) std::cout << "\nStarting run that repeatedly launches k1() with atomicExch(),\n(debug build runs very slow):" << std::endl; int N_Bad = 0; // used to abort after too many bad values bool WasBad = false; // used to show a sample ok value after a bad value, i.e., indicates a bad value was observed in the previous iteration for ( int i = 0; i < 1'000; ++ i ) { if ( N_Bad >= 30 ) { std::cout << "Aborting after 30 bad values in " << std::dec << i << " launches! ***** " << std::endl; break; } *d = InitValue; if ( Run == 0 ) k0<<< 20,1024>>>( d ); // always leads to "ok value" if ( Run == 1 ) k1<<< 1,1024>>>( d, true, i ); // true --> volatile write -- often leads to "bad value" if ( Run == 2 ) k1<<< 1,1024>>>( d, false, i ); // false --> atomicExch() -- always leads to "ok value" cudaDeviceSynchronize(); // report: show all bad values, a sample of good values, and the final value std::cout << std::hex; if ( *d < BigValue ) { std::cout << "got bad value: 0x" << *d << " on launch 0x" << i << " ***** " << std::endl; ++ N_Bad; WasBad = true; } else if ( WasBad || ( ( i & 0x0ff ) == 0 ) ) { std::cout << "got ok value: 0x" << *d << " on launch 0x" << i << std::endl; WasBad = false; } // occasional updates } std::cout << std::hex; std::cout << "final value: 0x" << *d << std::endl; } }
Become a Patron
Sponsor on GitHub
Donate via PayPal
Source on GitHub
Mailing list
Installed libraries
Wiki
Report an issue
How it works
Contact the author
CE on Mastodon
CE on Bluesky
About the author
Statistics
Changelog
Version tree