Compiler Explorer

Source code

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>

const unsigned long long int  InitValue   =  0x1000'0000'0000'0000ull;      // the value that initializes the volatile variable
const unsigned long long int  BigValue    =  0x7000'0000'0000'0000ull;      // the amount written by the one volatile write (or atomicExch())
const unsigned long long int  Delta       =  0x0000'0001'0000'0001ull;      // the amount added by each atomicAdd() -- note the two 1s
const int                     N_AddPerThr =  0x0400;                        // all threads (except one) loop this many times and do one atomicAdd() per loop

// for understanding the ffc00 pattern seen in main()'s bad value reports:
//   # of threads doing atomic adds  *  adds (loops) per thread
//   ------------------------------     -----------------------
//                       (1024 - 1)  *  0x0400                   =  0xffc00

// example ***** non-problematic ***** kernel:
// k0() always leads to "ok value" reports from main()
__global__ void k0( volatile unsigned long long int *d ){

if ( blockIdx.x == 0 ) {
    for ( int i = 0; i < N_AddPerThr; i++)  atomicAdd((unsigned long long int *)d, Delta );
  }

else if ( blockIdx.x == 19 ) {
    if ( threadIdx.x  ==  0 ) {
//    while ( *d == 0                    );        // don't create a time dependency on an atomicAdd() from another thread
      while ( ( clock() & 0xff0 )  !=  0 );        // wait a random few ns instead

*d  =  BigValue;
    }
  }
  else {
    while ( ( clock() & 0xff0 )  !=  0 );            // wait a random few ns and then exit
  }

}

// for this kernel see:
//   https://stackoverflow.com/questions/15331009/when-to-use-volatile-with-shared-cuda-memory

// lines commmented out below do not significantly change main()'s reports, but changing the VolatileWrite argument at the time of launch does

// troublesome kernel:
// k1() often leads to "bad value" reports from main()
// k1() uses d only to report the final value of Victim
__global__ void k1( volatile unsigned long long int *d, bool VolatileWrite, int I_Launch ) {

__shared__  volatile  unsigned long long int      Victim;
//__shared__            unsigned long long int      Victim;
//volatile              unsigned long long int*  VolVictim  =  const_cast<volatile unsigned long long int*>( & Victim );

if ( blockIdx.x   !=  0 )  return;

if ( threadIdx.x  ==  0 )      Victim  =  InitValue;
//if ( threadIdx.x  ==  0 )  *VolVictim  =  InitValue;

__syncthreads();

if ( threadIdx.x  ==  0 ) {
    while ( ( clock() & 0x03f0 )  !=  0 );        // wait a random few ns

if ( VolatileWrite ) {
        Victim  =  BigValue;                                         // this does not always have the intended effect, which is to reset the count to BigValue
//  *VolVictim  =  BigValue;
    }
    else  atomicExch( & (unsigned long long) Victim, BigValue );     // this has has the intended effect, and main() reports only "ok value"
  }

else {
//else if ( threadIdx.x > 31 ) {
    for ( int i = 0; i < N_AddPerThr; i++ )  atomicAdd( & (unsigned long long int) Victim, Delta );
  }

__syncthreads();

if ( threadIdx.x  ==  0 )  *d  =      Victim;   // report result to main()
//if ( threadIdx.x  ==  0 )  *d  =  *VolVictim;

// this produces the same result as the printout in main():
//  if ( threadIdx.x  ==  0 ) {  if  ( Victim  <  BigValue ) printf( "got bad value: %llx on launch %x   <-- from k1()\n", Victim, I_Launch ); }

}

int main(){

unsigned long long int *d;
  cudaMallocManaged(&d, sizeof(*d));

for ( int Run   =  0; Run < 3; ++ Run ) {
    if (    Run  ==  0 )  std::cout << "\nStarting run that repeatedly launches k0():" << std::endl;
    if (    Run  ==  1 )  std::cout << "\nStarting run that repeatedly launches k1() with volatile write,\n(debug build yields only a few good values -- try another run for examples):" << std::endl;
    if (    Run  ==  2 )  std::cout << "\nStarting run that repeatedly launches k1() with atomicExch(),\n(debug build runs very slow):"   << std::endl;

int   N_Bad   =  0;      // used to abort after too many bad values
    bool  WasBad  =  false;  // used to show a sample ok value after a bad value, i.e., indicates a bad value was observed in the previous iteration

for ( int i = 0; i <  1'000; ++ i ) {
      if ( N_Bad >= 30 ) { std::cout << "Aborting after 30 bad values in " << std::dec << i << " launches!  ***** " << std::endl;  break; }
      *d = InitValue;
      if ( Run  ==  0 )  k0<<< 20,1024>>>( d    );           // always leads to "ok  value"
      if ( Run  ==  1 )  k1<<<  1,1024>>>( d,  true, i );    // true  --> volatile write  -- often  leads to "bad value"
      if ( Run  ==  2 )  k1<<<  1,1024>>>( d, false, i );    // false --> atomicExch()    -- always leads to "ok  value"
      cudaDeviceSynchronize();

// report: show all bad values, a sample of good values, and the final value
      std::cout << std::hex;
      if      ( *d  <  BigValue          ) { std::cout << "got bad value: 0x" << *d << " on launch 0x" << i << "  ***** " << std::endl;  ++ N_Bad;  WasBad  =  true; }
      else if (  WasBad  ||
                ( ( i & 0x0ff )  ==  0 ) ) { std::cout << "got ok  value: 0x" << *d << " on launch 0x" << i <<               std::endl;  WasBad  =  false; }  // occasional updates
    }
    std::cout << std::hex;                   std::cout << "final   value: 0x" << *d << std::endl;
  }
}