Compiler Explorer

Source code

#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>

template <typename T, int size>
struct Array {
  T data[size];

__device__ T operator[](int i) const {
    return data[i];
  }
  __device__ T& operator[](int i) {
    return data[i];
  }

Array() = default;
  Array(const Array&) = default;
  Array& operator=(const Array&) = default;

// Fill the array with x.
  __device__ Array(T x) {
    for (int i = 0; i < size; i++) {
      data[i] = x;
    }
  }
};  
  
  
  class Philox {
  public:
    __device__ inline Philox(unsigned long long seed,
                             unsigned long long subsequence,
                             unsigned long long offset) {
      key[0] = (unsigned int)seed;
      key[1] = (unsigned int)(seed >> 32);
      counter = Array<uint, 4>(0);
      counter[2] = (unsigned int)(subsequence);
      counter[3] = (unsigned int)(subsequence >> 32);
      incr_n(offset / 4);
    }
    __device__ inline Array<uint, 4> operator()() {
        Array<uint, 4> counter_ = counter;
        Array<uint, 2> key_ = key;
        for(int i = 0; i < 9; i++) {
            counter_ = single_round(counter_, key_);
            key_[0] += (kPhilox10A); key_[1] += (kPhilox10B);
        }
        output = single_round(counter_, key_);
        incr();
      return output;
    }
  private:
    Array<uint, 4> counter;
    Array<uint, 4> output;
    Array<uint, 2> key;
    __device__ inline void incr_n(unsigned long long n) {
      unsigned int nlo = (unsigned int)(n);
      unsigned int nhi = (unsigned int)(n >> 32);
      counter[0] += nlo;
      if (counter[0] < nlo)
        nhi++;
      counter[1] += nhi;
      if (nhi <= counter[1])
        return;
      if (++counter[2])
        return;
      ++counter[3];
    }
    __device__ inline void incr() {
      if (++counter[0])
        return;
      if (++counter[1])
        return;
      if (++counter[2])
        return;
      ++counter[3];
    }
    __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
                                      unsigned int *result_high) {
      *result_high = __umulhi(a, b);
      return a*b;
    }
    __device__ inline Array<uint, 4> single_round(Array<uint, 4> ctr, Array<uint, 2> key) {
      unsigned int hi0;
      unsigned int hi1;
      unsigned int lo0 = mulhilo32(kPhiloxSA, ctr[0], &hi0);
      unsigned int lo1 = mulhilo32(kPhiloxSB, ctr[2], &hi1);
	  Array<uint, 4> ret;
      ret[0] = hi1 ^ ctr[1] ^ key[0];ret[1] = lo1; ret[2] = hi0 ^ ctr[3] ^ key[1]; ret[3] = lo0;
      return ret;
    }
    static const unsigned long kPhilox10A = 0x9E3779B9;
    static const unsigned long kPhilox10B = 0xBB67AE85;
    static const unsigned long kPhiloxSA = 0xD2511F53;
    static const unsigned long kPhiloxSB = 0xCD9E8D57;
  };
  // Inverse of 2^32.
  #define M_RAN_INVM32 2.3283064e-10f
  __device__  __inline__ Array<float, 4> uniform(Array<uint, 4> x) {
      Array<float, 4> res;
      res[0] = x[0] * M_RAN_INVM32;
      res[1] = x[1] * M_RAN_INVM32;
      res[2] = x[2] * M_RAN_INVM32;
      res[3] = x[3] * M_RAN_INVM32;
      return res;
  }
__device__ void launchFunction() {
    Philox state(123, 0, 0);
    Array<float, 4> myVar;
    myVar = uniform(state());
    printf("%p, %p, %p, %p\n", (void*)&myVar[0], (void*)&myVar[1], (void*)&myVar[2], (void*)&myVar[3]);
}
__global__ void myKernel() {
    return launchFunction();
}
int main() {
    myKernel<<<1,1>>>();

}