Compiler Explorer

Source code

#include <cuda/std/atomic>

#include<cuda_runtime.h>
#include<cstdlib>
#include<iostream>
#include<chrono>
#include <cuda/annotated_ptr>

namespace chrono = std::chrono;
using clock_type = chrono::high_resolution_clock;

template<typename Ptr_y, typename Ptr_x>
__global__ void squre(Ptr_y y, Ptr_x x, int n)
{
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
       i += blockDim.x * gridDim.x) {
        y[i] = x[i] * x[i];
  }
}

__inline__ __device__ float warp_reduce(float val) {
  int warp_size = 32;
  for (int offset = warp_size / 2; offset > 0; offset /= 2)
    val += __shfl_down_sync(0xFFFFFFFF, val, offset);
  return val;
}

template<typename Ptr_z, typename Ptr_x, typename Ptr_y>
__global__ void reduce(Ptr_z z, Ptr_x x, Ptr_y y, int N)
{
    int warp_size = 32;
    float sum = float(0);
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
       i += blockDim.x * gridDim.x) {
        sum += x[i] + y[i];
    }
    sum = warp_reduce(sum);  // Obtain the sum of values in the current warp;
    if ((threadIdx.x & (warp_size - 1)) == 0)                // Same as (threadIdx.x % warp_size) == 0 but faster
        atomicAdd(z, sum);  // The first thread in the warp updates the output;
}

int main(int argc, char *argv[])
{

// ./a device_id num_blcoks bs_1d N
    cudaError_t err;
    int N = 70000000;
    int device_id = 2;
    int num_blocks = 512;
    int block_size_1d = 32;
    if(argc >= 2) device_id = atoi(argv[1]);
    cudaSetDevice(device_id);
    if(argc >= 3) num_blocks = atoi(argv[2]);
    if(argc >= 4) block_size_1d = atoi(argv[3]);
    if(argc >= 5) N = atoi(argv[4]);

cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, device_id);                
    size_t l2_size = min(int(prop.l2CacheSize * 0.75), prop.persistingL2CacheMaxSize);
    std::cout<<"set aside L2 cache has "<<l2_size<<" Byte\n";
    cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, l2_size); /* set-aside 3/4 of L2 cache for persisting accesses or the max allowed*/

float * x = nullptr;
    float * y = nullptr;
    float * x1 = nullptr;
    float * y1 = nullptr;
    float * res = nullptr;

cudaMallocManaged((void**)&x, sizeof(float) * N);
    cudaMallocManaged((void**)&y, sizeof(float) * N);
    cudaMallocManaged((void**)&x1, sizeof(float) * N);
    cudaMallocManaged((void**)&y1, sizeof(float) * N);
    cudaMallocManaged((void**)&res, sizeof(float));

cudaStream_t s1,s2;
    cudaStreamCreate(&s1);
    cudaStreamCreate(&s2);

cudaEvent_t e1;
    cudaEventCreate(&e1);

//init data
    for (int i = 0; i < N; i++) {
        x[i] = 1.0 / (i + 1);
        y[i] = 2.0 / (i + 1);
    }

err = cudaGetLastError();
    if(err != cudaSuccess)
    {
        std::cout<<"Shit before prefetch\n";
    }

//prefetch
    cudaMemPrefetchAsync(x, sizeof(float) * N, device_id, 0);
    cudaMemPrefetchAsync(y, sizeof(float) * N, device_id, 0);
    cudaMemPrefetchAsync(x1, sizeof(float) * N, device_id, 0);
    cudaMemPrefetchAsync(y1, sizeof(float) * N, device_id, 0);
    cudaMemPrefetchAsync(res, sizeof(float), device_id, 0);
    cudaDeviceSynchronize();

//compute
    for(int i = 0; i < 10; i++){

//timing
    auto start = clock_type::now();
    //Sync
    // squre<<<num_blocks, block_size_1d, 0, s1>>>(x1, x, N);
    // squre<<<num_blocks, block_size_1d, 0, s1>>>(y1, y, N);
    // reduce<<<num_blocks, block_size_1d, 0, s1>>>(res, x1, y1, N);
    // cudaDeviceSynchronize();

//multi-stream CKE with L2
    //bind L2 cache ptr with stream
    
    cudaStreamAttrValue stream_attribute_x1;                                         // Stream level attributes data structure
    stream_attribute_x1.accessPolicyWindow.base_ptr  = reinterpret_cast<void*>(x1); // Global Memory data pointer
    stream_attribute_x1.accessPolicyWindow.num_bytes = l2_size;                    // Number of bytes for persistence access.
                                                                                // (Must be less than cudaDeviceProp::accessPolicyMaxWindowSize)
    //TODO: How to set this hitRatio
    stream_attribute_x1.accessPolicyWindow.hitRatio  = 0.9;                          // Hint for cache hit ratio
    stream_attribute_x1.accessPolicyWindow.hitProp   = cudaAccessPropertyPersisting; // Type of access property on cache hit
    stream_attribute_x1.accessPolicyWindow.missProp  = cudaAccessPropertyStreaming;  // Type of access property on cache miss.

//Set the attributes to a CUDA stream of type cudaStream_t
    cudaStreamSetAttribute(s1, cudaStreamAttributeAccessPolicyWindow, &stream_attribute_x1);

// bind L2 cache ptr with stream
    cudaStreamAttrValue stream_attribute_y1;                                         // Stream level attributes data structure
    stream_attribute_y1.accessPolicyWindow.base_ptr  = reinterpret_cast<void*>(y1); // Global Memory data pointer
    stream_attribute_y1.accessPolicyWindow.num_bytes = l2_size;                    // Number of bytes for persistence access.
                                                                                // (Must be less than cudaDeviceProp::accessPolicyMaxWindowSize)
    //TODO: How to set this hitRatio
    stream_attribute_y1.accessPolicyWindow.hitRatio  = 0.9;                          // Hint for cache hit ratio
    stream_attribute_y1.accessPolicyWindow.hitProp   = cudaAccessPropertyPersisting; // Type of access property on cache hit
    stream_attribute_y1.accessPolicyWindow.missProp  = cudaAccessPropertyStreaming;  // Type of access property on cache miss.

//Set the attributes to a CUDA stream of type cudaStream_t
    cudaStreamSetAttribute(s2, cudaStreamAttributeAccessPolicyWindow, &stream_attribute_y1);

//easy API for L2 cache
    // cuda::annotated_ptr<float, cuda::access_property::persisting> x1_p{x1};
    // cuda::annotated_ptr<float, cuda::access_property::persisting> y1_p{y1};

err = cudaGetLastError();
    if(err != cudaSuccess)
    {
        std::cout<<"Shit in cache\n";
    }

squre<<<num_blocks, block_size_1d, 0, s1>>>(x1, x, N);
    squre<<<num_blocks, block_size_1d, 0, s2>>>(y1, y, N);
    cudaEventRecord(e1, s1);

cudaStreamWaitEvent(s2, e1);
    reduce<<<num_blocks, block_size_1d, 0, s2>>>(res, x1, y1, N);

cudaDeviceSynchronize();
    
    err = cudaGetLastError();
    if(err != cudaSuccess)
    {
        std::cout<<"Shit in running\n";
    }

//timing
    auto end = clock_type::now();
    auto it_time = chrono::duration_cast<chrono::microseconds>(end - start).count();

std::cout<<"Iteration "<<i<<" : "<<(float)it_time / 1000.0<<" ms\n";

}

//validate
    std::cout<<"Result is "<<*res<<"\n";

cudaFree(x);
    cudaFree(y);
    cudaFree(x1);
    cudaFree(y1);
    cudaFree(res);
}