Compiler Explorer

Source code

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <assert.h>
#include <math.h>
#include <ctype.h>
#include <float.h>

__device__ float& vec_at(float4& vec, int index) {
    return reinterpret_cast<float*>(&vec)[index];
}

__device__ float vec_at(const float4& vec, int index) {
    return reinterpret_cast<const float*>(&vec)[index];
}

struct SoftmaxParams {
    float Scale;
    float Offset;
};
namespace cg = cooperative_groups;

__global__ void unroll_success_funcarg(float* out, const float* inp, int idx, int V, int block_size) {
    // same but not float4
    // one row of inp, i.e. inp[idx, :] of shape (V,)

const float* x = inp + idx * V;
    float thread_maxval = -INFINITY;
    float thread_sumval = 0.0f;

// do the loop in reverse to maximise probability of L2 cache hits
    // so even small L2s get some hits on the 2nd read of the same thread
    #pragma unroll 4
    for (int i = threadIdx.x; i < (V - block_size); i += block_size) {
        float v = x[V - block_size - i];
        float old_maxval = thread_maxval;
        thread_maxval = fmaxf(thread_maxval, v);
        thread_sumval *= expf((old_maxval - thread_maxval));
        thread_sumval += expf(v - thread_maxval);
    }

// simplifying test case by removing rest of code and returning early
    out[idx*V] = thread_maxval + thread_sumval;
}

__global__ void unroll_bad_blockdim(float* out, const float* inp, int idx, int V) {
    // same but not float4
    // one row of inp, i.e. inp[idx, :] of shape (V,)

const float* x = inp + idx * V;
    float thread_maxval = -INFINITY;
    float thread_sumval = 0.0f;

// do the loop in reverse to maximise probability of L2 cache hits
    // so even small L2s get some hits on the 2nd read of the same thread
    #pragma unroll 4
    for (int i = threadIdx.x; i < (V - blockDim.x); i += blockDim.x) {
        float v = x[V - blockDim.x - i];
        float old_maxval = thread_maxval;
        thread_maxval = fmaxf(thread_maxval, v);
        thread_sumval *= expf((old_maxval - thread_maxval));
        thread_sumval += expf(v - thread_maxval);
    }

// simplifying test case by removing rest of code and returning early
    out[idx*V] = thread_maxval + thread_sumval;
}