Compiler Explorer

Source code

#define GGML_USE_HIP
#define WARP_SIZE 32

#if defined(GGML_USE_HIP)
template <int dpp_ctrl, typename T, int row_mask = 0xf, int bank_mask = 0xf, bool bound_ctrl = true>
static __device__  T hip_move_dpp(T old, T v) {
    return __builtin_bit_cast(
        T,
        __builtin_amdgcn_update_dpp(
            __builtin_bit_cast(int, old),
            __builtin_bit_cast(int, v),
            dpp_ctrl,
            row_mask,
            bank_mask,
            bound_ctrl
        )
    );
}

template <int mask, typename T>
static __device__  T hip_ds_swizzle(T v) {
    return __builtin_bit_cast(T, __builtin_amdgcn_ds_swizzle(__builtin_bit_cast(int, v), mask));
}
#endif // GGML_USE_HIP

template<int width = WARP_SIZE, typename T>
static __device__ __forceinline__ T ggml_cuda_shfl_xor_sync(T x, int offset) {
#if defined(GGML_USE_HIP)
    static T old;

// for some reason clang (v20) will not unroll loops with just the plain `int offset`
    // inside the switch, but it will optimize the switch case out when it does.
    switch (~offset) {
        // width != warp size shouldn't affect normal butterfly shuffle pattern
        case ~1: return hip_move_dpp<0x160 + 1>(old, x);  // row_xor_mask: offset
        case ~2: return hip_move_dpp<0x160 + 2>(old, x);
        case ~4: return hip_move_dpp<0x160 + 4>(old, x);
        case ~8: return hip_move_dpp<0x160 + 8>(old, x);
        case ~16: return hip_ds_swizzle<0x401f>(x);  // swap neighboring groups of 16
        default: return __shfl_xor(x, offset, width);
    }
#else
    return __shfl_xor_sync(0xffffffff, x, offset, width);
#endif // GGML_USE_HIP
}

template<int width = WARP_SIZE>
static __device__ __forceinline__ int warp_reduce_sum(int x) {
#pragma unroll
    for (int offset = width/2; offset > 0; offset >>= 1) {
        x += ggml_cuda_shfl_xor_sync<width>(x, offset);
    }

return x;
}

__global__ void test_64(int x, int * o) {
    *o = warp_reduce_sum<64>(x);
}

__global__ void test_32(int x, int * o) {
    *o = warp_reduce_sum<32>(x);
}

__global__ void test_32_no_unroll(int x, int width, int * o) {
    #pragma unroll
    for (int offset = width/2; offset > 0; offset >>= 1) {
        x += ggml_cuda_shfl_xor_sync<32>(x, offset);
    }

*o = x;
}

__global__ void test_32_old_shfl(int x, int * o) {
    #pragma unroll
    for (int offset = 32/2; offset > 0; offset >>= 1) {
        x += __shfl_xor(x, offset, 32);
    }

*o = x;
}

__global__ void test_32__shfl_no_unroll(int x, int width, int * o) {
    #pragma unroll
    for (int offset = width/2; offset > 0; offset >>= 1) {
        x += __shfl_xor(x, offset, 32);
    }

*o = x;
}

__global__ void test_16(int x, int * o) {
    *o = warp_reduce_sum<16>(x);
}

__global__ void test_8(int x, int * o) {
    *o = warp_reduce_sum<8>(x);
}

__global__ void test_noop(int x, int * o) {
    *o = x;
}