Compiler Explorer

Source code

#include <cuda_fp16.h>
#include <iostream>
#include <mma.h>
using namespace nvcuda;

__global__ void mma_test(half* C1, half* C2, half *C3, half *C4)
{
    int lane = threadIdx.x % 32;
    uint out[4] = { 0 };

{
        uint MultiA[2] = { 0 };
        uint MultiB[2] = { 0 };

half* test1 = reinterpret_cast<half*>(MultiA);
        half* test2 = reinterpret_cast<half*>(MultiB);
        if ((lane < 4) || ((lane > 15) && (lane < 20))) { // row major matrix A1 from PTX figure 22
          test1[0] = 1.0; // you could have just as easily loaded these values from any location in global memory or shared memory
          test1[1] = 1.0;
          test1[2] = 1.0;
          test1[3] = 1.0;}
        if (((lane > 3) && (lane < 8)) || ((lane > 19) && (lane < 24))) { // row major matrix A2 from PTX figure 22
          test1[0] = 2.0;
          test1[1] = 2.0;
          test1[2] = 2.0;
          test1[3] = 2.0;}
        if (((lane > 7) && (lane < 12)) || ((lane > 23) && (lane < 28))) { // row major matrix A3 from PTX figure 22
          test1[0] = 3.0;
          test1[1] = 3.0;
          test1[2] = 3.0;
          test1[3] = 3.0;}
        if (((lane > 11) && (lane < 16)) || (lane > 27)) { // row major matrix A4 from PTX figure 22
          test1[0] = 4.0;
          test1[1] = 4.0;
          test1[2] = 4.0;
          test1[3] = 4.0;}

// loading B1 - B4, keeping it simple - but the same structure as above could be used to load the B1-B4 "separately"
        test2[0] = 1.0;
        test2[1] = 1.0;
        test2[2] = 1.0;
        test2[3] = 1.0;

asm volatile("mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16 "
                     "{ %0, %1, %2, %3 },"
                     "{ %4, %5 },"
                     "{ %6, %7 },"
                     "{ %8, %9, %10, %11 };\n"
                     : "=r"(out[0]), "=r"(out[1]), "=r"(out[2]), "=r"(out[3])
                     : "r"(MultiA[0]), "r"(MultiA[1]),
                     "r"(MultiB[0]), "r"(MultiB[1]),
                     "r"(out[0]), "r"(out[1]), "r"(out[2]), "r"(out[3]));
    }
    if (lane < 4) { // C1 from PTX figure 26
      reinterpret_cast<uint4*>(C1)[lane] = reinterpret_cast<uint4*>(out)[0];}
    if ((lane > 15) && (lane < 20)) { // C1 from PTX figure 26
      reinterpret_cast<uint4*>(C1)[lane+4-16] = reinterpret_cast<uint4*>(out)[0];}
    if ((lane > 3) && (lane < 8)) { // C2 from PTX figure 26
      reinterpret_cast<uint4*>(C2)[lane-4] = reinterpret_cast<uint4*>(out)[0];}
    if ((lane > 19) && (lane < 24)) { // C2 from PTX figure 26
      reinterpret_cast<uint4*>(C2)[lane+4-20] = reinterpret_cast<uint4*>(out)[0];}
    if ((lane > 7) && (lane < 12)) { // C3 from PTX figure 26
      reinterpret_cast<uint4*>(C3)[lane-8] = reinterpret_cast<uint4*>(out)[0];}
    if ((lane > 23) && (lane < 28)) { // C3 from PTX figure 26
      reinterpret_cast<uint4*>(C3)[lane+4-24] = reinterpret_cast<uint4*>(out)[0];}
    if ((lane > 11) && (lane < 16)) { // C4 from PTX figure 26
      reinterpret_cast<uint4*>(C4)[lane-12] = reinterpret_cast<uint4*>(out)[0];}
    if (lane > 27) { // C4 from PTX figure 26
      reinterpret_cast<uint4*>(C4)[lane+4-28] = reinterpret_cast<uint4*>(out)[0];}
}

int main(int argc, char *argv[])
{
    half* h_C1 = (half*)malloc(sizeof(half) * 8 * 8);
    half* h_C2 = (half*)malloc(sizeof(half) * 8 * 8);
    half* h_C3 = (half*)malloc(sizeof(half) * 8 * 8);
    half* h_C4 = (half*)malloc(sizeof(half) * 8 * 8);
    half *d_C1, *d_C2, *d_C3, *d_C4;
    cudaMalloc(&d_C1, sizeof(half) * 8 * 8);
    cudaMalloc(&d_C2, sizeof(half) * 8 * 8);
    cudaMalloc(&d_C3, sizeof(half) * 8 * 8);
    cudaMalloc(&d_C4, sizeof(half) * 8 * 8);

mma_test<<<1, 32>>>(d_C1, d_C2, d_C3, d_C4);

cudaMemcpy(h_C1, d_C1, sizeof(half) * 8 * 8, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_C2, d_C2, sizeof(half) * 8 * 8, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_C3, d_C3, sizeof(half) * 8 * 8, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_C4, d_C4, sizeof(half) * 8 * 8, cudaMemcpyDeviceToHost);
    std::cout << "C1: " << std::endl;
    for (int i = 0; i < 8*8; i++) {
        std::cout << __half2float(h_C1[i]) << " ";
    }
    std::cout << std::endl;
    std::cout << "C2: " << std::endl;
    for (int i = 0; i < 8*8; i++) {
        std::cout << __half2float(h_C2[i]) << " ";
    }
    std::cout << std::endl;
    std::cout << "C3: " << std::endl;
    for (int i = 0; i < 8*8; i++) {
        std::cout << __half2float(h_C3[i]) << " ";
    }
    std::cout << std::endl;
    std::cout << "C4: " << std::endl;
    for (int i = 0; i < 8*8; i++) {
        std::cout << __half2float(h_C4[i]) << " ";
    }
    std::cout << std::endl;
}