Compiler Explorer

Source code

#include <cub/device/device_histogram.cuh>
#include <cub/iterator/constant_input_iterator.cuh>

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <cuda/std/array>

// Elements per vector
constexpr int element_count = 5;
using element_t = int;
// Vector type (one vector corresponds to an item in the scan)
using item_t = cuda::std::array<element_t, element_count>;
// The identity vector
constexpr item_t identity{0, 1, 2, 3, 4};

// Function composition operator
struct CompositionOp
{
    __host__ __device__ __forceinline__
    item_t operator()(item_t lhs, item_t rhs)
    {
        item_t result{};
        // (!!!#1) You would like to parallelize across this loop
        for(int i=0; i < element_count; i++)
        {
            // (!!!#2) You would like to have at least rhs stored in shmem
            result[i] = rhs[lhs[i]];
        }
        return result;
    }
};

void print_vectors(thrust::host_vector<item_t> items)
{
    for(int item = 0; item < items.size(); item++)
    {
        std::cout << "[";
        for(int element = 0; element < element_count; element++)
        {   
            std::cout << (items[item][element]);
            if(element != element_count-1)
            std::cout << ",";
        }
        std::cout << "]";
        if(item != items.size()-1)
            std::cout << ", ";
    }
}

int main(int argc, char **argv)
{
    // Sample data
    std::vector<item_t> sample_data = {{2, 1, 2, 2, 1}, {1, 1, 1, 1, 1}, {2, 2, 2, 2, 2}};
    
    // Run exclusive scan using composition as custom scan operator
    thrust::device_vector<item_t> d_in{sample_data};
    thrust::device_vector<item_t> d_out{sample_data.size()};
    thrust::exclusive_scan(d_in.cbegin(), d_in.cend(), d_out.begin(), identity, CompositionOp{});
    
    // Get results & print
    thrust::host_vector<item_t> h_out{d_out};
    print_vectors(h_out);
    return 0;
}