Compiler Explorer

Source code

#include <cuco/static_multimap.cuh>

#include <iostream>

using namespace std;
using namespace std::chrono;

template <uint32_t cg_size, typename InsertView>
__global__ void map_filler(InsertView insert_view, int num_items)
{
  auto const tile =
    cooperative_groups::tiled_partition<cg_size>(cooperative_groups::this_thread_block());
  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
  auto tile_id   = tid / cg_size;

while (tile_id < num_items) {
    printf("Inserting for tid %d\n", tile_id);
    insert_view.insert(tile, {tile_id, tile_id});
    tile_id += gridDim.x * blockDim.x;
  }
}

template <uint32_t cg_size, typename CountView>
__global__ void count_checker(CountView count_view, int num_items)
{
  auto const tile =
    cooperative_groups::tiled_partition<cg_size>(cooperative_groups::this_thread_block());
  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
  auto tile_id   = tid / cg_size;

extern __shared__ cuda::std::atomic<int> count[];

while (tile_id < num_items) {
    count[tile_id] += count_view.count(tile, tile_id);
    if (count_view.contains(tile, tile_id) && count[tile_id] == 0) {
      printf("tile ID %d exists in map but has count 0\n", tile_id);
    }

tile_id += gridDim.x * blockDim.x;
  }
}

int main(int argc, char** argv)
{
  high_resolution_clock::time_point start;
  high_resolution_clock::time_point end;
  duration<double, std::milli> duration_ms;

int num_items         = 2;
  int threads_per_block = 1024;
  std::size_t capacity  = 2 * num_items;
  cuco::static_multimap<int, int> test_map{capacity, cuco::empty_key{-1}, cuco::empty_value{-1}};
  auto constexpr cg_size = test_map.cg_size();

// Insert into the map
  start          = high_resolution_clock::now();
  int num_blocks = (num_items + threads_per_block - 1) / threads_per_block;

map_filler<cg_size>
    <<<num_blocks, threads_per_block>>>(test_map.get_device_mutable_view(), num_items);
  cudaDeviceSynchronize();
  end         = high_resolution_clock::now();
  duration_ms = std::chrono::duration_cast<duration<double, std::milli>>(end - start);
  std::cout << "Insert time of " << duration_ms.count() << " ms" << std::endl;

// Query the map
  start = high_resolution_clock::now();
  count_checker<cg_size>
    <<<num_blocks, threads_per_block, num_items * sizeof(cuda::std::atomic<int>)>>>(
      test_map.get_device_view(), num_items);
  cudaDeviceSynchronize();
  end         = high_resolution_clock::now();
  duration_ms = std::chrono::duration_cast<duration<double, std::milli>>(end - start);
  std::cout << "Count time of " << duration_ms.count() << " ms" << std::endl;
}