Compiler Explorer

Source code

#include <cfloat>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <string>
#include <tuple>
#include <omp.h>

// Much simplified
struct double2 {double x,y;};
struct dim3 {unsigned int x,y,z; constexpr dim3(unsigned int x=1u,unsigned int y=1u,unsigned int z=1u):x(x),y(y),z(z){}};

//
// include/complex_quda.h
//
template <typename ValueType> struct complex;
template<> struct complex <double> : public double2
{
	typedef double value_type;
	inline complex<double>() {};
	constexpr complex<double>(const double & re, const double& im = double()) :
		double2{re, im} { }
	constexpr double real() const { return x; }
	constexpr double imag() const { return y; }
	inline void real(double re){ x = re; }
	inline void imag(double im){ y = im; }

// cast operators
	template <typename T>
	inline operator complex<T>() const { return complex<T>(static_cast<T>(real()),static_cast<T>(imag())); }
};

//
// TARGET device
//
namespace device {
	constexpr int warp_size() { return 8; }
	template <int block_size_y = 1, int block_size_z = 1>
	constexpr unsigned int max_block_size()
	{
		return std::max(warp_size(), 512 / (block_size_y * block_size_z));
	}
	unsigned int processor_count()
	{
		static int m = -1;
		if(m<0){
			#pragma omp target map(tofrom:m)
			m = omp_get_num_procs();
		}
		return m/warp_size();
	}
	unsigned int max_threads_per_block()
	{
		static int m = -1;
		if(m<0){
			#pragma omp target teams map(tofrom:m)
			if(omp_get_team_num()==0)
				m = omp_get_max_threads();
		}
		return m;
	}
	unsigned int max_threads_per_block_dim(int i) { return max_threads_per_block(); }
	unsigned int max_grid_size(int i) { return 65536; }
}

struct Timer {
	double t;
	void start() { t = omp_get_wtime(); }
	void stop() { t = omp_get_wtime() - t; }
	double last() { return t; }
};
typedef Timer host_timer_t;
typedef Timer device_timer_t;

//
// include/tune_key.h
//
struct TuneKey {

static const int volume_n = 32;
	static const int name_n = 512;
	static const int aux_n = 256;
	char volume[volume_n];
	char name[name_n];

TuneKey() { }
	TuneKey(const char v[], const char n[]) {
		strcpy(volume, v);
		strcpy(name, n);
	} 
/*
	bool operator<(const TuneKey &other) const {
		int vc = std::strcmp(volume, other.volume);
		if (vc < 0) {
			return true;
		} else if (vc == 0) {
			int nc = std::strcmp(name, other.name);
			if (nc < 0) {
				return true;
		}
		return false;
	}
*/
	friend std::ostream& operator<<(std::ostream& output, const TuneKey& key)
	{
		output << "volume = " << key.volume << ", ";
		output << "name = " << key.name << ", ";
		return output;
	}

};

//
// include/tune_quda.h
//
struct TuneParam {
	dim3 block;
	dim3 grid;
	float time;
	friend std::ostream& operator<<(std::ostream& output, const TuneParam& param) {
		output << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
		output << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << ")";
		return output;
	}
};

struct Tunable {
	virtual long long flops() const { return 0; }
	virtual long long bytes() const { return 0; }
	virtual unsigned int minThreads() const { return 1; }
	virtual bool tuneGridDim() const { return true; }
	virtual bool advanceGridDim(TuneParam &param) const
	{
		if (tuneGridDim()) {
			const int step = gridStep();
			param.grid.x += step;
			if (param.grid.x > maxGridSize()) {
				param.grid.x = minGridSize();
				return false;
			} else {
				return true;
			}
		} else {
			return false;
		}
	}

virtual unsigned int maxBlockSize(const TuneParam &param) const { return device::max_threads_per_block() / (param.block.y*param.block.z); }
	virtual unsigned int maxGridSize() const { return 2*device::processor_count(); }
	virtual unsigned int minGridSize() const { return 1; }
	virtual int gridStep() const { return 1; }
	virtual int blockStep() const;
	virtual int blockMin() const;
	virtual void resetBlockDim(TuneParam &param) const {
		if (tuneGridDim()) {
			param.block.x = blockMin();
		} else { // not tuning the grid dimension so have to set a valid grid size
			const auto step = blockStep();
			const auto max_threads = maxBlockSize(param);
			const auto max_blocks = device::max_grid_size(0);

// ensure the blockDim is large enough given the limit on gridDim
			param.block.x = (minThreads() + max_blocks - 1) / max_blocks;
			param.block.x = ((param.block.x+step-1)/step)*step; // round up to nearest step size
			if (param.block.x > max_threads && param.block.y == 1 && param.block.z == 1){
				printf("Local lattice volume is too large for device");
				exit(1);
			}
		}
	}

virtual bool advanceBlockDim(TuneParam &param) const
	{
		const unsigned int max_threads = maxBlockSize(param);
		bool ret;

param.block.x += blockStep();
		unsigned int nthreads = param.block.x * param.block.y * param.block.z;

if (param.block.x > max_threads || nthreads > device::max_threads_per_block()) {
			resetBlockDim(param);
			ret = false;
		} else {
			ret = true;
		}

if (!tuneGridDim()) param.grid.x = (minThreads() + param.block.x - 1) / param.block.x;

return ret;
	}
	char vol[TuneKey::volume_n];

virtual TuneKey tuneKey() const = 0;
	virtual void apply() = 0;
	virtual void preTune() { }
	virtual void postTune() { }
	virtual int tuningIter() const { return 32; }

virtual std::string paramString(const TuneParam &param) const
	{
		std::stringstream ps;
		ps << param;
		return ps.str();
	}

virtual std::string perfString(float time) const
	{
		float gflops = flops() / (1e9 * time);
		float gbytes = bytes() / (1e9 * time);
		std::stringstream ss;
		ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, ";
		ss << gbytes << " GB/s";
		return ss.str();
	}

virtual void initTuneParam(TuneParam &param) const
	{
		const unsigned int max_threads = device::max_threads_per_block_dim(0);
		const unsigned int max_blocks = device::max_grid_size(0);
		const int min_grid_size = minGridSize();
		const int min_block_size = blockMin();

if (tuneGridDim()) {
			param.block = dim3(min_block_size,1,1);
			param.grid = dim3(min_grid_size,1,1);
		} else {
			// find the minimum valid blockDim
			param.block = dim3((minThreads()+max_blocks-1)/max_blocks, 1, 1);
			param.block.x = ((param.block.x+min_block_size-1) / min_block_size) * min_block_size; // round up to the nearest multiple of desired minimum block size
			if (param.block.x > max_threads){
				printf("Local lattice volume is too large for device");
				exit(1);
			}
			param.grid = dim3((minThreads()+param.block.x-1)/param.block.x, 1, 1);
		}
	}

virtual void defaultTuneParam(TuneParam &param) const
	{
		initTuneParam(param);
		if (tuneGridDim()) param.grid.x = maxGridSize(); // don't set y and z in case derived initTuneParam has
	}

virtual bool advanceTuneParam(TuneParam &param) const
	{
		return advanceBlockDim(param) || advanceGridDim(param);
	}

};

//
// TARGET specific
//
#define QUDA_RT_CONSTS \
	const dim3\
		blockDim=launch_param.block,\
		gridDim=launch_param.grid,\
		threadIdx(omp_get_thread_num()%launch_param.block.x, (omp_get_thread_num()/launch_param.block.x)%launch_param.block.y, omp_get_thread_num()/(launch_param.block.x*launch_param.block.y)),\
		blockIdx(omp_get_team_num()%launch_param.grid.x, (omp_get_team_num()/launch_param.grid.x)%launch_param.grid.y, omp_get_team_num()/(launch_param.grid.x*launch_param.grid.y))
bool invalid_launch_param(TuneParam &param)
{
	const int gd = param.grid.x*param.grid.y*param.grid.z;
	const int ld = param.block.x*param.block.y*param.block.z;
	int gn = 0, ln = 0;
	#pragma omp target teams num_teams(gd) thread_limit(ld) map(tofrom:gn,ln)
	#pragma omp parallel num_threads(ld)
	{
		if(omp_get_team_num()==0 && omp_get_thread_num()==0){
			gn = omp_get_num_teams();
			ln = omp_get_num_threads();
		}
	}
	return gn!=gd||ln!=ld;
}

struct LaunchParam{
	dim3 block;
	dim3 grid;
};
LaunchParam launch_param;
#pragma omp declare target to(launch_param)

void qudaSetupLaunchParameter(const TuneParam &tp)
{
	launch_param.grid = tp.grid;
	launch_param.block = tp.block;
	#pragma omp target update to(launch_param)
}

struct ColorSpinorField {
	int nColor, nSpin;
	size_t volumeCB;
	size_t length;
	size_t bytes;
	void *v;
	bool alloc;
    char vol_string[TuneKey::volume_n];
	ColorSpinorField(int nColor, int nSpin, size_t volumeCB) :
		nColor(nColor),
		nSpin(nSpin),
		volumeCB(volumeCB),
		length(volumeCB*nColor*nSpin*2),
		bytes(length*sizeof(double)),
		v(omp_target_alloc(bytes, omp_get_default_device())),
		alloc(true)
	{
		snprintf(vol_string, TuneKey::volume_n-1, "%zu", length);
	}
	ColorSpinorField(int nColor, int nSpin, size_t volumeCB, void *v) :
		nColor(nColor),
		nSpin(nSpin),
		volumeCB(volumeCB),
		length(volumeCB*nColor*nSpin*2),
		bytes(length*sizeof(double)),
		v(v),
		alloc(false)
	{
		snprintf(vol_string, TuneKey::volume_n-1, "%zu", length);
	}
	~ColorSpinorField(void)
	{
		if(alloc)
			omp_target_free(v, omp_get_default_device());
	}
	void * V(void) const {return v;}
	size_t Bytes(void) const {return bytes;}
	int VolumeCB(void) const {return volumeCB;}
	int Stride(void) const {return volumeCB;}
	int SiteSubset(void) const {return 1;}
	inline const char *VolString() const { return vol_string; }
};

template <template <typename> class Functor, typename Arg, bool grid_stride = false>
void Kernel2D_impl(const Arg &arg)
{
	QUDA_RT_CONSTS;
	Functor<Arg> f(arg);

auto i = threadIdx.x + blockIdx.x * blockDim.x;
	auto j = threadIdx.y + blockIdx.y * blockDim.y;
	if (j >= arg.threads.y) return;

while (i < arg.threads.x) {
		f(i, j);
		if (grid_stride) i += gridDim.x * blockDim.x; else break;
	}
}

template <template <typename> class Functor, typename Arg, bool grid_stride = false>
void Kernel2D(Arg arg)
{
	const int gd = launch_param.grid.x*launch_param.grid.y*launch_param.grid.z;
	const int ld = launch_param.block.x*launch_param.block.y*launch_param.block.z;
	Arg *dparg = (Arg*)omp_target_alloc(sizeof(Arg), omp_get_default_device());
	omp_target_memcpy(dparg, (void *)(&arg), sizeof(Arg), 0, 0, omp_get_default_device(), omp_get_initial_device());
	#pragma omp target teams num_teams(gd) thread_limit(ld) is_device_ptr(dparg)
	#pragma omp parallel num_threads(ld)
	{
		char buffer[sizeof(Arg)];
		memcpy(buffer, (void *)dparg, sizeof(Arg));
		Kernel2D_impl<Functor, Arg, grid_stride>(*(Arg *)buffer);
	}
	omp_target_free(dparg, omp_get_default_device());
}

//
// include/targets/omptarget/tunable_kernel.h
//
struct TunableKernel : public Tunable
{
	template <bool grid_stride, typename Func, typename Arg>
	void launch_device(Func *func, const TuneParam &tp, const Arg &arg)
	{
		qudaSetupLaunchParameter(tp);
		func(arg);
	}
	TuneKey tuneKey() const { return TuneKey(vol, typeid(*this).name()); }
};

//
// include/tunable_nd.h
//
template <bool grid_stride>
struct TunableKernel1D_base : public TunableKernel
{
	virtual bool tuneGridDim() const { return grid_stride; }
	TunableKernel1D_base(const ColorSpinorField &field)
	{
		strcpy(vol, field.VolString());
	}
};
template <bool grid_stride = false>
struct TunableKernel2D_base : public TunableKernel1D_base<grid_stride>
{
	mutable unsigned int vector_length_y;
	mutable unsigned int step_y;
	bool tune_block_x;

template <template <typename> class Functor, typename Arg>
	void launch_device(const TuneParam &tp, const Arg &arg)
	{
		const_cast<Arg &>(arg).threads.y = vector_length_y;
		TunableKernel::launch_device<grid_stride>(Kernel2D<Functor, Arg, grid_stride>, tp, arg);
	}

template <template <typename> class Functor, typename Arg>
	void launch(const TuneParam &tp, const Arg &arg)
	{
		launch_device<Functor, Arg>(tp, arg);
	}
	TunableKernel2D_base(const ColorSpinorField &field, unsigned int vector_length_y) :
		TunableKernel1D_base<grid_stride>(field),
		vector_length_y(vector_length_y),
		step_y(1),
		tune_block_x(true)
	{ }

bool advanceBlockDim(TuneParam &param) const
	{
		dim3 block = param.block;
		dim3 grid = param.grid;
		param.block.y = block.y;
		param.grid.y = grid.y;
		bool ret = tune_block_x ? Tunable::advanceBlockDim(param) : false;

if (ret) {
			return true;
		} else { // block.x (spacetime) was reset

auto next = param;
			next.block.y += step_y;

// we can advance spin/block-color since this is valid
			if (param.block.y < vector_length_y && param.block.y < device::max_threads_per_block_dim(1) &&
					param.block.x*(param.block.y+step_y)*param.block.z <= device::max_threads_per_block()) {
				param.block.y += step_y;
				param.grid.y = (vector_length_y + param.block.y - 1) / param.block.y;
				return true;
			} else { // we have run off the end so let's reset
				param.block.y = step_y;
				param.grid.y = (vector_length_y + param.block.y - 1) / param.block.y;

return false;
			}
		}
	}

void initTuneParam(TuneParam &param) const
	{
		Tunable::initTuneParam(param);
		param.block.y = step_y;
		param.grid.y = (vector_length_y + step_y - 1) / step_y;
	}

/** sets default values for when tuning is disabled */
	void defaultTuneParam(TuneParam &param) const
	{
		Tunable::defaultTuneParam(param);
		param.block.y = step_y;
		param.grid.y = (vector_length_y + step_y - 1) / step_y;
	}

void resizeVector(int y) const { vector_length_y = y; }
	void resizeStep(int y) const { step_y = y; }
};

struct TunableKernel2D : public TunableKernel2D_base<false> {
	virtual unsigned int minThreads() const = 0;
	TunableKernel2D(const ColorSpinorField &field, unsigned int vector_length_y) :
		TunableKernel2D_base<false>(field, vector_length_y) {}
};

//
// lib/tune.cpp
//
static bool tuning = false;
TuneParam tuneLaunch(Tunable &tunable)
{
	TuneKey key = tunable.tuneKey();
	static TuneParam param;

if (!tuning) {

TuneParam best_param;
		double best_time;

tuning = true;
		best_time = DBL_MAX;

printf("Tuning %s at vol=%s\n", key.name, key.volume);

device_timer_t timer;

host_timer_t tune_timer;
		tune_timer.start();

tunable.initTuneParam(param);
		while (tuning) {
			printf("About to call tunable.apply block=(%d,%d,%d) grid=(%d,%d,%d)",
				param.block.x, param.block.y, param.block.z, param.grid.x, param.grid.y, param.grid.z);

tunable.apply(); // do initial call in case we need to jit compile for these parameters or if policy tuning

timer.start();
			for (int i = 0; i < tunable.tuningIter(); i++) {
				tunable.apply(); // calls tuneLaunch() again, which simply returns the currently active param
			}
			timer.stop();

float elapsed_time = timer.last() / tunable.tuningIter();
			if (elapsed_time < best_time) {
				best_time = elapsed_time;
				best_param = param;
			}
			printf("...	gives %s (time %.3f ms x %d)\n",
				tunable.perfString(elapsed_time).c_str(), elapsed_time*1e3, tunable.tuningIter());
			do{
				tuning = tunable.advanceTuneParam(param);
			}while(tuning && invalid_launch_param(param));
		}
		printf("Tuned %s giving %s for %s\n", tunable.paramString(best_param).c_str(),
			tunable.perfString(best_time).c_str(), key.name);
		best_param.time = best_time;

param = best_param;
	}

return param;
}
int Tunable::blockStep() const { return device::warp_size(); }
int Tunable::blockMin() const { return device::warp_size(); }

//
// include/targets/generic/load_store.h
//
template <typename VectorType>
inline VectorType vector_load(const void *ptr, int idx)
{
	VectorType value;
	value = reinterpret_cast<const VectorType *>(ptr)[idx];
	return value;
}
template <typename VectorType>
inline void vector_store(void *ptr, int idx, const VectorType &value)
{
	reinterpret_cast<VectorType *>(ptr)[idx] = value;
}

//
// include/convert.h
//
template <typename T1, typename T2>
constexpr void copy(T1 &a, const T2 &b)
{
	a = b;
}
template <typename T1, typename T2>
constexpr void copy_scaled(T1 &a, const T2 &b)
{
	copy(a, b);
}
template <typename T1, typename T2, typename T3>
constexpr void copy_and_scale(T1 &a, const T2 &b, const T3 &)
{
	copy(a, b);
}

//
// include/register_traits.h
//
template<typename> struct mapper { };
template<> struct mapper<double> { typedef double type; };
template <typename Float, int number> struct VectorType;
template <> struct VectorType<double, 1>{typedef double type; };
template <> struct VectorType<double, 2>{typedef double2 type; };
template<bool large_alloc> struct AllocType { };
template<> struct AllocType<true> { typedef size_t type; };
template<> struct AllocType<false> { typedef int type; };

//
// include/color_spinor_field_order.h
//
/**
	 @brief colorspinor_wrapper is an internal class that is used to
	 wrap instances of colorspinor accessors, currying in a specifc
	 location on the field.	The operator() accessors in
	 colorspinor-field accessors return instances to this class,
	 allowing us to then use operator overloading upon this class
	 to interact with the ColorSpinor class.	As a result we can
	 include colorspinor-field accessors directly in ColorSpinor
	 expressions in kernels without having to declare temporaries
	 with explicit calls to the load/save methods in the
	 colorspinor-field accessors.
*/
template <typename Float, typename T>
struct colorspinor_wrapper {
	const T &field;
	const int x_cb;
	const int parity;

/**
		 @brief colorspinor_wrapper constructor
		 @param[in] a colorspinor field accessor we are wrapping
		 @param[in] x_cb checkerboarded space-time index we are accessing
		 @param[in] parity Parity we are accessing
	*/
	inline colorspinor_wrapper<Float, T>(const T &field, int x_cb, int parity) :
			field(field),
			x_cb(x_cb),
			parity(parity)
	{ }

/**
		 @brief Assignment operator with ColorSpinor instance as input
		 @param[in] C ColorSpinor we want to store in this accessor
	*/
	template <typename C> inline void operator=(const C &a) const { field.save(a.data, x_cb, parity); }
};

/**
	 @brief Accessor routine for ColorSpinorFields in native field order.
	 @tparam Float Underlying storage data type of the field
	 @tparam Ns Number of spin components
	 @tparam Nc Number of colors
	 @tparam N Number of real numbers per short vector
	 @tparam spin_project Whether the ghosts are spin projected or not
	 @tparam huge_alloc Template parameter that enables 64-bit
	 pointer arithmetic for huge allocations (e.g., packed set of
	 vectors).	Default is to use 32-bit pointer arithmetic.
 */
template <typename Float, int Ns, int Nc, int N_, bool spin_project = false, bool huge_alloc = false>
struct FloatNOrder {
	static_assert((2 * Ns * Nc) % N_ == 0, "Internal degrees of freedom not divisible by short-vector length");
	static constexpr int length = 2 * Ns * Nc;
	static constexpr int N = N_;
	static constexpr int M = length / N;
	using Accessor = FloatNOrder<Float, Ns, Nc, N, spin_project, huge_alloc>;
	using real = typename mapper<Float>::type;
	using complex = complex<real>;
	using Vector = typename VectorType<Float, N>::type;
	using AllocInt = typename AllocType<huge_alloc>::type;
	using norm_type = float;
	Float *field;
	const AllocInt offset; // offset can be 32-bit or 64-bit
	int volumeCB;
	int stride;
	int nParity;
	void *backup_h; //! host memory for backing up the field when tuning
	size_t bytes;

FloatNOrder(const ColorSpinorField &a, Float *field_ = 0) :
		field(field_ ? field_ : (Float *)a.V()),
		offset(a.Bytes() / (2 * sizeof(Float) * N)),
		volumeCB(a.VolumeCB()),
		stride(a.Stride()),
		nParity(a.SiteSubset()),
		bytes(a.Bytes())
	{ }

inline void load(complex out[length / 2], int x, int parity = 0) const
	{
		real v[length];
		norm_type nrm = 0.0;
		#pragma unroll
		for (int i=0; i<M; i++) {
			// first load from memory
			Vector vecTmp = vector_load<Vector>(field, parity * offset + x + stride * i);
			// now copy into output and scale
			#pragma unroll
			for (int j = 0; j < N; j++) copy_and_scale(v[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j], nrm);
		}
		
		#pragma unroll
		for (int i = 0; i < length / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]);
	}
		
	inline void save(const complex in[length / 2], int x, int parity = 0) const
	{
		real v[length];
		
		#pragma unroll
		for (int i = 0; i < length / 2; i++) {
			v[2 * i + 0] = in[i].real();
			v[2 * i + 1] = in[i].imag();
		}
		
		#pragma unroll
		for (int i=0; i<M; i++) {
			Vector vecTmp;
			// first do scalar copy converting into storage type
			#pragma unroll
			for (int j = 0; j < N; j++) copy_scaled(reinterpret_cast<Float *>(&vecTmp)[j], v[i * N + j]);
			// second do vectorized copy into memory
			vector_store(field, parity * offset + x + stride * i, vecTmp);
		}
	}
	
	/**
	 @brief This accessor routine returns a colorspinor_wrapper to this object,
	 allowing us to overload various operators for manipulating at
	 the site level interms of matrix operations.
	 @param[in] x_cb Checkerboarded space-time index we are requesting
	 @param[in] parity Parity we are requesting
	 @return Instance of a colorspinor_wrapper that curries in access to
	 this field at the above coordinates.
	*/
	inline auto operator()(int x_cb, int parity) const
	{
		return colorspinor_wrapper<real, Accessor>(*this, x_cb, parity);
	}
	
	size_t Bytes() const
	{
		return nParity * volumeCB * (Nc * Ns * 2 * sizeof(Float));
	}
};

template <typename T, int Ns, int Nc, bool project = false, bool huge_alloc = false> struct colorspinor_mapper {
};

// double precision
template <int Nc, bool huge_alloc> struct colorspinor_mapper<double, 4, Nc, false, huge_alloc> {
	typedef FloatNOrder<double, 4, Nc, 2, false, huge_alloc> type;
};
template <int Nc, bool huge_alloc> struct colorspinor_mapper<double, 4, Nc, true, huge_alloc> {
	typedef FloatNOrder<double, 4, Nc, 2, true, huge_alloc> type;
};
template <int Nc, bool huge_alloc> struct colorspinor_mapper<double, 2, Nc, false, huge_alloc> {
	typedef FloatNOrder<double, 2, Nc, 2, false, huge_alloc> type;
};
template <int Nc, bool huge_alloc> struct colorspinor_mapper<double, 1, Nc, false, huge_alloc> {
	typedef FloatNOrder<double, 1, Nc, 2, false, huge_alloc> type;
};

//
// include/kernel_helper.h
//
template <bool use_kernel_arg_ = true>
struct kernel_param {
	static constexpr bool use_kernel_arg = use_kernel_arg_;
	dim3 threads; /** number of active threads required */
	constexpr kernel_param() = default;
	constexpr kernel_param(dim3 threads) :
		threads(threads)
	{ }
};

//
// include/color_spinor.h
//
template <typename Float, int Nc, int Ns>
struct ColorSpinor {
	static constexpr int size = Nc * Ns;
	complex<Float> data[size];
	inline ColorSpinor<Float, Nc, Ns>()
	{
#pragma unroll
		for (int i = 0; i < size; i++) { data[i] = 0; }
	}

inline ColorSpinor<Float, Nc, Ns>(const ColorSpinor<Float, Nc, Ns> &a) {
#pragma unroll
		for (int i = 0; i < size; i++) { data[i] = a.data[i]; }
	}

inline ColorSpinor<Float, Nc, Ns>& operator=(const ColorSpinor<Float, Nc, Ns> &a) {
		if (this != &a) {
#pragma unroll
			for (int i = 0; i < size; i++) { data[i] = a.data[i]; }
		}
		return *this;
	}
	template<typename S>
	inline ColorSpinor(const colorspinor_wrapper<Float,S> &a) {
		a.field.load(data, a.x_cb, a.parity);
	}
};

//
// include/kernels/copy_color_spinor.cuh
//
template <typename FloatOut, typename FloatIn, int nSpin_, int nColor_, typename Out, typename In, template <int, int> class Basis_>
struct CopyColorSpinorArg : kernel_param<>  {
	using Basis = Basis_<nSpin_, nColor_>;
	using realOut = typename mapper<FloatOut>::type;
	using realIn = typename mapper<FloatIn>::type;
	static constexpr int nSpin = nSpin_;
	static constexpr int nColor = nColor_;
	Out out;
	const In in;
	const int outParity;
	const int inParity;
	CopyColorSpinorArg(ColorSpinorField &out, const ColorSpinorField &in,
			FloatOut* Out_, FloatIn *In_, float *outNorm, float *inNorm) :
		kernel_param(dim3(in.VolumeCB(), in.SiteSubset(), 1)),
		out(out, Out_),
		in(in, In_),
		outParity(0),
		inParity(0)
	{ }
};

template <int Ns, int Nc>
struct PreserveBasis {
	template <typename FloatOut, typename FloatIn>
	inline void operator()(complex<FloatOut> out[Ns*Nc], const complex<FloatIn> in[Ns*Nc]) const {
		for (int s=0; s<Ns; s++) for (int c=0; c<Nc; c++) out[s*Nc+c] = in[s*Nc+c];
	}
};

template <typename Arg> struct CopyColorSpinor_ {
	const Arg &arg;
	constexpr CopyColorSpinor_(const Arg &arg): arg(arg) {}

inline void operator()(int x_cb, int parity)
	{
		ColorSpinor<typename Arg::realIn, Arg::nColor, Arg::nSpin> in = arg.in(x_cb, (parity+arg.inParity)&1);
		ColorSpinor<typename Arg::realOut, Arg::nColor, Arg::nSpin> out;
		typename Arg::Basis basis;
		basis(out.data, in.data);
		arg.out(x_cb, (parity+arg.outParity)&1) = out;
	}
};

//
// lib/copy_color_spinor.cuh
//
template <int Ns, int Nc, typename Out, typename In, typename param_t>
class CopyColorSpinor : TunableKernel2D {
	using FloatOut = typename std::remove_pointer<typename std::tuple_element<2, param_t>::type>::type;
	using FloatIn = typename std::remove_pointer<typename std::tuple_element<3, param_t>::type>::type;
	template <template <int, int> class Basis> using Arg = CopyColorSpinorArg<FloatOut, FloatIn, Ns, Nc, Out, In, Basis>;
	FloatOut *Out_;
	FloatIn *In_;
	float *outNorm;
	float *inNorm;
	ColorSpinorField &out;
	const ColorSpinorField &in;

unsigned int minThreads() const { return in.VolumeCB(); }

public:
	CopyColorSpinor(ColorSpinorField &out, const ColorSpinorField &in, const param_t &param) :
		TunableKernel2D(in, in.SiteSubset()),
		Out_(std::get<2>(param)),
		In_(std::get<3>(param)),
		outNorm(std::get<4>(param)),
		inNorm(std::get<5>(param)),
		out(out),
		in(in)
	{
		apply();
	}

void apply()
	{
		TuneParam tp = tuneLaunch(*this);
		launch<CopyColorSpinor_>(tp, Arg<PreserveBasis>(out, in, Out_, In_, outNorm, inNorm));
	}

long long bytes() const { return in.Bytes() + out.Bytes(); }
};

constexpr int Ns = 4;
constexpr int Nc = 3;

int
main(int argc, char *argv[])
{
	size_t vol, i, fail;
	double *x, *y;

vol = argc <= 1 ? 1L<<22 : atol(argv[1]);
	x = (double *)omp_target_alloc(sizeof(double)*vol*Ns*Nc*2, omp_get_default_device());

#pragma omp target teams distribute parallel for is_device_ptr(x)
	for(i=0;i<vol*Ns*Nc*2;++i)
		x[i] = (double)i;

ColorSpinorField out(Nc, Ns, vol);
	ColorSpinorField in(Nc, Ns, vol, x);
	using FloatOut = double;
	using FloatIn = double;
	using param_t = std::tuple<ColorSpinorField &, const ColorSpinorField &, FloatOut *, FloatIn *, float *, float *>;
	using O = typename colorspinor_mapper<FloatOut,Ns,Nc>::type;
	using I = typename colorspinor_mapper<FloatIn,Ns,Nc>::type;
	param_t param(out, in, nullptr, nullptr, nullptr, nullptr);
	CopyColorSpinor<Ns, Nc, O, I, param_t>(out, in, param);

fail = 0;
	y = (double*)out.v;
	#pragma omp target teams distribute parallel for reduction(+:fail) is_device_ptr(x,y)
	for(i=0;i<vol*Ns*Nc*2;++i)
		if(x[i]!=y[i])
			fail++;
	if(fail>0)
		printf("%zu elements differ!\n", fail);

omp_target_free(x, omp_get_default_device());

return fail>0;
}