Compiler Explorer

Source code

#include <stdint.h>
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <time.h>
#include <random>
#include <immintrin.h>

#define _dbg_assert_msg_(...) (void)0

// Use this if you know the value is non-zero.
inline uint32_t clz32_nonzero(uint32_t value) {
	return __builtin_clz(value);
}

inline uint32_t clz32(uint32_t value) {
	if (!value)
		return 32;
	return __builtin_clz(value);
}

union float2int {
	uint32_t i;
	float f;
};

inline uint32_t get_uexp(uint32_t x) {
	return (x >> 23) & 0xFF;
}

inline int32_t get_exp(uint32_t x) {
	return get_uexp(x) - 127;
}

inline int32_t get_mant(uint32_t x) {
	// Note: this returns the hidden 1.
	return (x & 0x007FFFFF) | 0x00800000;
}

inline int32_t get_sign(uint32_t x) {
	return x & 0x80000000;
}

float vfpu_dot_cpp(const float a[4], const float b[4]) {
	static const int EXTRA_BITS = 2;
	float2int result;
	float2int src[2];

int32_t exps[4];
	int32_t mants[4];
	int32_t signs[4];
	int32_t max_exp = 0;
	int32_t last_inf = -1;

for (int i = 0; i < 4; i++) {
		src[0].f = a[i];
		src[1].f = b[i];

int32_t aexp = get_uexp(src[0].i);
		int32_t bexp = get_uexp(src[1].i);
		int32_t amant = get_mant(src[0].i) << EXTRA_BITS;
		int32_t bmant = get_mant(src[1].i) << EXTRA_BITS;

exps[i] = aexp + bexp - 127;
		if (aexp == 255) {
			// INF * 0 = NAN
			if ((src[0].i & 0x007FFFFF) != 0 || bexp == 0) {
				result.i = 0x7F800001;
				return result.f;
			}
			mants[i] = get_mant(0) << EXTRA_BITS;
			exps[i] = 255;
		} else if (bexp == 255) {
			if ((src[1].i & 0x007FFFFF) != 0 || aexp == 0) {
				result.i = 0x7F800001;
				return result.f;
			}
			mants[i] = get_mant(0) << EXTRA_BITS;
			exps[i] = 255;
		} else {
			// TODO: Adjust precision?
			uint64_t adjust = (uint64_t)amant * (uint64_t)bmant;
			mants[i] = (adjust >> (23 + EXTRA_BITS)) & 0x7FFFFFFF;
		}
		signs[i] = get_sign(src[0].i) ^ get_sign(src[1].i);

if (exps[i] > max_exp) {
			max_exp = exps[i];
		}
		if (exps[i] >= 255) {
			// Infinity minus infinity is not a real number.
			if (last_inf != -1 && signs[i] != last_inf) {
				result.i = 0x7F800001;
				return result.f;
			}
			last_inf = signs[i];
		}
	}

int32_t mant_sum = 0;
	for (int i = 0; i < 4; i++) {
		int exp = max_exp - exps[i];
		if (exp >= 32) {
			mants[i] = 0;
		} else {
			mants[i] >>= exp;
		}
		if (signs[i]) {
			mants[i] = -mants[i];
		}
		mant_sum += mants[i];
	}

uint32_t sign_sum = 0;
	if (mant_sum < 0) {
		sign_sum = 0x80000000;
		mant_sum = -mant_sum;
	}

// Truncate off the extra bits now.  We want to zero them for rounding purposes.
	mant_sum >>= EXTRA_BITS;

if (mant_sum == 0 || max_exp <= 0) {
		return 0.0f;
	}

int8_t shift = (int8_t)clz32_nonzero(mant_sum) - 8;
	if (shift < 0) {
		// Round to even if we'd shift away a 0.5.
		const uint32_t round_bit = 1 << (-shift - 1);
		if ((mant_sum & round_bit) && (mant_sum & (round_bit << 1))) {
			mant_sum += round_bit;
			shift = (int8_t)clz32_nonzero(mant_sum) - 8;
		} else if ((mant_sum & round_bit) && (mant_sum & (round_bit - 1))) {
			mant_sum += round_bit;
			shift = (int8_t)clz32_nonzero(mant_sum) - 8;
		}
		mant_sum >>= -shift;
		max_exp += -shift;
	} else {
		mant_sum <<= shift;
		max_exp -= shift;
	}
	_dbg_assert_msg_((mant_sum & 0x00800000) != 0, "Mantissa wrong: %08x", mant_sum);

if (max_exp >= 255) {
		max_exp = 255;
		mant_sum = 0;
	} else if (max_exp <= 0) {
		return 0.0f;
	}

result.i = sign_sum | (max_exp << 23) | (mant_sum & 0x007FFFFF);
	return result.f;
}

// Returns ((uint64_t)a[i]*(uint64_t)b[i])>>32.
static inline __m128i mulhi32x4(__m128i a, __m128i b) {
	__m128i m02 = _mm_mul_epu32(a, b);
	__m128i m13 = _mm_mul_epu32(
		_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)),
		_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)));
	__m128i m=_mm_unpacklo_epi32(
		_mm_shuffle_epi32(m02, _MM_SHUFFLE(3, 2, 3, 1)),
		_mm_shuffle_epi32(m13, _MM_SHUFFLE(3, 2, 3, 1)));
	return m;
}

// Values of rounding_mode:
//   -1 - detect at runtime
//    0 - assume round-to-nearest-ties-to-even
//    1 - round yourself in integer math
template<int rounding_mode=-1>
static float vfpu_dot_sse2(const float a[4], const float b[4])
{
	static const int EXTRA_BITS = 2;

bool is_default_rounding_mode = (rounding_mode == 0);
	if(rounding_mode == -1)
	{
		volatile float test05 = 5.9604644775390625e-08f;  // 0.5*2^-23
		volatile float test15 = 1.78813934326171875e-07f; // 1.5*2^-23
		const float res15 = 1.0000002384185791015625f;    // 1+2^-22
		test05 += 1.0f;
		test15 += 1.0f;
		is_default_rounding_mode = (test05 == 1.0f && test15 == res15);
	}
	__m128 A = _mm_loadu_ps(a);
	__m128 B = _mm_loadu_ps(b);
	// Extract exponents.
	__m128 exp_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
	__m128 eA = _mm_and_ps(A, exp_mask);
	__m128 eB = _mm_and_ps(B, exp_mask);
	__m128i exps = _mm_srli_epi32(_mm_add_epi32(
		_mm_castps_si128(eA),
		_mm_castps_si128(eB)),23);
	// Find maximum exponent, stored as float32 in [1;2),
	// so we can use _mm_max_ps() with normal arguments.
	__m128 t = _mm_or_ps(_mm_castsi128_ps(exps), _mm_set1_ps(1.0f));
	t = _mm_max_ps(t, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(t), _MM_SHUFFLE(2, 3, 0, 1))));
	t = _mm_max_ps(t, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(t), _MM_SHUFFLE(1, 0, 3, 2))));
	t = _mm_max_ps(t, _mm_castsi128_ps(_mm_set1_epi32(0x3F80007F)));
	int32_t mexp = _mm_cvtsi128_si32(_mm_castps_si128(t)) & 511;
	// NOTE: mexp is doubly-biased, same for exps.
	int32_t max_exp = mexp - 127;
	// Fall back on anything weird.
	__m128 finiteA = _mm_sub_ps(A, A);
	__m128 finiteB = _mm_sub_ps(B, B);
	finiteA = _mm_cmpeq_ps(finiteA, finiteA);
	finiteB = _mm_cmpeq_ps(finiteB, finiteB);
	if(max_exp >= 255 || _mm_movemask_ps(_mm_and_ps(finiteA, finiteB)) != 15) return vfpu_dot_cpp(a, b);
	// Extract significands.
	__m128i mA = _mm_or_si128(_mm_and_si128(_mm_castps_si128(A),_mm_set1_epi32(0x007FFFFF)),_mm_set1_epi32(0x00800000));
	__m128i mB = _mm_or_si128(_mm_and_si128(_mm_castps_si128(B),_mm_set1_epi32(0x007FFFFF)),_mm_set1_epi32(0x00800000));
	// Multiply.
	// NOTE: vfpu_dot does multiplication as
	// ((x<<EXTRA_BITS)*(y<<EXTRA_BITS))>>(23+EXTRA_BITS),
	// here we do (x*y)>>(23-EXTRA_BITS-1),
	// which produces twice the result (neither expression
	// overflows in our case). We need that because our
	// variable-shift scheme (below) must shift by at least 1 bit.
	static const int s = 32-(23 - EXTRA_BITS - 1), s0 = s / 2,s1 = s - s0;
	// We compute ((x*y)>>shift) as
	// (((x*y)<<(32-shift))>>32), which we express as
	// (((x<<s0)*(y<<s1))>>32) (neither shift overflows).
	__m128i m = mulhi32x4(_mm_slli_epi32(mA, s0), _mm_slli_epi32(mB, s1));
	// Shift according to max_exp. Since SSE2 doesn't have
	// variable per-lane shifts, we multiply *again*,
	// specifically, x>>y turns into (x<<(1<<(32-y)))>>32.
	// We compute 1<<(32-y) using floating-point casts.
	// NOTE: the cast for 1<<31 produces the correct value,
	// since the _mm_cvttps_epi32 error code just happens
	// to be 0x80000000.
	// So (since we pre-multiplied m by 2), we need
	// (m>>1)>>(mexp-exps),
	// i.e. m>>(mexp+1-exps),
	// i.e. (m<<(32-(mexp+1-exps)))>>32,
	// i.e. (m<<(exps-(mexp-31)))>>32.
	__m128i amounts = _mm_sub_epi32(exps, _mm_set1_epi32(mexp - 31));
	// Clamp by 0. Both zero and negative amounts produce zero,
	// since they correspond to right-shifting by 32 or more bits.
	amounts = _mm_and_si128(amounts, _mm_cmpgt_epi32(amounts, _mm_set1_epi32(0)));
	// Set up multipliers.
	__m128i bits = _mm_add_epi32(_mm_set1_epi32(0x3F800000), _mm_slli_epi32(amounts, 23));
	__m128i muls = _mm_cvttps_epi32(_mm_castsi128_ps(bits));
	m = mulhi32x4(m, muls);
	// Extract signs.
	__m128i signs = _mm_cmpgt_epi32(
			_mm_set1_epi32(0),
			_mm_xor_si128(_mm_castps_si128(A), _mm_castps_si128(B)));
	// Apply signs to m.
	m = _mm_sub_epi32(_mm_xor_si128(m, signs), signs);
	// Horizontal sum.
	// See https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction
	__m128i h64 = _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2));
	__m128i s64 = _mm_add_epi32(h64, m);
	__m128i h32 = _mm_shufflelo_epi16(s64, _MM_SHUFFLE(1, 0, 3, 2));
	__m128i s32 = _mm_add_epi32(s64, h32);
	int32_t mant_sum = _mm_cvtsi128_si32(s32);

// The rest is scalar.
	uint32_t sign_sum = 0;
	if (mant_sum < 0) {
		sign_sum = 0x80000000;
		mant_sum = -mant_sum;
	}

// Truncate off the extra bits now.  We want to zero them for rounding purposes.
	mant_sum >>= EXTRA_BITS;

if (mant_sum == 0 || max_exp <= 0) {
		return 0.0f;
	}

if(is_default_rounding_mode)
	{
		float2int r;
		r.f = (float)mant_sum;
		mant_sum = (r.i & 0x007FFFFF) | 0x00800000;
		max_exp += (r.i >> 23) - 0x96;
	}
	else
	{
		int8_t shift = (int8_t)clz32_nonzero(mant_sum) - 8;
		if (shift < 0) {
			// Round to even if we'd shift away a 0.5.
			const uint32_t round_bit = 1 << (-shift - 1);
			if ((mant_sum & round_bit) && (mant_sum & (round_bit << 1))) {
				mant_sum += round_bit;
				shift = (int8_t)clz32_nonzero(mant_sum) - 8;
			} else if ((mant_sum & round_bit) && (mant_sum & (round_bit - 1))) {
				mant_sum += round_bit;
				shift = (int8_t)clz32_nonzero(mant_sum) - 8;
			}
			mant_sum >>= -shift;
			max_exp += -shift;
		} else {
			mant_sum <<= shift;
			max_exp -= shift;
		}
		_dbg_assert_msg_((mant_sum & 0x00800000) != 0, "Mantissa wrong: %08x", mant_sum);
	}

if (max_exp >= 255) {
		max_exp = 255;
		mant_sum = 0;
	} else if (max_exp <= 0) {
		return 0.0f;
	}

float2int result;
	result.i = sign_sum | (max_exp << 23) | (mant_sum & 0x007FFFFF);
	return result.f;
}

static inline float bits2float(uint32_t v)
{
    float ret;
    memcpy(&ret,&v,sizeof(ret));
    return ret;
}

static inline float float2bits(float v)
{
    uint32_t ret;
    memcpy(&ret,&v,sizeof(ret));
    return ret;
}

int main()
{
#ifndef T
    static const int T=10000000;
#endif
#ifndef N
    static const int N=10000;
#endif
    static const int M=1024;

std::mt19937 rng(1);

#if __GNUC__
    // Makeshift GCC-only x86 CPU detection.
    // Output similar to system("cat /proc/cpuinfo | grep 'model name' | uniq");
    printf("CPU: ");
    for(int i=2;i<5;++i)
    {
        unsigned reg[4]={0x80000000u+i,0,0,0};
        char s[17]={0};
        __asm__ volatile ("cpuid":"+a"(reg[0]),"+b"(reg[1]),"+c"(reg[2]),"+d"(reg[3]));
        memcpy(s,reg,sizeof(reg));
        printf("%s",s);
    }
    printf("\n");
#endif

printf("Testing correctness...");
    for(int i=0;i<T;++i)
    {
        float a[4],b[4];
        for(int j=0;j<4;++j)
        {
            uint32_t A=rng(),B=rng();
            if(i>T/2)
            {
                A=(A&0x83FFFFFFu)|0x40000000u;
                B=(B&0x83FFFFFFu)|0x40000000u;
            }
            a[j]=bits2float(A);
            b[j]=bits2float(B);
        }
        float x=vfpu_dot_cpp(a,b);
        float y=vfpu_dot_sse2<0>(a,b);
        float z=vfpu_dot_sse2<1>(a,b);
        // If x,y,z are not all equal - fail,
        // unless they are all NaN.
        if((x==x||y==y||z==z)&&(x!=y||x!=z))
        {
            printf(" FAIL!\n");
            for(int j=0;j<4;++j)
                printf("%19.9g * %19.9g %s\n",a[j],b[j],(j==3?"":"+"));
            printf("vfpu_dot_cpp:     %19.9g\n",x);
            printf("vfpu_dot_sse2<0>: %19.9g\n",y);
            printf("vfpu_dot_sse2<1>: %19.9g\n",z);
            return 1;
        }
#ifdef PROGRESS
        if((i&0xFFFFF)==0) {printf("\rTesting correctness... [%7.3f%%]",100.0*(double)i/(double)T); fflush(stdout);}
#endif
    }
#ifdef PROGRESS
    printf("\rTesting correctness... [%7.3f%%] ok.\n",100.0);
#else
    printf(" ok.\n");
#endif

printf("Testing speed.\n");
    float a[4*M],b[4*M];
    double t;
    volatile float v=0.0f;
    for(int j=0;j<4*M;++j)
    {
        a[j]=bits2float(rng());
        b[j]=bits2float(rng());
    }
    v=0.0f;
    t=(double)clock();
    for(int i=0;i<N;++i)
        for(int j=0;j<M;++j)
            v+=vfpu_dot_cpp(a+4*j,b+4*j);
    t=(double)clock()-t;
    t=(t/CLOCKS_PER_SEC/N/M);
    printf("vfpu_dot_cpp:       %7.2f ns/call\n",1.0e+9*t);
    v=0.0f;
    t=(double)clock();
    for(int i=0;i<N;++i)
        for(int j=0;j<M;++j)
            v+=vfpu_dot_sse2<-1>(a+4*j,b+4*j);
    t=(double)clock()-t;
    t=(t/CLOCKS_PER_SEC/N/M);
    printf("vfpu_dot_sse2<-1>:  %7.2f ns/call\n",1.0e+9*t);
    v=0.0f;
    t=(double)clock();
    for(int i=0;i<N;++i)
        for(int j=0;j<M;++j)
            v+=vfpu_dot_sse2< 0>(a+4*j,b+4*j);
    t=(double)clock()-t;
    t=(t/CLOCKS_PER_SEC/N/M);
    printf("vfpu_dot_sse2< 0>:  %7.2f ns/call\n",1.0e+9*t);
    v=0.0f;
    t=(double)clock();
    for(int i=0;i<N;++i)
        for(int j=0;j<M;++j)
            v+=vfpu_dot_sse2< 1>(a+4*j,b+4*j);
    t=(double)clock()-t;
    t=(t/CLOCKS_PER_SEC/N/M);
    printf("vfpu_dot_sse2< 1>:  %7.2f ns/call\n",1.0e+9*t);
    return 0;
}