Compiler Explorer

Source code

#![feature(core_intrinsics)]
#![allow(internal_features)]

use std::intrinsics::likely;

const WORD_SIZE: usize = core::mem::size_of::<usize>();
const WORD_MASK: usize = WORD_SIZE - 1;

// If the number of bytes involved exceed this threshold we will opt in word-wise copy.
// The value here selected is max(2 * WORD_SIZE, 16):
// * We need at least 2 * WORD_SIZE bytes to guarantee that at least 1 word will be copied through
//   word-wise copy.
// * The word-wise copy logic needs to perform some checks so it has some small overhead.
//   ensures that even on 32-bit platforms we have copied at least 8 bytes through
//   word-wise copy so the saving of word-wise copy outweighs the fixed overhead.
const WORD_COPY_THRESHOLD: usize = if 2 * WORD_SIZE > 16 {
    2 * WORD_SIZE
} else {
    16
};

#[inline(always)]
unsafe fn load_chunk_aligned<T: Copy>(
    src: *const usize,
    dst: *mut usize,
    load_sz: usize,
    offset: usize,
) -> usize {
    let chunk_sz = core::mem::size_of::<T>();
    if (load_sz & chunk_sz) != 0 {
        *dst.wrapping_byte_add(offset).cast::<T>() = *src.wrapping_byte_add(offset).cast::<T>();
        offset | chunk_sz
    } else {
        offset
    }
}

#[inline(always)]
unsafe fn load_aligned_partial(src: *const usize, load_sz: usize) -> usize {
    debug_assert!(load_sz < WORD_SIZE);
    // We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
    // (since `load_sz < WORD_SIZE`).
    const { assert!(WORD_SIZE <= 8) };

let mut i = 0;
    let mut out = 0usize;
    // We load in decreasing order, so the pointers remain sufficiently aligned for the next step.
    i = load_chunk_aligned::<u32>(src, &raw mut out, load_sz, i);
    i = load_chunk_aligned::<u16>(src, &raw mut out, load_sz, i);
    i = load_chunk_aligned::<u8>(src, &raw mut out, load_sz, i);
    debug_assert!(i == load_sz);
    out
}

#[inline(always)]
unsafe fn load_aligned_end_partial(src: *const usize, load_sz: usize) -> usize {
    debug_assert!(load_sz < WORD_SIZE);
    // We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
    // (since `load_sz < WORD_SIZE`).
    const { assert!(WORD_SIZE <= 8) };

let mut i = 0;
    let mut out = 0usize;
    // Obtain pointers pointing to the beginning of the range we want to load.
    let src_shifted = src.wrapping_byte_add(WORD_SIZE - load_sz);
    let out_shifted = (&raw mut out).wrapping_byte_add(WORD_SIZE - load_sz);
    // We load in increasing order, so by the time we reach `u16` things are 2-aligned etc.
    i = load_chunk_aligned::<u8>(src_shifted, out_shifted, load_sz, i);
    i = load_chunk_aligned::<u16>(src_shifted, out_shifted, load_sz, i);
    i = load_chunk_aligned::<u32>(src_shifted, out_shifted, load_sz, i);
    debug_assert!(i == load_sz);
    out
}

#[unsafe(no_mangle)]
pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, mut n: usize) {
    #[inline(always)]
    unsafe fn copy_forward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) {
        let dest_end = dest.wrapping_add(n);
        while dest < dest_end {
            *dest = *src;
            dest = dest.wrapping_add(1);
            src = src.wrapping_add(1);
        }
    }

#[inline(always)]
    unsafe fn copy_forward_aligned_words(dest: *mut u8, src: *const u8, n: usize) {
        let mut dest_usize = dest as *mut usize;
        let mut src_usize = src as *mut usize;
        let dest_end = dest.wrapping_add(n) as *mut usize;

while dest_usize < dest_end {
            *dest_usize = *src_usize;
            dest_usize = dest_usize.wrapping_add(1);
            src_usize = src_usize.wrapping_add(1);
        }
    }

/// `n` is in units of bytes, but must be a multiple of the word size and must not be 0.
    /// `src` *must not* be `usize`-aligned.
    #[inline(always)]
    unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
        debug_assert!(n > 0 && n % WORD_SIZE == 0);
        debug_assert!(src.addr() % WORD_SIZE != 0);

let mut dest_usize = dest as *mut usize;
        let dest_end = dest.wrapping_add(n) as *mut usize;

// Calculate the misalignment offset and shift needed to reassemble value.
        // Since `src` is definitely not aligned, `offset` is in the range 1..WORD_SIZE.
        let offset = src as usize & WORD_MASK;
        let shift = offset * 8;

// Realign src
        let mut src_aligned = src.wrapping_byte_sub(offset) as *mut usize;
        let mut prev_word = load_aligned_end_partial(src_aligned, WORD_SIZE - offset);

while dest_usize.wrapping_add(1) < dest_end {
            src_aligned = src_aligned.wrapping_add(1);
            let cur_word = *src_aligned;
            let reassembled = if cfg!(target_endian = "little") {
                prev_word >> shift | cur_word << (WORD_SIZE * 8 - shift)
            } else {
                prev_word << shift | cur_word >> (WORD_SIZE * 8 - shift)
            };
            prev_word = cur_word;

*dest_usize = reassembled;
            dest_usize = dest_usize.wrapping_add(1);
        }

// There's one more element left to go, and we can't use the loop for that as on the `src` side,
        // it is partially out-of-bounds.
        src_aligned = src_aligned.wrapping_add(1);
        let cur_word = load_aligned_partial(src_aligned, offset);
        let reassembled = if cfg!(target_endian = "little") {
            prev_word >> shift | cur_word << (WORD_SIZE * 8 - shift)
        } else {
            prev_word << shift | cur_word >> (WORD_SIZE * 8 - shift)
        };
        // prev_word does not matter any more

*dest_usize = reassembled;
        // dest_usize does not matter any more
    }

if n >= WORD_COPY_THRESHOLD {
        // Align dest
        // Because of n >= 2 * WORD_SIZE, dst_misalignment < n
        let dest_misalignment = (dest as usize).wrapping_neg() & WORD_MASK;
        copy_forward_bytes(dest, src, dest_misalignment);
        dest = dest.wrapping_add(dest_misalignment);
        src = src.wrapping_add(dest_misalignment);
        n -= dest_misalignment;

let n_words = n & !WORD_MASK;
        let src_misalignment = src as usize & WORD_MASK;
        if likely(src_misalignment == 0) {
            copy_forward_aligned_words(dest, src, n_words);
        } else {
            copy_forward_misaligned_words(dest, src, n_words);
        }
        dest = dest.wrapping_add(n_words);
        src = src.wrapping_add(n_words);
        n -= n_words;
    }
    copy_forward_bytes(dest, src, n);
}