Compiler Explorer

Source code

const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
    (ch << 6) | (byte & CONT_MASK) as u32
}

const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
    (byte & (0x7F >> width)) as u32
}

const CONT_MASK: u8 = 0b0011_1111;

pub unsafe fn src<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
    // Decode UTF-8
    let x = *bytes.next()?;
    if x < 128 {
        return Some(x as u32);
    }

// Multibyte case follows
    // Decode from a byte combination out of: [[[x y] z] w]
    // NOTE: Performance is sensitive to the exact formulation here
    let init = utf8_first_byte(x, 2);
    // SAFETY: `bytes` produces an UTF-8-like string,
    // so the iterator must produce a value here.
    let y = unsafe { *bytes.next().unwrap_unchecked() };
    let mut ch = utf8_acc_cont_byte(init, y);
    if x >= 0xE0 {
        // [[x y z] w] case
        // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
        // SAFETY: `bytes` produces an UTF-8-like string,
        // so the iterator must produce a value here.
        let z = unsafe { *bytes.next().unwrap_unchecked() };
        let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
        ch = init << 12 | y_z;
        if x >= 0xF0 {
            // [x y z w] case
            // use only the lower 3 bits of `init`
            // SAFETY: `bytes` produces an UTF-8-like string,
            // so the iterator must produce a value here.
            let w = unsafe { *bytes.next().unwrap_unchecked() };
            ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
        }
    }

Some(ch)
}

pub unsafe fn tgt<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
    let b1 = *bytes.next()? as u32;
    if b1 < 0x80 {
        // 1 byte (ASCII) case:
        // c = b1
        return Some(b1);
    }

// SAFETY: `bytes` produces a UTF-8-like string
    let b2 = unsafe { *bytes.next().unwrap_unchecked() } as u32;
    let c = (b1 & 0x1F) << 6 | (b2 & 0x3F);
    if b1 < 0xE0 {
        // 2 byte case:
        // c = (b1 & 0x1F) << 6
        //   | (b2 & 0x3F) << 0
        return Some(c);
    }

// SAFETY: `bytes` produces a UTF-8-like string
    let b3 = unsafe { *bytes.next().unwrap_unchecked() } as u32;
    let c = c << 6 | (b3 & 0x3F);
    if b1 < 0xF0 {
        // 3 byte case:
        // c = (b1 & 0x1F) << 12
        //   | (b2 & 0x3F) << 6
        //   | (b3 & 0x3F) << 0
        return Some(c);
    }

// SAFETY: `bytes` produces a UTF-8-like string
    let b4 = unsafe { *bytes.next().unwrap_unchecked() } as u32;
    let c = c << 6 | (b4 & 0x3F);
    // 4 byte case:
    // c = ((b1 & 0x1F) << 18
    //    | (b2 & 0x3F) << 12
    //    | (b3 & 0x3F) << 6
    //    | (b4 & 0x3F) << 0) & 0x3F_FF_FF
    // Masking by 0x1F_FF_FF would be sufficient (since we only want the 21 lowest bits),
    // but masking by 0x3F_FF_FF lets x86 use a movzx instead of an and,
    // which has a shorter encoding.
    Some(c & 0x3F_FF_FF)
}

pub fn src1(s: &str) -> Option<char> {
    unsafe {
        let codepoint = src(&mut s.as_bytes().iter())?;
        Some(char::from_u32_unchecked(codepoint))
    }
}