Thanks for using Compiler Explorer
Sponsors
Jakt
C++
Ada
Algol68
Analysis
Android Java
Android Kotlin
Assembly
C
C3
Carbon
C with Coccinelle
C++ with Coccinelle
C++ (Circle)
CIRCT
Clean
CMake
CMakeScript
COBOL
C++ for OpenCL
MLIR
Cppx
Cppx-Blue
Cppx-Gold
Cpp2-cppfront
Crystal
C#
CUDA C++
D
Dart
Elixir
Erlang
Fortran
F#
GLSL
Go
Haskell
HLSL
Hook
Hylo
IL
ispc
Java
Julia
Kotlin
LLVM IR
LLVM MIR
Modula-2
Mojo
Nim
Numba
Nix
Objective-C
Objective-C++
OCaml
Odin
OpenCL C
Pascal
Pony
PTX
Python
Racket
Raku
Ruby
Rust
Sail
Snowball
Scala
Slang
Solidity
Spice
SPIR-V
Swift
LLVM TableGen
Toit
Triton
TypeScript Native
V
Vala
Visual Basic
Vyper
WASM
Zig
Javascript
GIMPLE
Ygen
sway
rust source #1
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
mrustc (master)
rustc 1.0.0
rustc 1.1.0
rustc 1.10.0
rustc 1.11.0
rustc 1.12.0
rustc 1.13.0
rustc 1.14.0
rustc 1.15.1
rustc 1.16.0
rustc 1.17.0
rustc 1.18.0
rustc 1.19.0
rustc 1.2.0
rustc 1.20.0
rustc 1.21.0
rustc 1.22.0
rustc 1.23.0
rustc 1.24.0
rustc 1.25.0
rustc 1.26.0
rustc 1.27.0
rustc 1.27.1
rustc 1.28.0
rustc 1.29.0
rustc 1.3.0
rustc 1.30.0
rustc 1.31.0
rustc 1.32.0
rustc 1.33.0
rustc 1.34.0
rustc 1.35.0
rustc 1.36.0
rustc 1.37.0
rustc 1.38.0
rustc 1.39.0
rustc 1.4.0
rustc 1.40.0
rustc 1.41.0
rustc 1.42.0
rustc 1.43.0
rustc 1.44.0
rustc 1.45.0
rustc 1.45.2
rustc 1.46.0
rustc 1.47.0
rustc 1.48.0
rustc 1.49.0
rustc 1.5.0
rustc 1.50.0
rustc 1.51.0
rustc 1.52.0
rustc 1.53.0
rustc 1.54.0
rustc 1.55.0
rustc 1.56.0
rustc 1.57.0
rustc 1.58.0
rustc 1.59.0
rustc 1.6.0
rustc 1.60.0
rustc 1.61.0
rustc 1.62.0
rustc 1.63.0
rustc 1.64.0
rustc 1.65.0
rustc 1.66.0
rustc 1.67.0
rustc 1.68.0
rustc 1.69.0
rustc 1.7.0
rustc 1.70.0
rustc 1.71.0
rustc 1.72.0
rustc 1.73.0
rustc 1.74.0
rustc 1.75.0
rustc 1.76.0
rustc 1.77.0
rustc 1.78.0
rustc 1.79.0
rustc 1.8.0
rustc 1.80.0
rustc 1.81.0
rustc 1.82.0
rustc 1.83.0
rustc 1.84.0
rustc 1.85.0
rustc 1.86.0
rustc 1.87.0
rustc 1.88.0
rustc 1.89.0
rustc 1.9.0
rustc 1.90.0
rustc beta
rustc nightly
rustc-cg-gcc (master)
x86-64 GCCRS (GCC master)
x86-64 GCCRS (GCCRS master)
x86-64 GCCRS 14.1 (GCC assertions)
x86-64 GCCRS 14.1 (GCC)
x86-64 GCCRS 14.2 (GCC assertions)
x86-64 GCCRS 14.2 (GCC)
x86-64 GCCRS 14.3 (GCC assertions)
x86-64 GCCRS 14.3 (GCC)
x86-64 GCCRS 15.1 (GCC assertions)
x86-64 GCCRS 15.1 (GCC)
x86-64 GCCRS 15.2 (GCC assertions)
x86-64 GCCRS 15.2 (GCC)
Options
Source code
#![feature(core_intrinsics)] #![feature(array_chunks)] #[allow(unused)] mod temp_utf8error { //! Defines utf8 error type. use core::error::Error; use core::fmt; /// Errors which can occur when attempting to interpret a sequence of [`u8`] /// as a string. /// /// As such, the `from_utf8` family of functions and methods for both [`String`]s /// and [`&str`]s make use of this error, for example. /// /// [`String`]: ../../std/string/struct.String.html#method.from_utf8 /// [`&str`]: super::from_utf8 /// /// # Examples /// /// This error type’s methods can be used to create functionality /// similar to `String::from_utf8_lossy` without allocating heap memory: /// /// ``` /// fn from_utf8_lossy<F>(mut input: &[u8], mut push: F) where F: FnMut(&str) { /// loop { /// match std::str::from_utf8(input) { /// Ok(valid) => { /// push(valid); /// break /// } /// Err(error) => { /// let (valid, after_valid) = input.split_at(error.valid_up_to()); /// unsafe { /// push(std::str::from_utf8_unchecked(valid)) /// } /// push("\u{FFFD}"); /// /// if let Some(invalid_sequence_length) = error.error_len() { /// input = &after_valid[invalid_sequence_length..] /// } else { /// break /// } /// } /// } /// } /// } /// ``` #[derive(Copy, Eq, PartialEq, Clone)] pub struct Utf8Error { pub(super) valid_up_to: usize, // Use a single value instead of tagged enum `Option<u8>` to make `Result<(), Utf8Error>` fits // in two machine words, so `run_utf8_validation` does not need to returns values on stack on // x86(_64). Register spill is very expensive on `run_utf8_validation` and can give up to 200% // latency penalty on the error path. pub(super) error_len: Utf8ErrorLen, } #[derive(Copy, Eq, PartialEq, Clone)] #[repr(u8)] pub(super) enum Utf8ErrorLen { Eof = 0, One, Two, Three, } impl Utf8Error { /// Returns the index in the given string up to which valid UTF-8 was /// verified. /// /// It is the maximum index such that `from_utf8(&input[..index])` /// would return `Ok(_)`. /// /// # Examples /// /// Basic usage: /// /// ``` /// use std::str; /// /// // some invalid bytes, in a vector /// let sparkle_heart = vec![0, 159, 146, 150]; /// /// // std::str::from_utf8 returns a Utf8Error /// let error = str::from_utf8(&sparkle_heart).unwrap_err(); /// /// // the second byte is invalid here /// assert_eq!(1, error.valid_up_to()); /// ``` #[must_use] #[inline] pub const fn valid_up_to(&self) -> usize { self.valid_up_to } /// Provides more information about the failure: /// /// * `None`: the end of the input was reached unexpectedly. /// `self.valid_up_to()` is 1 to 3 bytes from the end of the input. /// If a byte stream (such as a file or a network socket) is being decoded incrementally, /// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks. /// /// * `Some(len)`: an unexpected byte was encountered. /// The length provided is that of the invalid byte sequence /// that starts at the index given by `valid_up_to()`. /// Decoding should resume after that sequence /// (after inserting a [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD]) in case of /// lossy decoding. /// /// [U+FFFD]: ../../std/char/constant.REPLACEMENT_CHARACTER.html #[must_use] #[inline] pub const fn error_len(&self) -> Option<usize> { match self.error_len { Utf8ErrorLen::Eof => None, // FIXME(136972): Direct `match` gives suboptimal codegen involving two table lookups. len => Some(len as usize), } } } impl fmt::Debug for Utf8Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Utf8Error") .field("valid_up_to", &self.valid_up_to) .field("error_len", &self.error_len()) .finish() } } impl fmt::Display for Utf8Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if let Some(error_len) = self.error_len() { write!( f, "invalid utf-8 sequence of {} bytes from index {}", error_len, self.valid_up_to ) } else { write!( f, "incomplete utf-8 byte sequence from index {}", self.valid_up_to ) } } } impl Error for Utf8Error { #[allow(deprecated)] fn description(&self) -> &str { "invalid utf-8: corrupt contents" } } /// An error returned when parsing a `bool` using [`from_str`] fails /// /// [`from_str`]: super::FromStr::from_str #[derive(Debug, Clone, PartialEq, Eq)] #[non_exhaustive] pub struct ParseBoolError; impl fmt::Display for ParseBoolError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { "provided string was not `true` or `false`".fmt(f) } } impl Error for ParseBoolError { #[allow(deprecated)] fn description(&self) -> &str { "failed to parse bool" } } } #[allow(unused)] mod temp_validation { //! Operations related to UTF-8 validation. use super::temp_utf8error::Utf8Error; use super::temp_utf8error::Utf8ErrorLen; use core::intrinsics::const_eval_select; /// Returns the initial codepoint accumulator for the first byte. /// The first byte is special, only want bottom 5 bits for width 2, 4 bits /// for width 3, and 3 bits for width 4. #[inline] const fn utf8_first_byte(byte: u8, width: u32) -> u32 { (byte & (0x7F >> width)) as u32 } /// Returns the value of `ch` updated with continuation byte `byte`. #[inline] const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { (ch << 6) | (byte & CONT_MASK) as u32 } /// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the /// bits `10`). #[inline] pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool { (byte as i8) < -64 } /// Reads the next code point out of a byte iterator (assuming a /// UTF-8-like encoding). /// /// # Safety /// /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string #[inline] pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> { // Decode UTF-8 let x = *bytes.next()?; if x < 128 { return Some(x as u32); } // Multibyte case follows // Decode from a byte combination out of: [[[x y] z] w] // NOTE: Performance is sensitive to the exact formulation here let init = utf8_first_byte(x, 2); // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let y = unsafe { *bytes.next().unwrap_unchecked() }; let mut ch = utf8_acc_cont_byte(init, y); if x >= 0xE0 { // [[x y z] w] case // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let z = unsafe { *bytes.next().unwrap_unchecked() }; let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); ch = init << 12 | y_z; if x >= 0xF0 { // [x y z w] case // use only the lower 3 bits of `init` // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let w = unsafe { *bytes.next().unwrap_unchecked() }; ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); } } Some(ch) } /// Reads the last code point out of a byte iterator (assuming a /// UTF-8-like encoding). /// /// # Safety /// /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string #[inline] pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32> where I: DoubleEndedIterator<Item = &'a u8>, { // Decode UTF-8 let w = match *bytes.next_back()? { next_byte if next_byte < 128 => return Some(next_byte as u32), back_byte => back_byte, }; // Multibyte case follows // Decode from a byte combination out of: [x [y [z w]]] let mut ch; // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let z = unsafe { *bytes.next_back().unwrap_unchecked() }; ch = utf8_first_byte(z, 2); if utf8_is_cont_byte(z) { // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let y = unsafe { *bytes.next_back().unwrap_unchecked() }; ch = utf8_first_byte(y, 3); if utf8_is_cont_byte(y) { // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let x = unsafe { *bytes.next_back().unwrap_unchecked() }; ch = utf8_first_byte(x, 4); ch = utf8_acc_cont_byte(ch, y); } ch = utf8_acc_cont_byte(ch, z); } ch = utf8_acc_cont_byte(ch, w); Some(ch) } // The shift-based DFA algorithm for UTF-8 validation. // Ref: <https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725> // // In short, we encode DFA transitions in an array `TRANS_TABLE` such that: // ``` // TRANS_TABLE[next_byte] = // OFFSET[target_state1] << OFFSET[source_state1] | // OFFSET[target_state2] << OFFSET[source_state2] | // ... // ``` // Where `OFFSET[]` is a compile-time map from each state to a distinct 0..32 value. // // To execute the DFA: // ``` // let state = OFFSET[initial_state]; // for byte in .. { // state = TRANS_TABLE[byte] >> (state & ((1 << BITS_PER_STATE) - 1)); // } // ``` // By choosing `BITS_PER_STATE = 5` and `state: u32`, we can replace the masking by `wrapping_shr` // and it becomes free on modern ISAs, including x86, x86_64 and ARM. // // ``` // // On x86-64-v3: (more instructions on ordinary x86_64 but with same cycles-per-byte) // // shrx state, qword ptr [TRANS_TABLE + 4 * byte], state // // On aarch64/ARMv8: // // ldr temp, [TRANS_TABLE, byte, lsl 2] // // lsr state, temp, state // state = TRANS_TABLE[byte].wrapping_shr(state); // ``` // // The DFA is directly derived from UTF-8 syntax from the RFC3629: // <https://datatracker.ietf.org/doc/html/rfc3629#section-4>. // We assign S0 as ERROR and S1 as ACCEPT. DFA starts at S1. // Syntax are annotated with DFA states in angle bracket as following: // // UTF8-char = <S1> (UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4) // UTF8-1 = <S1> %x00-7F // UTF8-2 = <S1> %xC2-DF <S2> UTF8-tail // UTF8-3 = <S1> %xE0 <S3> %xA0-BF <S2> UTF8-tail / // <S1> (%xE1-EC / %xEE-EF) <S4> 2( UTF8-tail ) / // <S1> %xED <S5> %x80-9F <S2> UTF8-tail // UTF8-4 = <S1> %xF0 <S6> %x90-BF <S4> 2( UTF8-tail ) / // <S1> %xF1-F3 <S7> UTF8-tail <S4> 2( UTF8-tail ) / // <S1> %xF4 <S8> %x80-8F <S4> 2( UTF8-tail ) // UTF8-tail = %x80-BF # Inlined into above usages. // // You may notice that encoding 9 states with 5bits per state into 32bit seems impossible, // but we exploit overlapping bits to find a possible `OFFSET[]` and `TRANS_TABLE[]` solution. // The SAT solver to find such (minimal) solution is in `./solve_dfa.py`. // The solution is also appended to the end of that file and is verifiable. const BITS_PER_STATE: u32 = 5; const STATE_MASK: u32 = (1 << BITS_PER_STATE) - 1; const STATE_CNT: usize = 9; const ST_ERROR: u32 = OFFSETS[0]; const ST_ACCEPT: u32 = OFFSETS[1]; // See the end of `./solve_dfa.py`. const OFFSETS: [u32; STATE_CNT] = [0, 6, 16, 19, 1, 25, 11, 18, 24]; // Keep the whole table in a single page. #[repr(align(1024))] struct TransitionTable([u32; 256]); static TRANS_TABLE: TransitionTable = { let mut table = [0u32; 256]; let mut b = 0; while b < 256 { // See the end of `./solve_dfa.py`. table[b] = match b as u8 { 0x00..=0x7F => 0x180, 0xC2..=0xDF => 0x400, 0xE0 => 0x4C0, 0xE1..=0xEC | 0xEE..=0xEF => 0x40, 0xED => 0x640, 0xF0 => 0x2C0, 0xF1..=0xF3 => 0x480, 0xF4 => 0x600, 0x80..=0x8F => 0x21060020, 0x90..=0x9F => 0x20060820, 0xA0..=0xBF => 0x860820, 0xC0..=0xC1 | 0xF5..=0xFF => 0x0, }; b += 1; } TransitionTable(table) }; #[inline(always)] const fn next_state(st: u32, byte: u8) -> u32 { TRANS_TABLE.0[byte as usize].wrapping_shr(st) } /// Check if `byte` is a valid UTF-8 first byte, assuming it must be a valid first or /// continuation byte. #[inline(always)] const fn is_utf8_first_byte(byte: u8) -> bool { byte as i8 >= 0b1100_0000u8 as i8 } /// # Safety /// The caller must ensure `bytes[..i]` is a valid UTF-8 prefix and `st` is the DFA state after /// executing on `bytes[..i]`. #[inline] const unsafe fn resolve_error_location(st: u32, bytes: &[u8], i: usize) -> Utf8Error { // There are two cases: // 1. [valid UTF-8..] | *here // The previous state must be ACCEPT for the case 1, and `valid_up_to = i`. // 2. [valid UTF-8..] | valid first byte, [valid continuation byte...], *here // `valid_up_to` is at the latest non-continuation byte, which must exist and // be in range `(i-3)..i`. let (valid_up_to, error_len) = if st & STATE_MASK == ST_ACCEPT { (i, Utf8ErrorLen::One) // SAFETY: UTF-8 first byte must exist if we are in an intermediate state. // We use pointer here because `get_unchecked` is not const fn. } else if is_utf8_first_byte(unsafe { bytes.as_ptr().add(i - 1).read() }) { (i - 1, Utf8ErrorLen::One) // SAFETY: Same as above. } else if is_utf8_first_byte(unsafe { bytes.as_ptr().add(i - 2).read() }) { (i - 2, Utf8ErrorLen::Two) } else { (i - 3, Utf8ErrorLen::Three) }; Utf8Error { valid_up_to, error_len, } } // The simpler but slower algorithm to run DFA with error handling. // Returns the final state after execution on the whole slice. // // # Safety // The caller must ensure `bytes[..i]` is a valid UTF-8 prefix and `st` is the DFA state after // executing on `bytes[..i]`. #[inline] const unsafe fn run_with_error_handling( mut st: u32, bytes: &[u8], mut i: usize, ) -> Result<u32, Utf8Error> { while i < bytes.len() { let new_st = next_state(st, bytes[i]); if new_st & STATE_MASK == ST_ERROR { // SAFETY: Guaranteed by the caller. return Err(unsafe { resolve_error_location(st, bytes, i) }); } st = new_st; i += 1; } Ok(st) } /// Walks through `v` checking that it's a valid UTF-8 sequence, /// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`. pub(super) fn run_utf8_validation(bytes: &[u8]) -> Result<(), Utf8Error> { const_eval_select((bytes,), run_utf8_validation_const, run_utf8_validation_rt) } #[inline] const fn run_utf8_validation_const(bytes: &[u8]) -> Result<(), Utf8Error> { // SAFETY: Start at empty string with valid state ACCEPT. match unsafe { run_with_error_handling(ST_ACCEPT, bytes, 0) } { Err(err) => Err(err), Ok(st) if st & STATE_MASK == ST_ACCEPT => Ok(()), Ok(st) => { // SAFETY: `st` is the last state after execution without encountering any error. let mut err = unsafe { resolve_error_location(st, bytes, bytes.len()) }; err.error_len = Utf8ErrorLen::Eof; Err(err) } } } #[inline] fn run_utf8_validation_rt(bytes: &[u8]) -> Result<(), Utf8Error> { const MAIN_CHUNK_SIZE: usize = 16; const ASCII_CHUNK_SIZE: usize = 16; const { assert!(ASCII_CHUNK_SIZE % MAIN_CHUNK_SIZE == 0) }; let mut i = bytes.len() % MAIN_CHUNK_SIZE; // SAFETY: Start at initial state ACCEPT. let mut st = unsafe { run_with_error_handling(ST_ACCEPT, &bytes[..i], 0)? }; while i < bytes.len() { // Fast path: if the current state is ACCEPT, we can skip to the next non-ASCII chunk. // We also did a quick inspection on the first byte to avoid getting into this path at all // when handling strings with almost no ASCII, eg. Chinese scripts. // SAFETY: `i` is in bound. if st & STATE_MASK == ST_ACCEPT && unsafe { bytes.get_unchecked(i).is_ascii() } { // SAFETY: `i` is in bound. let rest = unsafe { bytes.get_unchecked(i..) }; let mut ascii_chunks = rest.array_chunks::<ASCII_CHUNK_SIZE>(); let ascii_rest_chunk_cnt = ascii_chunks.len(); let pos = ascii_chunks .position(|chunk| { // NB. Always traverse the whole chunk instead of `.all()`, to persuade LLVM to // vectorize this check. // We also do not use `<[u8]>::is_ascii` which is unnecessarily complex here. #[expect(clippy::unnecessary_fold)] let all_ascii = chunk.iter().fold(true, |acc, b| acc && b.is_ascii()); !all_ascii }) .unwrap_or(ascii_rest_chunk_cnt); i += pos * ASCII_CHUNK_SIZE; if i >= bytes.len() { break; } } // SAFETY: `i` and `i + MAIN_CHUNK_SIZE` are in bound by loop invariant. let chunk = unsafe { &*bytes.as_ptr().add(i).cast::<[u8; MAIN_CHUNK_SIZE]>() }; let mut new_st = st; for &b in chunk { new_st = next_state(new_st, b); } if new_st & STATE_MASK == ST_ERROR { // SAFETY: `st` is the last state after executing `bytes[..i]` without encountering any error. // And we know the next chunk must fail the validation. return Err(unsafe { run_with_error_handling(st, bytes, i).unwrap_err_unchecked() }); } st = new_st; i += MAIN_CHUNK_SIZE; } if st & STATE_MASK != ST_ACCEPT { // SAFETY: Same as above. let mut err = unsafe { resolve_error_location(st, bytes, bytes.len()) }; err.error_len = Utf8ErrorLen::Eof; return Err(err); } Ok(()) } // https://tools.ietf.org/html/rfc3629 const UTF8_CHAR_WIDTH: &[u8; 256] = &[ // 1 2 3 4 5 6 7 8 9 A B C D E F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F ]; /// Given a first byte, determines how many bytes are in this UTF-8 character. #[must_use] #[inline] pub const fn utf8_char_width(b: u8) -> usize { UTF8_CHAR_WIDTH[b as usize] as usize } /// Mask of the value bits of a continuation byte. const CONT_MASK: u8 = 0b0011_1111; } pub fn main() { let v = std::env::args().nth(1); let _s = if let Some(ref s) = v { s.as_bytes() } else { std::hint::black_box( [ b'a', b' ', b'r', b'a', b'n', b'd', b'o', b'm', b' ', b's', b't', b'r', b'i', b'n', b'g', b' ', b't', b'o', b' ', b'v', b'a', b'l', b'i', b'd', b'a', b't', b'e', b'\xff', b't', b'e', b'\x00', ] .as_slice(), ) }; let t = std::time::Instant::now(); let mut cnt = 0; for _ in 0..1000000 { if std::str::from_utf8(_s).is_ok() { cnt += 1; } } println!("old implementation: {:?} {}", t.elapsed(), cnt); let t = std::time::Instant::now(); let mut cnt = 0; for _ in 0..1000000 { if temp_validation::run_utf8_validation(_s).is_ok() { cnt += 1; } } println!("new implementation: {:?} {}", t.elapsed(), cnt); }
zig source #2
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
zig 0.10.0
zig 0.11.0
zig 0.12.0
zig 0.12.1
zig 0.13.0
zig 0.14.0
zig 0.14.1
zig 0.15.1
zig 0.2.0
zig 0.3.0
zig 0.4.0
zig 0.5.0
zig 0.6.0
zig 0.7.0
zig 0.7.1
zig 0.8.0
zig 0.9.0
zig trunk
Options
Source code
const std = @import("std"); const Surrogates = enum { cannot_encode_surrogate_half, can_encode_surrogate_half, }; pub fn utf8ValidateSlice(input: []const u8) bool { return utf8ValidateSliceImpl(input, .cannot_encode_surrogate_half); } fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) bool { const DFA = struct { const ByteClass = enum { // base ASCII ascii, // continuation bytes cont1, cont2, cont3, // starting bytes of 2-byte codepoint two1, two2, // starting bytes of 3-byte codepoint three1, three2, three3, // starting bytes of 4-byte codepoint four1, four2, four3, four4, }; pub fn byte_class(byte: u8) ByteClass { return switch (byte) { 0x00...0x7f => .ascii, 0x80...0x8f => .cont1, 0x90...0x9f => .cont2, 0xa0...0xbf => .cont3, 0xc0...0xc1 => .two1, 0xc2...0xdf => .two2, 0xe0...0xe0 => .three1, 0xe1...0xec => .three2, 0xed...0xed => .three3, 0xee...0xef => .three2, 0xf0...0xf0 => .four1, 0xf1...0xf3 => .four2, 0xf4...0xf4 => .four3, 0xf5...0xff => .four4, }; } const State = enum { ok, one, two1, two2, three1, three2, fail, }; fn offset_from_state(state: State) u5 { return switch (state) { .ok => 8, .one => 13, .two1 => 23, .two2 => 18, .three1 => 3, .three2 => 28, .fail => 0, }; } fn state_from_offset(offset: u5) State { return switch (offset) { 8 => .ok, 13 => .one, 23 => .two1, 18 => .two2, 3 => .three1, 28 => .three2, 0 => .fail, else => unreachable, }; } const start: State = .ok; const accept: []State = .{.ok}; const fail: ?State = .fail; fn step(byte: u8, state: State) State { const class = byte_class(byte); return switch (state) { .ok => switch (class) { .ascii => .ok, .cont1 => .one, .cont2 => .one, .cont3 => .one, else => .fail, }, .one => switch (class) { .two2 => .ok, .cont1 => .two1, .cont2 => .two1, .cont3 => .two2, else => .fail, }, .two1 => switch (class) { .three2 => .ok, .three3 => .ok, .cont1 => .three1, .cont2 => .three2, .cont3 => .three2, else => .fail, }, .two2 => switch (class) { .three1 => .ok, .three2 => .ok, .three3 => switch (surrogates) { .cannot_encode_surrogate_half => .fail, .can_encode_surrogate_half => .ok, }, .cont1 => .three1, .cont2 => .three2, .cont3 => .three2, else => .fail, }, .three1 => switch (class) { .four2 => .ok, .four3 => .ok, else => .fail, }, .three2 => switch (class) { .four1 => .ok, .four2 => .ok, else => .fail, }, .fail => .fail, }; } const shift_table = blk: { @setEvalBranchQuota(30000); var t = [_]u32{0} ** 256; for (&t, 0..) |*r, c| { for (std.enums.values(State)) |s| { r.* |= @truncate(@as(u32, offset_from_state(step(c, s))) << offset_from_state(s)); } // Make sure the states didn't overlap and destroy themselves for (std.enums.values(State)) |s| { std.debug.assert(@as(u5, @truncate(r.* >> offset_from_state(s))) == offset_from_state(step(c, s))); } } break :blk t; }; }; var remaining = input; if (std.simd.suggestVectorLength(u8)) |chunk_len| { const Chunk = @Vector(chunk_len, u8); // Fast path. Check for and skip ASCII characters at the start of the input. while (remaining.len >= chunk_len) { const chunk: Chunk = remaining[0..chunk_len].*; const mask: Chunk = @splat(0x80); if (@reduce(.Or, chunk & mask == mask)) { // found a non ASCII byte break; } remaining = remaining[chunk_len..]; } } var state: u32 = DFA.offset_from_state(DFA.State.ok); // Manually unrolled to insert early return. const UNROLL = 8; while (remaining.len > UNROLL) { for (0..UNROLL) |i| { const byte = remaining[remaining.len - 1 - i]; state = DFA.shift_table[byte] >> @truncate(state); } remaining = remaining[0 .. remaining.len - UNROLL]; if (@as(u5, @truncate(state)) == DFA.offset_from_state(DFA.State.fail)) { return false; } } for (0..remaining.len) |i| { const byte = remaining[remaining.len - 1 - i]; state = DFA.shift_table[byte] >> @truncate(state); } return @as(u5, @truncate(state)) == DFA.offset_from_state(DFA.State.ok); } pub fn main() !void { var args = try std.process.argsWithAllocator(std.heap.c_allocator); defer args.deinit(); _ = args.next(); const v = if (args.next()) |arg| arg else @as([:0]const u8, "a random string to validate\xffte\x00"); var t = try std.time.Timer.start(); var cnt: usize = 0; for (0..1000000) |_| { if (std.unicode.utf8ValidateSlice(v)) { cnt += 1; } } var elapsed = t.lap(); std.debug.print("old implementation: {} {}\n", .{ std.fmt.fmtDuration(elapsed), cnt }); cnt = 0; for (0..1000000) |_| { if (utf8ValidateSlice(v)) { cnt += 1; } } elapsed = t.read(); std.debug.print("new implementation: {} {}\n", .{ std.fmt.fmtDuration(elapsed), cnt }); }
Become a Patron
Sponsor on GitHub
Donate via PayPal
Compiler Explorer Shop
Source on GitHub
Mailing list
Installed libraries
Wiki
Report an issue
How it works
Contact the author
CE on Mastodon
CE on Bluesky
Statistics
Changelog
Version tree