diff options
Diffstat (limited to 'vendor/utf8parse/src')
-rw-r--r-- | vendor/utf8parse/src/lib.rs | 132 | ||||
-rw-r--r-- | vendor/utf8parse/src/types.rs | 105 |
2 files changed, 0 insertions, 237 deletions
diff --git a/vendor/utf8parse/src/lib.rs b/vendor/utf8parse/src/lib.rs deleted file mode 100644 index 093de81..0000000 --- a/vendor/utf8parse/src/lib.rs +++ /dev/null @@ -1,132 +0,0 @@ -//! A table-driven UTF-8 Parser -//! -//! This module implements a table-driven UTF-8 parser which should -//! theoretically contain the minimal number of branches (1). The only branch is -//! on the `Action` returned from unpacking a transition. -#![deny(clippy::all, clippy::if_not_else, clippy::enum_glob_use)] -#![cfg_attr(all(feature = "nightly", test), feature(test))] -#![no_std] - -use core::char; - -mod types; - -use types::{Action, State}; - -/// Handles codepoint and invalid sequence events from the parser. -pub trait Receiver { - /// Called whenever a codepoint is parsed successfully - fn codepoint(&mut self, _: char); - - /// Called when an invalid_sequence is detected - fn invalid_sequence(&mut self); -} - -/// A parser for Utf8 Characters -/// -/// Repeatedly call `advance` with bytes to emit Utf8 characters -#[derive(Clone, Default, PartialEq, Eq, Debug)] -pub struct Parser { - point: u32, - state: State, -} - -/// Continuation bytes are masked with this value. -const CONTINUATION_MASK: u8 = 0b0011_1111; - -impl Parser { - /// Create a new Parser - pub fn new() -> Parser { - Parser { point: 0, state: State::Ground } - } - - /// Advance the parser - /// - /// The provider receiver will be called whenever a codepoint is completed or an invalid - /// sequence is detected. - pub fn advance<R>(&mut self, receiver: &mut R, byte: u8) - where - R: Receiver, - { - let (state, action) = self.state.advance(byte); - self.perform_action(receiver, byte, action); - self.state = state; - } - - fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action) - where - R: Receiver, - { - match action { - Action::InvalidSequence => { - self.point = 0; - receiver.invalid_sequence(); - }, - Action::EmitByte => { - receiver.codepoint(byte as char); - }, - Action::SetByte1 => { - let point = self.point | ((byte & CONTINUATION_MASK) as u32); - let c = unsafe { char::from_u32_unchecked(point) }; - self.point = 0; - - receiver.codepoint(c); - }, - Action::SetByte2 => { - self.point |= ((byte & CONTINUATION_MASK) as u32) << 6; - }, - Action::SetByte2Top => { - self.point |= ((byte & 0b0001_1111) as u32) << 6; - }, - Action::SetByte3 => { - self.point |= ((byte & CONTINUATION_MASK) as u32) << 12; - }, - Action::SetByte3Top => { - self.point |= ((byte & 0b0000_1111) as u32) << 12; - }, - Action::SetByte4 => { - self.point |= ((byte & 0b0000_0111) as u32) << 18; - }, - } - } -} - -#[cfg(all(feature = "nightly", test))] -mod benches { - extern crate std; - extern crate test; - - use super::{Parser, Receiver}; - - use self::test::{black_box, Bencher}; - - static UTF8_DEMO: &[u8] = include_bytes!("../tests/UTF-8-demo.txt"); - - impl Receiver for () { - fn codepoint(&mut self, c: char) { - black_box(c); - } - - fn invalid_sequence(&mut self) {} - } - - #[bench] - fn parse_bench_utf8_demo(b: &mut Bencher) { - let mut parser = Parser::new(); - - b.iter(|| { - for byte in UTF8_DEMO { - parser.advance(&mut (), *byte); - } - }) - } - - #[bench] - fn std_string_parse_utf8(b: &mut Bencher) { - b.iter(|| { - for c in std::str::from_utf8(UTF8_DEMO).unwrap().chars() { - black_box(c); - } - }); - } -} diff --git a/vendor/utf8parse/src/types.rs b/vendor/utf8parse/src/types.rs deleted file mode 100644 index f57a94d..0000000 --- a/vendor/utf8parse/src/types.rs +++ /dev/null @@ -1,105 +0,0 @@ -//! Types supporting the UTF-8 parser - -/// Action to take when receiving a byte -#[derive(Debug, Copy, Clone)] -pub enum Action { - /// Unexpected byte; sequence is invalid - InvalidSequence = 0, - /// Received valid 7-bit ASCII byte which can be directly emitted. - EmitByte = 1, - /// Set the bottom continuation byte - SetByte1 = 2, - /// Set the 2nd-from-last continuation byte - SetByte2 = 3, - /// Set the 2nd-from-last byte which is part of a two byte sequence - SetByte2Top = 4, - /// Set the 3rd-from-last continuation byte - SetByte3 = 5, - /// Set the 3rd-from-last byte which is part of a three byte sequence - SetByte3Top = 6, - /// Set the top byte of a four byte sequence. - SetByte4 = 7, -} - -/// States the parser can be in. -/// -/// There is a state for each initial input of the 3 and 4 byte sequences since -/// the following bytes are subject to different conditions than a tail byte. -#[allow(non_camel_case_types)] -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub enum State { - /// Ground state; expect anything - Ground = 0, - /// 3 tail bytes - Tail3 = 1, - /// 2 tail bytes - Tail2 = 2, - /// 1 tail byte - Tail1 = 3, - /// UTF8-3 starting with E0 - U3_2_e0 = 4, - /// UTF8-3 starting with ED - U3_2_ed = 5, - /// UTF8-4 starting with F0 - Utf8_4_3_f0 = 6, - /// UTF8-4 starting with F4 - Utf8_4_3_f4 = 7, -} - -impl Default for State { - fn default() -> State { - State::Ground - } -} - -impl State { - /// Advance the parser state. - /// - /// This takes the current state and input byte into consideration, to determine the next state - /// and any action that should be taken. - #[inline] - pub fn advance(self, byte: u8) -> (State, Action) { - match self { - State::Ground => match byte { - 0x00..=0x7f => (State::Ground, Action::EmitByte), - 0xc2..=0xdf => (State::Tail1, Action::SetByte2Top), - 0xe0 => (State::U3_2_e0, Action::SetByte3Top), - 0xe1..=0xec => (State::Tail2, Action::SetByte3Top), - 0xed => (State::U3_2_ed, Action::SetByte3Top), - 0xee..=0xef => (State::Tail2, Action::SetByte3Top), - 0xf0 => (State::Utf8_4_3_f0, Action::SetByte4), - 0xf1..=0xf3 => (State::Tail3, Action::SetByte4), - 0xf4 => (State::Utf8_4_3_f4, Action::SetByte4), - _ => (State::Ground, Action::InvalidSequence), - }, - State::U3_2_e0 => match byte { - 0xa0..=0xbf => (State::Tail1, Action::SetByte2), - _ => (State::Ground, Action::InvalidSequence), - }, - State::U3_2_ed => match byte { - 0x80..=0x9f => (State::Tail1, Action::SetByte2), - _ => (State::Ground, Action::InvalidSequence), - }, - State::Utf8_4_3_f0 => match byte { - 0x90..=0xbf => (State::Tail2, Action::SetByte3), - _ => (State::Ground, Action::InvalidSequence), - }, - State::Utf8_4_3_f4 => match byte { - 0x80..=0x8f => (State::Tail2, Action::SetByte3), - _ => (State::Ground, Action::InvalidSequence), - }, - State::Tail3 => match byte { - 0x80..=0xbf => (State::Tail2, Action::SetByte3), - _ => (State::Ground, Action::InvalidSequence), - }, - State::Tail2 => match byte { - 0x80..=0xbf => (State::Tail1, Action::SetByte2), - _ => (State::Ground, Action::InvalidSequence), - }, - State::Tail1 => match byte { - 0x80..=0xbf => (State::Ground, Action::SetByte1), - _ => (State::Ground, Action::InvalidSequence), - }, - } - } -} |