diff options
Diffstat (limited to 'vendor/utf8parse/src/lib.rs')
-rw-r--r-- | vendor/utf8parse/src/lib.rs | 132 |
1 files changed, 132 insertions, 0 deletions
diff --git a/vendor/utf8parse/src/lib.rs b/vendor/utf8parse/src/lib.rs new file mode 100644 index 0000000..093de81 --- /dev/null +++ b/vendor/utf8parse/src/lib.rs @@ -0,0 +1,132 @@ +//! A table-driven UTF-8 Parser +//! +//! This module implements a table-driven UTF-8 parser which should +//! theoretically contain the minimal number of branches (1). The only branch is +//! on the `Action` returned from unpacking a transition. +#![deny(clippy::all, clippy::if_not_else, clippy::enum_glob_use)] +#![cfg_attr(all(feature = "nightly", test), feature(test))] +#![no_std] + +use core::char; + +mod types; + +use types::{Action, State}; + +/// Handles codepoint and invalid sequence events from the parser. +pub trait Receiver { + /// Called whenever a codepoint is parsed successfully + fn codepoint(&mut self, _: char); + + /// Called when an invalid_sequence is detected + fn invalid_sequence(&mut self); +} + +/// A parser for Utf8 Characters +/// +/// Repeatedly call `advance` with bytes to emit Utf8 characters +#[derive(Clone, Default, PartialEq, Eq, Debug)] +pub struct Parser { + point: u32, + state: State, +} + +/// Continuation bytes are masked with this value. +const CONTINUATION_MASK: u8 = 0b0011_1111; + +impl Parser { + /// Create a new Parser + pub fn new() -> Parser { + Parser { point: 0, state: State::Ground } + } + + /// Advance the parser + /// + /// The provider receiver will be called whenever a codepoint is completed or an invalid + /// sequence is detected. + pub fn advance<R>(&mut self, receiver: &mut R, byte: u8) + where + R: Receiver, + { + let (state, action) = self.state.advance(byte); + self.perform_action(receiver, byte, action); + self.state = state; + } + + fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action) + where + R: Receiver, + { + match action { + Action::InvalidSequence => { + self.point = 0; + receiver.invalid_sequence(); + }, + Action::EmitByte => { + receiver.codepoint(byte as char); + }, + Action::SetByte1 => { + let point = self.point | ((byte & CONTINUATION_MASK) as u32); + let c = unsafe { char::from_u32_unchecked(point) }; + self.point = 0; + + receiver.codepoint(c); + }, + Action::SetByte2 => { + self.point |= ((byte & CONTINUATION_MASK) as u32) << 6; + }, + Action::SetByte2Top => { + self.point |= ((byte & 0b0001_1111) as u32) << 6; + }, + Action::SetByte3 => { + self.point |= ((byte & CONTINUATION_MASK) as u32) << 12; + }, + Action::SetByte3Top => { + self.point |= ((byte & 0b0000_1111) as u32) << 12; + }, + Action::SetByte4 => { + self.point |= ((byte & 0b0000_0111) as u32) << 18; + }, + } + } +} + +#[cfg(all(feature = "nightly", test))] +mod benches { + extern crate std; + extern crate test; + + use super::{Parser, Receiver}; + + use self::test::{black_box, Bencher}; + + static UTF8_DEMO: &[u8] = include_bytes!("../tests/UTF-8-demo.txt"); + + impl Receiver for () { + fn codepoint(&mut self, c: char) { + black_box(c); + } + + fn invalid_sequence(&mut self) {} + } + + #[bench] + fn parse_bench_utf8_demo(b: &mut Bencher) { + let mut parser = Parser::new(); + + b.iter(|| { + for byte in UTF8_DEMO { + parser.advance(&mut (), *byte); + } + }) + } + + #[bench] + fn std_string_parse_utf8(b: &mut Bencher) { + b.iter(|| { + for c in std::str::from_utf8(UTF8_DEMO).unwrap().chars() { + black_box(c); + } + }); + } +} |