diff options
Diffstat (limited to 'vendor/encode_unicode/src/decoding_iterators.rs')
-rw-r--r-- | vendor/encode_unicode/src/decoding_iterators.rs | 494 |
1 files changed, 494 insertions, 0 deletions
diff --git a/vendor/encode_unicode/src/decoding_iterators.rs b/vendor/encode_unicode/src/decoding_iterators.rs new file mode 100644 index 0000000..4ef4125 --- /dev/null +++ b/vendor/encode_unicode/src/decoding_iterators.rs @@ -0,0 +1,494 @@ +/* Copyright 2018 The encode_unicode Developers + * + * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or + * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or + * http://opensource.org/licenses/MIT>, at your option. This file may not be + * copied, modified, or distributed except according to those terms. + */ + +//! Iterators that turn multiple `u8`s or `u16`s into `Utf*Char`s, but can fail. +//! +//! To be predictable, all errors consume one element each. +//! +//! The iterator adaptors produce neither offset nor element length to work +//! well with other adaptors, +//! while the slice iterators yield both to make more advanced use cases easy. + +use errors::{InvalidUtf8Slice, InvalidUtf16FirstUnit, Utf16PairError}; +use errors::InvalidUtf8Slice::*; +use errors::InvalidUtf8::*; +use errors::InvalidUtf8FirstByte::*; +use errors::InvalidUtf16Slice::*; +use errors::InvalidCodepoint::*; +use errors::Utf16PairError::*; +use utf8_char::Utf8Char; +use utf16_char::Utf16Char; +use traits::U16UtfExt; +extern crate core; +use self::core::borrow::Borrow; +use self::core::fmt::{self, Debug}; +use self::core::iter::Chain; +use self::core::option; + + +/// Decodes UTF-8 characters from a byte iterator into `Utf8Char`s. +/// +/// See [`IterExt::to_utf8chars()`](../trait.IterExt.html#tymethod.to_utf8chars) +/// for examples and error handling. +#[derive(Clone, Default)] +pub struct Utf8CharMerger<B:Borrow<u8>, I:Iterator<Item=B>> { + iter: I, + /// number of bytes that were read before an error was detected + after_err_leftover: u8, + /// stack because it simplifies popping. + after_err_stack: [u8; 3], +} +impl<B:Borrow<u8>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>> +From<T> for Utf8CharMerger<B, I> { + fn from(t: T) -> Self { + Utf8CharMerger { + iter: t.into_iter(), + after_err_leftover: 0, + after_err_stack: [0; 3], + } + } +} +impl<B:Borrow<u8>, I:Iterator<Item=B>> Utf8CharMerger<B,I> { + /// Extract the inner iterator. + /// + /// If the last item produced by `.next()` was an `Err`, + /// up to three following bytes might be missing. + /// The exact number of missing bytes for each error type should not be relied on. + /// + /// # Examples + /// + /// Three bytes swallowed: + /// ``` + /// # use encode_unicode::IterExt; + /// let mut merger = b"\xf4\xa1\xb2FS".iter().to_utf8chars(); + /// assert!(merger.next().unwrap().is_err()); + /// let mut inner: std::slice::Iter<u8> = merger.into_inner(); + /// assert_eq!(inner.next(), Some(&b'S')); // b'\xa1', b'\xb2' and b'F' disappeared + /// ``` + /// + /// All bytes present: + /// ``` + /// # use encode_unicode::IterExt; + /// let mut merger = b"\xb0FS".iter().to_utf8chars(); + /// assert!(merger.next().unwrap().is_err()); + /// assert_eq!(merger.into_inner().next(), Some(&b'F')); + /// ``` + /// + /// Two bytes missing: + /// ``` + /// # use encode_unicode::IterExt; + /// let mut merger = b"\xe0\x80\x80FS".iter().to_utf8chars(); + /// assert!(merger.next().unwrap().is_err()); + /// assert_eq!(merger.into_inner().next(), Some(&b'F')); + /// ``` + pub fn into_inner(self) -> I { + self.iter + } + + fn save(&mut self, bytes: &[u8;4], len: usize) { + // forget bytes[0] and push the others onto self.after_err_stack (in reverse). + for &after_err in bytes[1..len].iter().rev() { + self.after_err_stack[self.after_err_leftover as usize] = after_err; + self.after_err_leftover += 1; + } + } + /// Reads len-1 bytes into bytes[1..] + fn extra(&mut self, bytes: &mut[u8;4], len: usize) -> Result<(),InvalidUtf8Slice> { + // This is the only function that pushes onto after_err_stack, + // and it checks that all bytes are continuation bytes before fetching the next one. + // Therefore only the last byte retrieved can be a non-continuation byte. + // That last byte is also the last to be retrieved from after_err. + // + // Before this function is called, there has been retrieved at least one byte. + // If that byte was a continuation byte, next() produces an error + // and won't call this function. + // Therefore, we know that after_err is empty at this point. + // This means that we can use self.iter directly, and knows where to start pushing + debug_assert_eq!(self.after_err_leftover, 0, "first: {:#02x}, stack: {:?}", bytes[0], self.after_err_stack); + for i in 1..len { + if let Some(extra) = self.iter.next() { + let extra = *extra.borrow(); + bytes[i] = extra; + if extra & 0b1100_0000 != 0b1000_0000 { + // not a continuation byte + self.save(bytes, i+1); + return Err(InvalidUtf8Slice::Utf8(NotAContinuationByte(i))) + } + } else { + self.save(bytes, i); + return Err(TooShort(len)); + } + } + Ok(()) + } +} +impl<B:Borrow<u8>, I:Iterator<Item=B>> Iterator for Utf8CharMerger<B,I> { + type Item = Result<Utf8Char,InvalidUtf8Slice>; + fn next(&mut self) -> Option<Self::Item> { + let first: u8; + if self.after_err_leftover != 0 { + self.after_err_leftover -= 1; + first = self.after_err_stack[self.after_err_leftover as usize]; + } else if let Some(next) = self.iter.next() { + first = *next.borrow(); + } else { + return None; + } + + unsafe { + let mut bytes = [first, 0, 0, 0]; + let ok = match first { + 0b0000_0000...0b0111_1111 => {/*1 and */Ok(())}, + 0b1100_0010...0b1101_1111 => {//2 and not overlong + self.extra(&mut bytes, 2) // no extra validation required + }, + 0b1110_0000...0b1110_1111 => {//3 + if let Err(e) = self.extra(&mut bytes, 3) { + Err(e) + } else if bytes[0] == 0b1110_0000 && bytes[1] <= 0b10_011111 { + self.save(&bytes, 3); + Err(Utf8(OverLong)) + } else if bytes[0] == 0b1110_1101 && bytes[1] & 0b11_100000 == 0b10_100000 { + self.save(&bytes, 3); + Err(Codepoint(Utf16Reserved)) + } else { + Ok(()) + } + }, + 0b1111_0000...0b1111_0100 => {//4 + if let Err(e) = self.extra(&mut bytes, 4) { + Err(e) + } else if bytes[0] == 0b11110_000 && bytes[1] <= 0b10_001111 { + self.save(&bytes, 4); + Err(InvalidUtf8Slice::Utf8(OverLong)) + } else if bytes[0] == 0b11110_100 && bytes[1] > 0b10_001111 { + self.save(&bytes, 4); + Err(InvalidUtf8Slice::Codepoint(TooHigh)) + } else { + Ok(()) + } + }, + 0b1000_0000...0b1011_1111 => {// continuation byte + Err(Utf8(FirstByte(ContinuationByte))) + }, + 0b1100_0000...0b1100_0001 => {// 2 and overlong + Err(Utf8(OverLong)) + }, + 0b1111_0101...0b1111_0111 => {// 4 and too high codepoint + Err(Codepoint(TooHigh)) + }, + 0b1111_1000...0b1111_1111 => { + Err(Utf8(FirstByte(TooLongSeqence))) + }, + _ => unreachable!("all possible byte values should be covered") + }; + Some(ok.map(|()| Utf8Char::from_array_unchecked(bytes) )) + } + } + fn size_hint(&self) -> (usize,Option<usize>) { + let (iter_min, iter_max) = self.iter.size_hint(); + // cannot be exact, so KISS + let min = iter_min / 4; // don't bother rounding up or accounting for after_err + // handle edge case of max > usize::MAX-3 just in case. + // Using wrapping_add() wouldn't violate any API contract as the trait isn't unsafe. + let max = iter_max.and_then(|max| { + max.checked_add(self.after_err_leftover as usize) + }); + (min, max) + } +} +impl<B:Borrow<u8>, I:Iterator<Item=B>+Debug> Debug for Utf8CharMerger<B,I> { + fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { + let mut in_order = [0u8; 3]; + for i in 0..self.after_err_leftover as usize { + in_order[i] = self.after_err_stack[self.after_err_leftover as usize - i - 1]; + } + fmtr.debug_struct("Utf8CharMerger") + .field("buffered", &&in_order[..self.after_err_leftover as usize]) + .field("inner", &self.iter) + .finish() + } +} + + +/// An [`Utf8CharMerger`](struct.Utf8CharMerger.html) that also produces +/// offsets and lengths, but can only iterate over slices. +/// +/// See [`SliceExt::utf8char_indices()`](../trait.SliceExt.html#tymethod.utf8char_indices) +/// for examples and error handling. +#[derive(Clone, Default)] +pub struct Utf8CharDecoder<'a> { + slice: &'a[u8], + index: usize, +} +impl<'a> From<&'a[u8]> for Utf8CharDecoder<'a> { + fn from(s: &[u8]) -> Utf8CharDecoder { + Utf8CharDecoder { slice: s, index: 0 } + } +} +impl<'a> Utf8CharDecoder<'a> { + /// Extract the remainder of the source slice. + /// + /// # Examples + /// + /// Unlike `Utf8CharMerger::into_inner()`, bytes directly after an error + /// are never swallowed: + /// ``` + /// # use encode_unicode::SliceExt; + /// let mut iter = b"\xf4\xa1\xb2FS".utf8char_indices(); + /// assert!(iter.next().unwrap().1.is_err()); + /// assert_eq!(iter.as_slice(), b"\xa1\xb2FS"); + /// ``` + pub fn as_slice(&self) -> &'a[u8] { + &self.slice[self.index..] + } +} +impl<'a> Iterator for Utf8CharDecoder<'a> { + type Item = (usize, Result<Utf8Char,InvalidUtf8Slice>, usize); + fn next(&mut self) -> Option<Self::Item> { + let start = self.index; + match Utf8Char::from_slice_start(&self.slice[self.index..]) { + Ok((u8c, len)) => { + self.index += len; + Some((start, Ok(u8c), len)) + }, + Err(TooShort(1)) => None, + Err(e) => { + self.index += 1; + Some((start, Err(e), 1)) + } + } + } + #[inline] + fn size_hint(&self) -> (usize,Option<usize>) { + let bytes = self.slice.len() - self.index; + // Cannot be exact, so KISS and don't bother rounding up. + // The slice is unlikely be full of 4-byte codepoints, so buffers + // allocated with the lower bound will have to be grown anyway. + (bytes/4, Some(bytes)) + } +} +impl<'a> DoubleEndedIterator for Utf8CharDecoder<'a> { + fn next_back(&mut self) -> Option<Self::Item> { + if self.index < self.slice.len() { + let extras = self.slice.iter() + .rev() + .take_while(|&b| b & 0b1100_0000 == 0b1000_0000 ) + .count(); + let starts = self.slice.len() - (extras+1); + match Utf8Char::from_slice_start(&self.slice[starts..]) { + Ok((u8c,len)) if len == 1+extras => { + self.slice = &self.slice[..starts]; + Some((starts, Ok(u8c), len)) + }, + // This enures errors for every byte in both directions, + // but means overlong and codepoint errors will be turned into + // tooshort errors. + Err(e) if extras == 0 => { + self.slice = &self.slice[..self.slice.len()-1]; + Some((self.slice.len()-1, Err(e), 1)) + }, + _ => { + self.slice = &self.slice[..self.slice.len()-1]; + Some((self.slice.len()-1, Err(Utf8(FirstByte(ContinuationByte))), 1)) + }, + } + } else { + None + } + } +} +impl<'a> Debug for Utf8CharDecoder<'a> { + fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { + write!(fmtr, "Utf8CharDecoder {{ bytes[{}..]: {:?} }}", self.index, self.as_slice()) + } +} + + + +/// Decodes UTF-16 characters from a `u16` iterator into `Utf16Char`s. +/// +/// See [`IterExt::to_utf16chars()`](../trait.IterExt.html#tymethod.to_utf16chars) +/// for examples and error handling. +#[derive(Clone, Default)] +pub struct Utf16CharMerger<B:Borrow<u16>, I:Iterator<Item=B>> { + iter: I, + /// Used when a trailing surrogate was expected, the u16 can be any value. + prev: Option<B>, +} +impl<B:Borrow<u16>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>> +From<T> for Utf16CharMerger<B,I> { + fn from(t: T) -> Self { + Utf16CharMerger { iter: t.into_iter(), prev: None } + } +} +impl<B:Borrow<u16>, I:Iterator<Item=B>> Utf16CharMerger<B,I> { + /// Extract the inner iterator. + /// + /// If the last item produced was an `Err`, the first unit might be missing. + /// + /// # Examples + /// + /// Unit right after an error missing + /// ``` + /// # use encode_unicode::IterExt; + /// # use encode_unicode::error::Utf16PairError; + /// let mut merger = [0xd901, 'F' as u16, 'S' as u16].iter().to_utf16chars(); + /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate))); + /// let mut inner: std::slice::Iter<u16> = merger.into_inner(); + /// assert_eq!(inner.next(), Some('S' as u16).as_ref()); // 'F' was consumed by Utf16CharMerger + /// ``` + /// + /// Error that doesn't swallow any units + /// ``` + /// # use encode_unicode::IterExt; + /// # use encode_unicode::error::Utf16PairError; + /// let mut merger = [0xde00, 'F' as u16, 'S' as u16].iter().to_utf16chars(); + /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnexpectedTrailingSurrogate))); + /// let mut inner: std::slice::Iter<u16> = merger.into_inner(); + /// assert_eq!(inner.next(), Some('F' as u16).as_ref()); // not consumed + /// ``` + pub fn into_inner(self) -> I { + self.iter + } + /// Returns an iterator over the remaining units. + /// Unlike `into_inner()` this will never drop any units. + /// + /// The exact type of the returned iterator should not be depended on. + /// + /// # Examples + /// + /// ``` + /// # use encode_unicode::IterExt; + /// # use encode_unicode::error::Utf16PairError; + /// let slice = [0xd901, 'F' as u16, 'S' as u16]; + /// let mut merger = slice.iter().to_utf16chars(); + /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate))); + /// let mut remaining = merger.into_remaining_units(); + /// assert_eq!(remaining.next(), Some('F' as u16).as_ref()); + /// ``` + pub fn into_remaining_units(self) -> Chain<option::IntoIter<B>,I> { + self.prev.into_iter().chain(self.iter) + } +} +impl<B:Borrow<u16>, I:Iterator<Item=B>> Iterator for Utf16CharMerger<B,I> { + type Item = Result<Utf16Char,Utf16PairError>; + fn next(&mut self) -> Option<Self::Item> { + let first = self.prev.take().or_else(|| self.iter.next() ); + first.map(|first| unsafe { + match first.borrow().utf16_needs_extra_unit() { + Ok(false) => Ok(Utf16Char::from_array_unchecked([*first.borrow(), 0])), + Ok(true) => match self.iter.next() { + Some(second) => match second.borrow().utf16_needs_extra_unit() { + Err(InvalidUtf16FirstUnit) => Ok(Utf16Char::from_tuple_unchecked(( + *first.borrow(), + Some(*second.borrow()) + ))), + Ok(_) => { + self.prev = Some(second); + Err(Utf16PairError::UnmatchedLeadingSurrogate) + } + }, + None => Err(Utf16PairError::Incomplete) + }, + Err(InvalidUtf16FirstUnit) => Err(Utf16PairError::UnexpectedTrailingSurrogate), + } + }) + } + fn size_hint(&self) -> (usize,Option<usize>) { + let (iter_min, iter_max) = self.iter.size_hint(); + // cannot be exact, so KISS + let min = iter_min / 2; // don't bother rounding up or accounting for self.prev + let max = match (iter_max, &self.prev) { + (Some(max), &Some(_)) => max.checked_add(1), + (max, _) => max, + }; + (min, max) + } +} +impl<B:Borrow<u16>, I:Iterator<Item=B>+Debug> Debug for Utf16CharMerger<B,I> { + fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { + fmtr.debug_struct("Utf16CharMerger") + .field("buffered", &self.prev.as_ref().map(|b| *b.borrow() )) + .field("inner", &self.iter) + .finish() + } +} + + +/// An [`Utf16CharMerger`](struct.Utf16CharMerger.html) that also produces +/// offsets and lengths, but can only iterate over slices. +/// +/// See [`SliceExt::utf16char_indices()`](../trait.SliceExt.html#tymethod.utf16char_indices) +/// for examples and error handling. +#[derive(Clone, Default)] +pub struct Utf16CharDecoder<'a> { + slice: &'a[u16], + index: usize, +} +impl<'a> From<&'a[u16]> for Utf16CharDecoder<'a> { + fn from(s: &'a[u16]) -> Self { + Utf16CharDecoder{ slice: s, index: 0 } + } +} +impl<'a> Utf16CharDecoder<'a> { + /// Extract the remainder of the source slice. + /// + /// # Examples + /// + /// Unlike `Utf16CharMerger::into_inner()`, the unit after an error is never swallowed: + /// ``` + /// # use encode_unicode::SliceExt; + /// # use encode_unicode::error::Utf16PairError; + /// let mut iter = [0xd901, 'F' as u16, 'S' as u16].utf16char_indices(); + /// assert_eq!(iter.next(), Some((0, Err(Utf16PairError::UnmatchedLeadingSurrogate), 1))); + /// assert_eq!(iter.as_slice(), &['F' as u16, 'S' as u16]); + /// ``` + pub fn as_slice(&self) -> &[u16] { + &self.slice[self.index..] + } +} +impl<'a> Iterator for Utf16CharDecoder<'a> { + type Item = (usize,Result<Utf16Char,Utf16PairError>,usize); + #[inline] + fn next(&mut self) -> Option<Self::Item> { + let start = self.index; + match Utf16Char::from_slice_start(self.as_slice()) { + Ok((u16c,len)) => { + self.index += len; + Some((start, Ok(u16c), len)) + }, + Err(EmptySlice) => None, + Err(FirstLowSurrogate) => { + self.index += 1; + Some((start, Err(UnexpectedTrailingSurrogate), 1)) + }, + Err(SecondNotLowSurrogate) => { + self.index += 1; + Some((start, Err(UnmatchedLeadingSurrogate), 1)) + }, + Err(MissingSecond) => { + self.index = self.slice.len(); + Some((start, Err(Incomplete), 1)) + } + } + } + #[inline] + fn size_hint(&self) -> (usize,Option<usize>) { + let units = self.slice.len() - self.index; + // Cannot be exact, so KISS and don't bother rounding up. + // The slice is unlikely be full of surrogate pairs, so buffers + // allocated with the lower bound will have to be grown anyway. + (units/2, Some(units)) + } +} +impl<'a> Debug for Utf16CharDecoder<'a> { + fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { + write!(fmtr, "Utf16CharDecoder {{ units[{}..]: {:?} }}", self.index, self.as_slice()) + } +} |