aboutsummaryrefslogtreecommitdiff
path: root/vendor/encode_unicode/src/decoding_iterators.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/encode_unicode/src/decoding_iterators.rs')
-rw-r--r--vendor/encode_unicode/src/decoding_iterators.rs494
1 files changed, 0 insertions, 494 deletions
diff --git a/vendor/encode_unicode/src/decoding_iterators.rs b/vendor/encode_unicode/src/decoding_iterators.rs
deleted file mode 100644
index 4ef4125..0000000
--- a/vendor/encode_unicode/src/decoding_iterators.rs
+++ /dev/null
@@ -1,494 +0,0 @@
-/* Copyright 2018 The encode_unicode Developers
- *
- * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
- * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
- * http://opensource.org/licenses/MIT>, at your option. This file may not be
- * copied, modified, or distributed except according to those terms.
- */
-
-//! Iterators that turn multiple `u8`s or `u16`s into `Utf*Char`s, but can fail.
-//!
-//! To be predictable, all errors consume one element each.
-//!
-//! The iterator adaptors produce neither offset nor element length to work
-//! well with other adaptors,
-//! while the slice iterators yield both to make more advanced use cases easy.
-
-use errors::{InvalidUtf8Slice, InvalidUtf16FirstUnit, Utf16PairError};
-use errors::InvalidUtf8Slice::*;
-use errors::InvalidUtf8::*;
-use errors::InvalidUtf8FirstByte::*;
-use errors::InvalidUtf16Slice::*;
-use errors::InvalidCodepoint::*;
-use errors::Utf16PairError::*;
-use utf8_char::Utf8Char;
-use utf16_char::Utf16Char;
-use traits::U16UtfExt;
-extern crate core;
-use self::core::borrow::Borrow;
-use self::core::fmt::{self, Debug};
-use self::core::iter::Chain;
-use self::core::option;
-
-
-/// Decodes UTF-8 characters from a byte iterator into `Utf8Char`s.
-///
-/// See [`IterExt::to_utf8chars()`](../trait.IterExt.html#tymethod.to_utf8chars)
-/// for examples and error handling.
-#[derive(Clone, Default)]
-pub struct Utf8CharMerger<B:Borrow<u8>, I:Iterator<Item=B>> {
- iter: I,
- /// number of bytes that were read before an error was detected
- after_err_leftover: u8,
- /// stack because it simplifies popping.
- after_err_stack: [u8; 3],
-}
-impl<B:Borrow<u8>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>>
-From<T> for Utf8CharMerger<B, I> {
- fn from(t: T) -> Self {
- Utf8CharMerger {
- iter: t.into_iter(),
- after_err_leftover: 0,
- after_err_stack: [0; 3],
- }
- }
-}
-impl<B:Borrow<u8>, I:Iterator<Item=B>> Utf8CharMerger<B,I> {
- /// Extract the inner iterator.
- ///
- /// If the last item produced by `.next()` was an `Err`,
- /// up to three following bytes might be missing.
- /// The exact number of missing bytes for each error type should not be relied on.
- ///
- /// # Examples
- ///
- /// Three bytes swallowed:
- /// ```
- /// # use encode_unicode::IterExt;
- /// let mut merger = b"\xf4\xa1\xb2FS".iter().to_utf8chars();
- /// assert!(merger.next().unwrap().is_err());
- /// let mut inner: std::slice::Iter<u8> = merger.into_inner();
- /// assert_eq!(inner.next(), Some(&b'S')); // b'\xa1', b'\xb2' and b'F' disappeared
- /// ```
- ///
- /// All bytes present:
- /// ```
- /// # use encode_unicode::IterExt;
- /// let mut merger = b"\xb0FS".iter().to_utf8chars();
- /// assert!(merger.next().unwrap().is_err());
- /// assert_eq!(merger.into_inner().next(), Some(&b'F'));
- /// ```
- ///
- /// Two bytes missing:
- /// ```
- /// # use encode_unicode::IterExt;
- /// let mut merger = b"\xe0\x80\x80FS".iter().to_utf8chars();
- /// assert!(merger.next().unwrap().is_err());
- /// assert_eq!(merger.into_inner().next(), Some(&b'F'));
- /// ```
- pub fn into_inner(self) -> I {
- self.iter
- }
-
- fn save(&mut self, bytes: &[u8;4], len: usize) {
- // forget bytes[0] and push the others onto self.after_err_stack (in reverse).
- for &after_err in bytes[1..len].iter().rev() {
- self.after_err_stack[self.after_err_leftover as usize] = after_err;
- self.after_err_leftover += 1;
- }
- }
- /// Reads len-1 bytes into bytes[1..]
- fn extra(&mut self, bytes: &mut[u8;4], len: usize) -> Result<(),InvalidUtf8Slice> {
- // This is the only function that pushes onto after_err_stack,
- // and it checks that all bytes are continuation bytes before fetching the next one.
- // Therefore only the last byte retrieved can be a non-continuation byte.
- // That last byte is also the last to be retrieved from after_err.
- //
- // Before this function is called, there has been retrieved at least one byte.
- // If that byte was a continuation byte, next() produces an error
- // and won't call this function.
- // Therefore, we know that after_err is empty at this point.
- // This means that we can use self.iter directly, and knows where to start pushing
- debug_assert_eq!(self.after_err_leftover, 0, "first: {:#02x}, stack: {:?}", bytes[0], self.after_err_stack);
- for i in 1..len {
- if let Some(extra) = self.iter.next() {
- let extra = *extra.borrow();
- bytes[i] = extra;
- if extra & 0b1100_0000 != 0b1000_0000 {
- // not a continuation byte
- self.save(bytes, i+1);
- return Err(InvalidUtf8Slice::Utf8(NotAContinuationByte(i)))
- }
- } else {
- self.save(bytes, i);
- return Err(TooShort(len));
- }
- }
- Ok(())
- }
-}
-impl<B:Borrow<u8>, I:Iterator<Item=B>> Iterator for Utf8CharMerger<B,I> {
- type Item = Result<Utf8Char,InvalidUtf8Slice>;
- fn next(&mut self) -> Option<Self::Item> {
- let first: u8;
- if self.after_err_leftover != 0 {
- self.after_err_leftover -= 1;
- first = self.after_err_stack[self.after_err_leftover as usize];
- } else if let Some(next) = self.iter.next() {
- first = *next.borrow();
- } else {
- return None;
- }
-
- unsafe {
- let mut bytes = [first, 0, 0, 0];
- let ok = match first {
- 0b0000_0000...0b0111_1111 => {/*1 and */Ok(())},
- 0b1100_0010...0b1101_1111 => {//2 and not overlong
- self.extra(&mut bytes, 2) // no extra validation required
- },
- 0b1110_0000...0b1110_1111 => {//3
- if let Err(e) = self.extra(&mut bytes, 3) {
- Err(e)
- } else if bytes[0] == 0b1110_0000 && bytes[1] <= 0b10_011111 {
- self.save(&bytes, 3);
- Err(Utf8(OverLong))
- } else if bytes[0] == 0b1110_1101 && bytes[1] & 0b11_100000 == 0b10_100000 {
- self.save(&bytes, 3);
- Err(Codepoint(Utf16Reserved))
- } else {
- Ok(())
- }
- },
- 0b1111_0000...0b1111_0100 => {//4
- if let Err(e) = self.extra(&mut bytes, 4) {
- Err(e)
- } else if bytes[0] == 0b11110_000 && bytes[1] <= 0b10_001111 {
- self.save(&bytes, 4);
- Err(InvalidUtf8Slice::Utf8(OverLong))
- } else if bytes[0] == 0b11110_100 && bytes[1] > 0b10_001111 {
- self.save(&bytes, 4);
- Err(InvalidUtf8Slice::Codepoint(TooHigh))
- } else {
- Ok(())
- }
- },
- 0b1000_0000...0b1011_1111 => {// continuation byte
- Err(Utf8(FirstByte(ContinuationByte)))
- },
- 0b1100_0000...0b1100_0001 => {// 2 and overlong
- Err(Utf8(OverLong))
- },
- 0b1111_0101...0b1111_0111 => {// 4 and too high codepoint
- Err(Codepoint(TooHigh))
- },
- 0b1111_1000...0b1111_1111 => {
- Err(Utf8(FirstByte(TooLongSeqence)))
- },
- _ => unreachable!("all possible byte values should be covered")
- };
- Some(ok.map(|()| Utf8Char::from_array_unchecked(bytes) ))
- }
- }
- fn size_hint(&self) -> (usize,Option<usize>) {
- let (iter_min, iter_max) = self.iter.size_hint();
- // cannot be exact, so KISS
- let min = iter_min / 4; // don't bother rounding up or accounting for after_err
- // handle edge case of max > usize::MAX-3 just in case.
- // Using wrapping_add() wouldn't violate any API contract as the trait isn't unsafe.
- let max = iter_max.and_then(|max| {
- max.checked_add(self.after_err_leftover as usize)
- });
- (min, max)
- }
-}
-impl<B:Borrow<u8>, I:Iterator<Item=B>+Debug> Debug for Utf8CharMerger<B,I> {
- fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
- let mut in_order = [0u8; 3];
- for i in 0..self.after_err_leftover as usize {
- in_order[i] = self.after_err_stack[self.after_err_leftover as usize - i - 1];
- }
- fmtr.debug_struct("Utf8CharMerger")
- .field("buffered", &&in_order[..self.after_err_leftover as usize])
- .field("inner", &self.iter)
- .finish()
- }
-}
-
-
-/// An [`Utf8CharMerger`](struct.Utf8CharMerger.html) that also produces
-/// offsets and lengths, but can only iterate over slices.
-///
-/// See [`SliceExt::utf8char_indices()`](../trait.SliceExt.html#tymethod.utf8char_indices)
-/// for examples and error handling.
-#[derive(Clone, Default)]
-pub struct Utf8CharDecoder<'a> {
- slice: &'a[u8],
- index: usize,
-}
-impl<'a> From<&'a[u8]> for Utf8CharDecoder<'a> {
- fn from(s: &[u8]) -> Utf8CharDecoder {
- Utf8CharDecoder { slice: s, index: 0 }
- }
-}
-impl<'a> Utf8CharDecoder<'a> {
- /// Extract the remainder of the source slice.
- ///
- /// # Examples
- ///
- /// Unlike `Utf8CharMerger::into_inner()`, bytes directly after an error
- /// are never swallowed:
- /// ```
- /// # use encode_unicode::SliceExt;
- /// let mut iter = b"\xf4\xa1\xb2FS".utf8char_indices();
- /// assert!(iter.next().unwrap().1.is_err());
- /// assert_eq!(iter.as_slice(), b"\xa1\xb2FS");
- /// ```
- pub fn as_slice(&self) -> &'a[u8] {
- &self.slice[self.index..]
- }
-}
-impl<'a> Iterator for Utf8CharDecoder<'a> {
- type Item = (usize, Result<Utf8Char,InvalidUtf8Slice>, usize);
- fn next(&mut self) -> Option<Self::Item> {
- let start = self.index;
- match Utf8Char::from_slice_start(&self.slice[self.index..]) {
- Ok((u8c, len)) => {
- self.index += len;
- Some((start, Ok(u8c), len))
- },
- Err(TooShort(1)) => None,
- Err(e) => {
- self.index += 1;
- Some((start, Err(e), 1))
- }
- }
- }
- #[inline]
- fn size_hint(&self) -> (usize,Option<usize>) {
- let bytes = self.slice.len() - self.index;
- // Cannot be exact, so KISS and don't bother rounding up.
- // The slice is unlikely be full of 4-byte codepoints, so buffers
- // allocated with the lower bound will have to be grown anyway.
- (bytes/4, Some(bytes))
- }
-}
-impl<'a> DoubleEndedIterator for Utf8CharDecoder<'a> {
- fn next_back(&mut self) -> Option<Self::Item> {
- if self.index < self.slice.len() {
- let extras = self.slice.iter()
- .rev()
- .take_while(|&b| b & 0b1100_0000 == 0b1000_0000 )
- .count();
- let starts = self.slice.len() - (extras+1);
- match Utf8Char::from_slice_start(&self.slice[starts..]) {
- Ok((u8c,len)) if len == 1+extras => {
- self.slice = &self.slice[..starts];
- Some((starts, Ok(u8c), len))
- },
- // This enures errors for every byte in both directions,
- // but means overlong and codepoint errors will be turned into
- // tooshort errors.
- Err(e) if extras == 0 => {
- self.slice = &self.slice[..self.slice.len()-1];
- Some((self.slice.len()-1, Err(e), 1))
- },
- _ => {
- self.slice = &self.slice[..self.slice.len()-1];
- Some((self.slice.len()-1, Err(Utf8(FirstByte(ContinuationByte))), 1))
- },
- }
- } else {
- None
- }
- }
-}
-impl<'a> Debug for Utf8CharDecoder<'a> {
- fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
- write!(fmtr, "Utf8CharDecoder {{ bytes[{}..]: {:?} }}", self.index, self.as_slice())
- }
-}
-
-
-
-/// Decodes UTF-16 characters from a `u16` iterator into `Utf16Char`s.
-///
-/// See [`IterExt::to_utf16chars()`](../trait.IterExt.html#tymethod.to_utf16chars)
-/// for examples and error handling.
-#[derive(Clone, Default)]
-pub struct Utf16CharMerger<B:Borrow<u16>, I:Iterator<Item=B>> {
- iter: I,
- /// Used when a trailing surrogate was expected, the u16 can be any value.
- prev: Option<B>,
-}
-impl<B:Borrow<u16>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>>
-From<T> for Utf16CharMerger<B,I> {
- fn from(t: T) -> Self {
- Utf16CharMerger { iter: t.into_iter(), prev: None }
- }
-}
-impl<B:Borrow<u16>, I:Iterator<Item=B>> Utf16CharMerger<B,I> {
- /// Extract the inner iterator.
- ///
- /// If the last item produced was an `Err`, the first unit might be missing.
- ///
- /// # Examples
- ///
- /// Unit right after an error missing
- /// ```
- /// # use encode_unicode::IterExt;
- /// # use encode_unicode::error::Utf16PairError;
- /// let mut merger = [0xd901, 'F' as u16, 'S' as u16].iter().to_utf16chars();
- /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate)));
- /// let mut inner: std::slice::Iter<u16> = merger.into_inner();
- /// assert_eq!(inner.next(), Some('S' as u16).as_ref()); // 'F' was consumed by Utf16CharMerger
- /// ```
- ///
- /// Error that doesn't swallow any units
- /// ```
- /// # use encode_unicode::IterExt;
- /// # use encode_unicode::error::Utf16PairError;
- /// let mut merger = [0xde00, 'F' as u16, 'S' as u16].iter().to_utf16chars();
- /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnexpectedTrailingSurrogate)));
- /// let mut inner: std::slice::Iter<u16> = merger.into_inner();
- /// assert_eq!(inner.next(), Some('F' as u16).as_ref()); // not consumed
- /// ```
- pub fn into_inner(self) -> I {
- self.iter
- }
- /// Returns an iterator over the remaining units.
- /// Unlike `into_inner()` this will never drop any units.
- ///
- /// The exact type of the returned iterator should not be depended on.
- ///
- /// # Examples
- ///
- /// ```
- /// # use encode_unicode::IterExt;
- /// # use encode_unicode::error::Utf16PairError;
- /// let slice = [0xd901, 'F' as u16, 'S' as u16];
- /// let mut merger = slice.iter().to_utf16chars();
- /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate)));
- /// let mut remaining = merger.into_remaining_units();
- /// assert_eq!(remaining.next(), Some('F' as u16).as_ref());
- /// ```
- pub fn into_remaining_units(self) -> Chain<option::IntoIter<B>,I> {
- self.prev.into_iter().chain(self.iter)
- }
-}
-impl<B:Borrow<u16>, I:Iterator<Item=B>> Iterator for Utf16CharMerger<B,I> {
- type Item = Result<Utf16Char,Utf16PairError>;
- fn next(&mut self) -> Option<Self::Item> {
- let first = self.prev.take().or_else(|| self.iter.next() );
- first.map(|first| unsafe {
- match first.borrow().utf16_needs_extra_unit() {
- Ok(false) => Ok(Utf16Char::from_array_unchecked([*first.borrow(), 0])),
- Ok(true) => match self.iter.next() {
- Some(second) => match second.borrow().utf16_needs_extra_unit() {
- Err(InvalidUtf16FirstUnit) => Ok(Utf16Char::from_tuple_unchecked((
- *first.borrow(),
- Some(*second.borrow())
- ))),
- Ok(_) => {
- self.prev = Some(second);
- Err(Utf16PairError::UnmatchedLeadingSurrogate)
- }
- },
- None => Err(Utf16PairError::Incomplete)
- },
- Err(InvalidUtf16FirstUnit) => Err(Utf16PairError::UnexpectedTrailingSurrogate),
- }
- })
- }
- fn size_hint(&self) -> (usize,Option<usize>) {
- let (iter_min, iter_max) = self.iter.size_hint();
- // cannot be exact, so KISS
- let min = iter_min / 2; // don't bother rounding up or accounting for self.prev
- let max = match (iter_max, &self.prev) {
- (Some(max), &Some(_)) => max.checked_add(1),
- (max, _) => max,
- };
- (min, max)
- }
-}
-impl<B:Borrow<u16>, I:Iterator<Item=B>+Debug> Debug for Utf16CharMerger<B,I> {
- fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
- fmtr.debug_struct("Utf16CharMerger")
- .field("buffered", &self.prev.as_ref().map(|b| *b.borrow() ))
- .field("inner", &self.iter)
- .finish()
- }
-}
-
-
-/// An [`Utf16CharMerger`](struct.Utf16CharMerger.html) that also produces
-/// offsets and lengths, but can only iterate over slices.
-///
-/// See [`SliceExt::utf16char_indices()`](../trait.SliceExt.html#tymethod.utf16char_indices)
-/// for examples and error handling.
-#[derive(Clone, Default)]
-pub struct Utf16CharDecoder<'a> {
- slice: &'a[u16],
- index: usize,
-}
-impl<'a> From<&'a[u16]> for Utf16CharDecoder<'a> {
- fn from(s: &'a[u16]) -> Self {
- Utf16CharDecoder{ slice: s, index: 0 }
- }
-}
-impl<'a> Utf16CharDecoder<'a> {
- /// Extract the remainder of the source slice.
- ///
- /// # Examples
- ///
- /// Unlike `Utf16CharMerger::into_inner()`, the unit after an error is never swallowed:
- /// ```
- /// # use encode_unicode::SliceExt;
- /// # use encode_unicode::error::Utf16PairError;
- /// let mut iter = [0xd901, 'F' as u16, 'S' as u16].utf16char_indices();
- /// assert_eq!(iter.next(), Some((0, Err(Utf16PairError::UnmatchedLeadingSurrogate), 1)));
- /// assert_eq!(iter.as_slice(), &['F' as u16, 'S' as u16]);
- /// ```
- pub fn as_slice(&self) -> &[u16] {
- &self.slice[self.index..]
- }
-}
-impl<'a> Iterator for Utf16CharDecoder<'a> {
- type Item = (usize,Result<Utf16Char,Utf16PairError>,usize);
- #[inline]
- fn next(&mut self) -> Option<Self::Item> {
- let start = self.index;
- match Utf16Char::from_slice_start(self.as_slice()) {
- Ok((u16c,len)) => {
- self.index += len;
- Some((start, Ok(u16c), len))
- },
- Err(EmptySlice) => None,
- Err(FirstLowSurrogate) => {
- self.index += 1;
- Some((start, Err(UnexpectedTrailingSurrogate), 1))
- },
- Err(SecondNotLowSurrogate) => {
- self.index += 1;
- Some((start, Err(UnmatchedLeadingSurrogate), 1))
- },
- Err(MissingSecond) => {
- self.index = self.slice.len();
- Some((start, Err(Incomplete), 1))
- }
- }
- }
- #[inline]
- fn size_hint(&self) -> (usize,Option<usize>) {
- let units = self.slice.len() - self.index;
- // Cannot be exact, so KISS and don't bother rounding up.
- // The slice is unlikely be full of surrogate pairs, so buffers
- // allocated with the lower bound will have to be grown anyway.
- (units/2, Some(units))
- }
-}
-impl<'a> Debug for Utf16CharDecoder<'a> {
- fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
- write!(fmtr, "Utf16CharDecoder {{ units[{}..]: {:?} }}", self.index, self.as_slice())
- }
-}