diff options
Diffstat (limited to 'vendor/encode_unicode/src/utf8_iterators.rs')
-rw-r--r-- | vendor/encode_unicode/src/utf8_iterators.rs | 352 |
1 files changed, 0 insertions, 352 deletions
diff --git a/vendor/encode_unicode/src/utf8_iterators.rs b/vendor/encode_unicode/src/utf8_iterators.rs deleted file mode 100644 index 891d729..0000000 --- a/vendor/encode_unicode/src/utf8_iterators.rs +++ /dev/null @@ -1,352 +0,0 @@ -/* Copyright 2016 The encode_unicode Developers - * - * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or - * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or - * http://opensource.org/licenses/MIT>, at your option. This file may not be - * copied, modified, or distributed except according to those terms. - */ - -use utf8_char::Utf8Char; -use errors::EmptyStrError; -extern crate core; -use self::core::{mem, u32, u64}; -use self::core::ops::Not; -use self::core::fmt; -use self::core::borrow::Borrow; -#[cfg(feature="std")] -use std::io::{Read, Error as ioError}; - - - -/// Read or iterate over the bytes of the UTF-8 representation of a codepoint. -#[derive(Clone)] -pub struct Utf8Iterator (u32); - -impl From<Utf8Char> for Utf8Iterator { - fn from(uc: Utf8Char) -> Self { - let used = u32::from_le(unsafe{ mem::transmute(uc.to_array().0) }); - // uses u64 because shifting an u32 by 32 bits is a no-op. - let unused_set = (u64::MAX << uc.len() as u64*8) as u32; - Utf8Iterator(used | unused_set) - } -} -impl From<char> for Utf8Iterator { - fn from(c: char) -> Self { - Self::from(Utf8Char::from(c)) - } -} -impl Iterator for Utf8Iterator { - type Item=u8; - fn next(&mut self) -> Option<u8> { - let next = self.0 as u8; - if next == 0xff { - None - } else { - self.0 = (self.0 >> 8) | 0xff_00_00_00; - Some(next) - } - } - fn size_hint(&self) -> (usize, Option<usize>) { - (self.len(), Some(self.len())) - } -} -impl ExactSizeIterator for Utf8Iterator { - fn len(&self) -> usize {// not straightforward, but possible - let unused_bytes = self.0.not().leading_zeros() / 8; - 4 - unused_bytes as usize - } -} -#[cfg(feature="std")] -impl Read for Utf8Iterator { - /// Always returns Ok - fn read(&mut self, buf: &mut[u8]) -> Result<usize, ioError> { - // Cannot call self.next() until I know I can write the result. - for (i, dst) in buf.iter_mut().enumerate() { - match self.next() { - Some(b) => *dst = b, - None => return Ok(i), - } - } - Ok(buf.len()) - } -} -impl fmt::Debug for Utf8Iterator { - fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { - let mut content = [0; 4]; - let mut i = 0; - for b in self.clone() { - content[i] = b; - i += 1; - } - write!(fmtr, "{:?}", &content[..i]) - } -} - - - -/// Converts an iterator of `Utf8Char` (or `&Utf8Char`) -/// to an iterator of `u8`s. -/// Is equivalent to calling `.flat_map()` on the original iterator, -/// but the returned iterator is ~40% faster. -/// -/// The iterator also implements `Read` (if the `std` feature isn't disabled). -/// Reading will never produce an error, and calls to `.read()` and `.next()` -/// can be mixed. -/// -/// The exact number of bytes cannot be known in advance, but `size_hint()` -/// gives the possible range. -/// (min: all remaining characters are ASCII, max: all require four bytes) -/// -/// # Examples -/// -/// From iterator of values: -/// -/// ``` -/// use encode_unicode::{iter_bytes, CharExt}; -/// -/// let iterator = "foo".chars().map(|c| c.to_utf8() ); -/// let mut bytes = [0; 4]; -/// for (u,dst) in iter_bytes(iterator).zip(&mut bytes) {*dst=u;} -/// assert_eq!(&bytes, b"foo\0"); -/// ``` -/// -/// From iterator of references: -/// -#[cfg_attr(feature="std", doc=" ```")] -#[cfg_attr(not(feature="std"), doc=" ```no_compile")] -/// use encode_unicode::{iter_bytes, CharExt, Utf8Char}; -/// -/// let chars: Vec<Utf8Char> = "💣 bomb 💣".chars().map(|c| c.to_utf8() ).collect(); -/// let bytes: Vec<u8> = iter_bytes(&chars).collect(); -/// let flat_map: Vec<u8> = chars.iter().flat_map(|u8c| *u8c ).collect(); -/// assert_eq!(bytes, flat_map); -/// ``` -/// -/// `Read`ing from it: -/// -#[cfg_attr(feature="std", doc=" ```")] -#[cfg_attr(not(feature="std"), doc=" ```no_compile")] -/// use encode_unicode::{iter_bytes, CharExt}; -/// use std::io::Read; -/// -/// let s = "Ååh‽"; -/// assert_eq!(s.len(), 8); -/// let mut buf = [b'E'; 9]; -/// let mut reader = iter_bytes(s.chars().map(|c| c.to_utf8() )); -/// assert_eq!(reader.read(&mut buf[..]).unwrap(), 8); -/// assert_eq!(reader.read(&mut buf[..]).unwrap(), 0); -/// assert_eq!(&buf[..8], s.as_bytes()); -/// assert_eq!(buf[8], b'E'); -/// ``` -pub fn iter_bytes<U:Borrow<Utf8Char>, I:IntoIterator<Item=U>> -(iterable: I) -> Utf8CharSplitter<U, I::IntoIter> { - Utf8CharSplitter{ inner: iterable.into_iter(), prev: 0 } -} - -/// The iterator type returned by `iter_bytes()` -/// -/// See its documentation for details. -#[derive(Clone)] -pub struct Utf8CharSplitter<U:Borrow<Utf8Char>, I:Iterator<Item=U>> { - inner: I, - prev: u32, -} -impl<I:Iterator<Item=Utf8Char>> From<I> for Utf8CharSplitter<Utf8Char,I> { - /// A less generic constructor than `iter_bytes()` - fn from(iter: I) -> Self { - iter_bytes(iter) - } -} -impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Utf8CharSplitter<U,I> { - /// Extracts the source iterator. - /// - /// Note that `iter_bytes(iter.into_inner())` is not a no-op: - /// If the last returned byte from `next()` was not an ASCII by, - /// the remaining bytes of that codepoint is lost. - pub fn into_inner(self) -> I { - self.inner - } -} -impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Iterator for Utf8CharSplitter<U,I> { - type Item = u8; - fn next(&mut self) -> Option<Self::Item> { - if self.prev == 0 { - self.inner.next().map(|u8c| { - let array = u8c.borrow().to_array().0; - self.prev = unsafe{ u32::from_le(mem::transmute(array)) } >> 8; - array[0] - }) - } else { - let next = self.prev as u8; - self.prev >>= 8; - Some(next) - } - } - fn size_hint(&self) -> (usize,Option<usize>) { - // Doesn't need to handle unlikely overflows correctly because - // size_hint() cannot be relied upon anyway. (the trait isn't unsafe) - let (min, max) = self.inner.size_hint(); - let add = 4 - (self.prev.leading_zeros() / 8) as usize; - (min.wrapping_add(add), max.map(|max| max.wrapping_mul(4).wrapping_add(add) )) - } -} -#[cfg(feature="std")] -impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Read for Utf8CharSplitter<U,I> { - /// Always returns `Ok` - fn read(&mut self, buf: &mut[u8]) -> Result<usize, ioError> { - let mut i = 0; - // write remaining bytes of previous codepoint - while self.prev != 0 && i < buf.len() { - buf[i] = self.prev as u8; - self.prev >>= 8; - i += 1; - } - // write whole characters - while i < buf.len() { - let bytes = match self.inner.next() { - Some(u8c) => u8c.borrow().to_array().0, - None => break - }; - buf[i] = bytes[0]; - i += 1; - if bytes[1] != 0 { - let len = bytes[0].not().leading_zeros() as usize; - let mut written = 1; - while written < len { - if i < buf.len() { - buf[i] = bytes[written]; - i += 1; - written += 1; - } else { - let bytes_as_u32 = unsafe{ u32::from_le(mem::transmute(bytes)) }; - self.prev = bytes_as_u32 >> (8*written); - return Ok(i); - } - } - } - } - Ok(i) - } -} - - - -/// An iterator over the `Utf8Char` of a string slice, and their positions. -/// -/// This struct is created by the `utf8char_indices() method from [`StrExt`] trait. See its documentation for more. -#[derive(Clone)] -pub struct Utf8CharIndices<'a>{ - str: &'a str, - index: usize, -} -impl<'a> From<&'a str> for Utf8CharIndices<'a> { - fn from(s: &str) -> Utf8CharIndices { - Utf8CharIndices{str: s, index: 0} - } -} -impl<'a> Utf8CharIndices<'a> { - /// Extract the remainder of the source `str`. - /// - /// # Examples - /// - /// ``` - /// use encode_unicode::{StrExt, Utf8Char}; - /// let mut iter = "abc".utf8char_indices(); - /// assert_eq!(iter.next_back(), Some((2, Utf8Char::from('c')))); - /// assert_eq!(iter.next(), Some((0, Utf8Char::from('a')))); - /// assert_eq!(iter.as_str(), "b"); - /// ``` - pub fn as_str(&self) -> &'a str { - &self.str[self.index..] - } -} -impl<'a> Iterator for Utf8CharIndices<'a> { - type Item = (usize,Utf8Char); - fn next(&mut self) -> Option<(usize,Utf8Char)> { - match Utf8Char::from_str_start(&self.str[self.index..]) { - Ok((u8c, len)) => { - let item = (self.index, u8c); - self.index += len; - Some(item) - }, - Err(EmptyStrError) => None - } - } - fn size_hint(&self) -> (usize,Option<usize>) { - let len = self.str.len() - self.index; - // For len+3 to overflow, the slice must fill all but two bytes of - // addressable memory, and size_hint() doesn't need to be correct. - (len.wrapping_add(3)/4, Some(len)) - } -} -impl<'a> DoubleEndedIterator for Utf8CharIndices<'a> { - fn next_back(&mut self) -> Option<(usize,Utf8Char)> { - // Cannot refactor out the unwrap without switching to ::from_slice() - // since slicing the str panics if not on a boundary. - if self.index < self.str.len() { - let rev = self.str.bytes().rev(); - let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count(); - let starts = self.str.len() - len; - let (u8c,_) = Utf8Char::from_str_start(&self.str[starts..]).unwrap(); - self.str = &self.str[..starts]; - Some((starts, u8c)) - } else { - None - } - } -} -impl<'a> fmt::Debug for Utf8CharIndices<'a> { - fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { - fmtr.debug_tuple("Utf8CharIndices") - .field(&self.index) - .field(&self.as_str()) - .finish() - } -} - - -/// An iterator over the codepoints in a `str` represented as `Utf8Char`. -#[derive(Clone)] -pub struct Utf8Chars<'a>(Utf8CharIndices<'a>); -impl<'a> From<&'a str> for Utf8Chars<'a> { - fn from(s: &str) -> Utf8Chars { - Utf8Chars(Utf8CharIndices::from(s)) - } -} -impl<'a> Utf8Chars<'a> { - /// Extract the remainder of the source `str`. - /// - /// # Examples - /// - /// ``` - /// use encode_unicode::{StrExt, Utf8Char}; - /// let mut iter = "abc".utf8chars(); - /// assert_eq!(iter.next(), Some(Utf8Char::from('a'))); - /// assert_eq!(iter.next_back(), Some(Utf8Char::from('c'))); - /// assert_eq!(iter.as_str(), "b"); - /// ``` - pub fn as_str(&self) -> &'a str { - self.0.as_str() - } -} -impl<'a> Iterator for Utf8Chars<'a> { - type Item = Utf8Char; - fn next(&mut self) -> Option<Utf8Char> { - self.0.next().map(|(_,u8c)| u8c ) - } - fn size_hint(&self) -> (usize,Option<usize>) { - self.0.size_hint() - } -} -impl<'a> DoubleEndedIterator for Utf8Chars<'a> { - fn next_back(&mut self) -> Option<Utf8Char> { - self.0.next_back().map(|(_,u8c)| u8c ) - } -} -impl<'a> fmt::Debug for Utf8Chars<'a> { - fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { - fmtr.debug_tuple("Utf8CharIndices") - .field(&self.as_str()) - .finish() - } -} |