From 1b6a04ca5504955c571d1c97504fb45ea0befee4 Mon Sep 17 00:00:00 2001 From: Valentin Popov Date: Mon, 8 Jan 2024 01:21:28 +0400 Subject: Initial vendor packages Signed-off-by: Valentin Popov --- vendor/encode_unicode/src/utf8_iterators.rs | 352 ++++++++++++++++++++++++++++ 1 file changed, 352 insertions(+) create mode 100644 vendor/encode_unicode/src/utf8_iterators.rs (limited to 'vendor/encode_unicode/src/utf8_iterators.rs') diff --git a/vendor/encode_unicode/src/utf8_iterators.rs b/vendor/encode_unicode/src/utf8_iterators.rs new file mode 100644 index 0000000..891d729 --- /dev/null +++ b/vendor/encode_unicode/src/utf8_iterators.rs @@ -0,0 +1,352 @@ +/* Copyright 2016 The encode_unicode Developers + * + * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be + * copied, modified, or distributed except according to those terms. + */ + +use utf8_char::Utf8Char; +use errors::EmptyStrError; +extern crate core; +use self::core::{mem, u32, u64}; +use self::core::ops::Not; +use self::core::fmt; +use self::core::borrow::Borrow; +#[cfg(feature="std")] +use std::io::{Read, Error as ioError}; + + + +/// Read or iterate over the bytes of the UTF-8 representation of a codepoint. +#[derive(Clone)] +pub struct Utf8Iterator (u32); + +impl From for Utf8Iterator { + fn from(uc: Utf8Char) -> Self { + let used = u32::from_le(unsafe{ mem::transmute(uc.to_array().0) }); + // uses u64 because shifting an u32 by 32 bits is a no-op. + let unused_set = (u64::MAX << uc.len() as u64*8) as u32; + Utf8Iterator(used | unused_set) + } +} +impl From for Utf8Iterator { + fn from(c: char) -> Self { + Self::from(Utf8Char::from(c)) + } +} +impl Iterator for Utf8Iterator { + type Item=u8; + fn next(&mut self) -> Option { + let next = self.0 as u8; + if next == 0xff { + None + } else { + self.0 = (self.0 >> 8) | 0xff_00_00_00; + Some(next) + } + } + fn size_hint(&self) -> (usize, Option) { + (self.len(), Some(self.len())) + } +} +impl ExactSizeIterator for Utf8Iterator { + fn len(&self) -> usize {// not straightforward, but possible + let unused_bytes = self.0.not().leading_zeros() / 8; + 4 - unused_bytes as usize + } +} +#[cfg(feature="std")] +impl Read for Utf8Iterator { + /// Always returns Ok + fn read(&mut self, buf: &mut[u8]) -> Result { + // Cannot call self.next() until I know I can write the result. + for (i, dst) in buf.iter_mut().enumerate() { + match self.next() { + Some(b) => *dst = b, + None => return Ok(i), + } + } + Ok(buf.len()) + } +} +impl fmt::Debug for Utf8Iterator { + fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { + let mut content = [0; 4]; + let mut i = 0; + for b in self.clone() { + content[i] = b; + i += 1; + } + write!(fmtr, "{:?}", &content[..i]) + } +} + + + +/// Converts an iterator of `Utf8Char` (or `&Utf8Char`) +/// to an iterator of `u8`s. +/// Is equivalent to calling `.flat_map()` on the original iterator, +/// but the returned iterator is ~40% faster. +/// +/// The iterator also implements `Read` (if the `std` feature isn't disabled). +/// Reading will never produce an error, and calls to `.read()` and `.next()` +/// can be mixed. +/// +/// The exact number of bytes cannot be known in advance, but `size_hint()` +/// gives the possible range. +/// (min: all remaining characters are ASCII, max: all require four bytes) +/// +/// # Examples +/// +/// From iterator of values: +/// +/// ``` +/// use encode_unicode::{iter_bytes, CharExt}; +/// +/// let iterator = "foo".chars().map(|c| c.to_utf8() ); +/// let mut bytes = [0; 4]; +/// for (u,dst) in iter_bytes(iterator).zip(&mut bytes) {*dst=u;} +/// assert_eq!(&bytes, b"foo\0"); +/// ``` +/// +/// From iterator of references: +/// +#[cfg_attr(feature="std", doc=" ```")] +#[cfg_attr(not(feature="std"), doc=" ```no_compile")] +/// use encode_unicode::{iter_bytes, CharExt, Utf8Char}; +/// +/// let chars: Vec = "💣 bomb 💣".chars().map(|c| c.to_utf8() ).collect(); +/// let bytes: Vec = iter_bytes(&chars).collect(); +/// let flat_map: Vec = chars.iter().flat_map(|u8c| *u8c ).collect(); +/// assert_eq!(bytes, flat_map); +/// ``` +/// +/// `Read`ing from it: +/// +#[cfg_attr(feature="std", doc=" ```")] +#[cfg_attr(not(feature="std"), doc=" ```no_compile")] +/// use encode_unicode::{iter_bytes, CharExt}; +/// use std::io::Read; +/// +/// let s = "Ååh‽"; +/// assert_eq!(s.len(), 8); +/// let mut buf = [b'E'; 9]; +/// let mut reader = iter_bytes(s.chars().map(|c| c.to_utf8() )); +/// assert_eq!(reader.read(&mut buf[..]).unwrap(), 8); +/// assert_eq!(reader.read(&mut buf[..]).unwrap(), 0); +/// assert_eq!(&buf[..8], s.as_bytes()); +/// assert_eq!(buf[8], b'E'); +/// ``` +pub fn iter_bytes, I:IntoIterator> +(iterable: I) -> Utf8CharSplitter { + Utf8CharSplitter{ inner: iterable.into_iter(), prev: 0 } +} + +/// The iterator type returned by `iter_bytes()` +/// +/// See its documentation for details. +#[derive(Clone)] +pub struct Utf8CharSplitter, I:Iterator> { + inner: I, + prev: u32, +} +impl> From for Utf8CharSplitter { + /// A less generic constructor than `iter_bytes()` + fn from(iter: I) -> Self { + iter_bytes(iter) + } +} +impl, I:Iterator> Utf8CharSplitter { + /// Extracts the source iterator. + /// + /// Note that `iter_bytes(iter.into_inner())` is not a no-op: + /// If the last returned byte from `next()` was not an ASCII by, + /// the remaining bytes of that codepoint is lost. + pub fn into_inner(self) -> I { + self.inner + } +} +impl, I:Iterator> Iterator for Utf8CharSplitter { + type Item = u8; + fn next(&mut self) -> Option { + if self.prev == 0 { + self.inner.next().map(|u8c| { + let array = u8c.borrow().to_array().0; + self.prev = unsafe{ u32::from_le(mem::transmute(array)) } >> 8; + array[0] + }) + } else { + let next = self.prev as u8; + self.prev >>= 8; + Some(next) + } + } + fn size_hint(&self) -> (usize,Option) { + // Doesn't need to handle unlikely overflows correctly because + // size_hint() cannot be relied upon anyway. (the trait isn't unsafe) + let (min, max) = self.inner.size_hint(); + let add = 4 - (self.prev.leading_zeros() / 8) as usize; + (min.wrapping_add(add), max.map(|max| max.wrapping_mul(4).wrapping_add(add) )) + } +} +#[cfg(feature="std")] +impl, I:Iterator> Read for Utf8CharSplitter { + /// Always returns `Ok` + fn read(&mut self, buf: &mut[u8]) -> Result { + let mut i = 0; + // write remaining bytes of previous codepoint + while self.prev != 0 && i < buf.len() { + buf[i] = self.prev as u8; + self.prev >>= 8; + i += 1; + } + // write whole characters + while i < buf.len() { + let bytes = match self.inner.next() { + Some(u8c) => u8c.borrow().to_array().0, + None => break + }; + buf[i] = bytes[0]; + i += 1; + if bytes[1] != 0 { + let len = bytes[0].not().leading_zeros() as usize; + let mut written = 1; + while written < len { + if i < buf.len() { + buf[i] = bytes[written]; + i += 1; + written += 1; + } else { + let bytes_as_u32 = unsafe{ u32::from_le(mem::transmute(bytes)) }; + self.prev = bytes_as_u32 >> (8*written); + return Ok(i); + } + } + } + } + Ok(i) + } +} + + + +/// An iterator over the `Utf8Char` of a string slice, and their positions. +/// +/// This struct is created by the `utf8char_indices() method from [`StrExt`] trait. See its documentation for more. +#[derive(Clone)] +pub struct Utf8CharIndices<'a>{ + str: &'a str, + index: usize, +} +impl<'a> From<&'a str> for Utf8CharIndices<'a> { + fn from(s: &str) -> Utf8CharIndices { + Utf8CharIndices{str: s, index: 0} + } +} +impl<'a> Utf8CharIndices<'a> { + /// Extract the remainder of the source `str`. + /// + /// # Examples + /// + /// ``` + /// use encode_unicode::{StrExt, Utf8Char}; + /// let mut iter = "abc".utf8char_indices(); + /// assert_eq!(iter.next_back(), Some((2, Utf8Char::from('c')))); + /// assert_eq!(iter.next(), Some((0, Utf8Char::from('a')))); + /// assert_eq!(iter.as_str(), "b"); + /// ``` + pub fn as_str(&self) -> &'a str { + &self.str[self.index..] + } +} +impl<'a> Iterator for Utf8CharIndices<'a> { + type Item = (usize,Utf8Char); + fn next(&mut self) -> Option<(usize,Utf8Char)> { + match Utf8Char::from_str_start(&self.str[self.index..]) { + Ok((u8c, len)) => { + let item = (self.index, u8c); + self.index += len; + Some(item) + }, + Err(EmptyStrError) => None + } + } + fn size_hint(&self) -> (usize,Option) { + let len = self.str.len() - self.index; + // For len+3 to overflow, the slice must fill all but two bytes of + // addressable memory, and size_hint() doesn't need to be correct. + (len.wrapping_add(3)/4, Some(len)) + } +} +impl<'a> DoubleEndedIterator for Utf8CharIndices<'a> { + fn next_back(&mut self) -> Option<(usize,Utf8Char)> { + // Cannot refactor out the unwrap without switching to ::from_slice() + // since slicing the str panics if not on a boundary. + if self.index < self.str.len() { + let rev = self.str.bytes().rev(); + let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count(); + let starts = self.str.len() - len; + let (u8c,_) = Utf8Char::from_str_start(&self.str[starts..]).unwrap(); + self.str = &self.str[..starts]; + Some((starts, u8c)) + } else { + None + } + } +} +impl<'a> fmt::Debug for Utf8CharIndices<'a> { + fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { + fmtr.debug_tuple("Utf8CharIndices") + .field(&self.index) + .field(&self.as_str()) + .finish() + } +} + + +/// An iterator over the codepoints in a `str` represented as `Utf8Char`. +#[derive(Clone)] +pub struct Utf8Chars<'a>(Utf8CharIndices<'a>); +impl<'a> From<&'a str> for Utf8Chars<'a> { + fn from(s: &str) -> Utf8Chars { + Utf8Chars(Utf8CharIndices::from(s)) + } +} +impl<'a> Utf8Chars<'a> { + /// Extract the remainder of the source `str`. + /// + /// # Examples + /// + /// ``` + /// use encode_unicode::{StrExt, Utf8Char}; + /// let mut iter = "abc".utf8chars(); + /// assert_eq!(iter.next(), Some(Utf8Char::from('a'))); + /// assert_eq!(iter.next_back(), Some(Utf8Char::from('c'))); + /// assert_eq!(iter.as_str(), "b"); + /// ``` + pub fn as_str(&self) -> &'a str { + self.0.as_str() + } +} +impl<'a> Iterator for Utf8Chars<'a> { + type Item = Utf8Char; + fn next(&mut self) -> Option { + self.0.next().map(|(_,u8c)| u8c ) + } + fn size_hint(&self) -> (usize,Option) { + self.0.size_hint() + } +} +impl<'a> DoubleEndedIterator for Utf8Chars<'a> { + fn next_back(&mut self) -> Option { + self.0.next_back().map(|(_,u8c)| u8c ) + } +} +impl<'a> fmt::Debug for Utf8Chars<'a> { + fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { + fmtr.debug_tuple("Utf8CharIndices") + .field(&self.as_str()) + .finish() + } +} -- cgit v1.2.3