From 1b6a04ca5504955c571d1c97504fb45ea0befee4 Mon Sep 17 00:00:00 2001 From: Valentin Popov Date: Mon, 8 Jan 2024 01:21:28 +0400 Subject: Initial vendor packages Signed-off-by: Valentin Popov --- vendor/encode_unicode/src/utf16_iterators.rs | 270 +++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 vendor/encode_unicode/src/utf16_iterators.rs (limited to 'vendor/encode_unicode/src/utf16_iterators.rs') diff --git a/vendor/encode_unicode/src/utf16_iterators.rs b/vendor/encode_unicode/src/utf16_iterators.rs new file mode 100644 index 0000000..7adb5ac --- /dev/null +++ b/vendor/encode_unicode/src/utf16_iterators.rs @@ -0,0 +1,270 @@ +/* Copyright 2016 The encode_unicode Developers + * + * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be + * copied, modified, or distributed except according to those terms. + */ + +use traits::CharExt; +use utf16_char::Utf16Char; +use errors::EmptyStrError; +extern crate core; +use self::core::fmt; +use self::core::borrow::Borrow; + +// Invalid values that says the field is consumed or empty. +const FIRST_USED: u16 = 0x_dc_00; +const SECOND_USED: u16 = 0; + +/// Iterate over the units of the UTF-16 representation of a codepoint. +#[derive(Clone)] +pub struct Utf16Iterator { + first: u16, + second: u16, +} +impl From for Utf16Iterator { + fn from(c: char) -> Self { + let (first, second) = c.to_utf16_tuple(); + Utf16Iterator{ first: first, second: second.unwrap_or(SECOND_USED) } + } +} +impl From for Utf16Iterator { + fn from(uc: Utf16Char) -> Self { + let (first, second) = uc.to_tuple(); + Utf16Iterator{ first: first, second: second.unwrap_or(SECOND_USED) } + } +} +impl Iterator for Utf16Iterator { + type Item=u16; + fn next(&mut self) -> Option { + match (self.first, self.second) { + (FIRST_USED, SECOND_USED) => { None }, + (FIRST_USED, second ) => {self.second = SECOND_USED; Some(second)}, + (first , _ ) => {self.first = FIRST_USED; Some(first )}, + } + } + fn size_hint(&self) -> (usize, Option) { + (self.len(), Some(self.len())) + } +} +impl ExactSizeIterator for Utf16Iterator { + fn len(&self) -> usize { + (if self.first == FIRST_USED {0} else {1}) + + (if self.second == SECOND_USED {0} else {1}) + } +} +impl fmt::Debug for Utf16Iterator { + fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { + let mut clone = self.clone(); + match (clone.next(), clone.next()) { + (Some(one), None) => write!(fmtr, "[{}]", one), + (Some(a), Some(b)) => write!(fmtr, "[{}, {}]", a, b), + (None, _) => write!(fmtr, "[]"), + } + } +} + + + +/// Converts an iterator of `Utf16Char` (or `&Utf16Char`) +/// to an iterator of `u16`s. +/// Is equivalent to calling `.flat_map()` on the original iterator, +/// but the returned iterator is about twice as fast. +/// +/// The exact number of units cannot be known in advance, but `size_hint()` +/// gives the possible range. +/// +/// # Examples +/// +/// From iterator of values: +/// +/// ``` +/// use encode_unicode::{iter_units, CharExt}; +/// +/// let iterator = "foo".chars().map(|c| c.to_utf16() ); +/// let mut units = [0; 4]; +/// for (u,dst) in iter_units(iterator).zip(&mut units) {*dst=u;} +/// assert_eq!(units, ['f' as u16, 'o' as u16, 'o' as u16, 0]); +/// ``` +/// +/// From iterator of references: +/// +#[cfg_attr(feature="std", doc=" ```")] +#[cfg_attr(not(feature="std"), doc=" ```no_compile")] +/// use encode_unicode::{iter_units, CharExt, Utf16Char}; +/// +/// // (💣 takes two units) +/// let chars: Vec = "💣 bomb 💣".chars().map(|c| c.to_utf16() ).collect(); +/// let units: Vec = iter_units(&chars).collect(); +/// let flat_map: Vec = chars.iter().flat_map(|u16c| *u16c ).collect(); +/// assert_eq!(units, flat_map); +/// ``` +pub fn iter_units, I:IntoIterator> +(iterable: I) -> Utf16CharSplitter { + Utf16CharSplitter{ inner: iterable.into_iter(), prev_second: 0 } +} + +/// The iterator type returned by `iter_units()` +#[derive(Clone)] +pub struct Utf16CharSplitter, I:Iterator> { + inner: I, + prev_second: u16, +} +impl> From for Utf16CharSplitter { + /// A less generic constructor than `iter_units()` + fn from(iter: I) -> Self { + iter_units(iter) + } +} +impl, I:Iterator> Utf16CharSplitter { + /// Extracts the source iterator. + /// + /// Note that `iter_units(iter.into_inner())` is not a no-op: + /// If the last returned unit from `next()` was a leading surrogate, + /// the trailing surrogate is lost. + pub fn into_inner(self) -> I { + self.inner + } +} +impl, I:Iterator> Iterator for Utf16CharSplitter { + type Item = u16; + fn next(&mut self) -> Option { + if self.prev_second == 0 { + self.inner.next().map(|u16c| { + let units = u16c.borrow().to_array(); + self.prev_second = units[1]; + units[0] + }) + } else { + let prev_second = self.prev_second; + self.prev_second = 0; + Some(prev_second) + } + } + fn size_hint(&self) -> (usize,Option) { + // Doesn't need to handle unlikely overflows correctly because + // size_hint() cannot be relied upon anyway. (the trait isn't unsafe) + let (min, max) = self.inner.size_hint(); + let add = if self.prev_second == 0 {0} else {1}; + (min.wrapping_add(add), max.map(|max| max.wrapping_mul(2).wrapping_add(add) )) + } +} + + + +/// An iterator over the codepoints in a `str` represented as `Utf16Char`. +#[derive(Clone)] +pub struct Utf16CharIndices<'a>{ + str: &'a str, + index: usize, +} +impl<'a> From<&'a str> for Utf16CharIndices<'a> { + fn from(s: &str) -> Utf16CharIndices { + Utf16CharIndices{str: s, index: 0} + } +} +impl<'a> Utf16CharIndices<'a> { + /// Extract the remainder of the source `str`. + /// + /// # Examples + /// + /// ``` + /// use encode_unicode::{StrExt, Utf16Char}; + /// let mut iter = "abc".utf16char_indices(); + /// assert_eq!(iter.next_back(), Some((2, Utf16Char::from('c')))); + /// assert_eq!(iter.next(), Some((0, Utf16Char::from('a')))); + /// assert_eq!(iter.as_str(), "b"); + /// ``` + pub fn as_str(&self) -> &'a str { + &self.str[self.index..] + } +} +impl<'a> Iterator for Utf16CharIndices<'a> { + type Item = (usize,Utf16Char); + fn next(&mut self) -> Option<(usize,Utf16Char)> { + match Utf16Char::from_str_start(&self.str[self.index..]) { + Ok((u16c, bytes)) => { + let item = (self.index, u16c); + self.index += bytes; + Some(item) + }, + Err(EmptyStrError) => None + } + } + fn size_hint(&self) -> (usize,Option) { + let len = self.str.len() - self.index; + // For len+3 to overflow, the slice must fill all but two bytes of + // addressable memory, and size_hint() doesn't need to be correct. + (len.wrapping_add(3)/4, Some(len)) + } +} +impl<'a> DoubleEndedIterator for Utf16CharIndices<'a> { + fn next_back(&mut self) -> Option<(usize,Utf16Char)> { + if self.index < self.str.len() { + let rev = self.str.bytes().rev(); + let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count(); + let starts = self.str.len() - len; + let (u16c,_) = Utf16Char::from_str_start(&self.str[starts..]).unwrap(); + self.str = &self.str[..starts]; + Some((starts, u16c)) + } else { + None + } + } +} +impl<'a> fmt::Debug for Utf16CharIndices<'a> { + fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { + fmtr.debug_tuple("Utf16CharIndices") + .field(&self.index) + .field(&self.as_str()) + .finish() + } +} + + +/// An iterator over the codepoints in a `str` represented as `Utf16Char`. +#[derive(Clone)] +pub struct Utf16Chars<'a>(Utf16CharIndices<'a>); +impl<'a> From<&'a str> for Utf16Chars<'a> { + fn from(s: &str) -> Utf16Chars { + Utf16Chars(Utf16CharIndices::from(s)) + } +} +impl<'a> Utf16Chars<'a> { + /// Extract the remainder of the source `str`. + /// + /// # Examples + /// + /// ``` + /// use encode_unicode::{StrExt, Utf16Char}; + /// let mut iter = "abc".utf16chars(); + /// assert_eq!(iter.next(), Some(Utf16Char::from('a'))); + /// assert_eq!(iter.next_back(), Some(Utf16Char::from('c'))); + /// assert_eq!(iter.as_str(), "b"); + /// ``` + pub fn as_str(&self) -> &'a str { + self.0.as_str() + } +} +impl<'a> Iterator for Utf16Chars<'a> { + type Item = Utf16Char; + fn next(&mut self) -> Option { + self.0.next().map(|(_,u16c)| u16c ) + } + fn size_hint(&self) -> (usize,Option) { + self.0.size_hint() + } +} +impl<'a> DoubleEndedIterator for Utf16Chars<'a> { + fn next_back(&mut self) -> Option { + self.0.next_back().map(|(_,u16c)| u16c ) + } +} +impl<'a> fmt::Debug for Utf16Chars<'a> { + fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { + fmtr.debug_tuple("Utf16Chars") + .field(&self.as_str()) + .finish() + } +} -- cgit v1.2.3