/* Copyright 2016 The encode_unicode Developers * * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be * copied, modified, or distributed except according to those terms. */ use traits::CharExt; use utf16_char::Utf16Char; use errors::EmptyStrError; extern crate core; use self::core::fmt; use self::core::borrow::Borrow; // Invalid values that says the field is consumed or empty. const FIRST_USED: u16 = 0x_dc_00; const SECOND_USED: u16 = 0; /// Iterate over the units of the UTF-16 representation of a codepoint. #[derive(Clone)] pub struct Utf16Iterator { first: u16, second: u16, } impl From for Utf16Iterator { fn from(c: char) -> Self { let (first, second) = c.to_utf16_tuple(); Utf16Iterator{ first: first, second: second.unwrap_or(SECOND_USED) } } } impl From for Utf16Iterator { fn from(uc: Utf16Char) -> Self { let (first, second) = uc.to_tuple(); Utf16Iterator{ first: first, second: second.unwrap_or(SECOND_USED) } } } impl Iterator for Utf16Iterator { type Item=u16; fn next(&mut self) -> Option { match (self.first, self.second) { (FIRST_USED, SECOND_USED) => { None }, (FIRST_USED, second ) => {self.second = SECOND_USED; Some(second)}, (first , _ ) => {self.first = FIRST_USED; Some(first )}, } } fn size_hint(&self) -> (usize, Option) { (self.len(), Some(self.len())) } } impl ExactSizeIterator for Utf16Iterator { fn len(&self) -> usize { (if self.first == FIRST_USED {0} else {1}) + (if self.second == SECOND_USED {0} else {1}) } } impl fmt::Debug for Utf16Iterator { fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { let mut clone = self.clone(); match (clone.next(), clone.next()) { (Some(one), None) => write!(fmtr, "[{}]", one), (Some(a), Some(b)) => write!(fmtr, "[{}, {}]", a, b), (None, _) => write!(fmtr, "[]"), } } } /// Converts an iterator of `Utf16Char` (or `&Utf16Char`) /// to an iterator of `u16`s. /// Is equivalent to calling `.flat_map()` on the original iterator, /// but the returned iterator is about twice as fast. /// /// The exact number of units cannot be known in advance, but `size_hint()` /// gives the possible range. /// /// # Examples /// /// From iterator of values: /// /// ``` /// use encode_unicode::{iter_units, CharExt}; /// /// let iterator = "foo".chars().map(|c| c.to_utf16() ); /// let mut units = [0; 4]; /// for (u,dst) in iter_units(iterator).zip(&mut units) {*dst=u;} /// assert_eq!(units, ['f' as u16, 'o' as u16, 'o' as u16, 0]); /// ``` /// /// From iterator of references: /// #[cfg_attr(feature="std", doc=" ```")] #[cfg_attr(not(feature="std"), doc=" ```no_compile")] /// use encode_unicode::{iter_units, CharExt, Utf16Char}; /// /// // (💣 takes two units) /// let chars: Vec = "💣 bomb 💣".chars().map(|c| c.to_utf16() ).collect(); /// let units: Vec = iter_units(&chars).collect(); /// let flat_map: Vec = chars.iter().flat_map(|u16c| *u16c ).collect(); /// assert_eq!(units, flat_map); /// ``` pub fn iter_units, I:IntoIterator> (iterable: I) -> Utf16CharSplitter { Utf16CharSplitter{ inner: iterable.into_iter(), prev_second: 0 } } /// The iterator type returned by `iter_units()` #[derive(Clone)] pub struct Utf16CharSplitter, I:Iterator> { inner: I, prev_second: u16, } impl> From for Utf16CharSplitter { /// A less generic constructor than `iter_units()` fn from(iter: I) -> Self { iter_units(iter) } } impl, I:Iterator> Utf16CharSplitter { /// Extracts the source iterator. /// /// Note that `iter_units(iter.into_inner())` is not a no-op: /// If the last returned unit from `next()` was a leading surrogate, /// the trailing surrogate is lost. pub fn into_inner(self) -> I { self.inner } } impl, I:Iterator> Iterator for Utf16CharSplitter { type Item = u16; fn next(&mut self) -> Option { if self.prev_second == 0 { self.inner.next().map(|u16c| { let units = u16c.borrow().to_array(); self.prev_second = units[1]; units[0] }) } else { let prev_second = self.prev_second; self.prev_second = 0; Some(prev_second) } } fn size_hint(&self) -> (usize,Option) { // Doesn't need to handle unlikely overflows correctly because // size_hint() cannot be relied upon anyway. (the trait isn't unsafe) let (min, max) = self.inner.size_hint(); let add = if self.prev_second == 0 {0} else {1}; (min.wrapping_add(add), max.map(|max| max.wrapping_mul(2).wrapping_add(add) )) } } /// An iterator over the codepoints in a `str` represented as `Utf16Char`. #[derive(Clone)] pub struct Utf16CharIndices<'a>{ str: &'a str, index: usize, } impl<'a> From<&'a str> for Utf16CharIndices<'a> { fn from(s: &str) -> Utf16CharIndices { Utf16CharIndices{str: s, index: 0} } } impl<'a> Utf16CharIndices<'a> { /// Extract the remainder of the source `str`. /// /// # Examples /// /// ``` /// use encode_unicode::{StrExt, Utf16Char}; /// let mut iter = "abc".utf16char_indices(); /// assert_eq!(iter.next_back(), Some((2, Utf16Char::from('c')))); /// assert_eq!(iter.next(), Some((0, Utf16Char::from('a')))); /// assert_eq!(iter.as_str(), "b"); /// ``` pub fn as_str(&self) -> &'a str { &self.str[self.index..] } } impl<'a> Iterator for Utf16CharIndices<'a> { type Item = (usize,Utf16Char); fn next(&mut self) -> Option<(usize,Utf16Char)> { match Utf16Char::from_str_start(&self.str[self.index..]) { Ok((u16c, bytes)) => { let item = (self.index, u16c); self.index += bytes; Some(item) }, Err(EmptyStrError) => None } } fn size_hint(&self) -> (usize,Option) { let len = self.str.len() - self.index; // For len+3 to overflow, the slice must fill all but two bytes of // addressable memory, and size_hint() doesn't need to be correct. (len.wrapping_add(3)/4, Some(len)) } } impl<'a> DoubleEndedIterator for Utf16CharIndices<'a> { fn next_back(&mut self) -> Option<(usize,Utf16Char)> { if self.index < self.str.len() { let rev = self.str.bytes().rev(); let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count(); let starts = self.str.len() - len; let (u16c,_) = Utf16Char::from_str_start(&self.str[starts..]).unwrap(); self.str = &self.str[..starts]; Some((starts, u16c)) } else { None } } } impl<'a> fmt::Debug for Utf16CharIndices<'a> { fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { fmtr.debug_tuple("Utf16CharIndices") .field(&self.index) .field(&self.as_str()) .finish() } } /// An iterator over the codepoints in a `str` represented as `Utf16Char`. #[derive(Clone)] pub struct Utf16Chars<'a>(Utf16CharIndices<'a>); impl<'a> From<&'a str> for Utf16Chars<'a> { fn from(s: &str) -> Utf16Chars { Utf16Chars(Utf16CharIndices::from(s)) } } impl<'a> Utf16Chars<'a> { /// Extract the remainder of the source `str`. /// /// # Examples /// /// ``` /// use encode_unicode::{StrExt, Utf16Char}; /// let mut iter = "abc".utf16chars(); /// assert_eq!(iter.next(), Some(Utf16Char::from('a'))); /// assert_eq!(iter.next_back(), Some(Utf16Char::from('c'))); /// assert_eq!(iter.as_str(), "b"); /// ``` pub fn as_str(&self) -> &'a str { self.0.as_str() } } impl<'a> Iterator for Utf16Chars<'a> { type Item = Utf16Char; fn next(&mut self) -> Option { self.0.next().map(|(_,u16c)| u16c ) } fn size_hint(&self) -> (usize,Option) { self.0.size_hint() } } impl<'a> DoubleEndedIterator for Utf16Chars<'a> { fn next_back(&mut self) -> Option { self.0.next_back().map(|(_,u16c)| u16c ) } } impl<'a> fmt::Debug for Utf16Chars<'a> { fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { fmtr.debug_tuple("Utf16Chars") .field(&self.as_str()) .finish() } }