/* Copyright 2016 The encode_unicode Developers * * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be * copied, modified, or distributed except according to those terms. */ use utf8_char::Utf8Char; use errors::EmptyStrError; extern crate core; use self::core::{mem, u32, u64}; use self::core::ops::Not; use self::core::fmt; use self::core::borrow::Borrow; #[cfg(feature="std")] use std::io::{Read, Error as ioError}; /// Read or iterate over the bytes of the UTF-8 representation of a codepoint. #[derive(Clone)] pub struct Utf8Iterator (u32); impl From for Utf8Iterator { fn from(uc: Utf8Char) -> Self { let used = u32::from_le(unsafe{ mem::transmute(uc.to_array().0) }); // uses u64 because shifting an u32 by 32 bits is a no-op. let unused_set = (u64::MAX << uc.len() as u64*8) as u32; Utf8Iterator(used | unused_set) } } impl From for Utf8Iterator { fn from(c: char) -> Self { Self::from(Utf8Char::from(c)) } } impl Iterator for Utf8Iterator { type Item=u8; fn next(&mut self) -> Option { let next = self.0 as u8; if next == 0xff { None } else { self.0 = (self.0 >> 8) | 0xff_00_00_00; Some(next) } } fn size_hint(&self) -> (usize, Option) { (self.len(), Some(self.len())) } } impl ExactSizeIterator for Utf8Iterator { fn len(&self) -> usize {// not straightforward, but possible let unused_bytes = self.0.not().leading_zeros() / 8; 4 - unused_bytes as usize } } #[cfg(feature="std")] impl Read for Utf8Iterator { /// Always returns Ok fn read(&mut self, buf: &mut[u8]) -> Result { // Cannot call self.next() until I know I can write the result. for (i, dst) in buf.iter_mut().enumerate() { match self.next() { Some(b) => *dst = b, None => return Ok(i), } } Ok(buf.len()) } } impl fmt::Debug for Utf8Iterator { fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { let mut content = [0; 4]; let mut i = 0; for b in self.clone() { content[i] = b; i += 1; } write!(fmtr, "{:?}", &content[..i]) } } /// Converts an iterator of `Utf8Char` (or `&Utf8Char`) /// to an iterator of `u8`s. /// Is equivalent to calling `.flat_map()` on the original iterator, /// but the returned iterator is ~40% faster. /// /// The iterator also implements `Read` (if the `std` feature isn't disabled). /// Reading will never produce an error, and calls to `.read()` and `.next()` /// can be mixed. /// /// The exact number of bytes cannot be known in advance, but `size_hint()` /// gives the possible range. /// (min: all remaining characters are ASCII, max: all require four bytes) /// /// # Examples /// /// From iterator of values: /// /// ``` /// use encode_unicode::{iter_bytes, CharExt}; /// /// let iterator = "foo".chars().map(|c| c.to_utf8() ); /// let mut bytes = [0; 4]; /// for (u,dst) in iter_bytes(iterator).zip(&mut bytes) {*dst=u;} /// assert_eq!(&bytes, b"foo\0"); /// ``` /// /// From iterator of references: /// #[cfg_attr(feature="std", doc=" ```")] #[cfg_attr(not(feature="std"), doc=" ```no_compile")] /// use encode_unicode::{iter_bytes, CharExt, Utf8Char}; /// /// let chars: Vec = "💣 bomb 💣".chars().map(|c| c.to_utf8() ).collect(); /// let bytes: Vec = iter_bytes(&chars).collect(); /// let flat_map: Vec = chars.iter().flat_map(|u8c| *u8c ).collect(); /// assert_eq!(bytes, flat_map); /// ``` /// /// `Read`ing from it: /// #[cfg_attr(feature="std", doc=" ```")] #[cfg_attr(not(feature="std"), doc=" ```no_compile")] /// use encode_unicode::{iter_bytes, CharExt}; /// use std::io::Read; /// /// let s = "Ååh‽"; /// assert_eq!(s.len(), 8); /// let mut buf = [b'E'; 9]; /// let mut reader = iter_bytes(s.chars().map(|c| c.to_utf8() )); /// assert_eq!(reader.read(&mut buf[..]).unwrap(), 8); /// assert_eq!(reader.read(&mut buf[..]).unwrap(), 0); /// assert_eq!(&buf[..8], s.as_bytes()); /// assert_eq!(buf[8], b'E'); /// ``` pub fn iter_bytes, I:IntoIterator> (iterable: I) -> Utf8CharSplitter { Utf8CharSplitter{ inner: iterable.into_iter(), prev: 0 } } /// The iterator type returned by `iter_bytes()` /// /// See its documentation for details. #[derive(Clone)] pub struct Utf8CharSplitter, I:Iterator> { inner: I, prev: u32, } impl> From for Utf8CharSplitter { /// A less generic constructor than `iter_bytes()` fn from(iter: I) -> Self { iter_bytes(iter) } } impl, I:Iterator> Utf8CharSplitter { /// Extracts the source iterator. /// /// Note that `iter_bytes(iter.into_inner())` is not a no-op: /// If the last returned byte from `next()` was not an ASCII by, /// the remaining bytes of that codepoint is lost. pub fn into_inner(self) -> I { self.inner } } impl, I:Iterator> Iterator for Utf8CharSplitter { type Item = u8; fn next(&mut self) -> Option { if self.prev == 0 { self.inner.next().map(|u8c| { let array = u8c.borrow().to_array().0; self.prev = unsafe{ u32::from_le(mem::transmute(array)) } >> 8; array[0] }) } else { let next = self.prev as u8; self.prev >>= 8; Some(next) } } fn size_hint(&self) -> (usize,Option) { // Doesn't need to handle unlikely overflows correctly because // size_hint() cannot be relied upon anyway. (the trait isn't unsafe) let (min, max) = self.inner.size_hint(); let add = 4 - (self.prev.leading_zeros() / 8) as usize; (min.wrapping_add(add), max.map(|max| max.wrapping_mul(4).wrapping_add(add) )) } } #[cfg(feature="std")] impl, I:Iterator> Read for Utf8CharSplitter { /// Always returns `Ok` fn read(&mut self, buf: &mut[u8]) -> Result { let mut i = 0; // write remaining bytes of previous codepoint while self.prev != 0 && i < buf.len() { buf[i] = self.prev as u8; self.prev >>= 8; i += 1; } // write whole characters while i < buf.len() { let bytes = match self.inner.next() { Some(u8c) => u8c.borrow().to_array().0, None => break }; buf[i] = bytes[0]; i += 1; if bytes[1] != 0 { let len = bytes[0].not().leading_zeros() as usize; let mut written = 1; while written < len { if i < buf.len() { buf[i] = bytes[written]; i += 1; written += 1; } else { let bytes_as_u32 = unsafe{ u32::from_le(mem::transmute(bytes)) }; self.prev = bytes_as_u32 >> (8*written); return Ok(i); } } } } Ok(i) } } /// An iterator over the `Utf8Char` of a string slice, and their positions. /// /// This struct is created by the `utf8char_indices() method from [`StrExt`] trait. See its documentation for more. #[derive(Clone)] pub struct Utf8CharIndices<'a>{ str: &'a str, index: usize, } impl<'a> From<&'a str> for Utf8CharIndices<'a> { fn from(s: &str) -> Utf8CharIndices { Utf8CharIndices{str: s, index: 0} } } impl<'a> Utf8CharIndices<'a> { /// Extract the remainder of the source `str`. /// /// # Examples /// /// ``` /// use encode_unicode::{StrExt, Utf8Char}; /// let mut iter = "abc".utf8char_indices(); /// assert_eq!(iter.next_back(), Some((2, Utf8Char::from('c')))); /// assert_eq!(iter.next(), Some((0, Utf8Char::from('a')))); /// assert_eq!(iter.as_str(), "b"); /// ``` pub fn as_str(&self) -> &'a str { &self.str[self.index..] } } impl<'a> Iterator for Utf8CharIndices<'a> { type Item = (usize,Utf8Char); fn next(&mut self) -> Option<(usize,Utf8Char)> { match Utf8Char::from_str_start(&self.str[self.index..]) { Ok((u8c, len)) => { let item = (self.index, u8c); self.index += len; Some(item) }, Err(EmptyStrError) => None } } fn size_hint(&self) -> (usize,Option) { let len = self.str.len() - self.index; // For len+3 to overflow, the slice must fill all but two bytes of // addressable memory, and size_hint() doesn't need to be correct. (len.wrapping_add(3)/4, Some(len)) } } impl<'a> DoubleEndedIterator for Utf8CharIndices<'a> { fn next_back(&mut self) -> Option<(usize,Utf8Char)> { // Cannot refactor out the unwrap without switching to ::from_slice() // since slicing the str panics if not on a boundary. if self.index < self.str.len() { let rev = self.str.bytes().rev(); let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count(); let starts = self.str.len() - len; let (u8c,_) = Utf8Char::from_str_start(&self.str[starts..]).unwrap(); self.str = &self.str[..starts]; Some((starts, u8c)) } else { None } } } impl<'a> fmt::Debug for Utf8CharIndices<'a> { fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { fmtr.debug_tuple("Utf8CharIndices") .field(&self.index) .field(&self.as_str()) .finish() } } /// An iterator over the codepoints in a `str` represented as `Utf8Char`. #[derive(Clone)] pub struct Utf8Chars<'a>(Utf8CharIndices<'a>); impl<'a> From<&'a str> for Utf8Chars<'a> { fn from(s: &str) -> Utf8Chars { Utf8Chars(Utf8CharIndices::from(s)) } } impl<'a> Utf8Chars<'a> { /// Extract the remainder of the source `str`. /// /// # Examples /// /// ``` /// use encode_unicode::{StrExt, Utf8Char}; /// let mut iter = "abc".utf8chars(); /// assert_eq!(iter.next(), Some(Utf8Char::from('a'))); /// assert_eq!(iter.next_back(), Some(Utf8Char::from('c'))); /// assert_eq!(iter.as_str(), "b"); /// ``` pub fn as_str(&self) -> &'a str { self.0.as_str() } } impl<'a> Iterator for Utf8Chars<'a> { type Item = Utf8Char; fn next(&mut self) -> Option { self.0.next().map(|(_,u8c)| u8c ) } fn size_hint(&self) -> (usize,Option) { self.0.size_hint() } } impl<'a> DoubleEndedIterator for Utf8Chars<'a> { fn next_back(&mut self) -> Option { self.0.next_back().map(|(_,u8c)| u8c ) } } impl<'a> fmt::Debug for Utf8Chars<'a> { fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { fmtr.debug_tuple("Utf8CharIndices") .field(&self.as_str()) .finish() } }