diff options
Diffstat (limited to 'vendor/textwrap/src/word_separators.rs')
-rw-r--r-- | vendor/textwrap/src/word_separators.rs | 428 |
1 files changed, 0 insertions, 428 deletions
diff --git a/vendor/textwrap/src/word_separators.rs b/vendor/textwrap/src/word_separators.rs deleted file mode 100644 index 25adf31..0000000 --- a/vendor/textwrap/src/word_separators.rs +++ /dev/null @@ -1,428 +0,0 @@ -//! Functionality for finding words. -//! -//! In order to wrap text, we need to know where the legal break -//! points are, i.e., where the words of the text are. This means that -//! we need to define what a "word" is. -//! -//! A simple approach is to simply split the text on whitespace, but -//! this does not work for East-Asian languages such as Chinese or -//! Japanese where there are no spaces between words. Breaking a long -//! sequence of emojis is another example where line breaks might be -//! wanted even if there are no whitespace to be found. -//! -//! The [`WordSeparator`] trait is responsible for determining where -//! there words are in a line of text. Please refer to the trait and -//! the structs which implement it for more information. - -#[cfg(feature = "unicode-linebreak")] -use crate::core::skip_ansi_escape_sequence; -use crate::core::Word; - -/// Describes where words occur in a line of text. -/// -/// The simplest approach is say that words are separated by one or -/// more ASCII spaces (`' '`). This works for Western languages -/// without emojis. A more complex approach is to use the Unicode line -/// breaking algorithm, which finds break points in non-ASCII text. -/// -/// The line breaks occur between words, please see -/// [`WordSplitter`](crate::WordSplitter) for options of how to handle -/// hyphenation of individual words. -/// -/// # Examples -/// -/// ``` -/// use textwrap::core::Word; -/// use textwrap::WordSeparator::AsciiSpace; -/// -/// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>(); -/// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]); -/// ``` -#[derive(Clone, Copy)] -pub enum WordSeparator { - /// Find words by splitting on runs of `' '` characters. - /// - /// # Examples - /// - /// ``` - /// use textwrap::core::Word; - /// use textwrap::WordSeparator::AsciiSpace; - /// - /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>(); - /// assert_eq!(words, vec![Word::from("Hello "), - /// Word::from("World!")]); - /// ``` - AsciiSpace, - - /// Split `line` into words using Unicode break properties. - /// - /// This word separator uses the Unicode line breaking algorithm - /// described in [Unicode Standard Annex - /// #14](https://www.unicode.org/reports/tr14/) to find legal places - /// to break lines. There is a small difference in that the U+002D - /// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break: - /// to allow a line break at a hyphen, use - /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter). - /// Soft hyphens are not currently supported. - /// - /// # Examples - /// - /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line - /// breaking algorithm will find line break opportunities between - /// some characters with no intervening whitespace: - /// - /// ``` - /// #[cfg(feature = "unicode-linebreak")] { - /// use textwrap::core::Word; - /// use textwrap::WordSeparator::UnicodeBreakProperties; - /// - /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: 😂😍").collect::<Vec<_>>(), - /// vec![Word::from("Emojis: "), - /// Word::from("😂"), - /// Word::from("😍")]); - /// - /// assert_eq!(UnicodeBreakProperties.find_words("CJK: 你好").collect::<Vec<_>>(), - /// vec![Word::from("CJK: "), - /// Word::from("你"), - /// Word::from("好")]); - /// } - /// ``` - /// - /// A U+2060 (Word Joiner) character can be inserted if you want to - /// manually override the defaults and keep the characters together: - /// - /// ``` - /// #[cfg(feature = "unicode-linebreak")] { - /// use textwrap::core::Word; - /// use textwrap::WordSeparator::UnicodeBreakProperties; - /// - /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: 😂\u{2060}😍").collect::<Vec<_>>(), - /// vec![Word::from("Emojis: "), - /// Word::from("😂\u{2060}😍")]); - /// } - /// ``` - /// - /// The Unicode line breaking algorithm will also automatically - /// suppress break breaks around certain punctuation characters:: - /// - /// ``` - /// #[cfg(feature = "unicode-linebreak")] { - /// use textwrap::core::Word; - /// use textwrap::WordSeparator::UnicodeBreakProperties; - /// - /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(), - /// vec![Word::from("[ foo ] "), - /// Word::from("bar !")]); - /// } - /// ``` - #[cfg(feature = "unicode-linebreak")] - UnicodeBreakProperties, - - /// Find words using a custom word separator - Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>), -} - -impl std::fmt::Debug for WordSeparator { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - WordSeparator::AsciiSpace => f.write_str("AsciiSpace"), - #[cfg(feature = "unicode-linebreak")] - WordSeparator::UnicodeBreakProperties => f.write_str("UnicodeBreakProperties"), - WordSeparator::Custom(_) => f.write_str("Custom(...)"), - } - } -} - -impl WordSeparator { - // This function should really return impl Iterator<Item = Word>, but - // this isn't possible until Rust supports higher-kinded types: - // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md - /// Find all words in `line`. - pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> { - match self { - WordSeparator::AsciiSpace => find_words_ascii_space(line), - #[cfg(feature = "unicode-linebreak")] - WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line), - WordSeparator::Custom(func) => func(line), - } - } -} - -fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> { - let mut start = 0; - let mut in_whitespace = false; - let mut char_indices = line.char_indices(); - - Box::new(std::iter::from_fn(move || { - // for (idx, ch) in char_indices does not work, gives this - // error: - // - // > cannot move out of `char_indices`, a captured variable in - // > an `FnMut` closure - #[allow(clippy::while_let_on_iterator)] - while let Some((idx, ch)) = char_indices.next() { - if in_whitespace && ch != ' ' { - let word = Word::from(&line[start..idx]); - start = idx; - in_whitespace = ch == ' '; - return Some(word); - } - - in_whitespace = ch == ' '; - } - - if start < line.len() { - let word = Word::from(&line[start..]); - start = line.len(); - return Some(word); - } - - None - })) -} - -// Strip all ANSI escape sequences from `text`. -#[cfg(feature = "unicode-linebreak")] -fn strip_ansi_escape_sequences(text: &str) -> String { - let mut result = String::with_capacity(text.len()); - - let mut chars = text.chars(); - while let Some(ch) = chars.next() { - if skip_ansi_escape_sequence(ch, &mut chars) { - continue; - } - result.push(ch); - } - - result -} - -/// Soft hyphen, also knows as a “shy hyphen”. Should show up as ‘-’ -/// if a line is broken at this point, and otherwise be invisible. -/// Textwrap does not currently support breaking words at soft -/// hyphens. -#[cfg(feature = "unicode-linebreak")] -const SHY: char = '\u{00ad}'; - -/// Find words in line. ANSI escape sequences are ignored in `line`. -#[cfg(feature = "unicode-linebreak")] -fn find_words_unicode_break_properties<'a>( - line: &'a str, -) -> Box<dyn Iterator<Item = Word<'a>> + 'a> { - // Construct an iterator over (original index, stripped index) - // tuples. We find the Unicode linebreaks on a stripped string, - // but we need the original indices so we can form words based on - // the original string. - let mut last_stripped_idx = 0; - let mut char_indices = line.char_indices(); - let mut idx_map = std::iter::from_fn(move || match char_indices.next() { - Some((orig_idx, ch)) => { - let stripped_idx = last_stripped_idx; - if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) { - last_stripped_idx += ch.len_utf8(); - } - Some((orig_idx, stripped_idx)) - } - None => None, - }); - - let stripped = strip_ansi_escape_sequences(line); - let mut opportunities = unicode_linebreak::linebreaks(&stripped) - .filter(|(idx, _)| { - #[allow(clippy::match_like_matches_macro)] - match &stripped[..*idx].chars().next_back() { - // We suppress breaks at ‘-’ since we want to control - // this via the WordSplitter. - Some('-') => false, - // Soft hyphens are currently not supported since we - // require all `Word` fragments to be continuous in - // the input string. - Some(SHY) => false, - // Other breaks should be fine! - _ => true, - } - }) - .collect::<Vec<_>>() - .into_iter(); - - // Remove final break opportunity, we will add it below using - // &line[start..]; This ensures that we correctly include a - // trailing ANSI escape sequence. - opportunities.next_back(); - - let mut start = 0; - Box::new(std::iter::from_fn(move || { - #[allow(clippy::while_let_on_iterator)] - while let Some((idx, _)) = opportunities.next() { - if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) { - let word = Word::from(&line[start..orig_idx]); - start = orig_idx; - return Some(word); - } - } - - if start < line.len() { - let word = Word::from(&line[start..]); - start = line.len(); - return Some(word); - } - - None - })) -} - -#[cfg(test)] -mod tests { - use super::WordSeparator::*; - use super::*; - - // Like assert_eq!, but the left expression is an iterator. - macro_rules! assert_iter_eq { - ($left:expr, $right:expr) => { - assert_eq!($left.collect::<Vec<_>>(), $right); - }; - } - - fn to_words<'a>(words: Vec<&'a str>) -> Vec<Word<'a>> { - words.into_iter().map(|w: &str| Word::from(&w)).collect() - } - - macro_rules! test_find_words { - ($ascii_name:ident, - $unicode_name:ident, - $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => { - #[test] - fn $ascii_name() { - $( - let expected_words = to_words($ascii_words.to_vec()); - let actual_words = WordSeparator::AsciiSpace - .find_words($line) - .collect::<Vec<_>>(); - assert_eq!(actual_words, expected_words, "Line: {:?}", $line); - )+ - } - - #[test] - #[cfg(feature = "unicode-linebreak")] - fn $unicode_name() { - $( - let expected_words = to_words($unicode_words.to_vec()); - let actual_words = WordSeparator::UnicodeBreakProperties - .find_words($line) - .collect::<Vec<_>>(); - assert_eq!(actual_words, expected_words, "Line: {:?}", $line); - )+ - } - }; - } - - test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]); - - test_find_words!( - ascii_single_word, - unicode_single_word, - ["foo", ["foo"], ["foo"]] - ); - - test_find_words!( - ascii_two_words, - unicode_two_words, - ["foo bar", ["foo ", "bar"], ["foo ", "bar"]] - ); - - test_find_words!( - ascii_multiple_words, - unicode_multiple_words, - ["foo bar", ["foo ", "bar"], ["foo ", "bar"]], - ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]] - ); - - test_find_words!( - ascii_only_whitespace, - unicode_only_whitespace, - [" ", [" "], [" "]], - [" ", [" "], [" "]] - ); - - test_find_words!( - ascii_inter_word_whitespace, - unicode_inter_word_whitespace, - ["foo bar", ["foo ", "bar"], ["foo ", "bar"]] - ); - - test_find_words!( - ascii_trailing_whitespace, - unicode_trailing_whitespace, - ["foo ", ["foo "], ["foo "]] - ); - - test_find_words!( - ascii_leading_whitespace, - unicode_leading_whitespace, - [" foo", [" ", "foo"], [" ", "foo"]] - ); - - test_find_words!( - ascii_multi_column_char, - unicode_multi_column_char, - ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji 🤠 - ); - - test_find_words!( - ascii_hyphens, - unicode_hyphens, - ["foo-bar", ["foo-bar"], ["foo-bar"]], - ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]], - ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]], - ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]] - ); - - test_find_words!( - ascii_newline, - unicode_newline, - ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]] - ); - - test_find_words!( - ascii_tab, - unicode_tab, - ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]] - ); - - test_find_words!( - ascii_non_breaking_space, - unicode_non_breaking_space, - ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]] - ); - - #[test] - #[cfg(unix)] - fn find_words_colored_text() { - use termion::color::{Blue, Fg, Green, Reset}; - - let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset)); - let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset)); - assert_iter_eq!( - AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)), - vec![Word::from(&green_hello), Word::from(&blue_world)] - ); - - #[cfg(feature = "unicode-linebreak")] - assert_iter_eq!( - UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)), - vec![Word::from(&green_hello), Word::from(&blue_world)] - ); - } - - #[test] - fn find_words_color_inside_word() { - let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz"; - assert_iter_eq!(AsciiSpace.find_words(&text), vec![Word::from(text)]); - - #[cfg(feature = "unicode-linebreak")] - assert_iter_eq!( - UnicodeBreakProperties.find_words(&text), - vec![Word::from(text)] - ); - } -} |