diff options
Diffstat (limited to 'vendor/textwrap/src/word_separators.rs')
-rw-r--r-- | vendor/textwrap/src/word_separators.rs | 428 |
1 files changed, 428 insertions, 0 deletions
diff --git a/vendor/textwrap/src/word_separators.rs b/vendor/textwrap/src/word_separators.rs new file mode 100644 index 0000000..25adf31 --- /dev/null +++ b/vendor/textwrap/src/word_separators.rs @@ -0,0 +1,428 @@ +//! Functionality for finding words. +//! +//! In order to wrap text, we need to know where the legal break +//! points are, i.e., where the words of the text are. This means that +//! we need to define what a "word" is. +//! +//! A simple approach is to simply split the text on whitespace, but +//! this does not work for East-Asian languages such as Chinese or +//! Japanese where there are no spaces between words. Breaking a long +//! sequence of emojis is another example where line breaks might be +//! wanted even if there are no whitespace to be found. +//! +//! The [`WordSeparator`] trait is responsible for determining where +//! there words are in a line of text. Please refer to the trait and +//! the structs which implement it for more information. + +#[cfg(feature = "unicode-linebreak")] +use crate::core::skip_ansi_escape_sequence; +use crate::core::Word; + +/// Describes where words occur in a line of text. +/// +/// The simplest approach is say that words are separated by one or +/// more ASCII spaces (`' '`). This works for Western languages +/// without emojis. A more complex approach is to use the Unicode line +/// breaking algorithm, which finds break points in non-ASCII text. +/// +/// The line breaks occur between words, please see +/// [`WordSplitter`](crate::WordSplitter) for options of how to handle +/// hyphenation of individual words. +/// +/// # Examples +/// +/// ``` +/// use textwrap::core::Word; +/// use textwrap::WordSeparator::AsciiSpace; +/// +/// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>(); +/// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]); +/// ``` +#[derive(Clone, Copy)] +pub enum WordSeparator { + /// Find words by splitting on runs of `' '` characters. + /// + /// # Examples + /// + /// ``` + /// use textwrap::core::Word; + /// use textwrap::WordSeparator::AsciiSpace; + /// + /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>(); + /// assert_eq!(words, vec![Word::from("Hello "), + /// Word::from("World!")]); + /// ``` + AsciiSpace, + + /// Split `line` into words using Unicode break properties. + /// + /// This word separator uses the Unicode line breaking algorithm + /// described in [Unicode Standard Annex + /// #14](https://www.unicode.org/reports/tr14/) to find legal places + /// to break lines. There is a small difference in that the U+002D + /// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break: + /// to allow a line break at a hyphen, use + /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter). + /// Soft hyphens are not currently supported. + /// + /// # Examples + /// + /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line + /// breaking algorithm will find line break opportunities between + /// some characters with no intervening whitespace: + /// + /// ``` + /// #[cfg(feature = "unicode-linebreak")] { + /// use textwrap::core::Word; + /// use textwrap::WordSeparator::UnicodeBreakProperties; + /// + /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: 😂😍").collect::<Vec<_>>(), + /// vec![Word::from("Emojis: "), + /// Word::from("😂"), + /// Word::from("😍")]); + /// + /// assert_eq!(UnicodeBreakProperties.find_words("CJK: 你好").collect::<Vec<_>>(), + /// vec![Word::from("CJK: "), + /// Word::from("你"), + /// Word::from("好")]); + /// } + /// ``` + /// + /// A U+2060 (Word Joiner) character can be inserted if you want to + /// manually override the defaults and keep the characters together: + /// + /// ``` + /// #[cfg(feature = "unicode-linebreak")] { + /// use textwrap::core::Word; + /// use textwrap::WordSeparator::UnicodeBreakProperties; + /// + /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: 😂\u{2060}😍").collect::<Vec<_>>(), + /// vec![Word::from("Emojis: "), + /// Word::from("😂\u{2060}😍")]); + /// } + /// ``` + /// + /// The Unicode line breaking algorithm will also automatically + /// suppress break breaks around certain punctuation characters:: + /// + /// ``` + /// #[cfg(feature = "unicode-linebreak")] { + /// use textwrap::core::Word; + /// use textwrap::WordSeparator::UnicodeBreakProperties; + /// + /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(), + /// vec![Word::from("[ foo ] "), + /// Word::from("bar !")]); + /// } + /// ``` + #[cfg(feature = "unicode-linebreak")] + UnicodeBreakProperties, + + /// Find words using a custom word separator + Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>), +} + +impl std::fmt::Debug for WordSeparator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + WordSeparator::AsciiSpace => f.write_str("AsciiSpace"), + #[cfg(feature = "unicode-linebreak")] + WordSeparator::UnicodeBreakProperties => f.write_str("UnicodeBreakProperties"), + WordSeparator::Custom(_) => f.write_str("Custom(...)"), + } + } +} + +impl WordSeparator { + // This function should really return impl Iterator<Item = Word>, but + // this isn't possible until Rust supports higher-kinded types: + // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md + /// Find all words in `line`. + pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> { + match self { + WordSeparator::AsciiSpace => find_words_ascii_space(line), + #[cfg(feature = "unicode-linebreak")] + WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line), + WordSeparator::Custom(func) => func(line), + } + } +} + +fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> { + let mut start = 0; + let mut in_whitespace = false; + let mut char_indices = line.char_indices(); + + Box::new(std::iter::from_fn(move || { + // for (idx, ch) in char_indices does not work, gives this + // error: + // + // > cannot move out of `char_indices`, a captured variable in + // > an `FnMut` closure + #[allow(clippy::while_let_on_iterator)] + while let Some((idx, ch)) = char_indices.next() { + if in_whitespace && ch != ' ' { + let word = Word::from(&line[start..idx]); + start = idx; + in_whitespace = ch == ' '; + return Some(word); + } + + in_whitespace = ch == ' '; + } + + if start < line.len() { + let word = Word::from(&line[start..]); + start = line.len(); + return Some(word); + } + + None + })) +} + +// Strip all ANSI escape sequences from `text`. +#[cfg(feature = "unicode-linebreak")] +fn strip_ansi_escape_sequences(text: &str) -> String { + let mut result = String::with_capacity(text.len()); + + let mut chars = text.chars(); + while let Some(ch) = chars.next() { + if skip_ansi_escape_sequence(ch, &mut chars) { + continue; + } + result.push(ch); + } + + result +} + +/// Soft hyphen, also knows as a “shy hyphen”. Should show up as ‘-’ +/// if a line is broken at this point, and otherwise be invisible. +/// Textwrap does not currently support breaking words at soft +/// hyphens. +#[cfg(feature = "unicode-linebreak")] +const SHY: char = '\u{00ad}'; + +/// Find words in line. ANSI escape sequences are ignored in `line`. +#[cfg(feature = "unicode-linebreak")] +fn find_words_unicode_break_properties<'a>( + line: &'a str, +) -> Box<dyn Iterator<Item = Word<'a>> + 'a> { + // Construct an iterator over (original index, stripped index) + // tuples. We find the Unicode linebreaks on a stripped string, + // but we need the original indices so we can form words based on + // the original string. + let mut last_stripped_idx = 0; + let mut char_indices = line.char_indices(); + let mut idx_map = std::iter::from_fn(move || match char_indices.next() { + Some((orig_idx, ch)) => { + let stripped_idx = last_stripped_idx; + if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) { + last_stripped_idx += ch.len_utf8(); + } + Some((orig_idx, stripped_idx)) + } + None => None, + }); + + let stripped = strip_ansi_escape_sequences(line); + let mut opportunities = unicode_linebreak::linebreaks(&stripped) + .filter(|(idx, _)| { + #[allow(clippy::match_like_matches_macro)] + match &stripped[..*idx].chars().next_back() { + // We suppress breaks at ‘-’ since we want to control + // this via the WordSplitter. + Some('-') => false, + // Soft hyphens are currently not supported since we + // require all `Word` fragments to be continuous in + // the input string. + Some(SHY) => false, + // Other breaks should be fine! + _ => true, + } + }) + .collect::<Vec<_>>() + .into_iter(); + + // Remove final break opportunity, we will add it below using + // &line[start..]; This ensures that we correctly include a + // trailing ANSI escape sequence. + opportunities.next_back(); + + let mut start = 0; + Box::new(std::iter::from_fn(move || { + #[allow(clippy::while_let_on_iterator)] + while let Some((idx, _)) = opportunities.next() { + if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) { + let word = Word::from(&line[start..orig_idx]); + start = orig_idx; + return Some(word); + } + } + + if start < line.len() { + let word = Word::from(&line[start..]); + start = line.len(); + return Some(word); + } + + None + })) +} + +#[cfg(test)] +mod tests { + use super::WordSeparator::*; + use super::*; + + // Like assert_eq!, but the left expression is an iterator. + macro_rules! assert_iter_eq { + ($left:expr, $right:expr) => { + assert_eq!($left.collect::<Vec<_>>(), $right); + }; + } + + fn to_words<'a>(words: Vec<&'a str>) -> Vec<Word<'a>> { + words.into_iter().map(|w: &str| Word::from(&w)).collect() + } + + macro_rules! test_find_words { + ($ascii_name:ident, + $unicode_name:ident, + $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => { + #[test] + fn $ascii_name() { + $( + let expected_words = to_words($ascii_words.to_vec()); + let actual_words = WordSeparator::AsciiSpace + .find_words($line) + .collect::<Vec<_>>(); + assert_eq!(actual_words, expected_words, "Line: {:?}", $line); + )+ + } + + #[test] + #[cfg(feature = "unicode-linebreak")] + fn $unicode_name() { + $( + let expected_words = to_words($unicode_words.to_vec()); + let actual_words = WordSeparator::UnicodeBreakProperties + .find_words($line) + .collect::<Vec<_>>(); + assert_eq!(actual_words, expected_words, "Line: {:?}", $line); + )+ + } + }; + } + + test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]); + + test_find_words!( + ascii_single_word, + unicode_single_word, + ["foo", ["foo"], ["foo"]] + ); + + test_find_words!( + ascii_two_words, + unicode_two_words, + ["foo bar", ["foo ", "bar"], ["foo ", "bar"]] + ); + + test_find_words!( + ascii_multiple_words, + unicode_multiple_words, + ["foo bar", ["foo ", "bar"], ["foo ", "bar"]], + ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]] + ); + + test_find_words!( + ascii_only_whitespace, + unicode_only_whitespace, + [" ", [" "], [" "]], + [" ", [" "], [" "]] + ); + + test_find_words!( + ascii_inter_word_whitespace, + unicode_inter_word_whitespace, + ["foo bar", ["foo ", "bar"], ["foo ", "bar"]] + ); + + test_find_words!( + ascii_trailing_whitespace, + unicode_trailing_whitespace, + ["foo ", ["foo "], ["foo "]] + ); + + test_find_words!( + ascii_leading_whitespace, + unicode_leading_whitespace, + [" foo", [" ", "foo"], [" ", "foo"]] + ); + + test_find_words!( + ascii_multi_column_char, + unicode_multi_column_char, + ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji 🤠 + ); + + test_find_words!( + ascii_hyphens, + unicode_hyphens, + ["foo-bar", ["foo-bar"], ["foo-bar"]], + ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]], + ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]], + ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]] + ); + + test_find_words!( + ascii_newline, + unicode_newline, + ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]] + ); + + test_find_words!( + ascii_tab, + unicode_tab, + ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]] + ); + + test_find_words!( + ascii_non_breaking_space, + unicode_non_breaking_space, + ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]] + ); + + #[test] + #[cfg(unix)] + fn find_words_colored_text() { + use termion::color::{Blue, Fg, Green, Reset}; + + let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset)); + let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset)); + assert_iter_eq!( + AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)), + vec![Word::from(&green_hello), Word::from(&blue_world)] + ); + + #[cfg(feature = "unicode-linebreak")] + assert_iter_eq!( + UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)), + vec![Word::from(&green_hello), Word::from(&blue_world)] + ); + } + + #[test] + fn find_words_color_inside_word() { + let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz"; + assert_iter_eq!(AsciiSpace.find_words(&text), vec![Word::from(text)]); + + #[cfg(feature = "unicode-linebreak")] + assert_iter_eq!( + UnicodeBreakProperties.find_words(&text), + vec![Word::from(text)] + ); + } +} |