diff options
Diffstat (limited to 'vendor/memchr/src/memmem/mod.rs')
-rw-r--r-- | vendor/memchr/src/memmem/mod.rs | 737 |
1 files changed, 737 insertions, 0 deletions
diff --git a/vendor/memchr/src/memmem/mod.rs b/vendor/memchr/src/memmem/mod.rs new file mode 100644 index 0000000..4f04943 --- /dev/null +++ b/vendor/memchr/src/memmem/mod.rs @@ -0,0 +1,737 @@ +/*! +This module provides forward and reverse substring search routines. + +Unlike the standard library's substring search routines, these work on +arbitrary bytes. For all non-empty needles, these routines will report exactly +the same values as the corresponding routines in the standard library. For +the empty needle, the standard library reports matches only at valid UTF-8 +boundaries, where as these routines will report matches at every position. + +Other than being able to work on arbitrary bytes, the primary reason to prefer +these routines over the standard library routines is that these will generally +be faster. In some cases, significantly so. + +# Example: iterating over substring matches + +This example shows how to use [`find_iter`] to find occurrences of a substring +in a haystack. + +``` +use memchr::memmem; + +let haystack = b"foo bar foo baz foo"; + +let mut it = memmem::find_iter(haystack, "foo"); +assert_eq!(Some(0), it.next()); +assert_eq!(Some(8), it.next()); +assert_eq!(Some(16), it.next()); +assert_eq!(None, it.next()); +``` + +# Example: iterating over substring matches in reverse + +This example shows how to use [`rfind_iter`] to find occurrences of a substring +in a haystack starting from the end of the haystack. + +**NOTE:** This module does not implement double ended iterators, so reverse +searches aren't done by calling `rev` on a forward iterator. + +``` +use memchr::memmem; + +let haystack = b"foo bar foo baz foo"; + +let mut it = memmem::rfind_iter(haystack, "foo"); +assert_eq!(Some(16), it.next()); +assert_eq!(Some(8), it.next()); +assert_eq!(Some(0), it.next()); +assert_eq!(None, it.next()); +``` + +# Example: repeating a search for the same needle + +It may be possible for the overhead of constructing a substring searcher to be +measurable in some workloads. In cases where the same needle is used to search +many haystacks, it is possible to do construction once and thus to avoid it for +subsequent searches. This can be done with a [`Finder`] (or a [`FinderRev`] for +reverse searches). + +``` +use memchr::memmem; + +let finder = memmem::Finder::new("foo"); + +assert_eq!(Some(4), finder.find(b"baz foo quux")); +assert_eq!(None, finder.find(b"quux baz bar")); +``` +*/ + +pub use crate::memmem::searcher::PrefilterConfig as Prefilter; + +// This is exported here for use in the crate::arch::all::twoway +// implementation. This is essentially an abstraction breaker. Namely, the +// public API of twoway doesn't support providing a prefilter, but its crate +// internal API does. The main reason for this is that I didn't want to do the +// API design required to support it without a concrete use case. +pub(crate) use crate::memmem::searcher::Pre; + +use crate::{ + arch::all::{ + packedpair::{DefaultFrequencyRank, HeuristicFrequencyRank}, + rabinkarp, + }, + cow::CowBytes, + memmem::searcher::{PrefilterState, Searcher, SearcherRev}, +}; + +mod searcher; + +/// Returns an iterator over all non-overlapping occurrences of a substring in +/// a haystack. +/// +/// # Complexity +/// +/// This routine is guaranteed to have worst case linear time complexity +/// with respect to both the needle and the haystack. That is, this runs +/// in `O(needle.len() + haystack.len())` time. +/// +/// This routine is also guaranteed to have worst case constant space +/// complexity. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use memchr::memmem; +/// +/// let haystack = b"foo bar foo baz foo"; +/// let mut it = memmem::find_iter(haystack, b"foo"); +/// assert_eq!(Some(0), it.next()); +/// assert_eq!(Some(8), it.next()); +/// assert_eq!(Some(16), it.next()); +/// assert_eq!(None, it.next()); +/// ``` +#[inline] +pub fn find_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>( + haystack: &'h [u8], + needle: &'n N, +) -> FindIter<'h, 'n> { + FindIter::new(haystack, Finder::new(needle)) +} + +/// Returns a reverse iterator over all non-overlapping occurrences of a +/// substring in a haystack. +/// +/// # Complexity +/// +/// This routine is guaranteed to have worst case linear time complexity +/// with respect to both the needle and the haystack. That is, this runs +/// in `O(needle.len() + haystack.len())` time. +/// +/// This routine is also guaranteed to have worst case constant space +/// complexity. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use memchr::memmem; +/// +/// let haystack = b"foo bar foo baz foo"; +/// let mut it = memmem::rfind_iter(haystack, b"foo"); +/// assert_eq!(Some(16), it.next()); +/// assert_eq!(Some(8), it.next()); +/// assert_eq!(Some(0), it.next()); +/// assert_eq!(None, it.next()); +/// ``` +#[inline] +pub fn rfind_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>( + haystack: &'h [u8], + needle: &'n N, +) -> FindRevIter<'h, 'n> { + FindRevIter::new(haystack, FinderRev::new(needle)) +} + +/// Returns the index of the first occurrence of the given needle. +/// +/// Note that if you're are searching for the same needle in many different +/// small haystacks, it may be faster to initialize a [`Finder`] once, +/// and reuse it for each search. +/// +/// # Complexity +/// +/// This routine is guaranteed to have worst case linear time complexity +/// with respect to both the needle and the haystack. That is, this runs +/// in `O(needle.len() + haystack.len())` time. +/// +/// This routine is also guaranteed to have worst case constant space +/// complexity. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use memchr::memmem; +/// +/// let haystack = b"foo bar baz"; +/// assert_eq!(Some(0), memmem::find(haystack, b"foo")); +/// assert_eq!(Some(4), memmem::find(haystack, b"bar")); +/// assert_eq!(None, memmem::find(haystack, b"quux")); +/// ``` +#[inline] +pub fn find(haystack: &[u8], needle: &[u8]) -> Option<usize> { + if haystack.len() < 64 { + rabinkarp::Finder::new(needle).find(haystack, needle) + } else { + Finder::new(needle).find(haystack) + } +} + +/// Returns the index of the last occurrence of the given needle. +/// +/// Note that if you're are searching for the same needle in many different +/// small haystacks, it may be faster to initialize a [`FinderRev`] once, +/// and reuse it for each search. +/// +/// # Complexity +/// +/// This routine is guaranteed to have worst case linear time complexity +/// with respect to both the needle and the haystack. That is, this runs +/// in `O(needle.len() + haystack.len())` time. +/// +/// This routine is also guaranteed to have worst case constant space +/// complexity. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use memchr::memmem; +/// +/// let haystack = b"foo bar baz"; +/// assert_eq!(Some(0), memmem::rfind(haystack, b"foo")); +/// assert_eq!(Some(4), memmem::rfind(haystack, b"bar")); +/// assert_eq!(Some(8), memmem::rfind(haystack, b"ba")); +/// assert_eq!(None, memmem::rfind(haystack, b"quux")); +/// ``` +#[inline] +pub fn rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> { + if haystack.len() < 64 { + rabinkarp::FinderRev::new(needle).rfind(haystack, needle) + } else { + FinderRev::new(needle).rfind(haystack) + } +} + +/// An iterator over non-overlapping substring matches. +/// +/// Matches are reported by the byte offset at which they begin. +/// +/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the +/// needle. +#[derive(Debug, Clone)] +pub struct FindIter<'h, 'n> { + haystack: &'h [u8], + prestate: PrefilterState, + finder: Finder<'n>, + pos: usize, +} + +impl<'h, 'n> FindIter<'h, 'n> { + #[inline(always)] + pub(crate) fn new( + haystack: &'h [u8], + finder: Finder<'n>, + ) -> FindIter<'h, 'n> { + let prestate = PrefilterState::new(); + FindIter { haystack, prestate, finder, pos: 0 } + } + + /// Convert this iterator into its owned variant, such that it no longer + /// borrows the finder and needle. + /// + /// If this is already an owned iterator, then this is a no-op. Otherwise, + /// this copies the needle. + /// + /// This is only available when the `alloc` feature is enabled. + #[cfg(feature = "alloc")] + #[inline] + pub fn into_owned(self) -> FindIter<'h, 'static> { + FindIter { + haystack: self.haystack, + prestate: self.prestate, + finder: self.finder.into_owned(), + pos: self.pos, + } + } +} + +impl<'h, 'n> Iterator for FindIter<'h, 'n> { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + let needle = self.finder.needle(); + let haystack = self.haystack.get(self.pos..)?; + let idx = + self.finder.searcher.find(&mut self.prestate, haystack, needle)?; + + let pos = self.pos + idx; + self.pos = pos + needle.len().max(1); + + Some(pos) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + // The largest possible number of non-overlapping matches is the + // quotient of the haystack and the needle (or the length of the + // haystack, if the needle is empty) + match self.haystack.len().checked_sub(self.pos) { + None => (0, Some(0)), + Some(haystack_len) => match self.finder.needle().len() { + // Empty needles always succeed and match at every point + // (including the very end) + 0 => ( + haystack_len.saturating_add(1), + haystack_len.checked_add(1), + ), + needle_len => (0, Some(haystack_len / needle_len)), + }, + } + } +} + +/// An iterator over non-overlapping substring matches in reverse. +/// +/// Matches are reported by the byte offset at which they begin. +/// +/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the +/// needle. +#[derive(Clone, Debug)] +pub struct FindRevIter<'h, 'n> { + haystack: &'h [u8], + finder: FinderRev<'n>, + /// When searching with an empty needle, this gets set to `None` after + /// we've yielded the last element at `0`. + pos: Option<usize>, +} + +impl<'h, 'n> FindRevIter<'h, 'n> { + #[inline(always)] + pub(crate) fn new( + haystack: &'h [u8], + finder: FinderRev<'n>, + ) -> FindRevIter<'h, 'n> { + let pos = Some(haystack.len()); + FindRevIter { haystack, finder, pos } + } + + /// Convert this iterator into its owned variant, such that it no longer + /// borrows the finder and needle. + /// + /// If this is already an owned iterator, then this is a no-op. Otherwise, + /// this copies the needle. + /// + /// This is only available when the `std` feature is enabled. + #[cfg(feature = "alloc")] + #[inline] + pub fn into_owned(self) -> FindRevIter<'h, 'static> { + FindRevIter { + haystack: self.haystack, + finder: self.finder.into_owned(), + pos: self.pos, + } + } +} + +impl<'h, 'n> Iterator for FindRevIter<'h, 'n> { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + let pos = match self.pos { + None => return None, + Some(pos) => pos, + }; + let result = self.finder.rfind(&self.haystack[..pos]); + match result { + None => None, + Some(i) => { + if pos == i { + self.pos = pos.checked_sub(1); + } else { + self.pos = Some(i); + } + Some(i) + } + } + } +} + +/// A single substring searcher fixed to a particular needle. +/// +/// The purpose of this type is to permit callers to construct a substring +/// searcher that can be used to search haystacks without the overhead of +/// constructing the searcher in the first place. This is a somewhat niche +/// concern when it's necessary to re-use the same needle to search multiple +/// different haystacks with as little overhead as possible. In general, using +/// [`find`] is good enough, but `Finder` is useful when you can meaningfully +/// observe searcher construction time in a profile. +/// +/// When the `std` feature is enabled, then this type has an `into_owned` +/// version which permits building a `Finder` that is not connected to +/// the lifetime of its needle. +#[derive(Clone, Debug)] +pub struct Finder<'n> { + needle: CowBytes<'n>, + searcher: Searcher, +} + +impl<'n> Finder<'n> { + /// Create a new finder for the given needle. + #[inline] + pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> Finder<'n> { + FinderBuilder::new().build_forward(needle) + } + + /// Returns the index of the first occurrence of this needle in the given + /// haystack. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use memchr::memmem::Finder; + /// + /// let haystack = b"foo bar baz"; + /// assert_eq!(Some(0), Finder::new("foo").find(haystack)); + /// assert_eq!(Some(4), Finder::new("bar").find(haystack)); + /// assert_eq!(None, Finder::new("quux").find(haystack)); + /// ``` + #[inline] + pub fn find(&self, haystack: &[u8]) -> Option<usize> { + let mut prestate = PrefilterState::new(); + let needle = self.needle.as_slice(); + self.searcher.find(&mut prestate, haystack, needle) + } + + /// Returns an iterator over all occurrences of a substring in a haystack. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use memchr::memmem::Finder; + /// + /// let haystack = b"foo bar foo baz foo"; + /// let finder = Finder::new(b"foo"); + /// let mut it = finder.find_iter(haystack); + /// assert_eq!(Some(0), it.next()); + /// assert_eq!(Some(8), it.next()); + /// assert_eq!(Some(16), it.next()); + /// assert_eq!(None, it.next()); + /// ``` + #[inline] + pub fn find_iter<'a, 'h>( + &'a self, + haystack: &'h [u8], + ) -> FindIter<'h, 'a> { + FindIter::new(haystack, self.as_ref()) + } + + /// Convert this finder into its owned variant, such that it no longer + /// borrows the needle. + /// + /// If this is already an owned finder, then this is a no-op. Otherwise, + /// this copies the needle. + /// + /// This is only available when the `alloc` feature is enabled. + #[cfg(feature = "alloc")] + #[inline] + pub fn into_owned(self) -> Finder<'static> { + Finder { + needle: self.needle.into_owned(), + searcher: self.searcher.clone(), + } + } + + /// Convert this finder into its borrowed variant. + /// + /// This is primarily useful if your finder is owned and you'd like to + /// store its borrowed variant in some intermediate data structure. + /// + /// Note that the lifetime parameter of the returned finder is tied to the + /// lifetime of `self`, and may be shorter than the `'n` lifetime of the + /// needle itself. Namely, a finder's needle can be either borrowed or + /// owned, so the lifetime of the needle returned must necessarily be the + /// shorter of the two. + #[inline] + pub fn as_ref(&self) -> Finder<'_> { + Finder { + needle: CowBytes::new(self.needle()), + searcher: self.searcher.clone(), + } + } + + /// Returns the needle that this finder searches for. + /// + /// Note that the lifetime of the needle returned is tied to the lifetime + /// of the finder, and may be shorter than the `'n` lifetime. Namely, a + /// finder's needle can be either borrowed or owned, so the lifetime of the + /// needle returned must necessarily be the shorter of the two. + #[inline] + pub fn needle(&self) -> &[u8] { + self.needle.as_slice() + } +} + +/// A single substring reverse searcher fixed to a particular needle. +/// +/// The purpose of this type is to permit callers to construct a substring +/// searcher that can be used to search haystacks without the overhead of +/// constructing the searcher in the first place. This is a somewhat niche +/// concern when it's necessary to re-use the same needle to search multiple +/// different haystacks with as little overhead as possible. In general, +/// using [`rfind`] is good enough, but `FinderRev` is useful when you can +/// meaningfully observe searcher construction time in a profile. +/// +/// When the `std` feature is enabled, then this type has an `into_owned` +/// version which permits building a `FinderRev` that is not connected to +/// the lifetime of its needle. +#[derive(Clone, Debug)] +pub struct FinderRev<'n> { + needle: CowBytes<'n>, + searcher: SearcherRev, +} + +impl<'n> FinderRev<'n> { + /// Create a new reverse finder for the given needle. + #[inline] + pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> FinderRev<'n> { + FinderBuilder::new().build_reverse(needle) + } + + /// Returns the index of the last occurrence of this needle in the given + /// haystack. + /// + /// The haystack may be any type that can be cheaply converted into a + /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use memchr::memmem::FinderRev; + /// + /// let haystack = b"foo bar baz"; + /// assert_eq!(Some(0), FinderRev::new("foo").rfind(haystack)); + /// assert_eq!(Some(4), FinderRev::new("bar").rfind(haystack)); + /// assert_eq!(None, FinderRev::new("quux").rfind(haystack)); + /// ``` + pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> { + self.searcher.rfind(haystack.as_ref(), self.needle.as_slice()) + } + + /// Returns a reverse iterator over all occurrences of a substring in a + /// haystack. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use memchr::memmem::FinderRev; + /// + /// let haystack = b"foo bar foo baz foo"; + /// let finder = FinderRev::new(b"foo"); + /// let mut it = finder.rfind_iter(haystack); + /// assert_eq!(Some(16), it.next()); + /// assert_eq!(Some(8), it.next()); + /// assert_eq!(Some(0), it.next()); + /// assert_eq!(None, it.next()); + /// ``` + #[inline] + pub fn rfind_iter<'a, 'h>( + &'a self, + haystack: &'h [u8], + ) -> FindRevIter<'h, 'a> { + FindRevIter::new(haystack, self.as_ref()) + } + + /// Convert this finder into its owned variant, such that it no longer + /// borrows the needle. + /// + /// If this is already an owned finder, then this is a no-op. Otherwise, + /// this copies the needle. + /// + /// This is only available when the `std` feature is enabled. + #[cfg(feature = "alloc")] + #[inline] + pub fn into_owned(self) -> FinderRev<'static> { + FinderRev { + needle: self.needle.into_owned(), + searcher: self.searcher.clone(), + } + } + + /// Convert this finder into its borrowed variant. + /// + /// This is primarily useful if your finder is owned and you'd like to + /// store its borrowed variant in some intermediate data structure. + /// + /// Note that the lifetime parameter of the returned finder is tied to the + /// lifetime of `self`, and may be shorter than the `'n` lifetime of the + /// needle itself. Namely, a finder's needle can be either borrowed or + /// owned, so the lifetime of the needle returned must necessarily be the + /// shorter of the two. + #[inline] + pub fn as_ref(&self) -> FinderRev<'_> { + FinderRev { + needle: CowBytes::new(self.needle()), + searcher: self.searcher.clone(), + } + } + + /// Returns the needle that this finder searches for. + /// + /// Note that the lifetime of the needle returned is tied to the lifetime + /// of the finder, and may be shorter than the `'n` lifetime. Namely, a + /// finder's needle can be either borrowed or owned, so the lifetime of the + /// needle returned must necessarily be the shorter of the two. + #[inline] + pub fn needle(&self) -> &[u8] { + self.needle.as_slice() + } +} + +/// A builder for constructing non-default forward or reverse memmem finders. +/// +/// A builder is primarily useful for configuring a substring searcher. +/// Currently, the only configuration exposed is the ability to disable +/// heuristic prefilters used to speed up certain searches. +#[derive(Clone, Debug, Default)] +pub struct FinderBuilder { + prefilter: Prefilter, +} + +impl FinderBuilder { + /// Create a new finder builder with default settings. + pub fn new() -> FinderBuilder { + FinderBuilder::default() + } + + /// Build a forward finder using the given needle from the current + /// settings. + pub fn build_forward<'n, B: ?Sized + AsRef<[u8]>>( + &self, + needle: &'n B, + ) -> Finder<'n> { + self.build_forward_with_ranker(DefaultFrequencyRank, needle) + } + + /// Build a forward finder using the given needle and a custom heuristic for + /// determining the frequency of a given byte in the dataset. + /// See [`HeuristicFrequencyRank`] for more details. + pub fn build_forward_with_ranker< + 'n, + R: HeuristicFrequencyRank, + B: ?Sized + AsRef<[u8]>, + >( + &self, + ranker: R, + needle: &'n B, + ) -> Finder<'n> { + let needle = needle.as_ref(); + Finder { + needle: CowBytes::new(needle), + searcher: Searcher::new(self.prefilter, ranker, needle), + } + } + + /// Build a reverse finder using the given needle from the current + /// settings. + pub fn build_reverse<'n, B: ?Sized + AsRef<[u8]>>( + &self, + needle: &'n B, + ) -> FinderRev<'n> { + let needle = needle.as_ref(); + FinderRev { + needle: CowBytes::new(needle), + searcher: SearcherRev::new(needle), + } + } + + /// Configure the prefilter setting for the finder. + /// + /// See the documentation for [`Prefilter`] for more discussion on why + /// you might want to configure this. + pub fn prefilter(&mut self, prefilter: Prefilter) -> &mut FinderBuilder { + self.prefilter = prefilter; + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + define_substring_forward_quickcheck!(|h, n| Some(Finder::new(n).find(h))); + define_substring_reverse_quickcheck!(|h, n| Some( + FinderRev::new(n).rfind(h) + )); + + #[test] + fn forward() { + crate::tests::substring::Runner::new() + .fwd(|h, n| Some(Finder::new(n).find(h))) + .run(); + } + + #[test] + fn reverse() { + crate::tests::substring::Runner::new() + .rev(|h, n| Some(FinderRev::new(n).rfind(h))) + .run(); + } +} |