diff options
Diffstat (limited to 'vendor/strsim')
-rw-r--r-- | vendor/strsim/.cargo-checksum.json | 1 | ||||
-rw-r--r-- | vendor/strsim/CHANGELOG.md | 226 | ||||
-rw-r--r-- | vendor/strsim/Cargo.toml | 39 | ||||
-rw-r--r-- | vendor/strsim/LICENSE | 23 | ||||
-rw-r--r-- | vendor/strsim/README.md | 102 | ||||
-rw-r--r-- | vendor/strsim/benches/benches.rs | 95 | ||||
-rw-r--r-- | vendor/strsim/src/lib.rs | 1307 | ||||
-rw-r--r-- | vendor/strsim/tests/lib.rs | 71 |
8 files changed, 0 insertions, 1864 deletions
diff --git a/vendor/strsim/.cargo-checksum.json b/vendor/strsim/.cargo-checksum.json deleted file mode 100644 index 6e7d5ab..0000000 --- a/vendor/strsim/.cargo-checksum.json +++ /dev/null @@ -1 +0,0 @@ -{"files":{"CHANGELOG.md":"fef521b697f2b44b73bc64603538a9849cb7d707c5227063546d4bb5ca836f03","Cargo.toml":"7face49fe821809026b3a537695cdcb12a82a5ebbed20b73cfab69b06fa309fb","LICENSE":"1e697ce8d21401fbf1bddd9b5c3fd4c4c79ae1e3bdf51f81761c85e11d5a89cd","README.md":"70806b27bbb20acf1e47aabd68530062aab834b7efeced77fcb3126179b043d0","benches/benches.rs":"2f7fae162a517378b42af04b4b077ffd563171f7341cba55b4efca3b4c30426a","src/lib.rs":"2e340450050784ea8e6e19ea36cbedcf9084ecb7829fc206cacdca4f6f069784","tests/lib.rs":"4c8207a5728b82836795e2f87d7d7834db7276082f5ded640f34822feb750cb4"},"package":"ccbca6f34534eb78dbee83f6b2c9442fea7113f43d9e80ea320f0972ae5dc08d"}
\ No newline at end of file diff --git a/vendor/strsim/CHANGELOG.md b/vendor/strsim/CHANGELOG.md deleted file mode 100644 index 558cbf9..0000000 --- a/vendor/strsim/CHANGELOG.md +++ /dev/null @@ -1,226 +0,0 @@ -# Change Log - -This project attempts to adhere to [Semantic Versioning](http://semver.org). - -## [Unreleased] - -## [0.10.1] - (2024-01-07) - -### Changed - -- improve OSA implementation - - reduce runtime - - reduce binary size by more than `25%` - -- reduce binary size of Levenshtein distance - -- improve Damerau-Levenshtein implementation - - reduce memory usage from `O(N*M)` to `O(N+M)` - - reduce runtime in our own benchmark by more than `70%` - - reduce binary size by more than `25%` - -- only boost similarity in Jaro-Winkler once the Jaro similarity exceeds 0.7 - -### Fixed - -- Fix transposition counting in Jaro and Jaro-Winkler. -- Limit common prefix in Jaro-Winkler to 4 characters - -## [0.10.0] - (2020-01-31) - -### Added - -- Sørensen-Dice implementation (thanks [@robjtede](https://github.com/robjtede)) - -## [0.9.3] - (2019-12-12) - -### Fixed - -- Fix Jaro and Jaro-Winkler when the arguments have lengths of 1 and are equal. - Previously, the functions would erroneously return 0 instead of 1. Thanks to - [@vvrably](https://github.com/vvrably) for pointing out the issue. - -## [0.9.2] - (2019-05-09) - -### Changed - -- Revert back to the standard library hashmap because it will use hashbrown very - soon -- Remove ndarray in favor of using a single vector to represent the 2d grid in - Damerau-Levenshtein - -## [0.9.1] - (2019-04-08) - -### Changed - -- Faster Damerau-Levenshtein implementation (thanks [@lovasoa](https://github.com/lovasoa)) - -## [0.9.0] - (2019-04-06) - -### Added - -- Generic distance functions (thanks [@lovasoa](https://github.com/lovasoa)) - -## [0.8.0] - (2018-08-19) - -### Added - -- Normalized versions of Levenshtein and Damerau-Levenshtein (thanks [@gentoid](https://github.com/gentoid)) - -## [0.7.0] - (2018-01-17) - -### Changed - -- Faster Levenshtein implementation (thanks [@wdv4758h](https://github.com/wdv4758h)) - -### Removed - -- Remove the "against_vec" functions. They are one-liners now, so they don't - seem to add enough value to justify making the API larger. I didn't find - anybody using them when I skimmed through a GitHub search. If you do use them, - you can change the calls to something like: -```rust -let distances = strings.iter().map(|a| jaro(target, a)).collect(); -``` - -## [0.6.0] - (2016-12-26) - -### Added - -- Add optimal string alignment distance - -### Fixed - -- Fix Damerau-Levenshtein implementation (previous implementation was actually - optimal string alignment; see this [Damerau-Levenshtein explanation]) - -## [0.5.2] - (2016-11-21) - -### Changed - -- Remove Cargo generated documentation in favor of a [docs.rs] link - -## [0.5.1] - (2016-08-23) - -### Added - -- Add Cargo generated documentation - -### Fixed - -- Fix panic when Jaro or Jaro-Winkler are given strings both with a length of - one - -## [0.5.0] - (2016-08-11) - -### Changed - -- Make Hamming faster (thanks @IBUzPE9) when the two strings have the same - length but slower when they have different lengths - -## [0.4.1] - (2016-04-18) - -### Added - -- Add Vagrant setup for development -- Add AppVeyor configuration for Windows CI - -### Fixed - -- Fix metrics when given strings with multibyte characters (thanks @WanzenBug) - -## [0.4.0] - (2015-06-10) - -### Added - -- For each metric, add a function that takes a vector of strings and returns a -vector of results (thanks @ovarene) - -## [0.3.0] - (2015-04-30) - -### Changed - -- Remove usage of unstable Rust features - -## [0.2.5] - (2015-04-24) - -### Fixed - -- Remove unnecessary `Float` import from doc tests - -## [0.2.4] - (2015-04-15) - -### Fixed - -- Remove unused `core` feature flag - -## [0.2.3] - (2015-04-01) - -### Fixed - -- Remove now unnecessary `Float` import - -## [0.2.2] - (2015-03-29) - -### Fixed - -- Remove usage of `char_at` (marked as unstable) - -## [0.2.1] - (2015-02-20) - -### Fixed - -- Update bit vector import to match Rust update - -## [0.2.0] - (2015-02-19) - -### Added - -- Implement Damerau-Levenshtein -- Add tests in docs - -## [0.1.1] - (2015-02-10) - -### Added - -- Configure Travis for CI -- Add rustdoc comments - -### Fixed - -- Limit Jaro-Winkler return value to a maximum of 1.0 -- Fix float comparisons in tests - -## [0.1.0] - (2015-02-09) - -### Added - -- Implement Hamming, Jaro, Jaro-Winkler, and Levenshtein - -[Unreleased]: https://github.com/rapidfuzz/strsim-rs/compare/0.10.1...HEAD -[0.10.1]: https://github.com/rapidfuzz/strsim-rs/compare/0.10.0...0.10.1 -[0.10.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.9.3...0.10.0 -[0.9.3]: https://github.com/rapidfuzz/strsim-rs/compare/0.9.2...0.9.3 -[0.9.2]: https://github.com/rapidfuzz/strsim-rs/compare/0.9.1...0.9.2 -[0.9.1]: https://github.com/rapidfuzz/strsim-rs/compare/0.9.0...0.9.1 -[0.9.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.8.0...0.9.0 -[0.8.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.7.0...0.8.0 -[0.7.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.6.0...0.7.0 -[0.6.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.5.2...0.6.0 -[0.5.2]: https://github.com/rapidfuzz/strsim-rs/compare/0.5.1...0.5.2 -[0.5.1]: https://github.com/rapidfuzz/strsim-rs/compare/0.5.0...0.5.1 -[0.5.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.4.1...0.5.0 -[0.4.1]: https://github.com/rapidfuzz/strsim-rs/compare/0.4.0...0.4.1 -[0.4.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.3.0...0.4.0 -[0.3.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.2.5...0.3.0 -[0.2.5]: https://github.com/rapidfuzz/strsim-rs/compare/0.2.4...0.2.5 -[0.2.4]: https://github.com/rapidfuzz/strsim-rs/compare/0.2.3...0.2.4 -[0.2.3]: https://github.com/rapidfuzz/strsim-rs/compare/0.2.2...0.2.3 -[0.2.2]: https://github.com/rapidfuzz/strsim-rs/compare/0.2.1...0.2.2 -[0.2.1]: https://github.com/rapidfuzz/strsim-rs/compare/0.2.0...0.2.1 -[0.2.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.1.1...0.2.0 -[0.1.1]: https://github.com/rapidfuzz/strsim-rs/compare/0.1.0...0.1.1 -[0.1.0]: https://github.com/rapidfuzz/strsim-rs/compare/fabad4...0.1.0 -[docs.rs]: https://docs.rs/strsim/ -[Damerau-Levenshtein explanation]: -http://scarcitycomputing.blogspot.com/2013/04/damerau-levenshtein-edit-distance.html diff --git a/vendor/strsim/Cargo.toml b/vendor/strsim/Cargo.toml deleted file mode 100644 index 7011ac5..0000000 --- a/vendor/strsim/Cargo.toml +++ /dev/null @@ -1,39 +0,0 @@ -# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO -# -# When uploading crates to the registry Cargo will automatically -# "normalize" Cargo.toml files for maximal compatibility -# with all versions of Cargo and also rewrite `path` dependencies -# to registry (e.g., crates.io) dependencies. -# -# If you are reading this file be aware that the original Cargo.toml -# will likely look very different (and much more reasonable). -# See Cargo.toml.orig for the original contents. - -[package] -name = "strsim" -version = "0.10.1" -authors = [ - "Danny Guo <danny@dannyguo.com>", - "maxbachmann <oss@maxbachmann.de>", -] -exclude = [ - "/.github", - "/dev", -] -description = """ -Implementations of string similarity metrics. Includes Hamming, Levenshtein, -OSA, Damerau-Levenshtein, Jaro, Jaro-Winkler, and Sørensen-Dice. -""" -homepage = "https://github.com/rapidfuzz/strsim-rs" -documentation = "https://docs.rs/strsim/" -readme = "README.md" -keywords = [ - "string", - "similarity", - "Hamming", - "Levenshtein", - "Jaro", -] -categories = ["text-processing"] -license = "MIT" -repository = "https://github.com/rapidfuzz/strsim-rs" diff --git a/vendor/strsim/LICENSE b/vendor/strsim/LICENSE deleted file mode 100644 index 8d1fbe1..0000000 --- a/vendor/strsim/LICENSE +++ /dev/null @@ -1,23 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2015 Danny Guo -Copyright (c) 2016 Titus Wormer <tituswormer@gmail.com> -Copyright (c) 2018 Akash Kurdekar - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/vendor/strsim/README.md b/vendor/strsim/README.md deleted file mode 100644 index f3dcd96..0000000 --- a/vendor/strsim/README.md +++ /dev/null @@ -1,102 +0,0 @@ -# strsim-rs - -[![Crates.io](https://img.shields.io/crates/v/strsim.svg)](https://crates.io/crates/strsim) -[![Crates.io](https://img.shields.io/crates/l/strsim.svg?maxAge=2592000)](https://github.com/rapidfuzz/strsim-rs/blob/main/LICENSE) -[![CI status](https://github.com/rapidfuzz/strsim-rs/workflows/CI/badge.svg)](https://github.com/rapidfuzz/strsim-rs/actions?query=branch%3Amain) -[![unsafe forbidden](https://img.shields.io/badge/unsafe-forbidden-success.svg)](https://github.com/rust-secure-code/safety-dance/) - -[Rust](https://www.rust-lang.org) implementations of [string similarity metrics]: - - [Hamming] - - [Levenshtein] - distance & normalized - - [Optimal string alignment] - - [Damerau-Levenshtein] - distance & normalized - - [Jaro and Jaro-Winkler] - - [Sørensen-Dice] - -The normalized versions return values between `0.0` and `1.0`, where `1.0` means -an exact match. - -There are also generic versions of the functions for non-string inputs. - -## Installation - -`strsim` is available on [crates.io](https://crates.io/crates/strsim). Add it to -your `Cargo.toml`: -```toml -[dependencies] -strsim = "0.10.0" -``` - -## Usage - -Go to [Docs.rs](https://docs.rs/strsim/) for the full documentation. You can -also clone the repo, and run `$ cargo doc --open`. - -### Examples - -```rust -extern crate strsim; - -use strsim::{hamming, levenshtein, normalized_levenshtein, osa_distance, - damerau_levenshtein, normalized_damerau_levenshtein, jaro, - jaro_winkler, sorensen_dice}; - -fn main() { - match hamming("hamming", "hammers") { - Ok(distance) => assert_eq!(3, distance), - Err(why) => panic!("{:?}", why) - } - - assert_eq!(levenshtein("kitten", "sitting"), 3); - - assert!((normalized_levenshtein("kitten", "sitting") - 0.571).abs() < 0.001); - - assert_eq!(osa_distance("ac", "cba"), 3); - - assert_eq!(damerau_levenshtein("ac", "cba"), 2); - - assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.272).abs() < - 0.001); - - assert!((jaro("Friedrich Nietzsche", "Jean-Paul Sartre") - 0.392).abs() < - 0.001); - - assert!((jaro_winkler("cheeseburger", "cheese fries") - 0.911).abs() < - 0.001); - - assert_eq!(sorensen_dice("web applications", "applications of the web"), - 0.7878787878787878); -} -``` - -Using the generic versions of the functions: - -```rust -extern crate strsim; - -use strsim::generic_levenshtein; - -fn main() { - assert_eq!(2, generic_levenshtein(&[1, 2, 3], &[0, 2, 5])); -} -``` - -## Contributing - -If you don't want to install Rust itself, you can run `$ ./dev` for a -development CLI if you have [Docker] installed. - -Benchmarks require a Nightly toolchain. Run `$ cargo +nightly bench`. - -## License - -[MIT](https://github.com/rapidfuzz/strsim-rs/blob/main/LICENSE) - -[string similarity metrics]:http://en.wikipedia.org/wiki/String_metric -[Damerau-Levenshtein]:http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance -[Jaro and Jaro-Winkler]:http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance -[Levenshtein]:http://en.wikipedia.org/wiki/Levenshtein_distance -[Hamming]:http://en.wikipedia.org/wiki/Hamming_distance -[Optimal string alignment]:https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance -[Sørensen-Dice]:http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient -[Docker]:https://docs.docker.com/engine/installation/ diff --git a/vendor/strsim/benches/benches.rs b/vendor/strsim/benches/benches.rs deleted file mode 100644 index 15c7041..0000000 --- a/vendor/strsim/benches/benches.rs +++ /dev/null @@ -1,95 +0,0 @@ -//! Benchmarks for strsim. - -#![feature(test)] - -extern crate strsim; -extern crate test; -use self::test::Bencher; - -#[bench] -fn bench_hamming(bencher: &mut Bencher) { - let a = "ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGG"; - let b = "CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGC"; - bencher.iter(|| { - strsim::hamming(a, b).unwrap(); - }) -} - -#[bench] -fn bench_jaro(bencher: &mut Bencher) { - let a = "Philosopher Friedrich Nietzsche"; - let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { - strsim::jaro(a, b); - }) -} - -#[bench] -fn bench_jaro_winkler(bencher: &mut Bencher) { - let a = "Philosopher Friedrich Nietzsche"; - let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { - strsim::jaro_winkler(a, b); - }) -} - -#[bench] -fn bench_levenshtein(bencher: &mut Bencher) { - let a = "Philosopher Friedrich Nietzsche"; - let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { - strsim::levenshtein(a, b); - }) -} - -#[bench] -fn bench_levenshtein_on_u8(bencher: &mut Bencher) { - bencher.iter(|| { - strsim::generic_levenshtein(&vec![0u8; 30], &vec![7u8; 31]); - }) -} - -#[bench] -fn bench_normalized_levenshtein(bencher: &mut Bencher) { - let a = "Philosopher Friedrich Nietzsche"; - let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { - strsim::normalized_levenshtein(a, b); - }) -} - -#[bench] -fn bench_osa_distance(bencher: &mut Bencher) { - let a = "Philosopher Friedrich Nietzsche"; - let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { - strsim::osa_distance(a, b); - }) -} - -#[bench] -fn bench_damerau_levenshtein(bencher: &mut Bencher) { - let a = "Philosopher Friedrich Nietzsche"; - let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { - strsim::damerau_levenshtein(a, b); - }) -} - -#[bench] -fn bench_normalized_damerau_levenshtein(bencher: &mut Bencher) { - let a = "Philosopher Friedrich Nietzsche"; - let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { - strsim::normalized_damerau_levenshtein(a, b); - }) -} - -#[bench] -fn bench_sorensen_dice(bencher: &mut Bencher) { - let a = "Philosopher Friedrich Nietzsche"; - let b = "Philosopher Jean-Paul Sartre"; - bencher.iter(|| { - strsim::sorensen_dice(a, b); - }) -} diff --git a/vendor/strsim/src/lib.rs b/vendor/strsim/src/lib.rs deleted file mode 100644 index 8118277..0000000 --- a/vendor/strsim/src/lib.rs +++ /dev/null @@ -1,1307 +0,0 @@ -//! This library implements string similarity metrics. - -#![forbid(unsafe_code)] -#![allow( - // these casts are sometimes needed. They restrict the length of input iterators - // but there isn't really any way around this except for always working with - // 128 bit types - clippy::cast_possible_wrap, - clippy::cast_sign_loss, - clippy::cast_precision_loss, - // not practical - clippy::needless_pass_by_value, - clippy::similar_names, - // noisy - clippy::missing_errors_doc, - clippy::missing_panics_doc, - clippy::must_use_candidate, - // todo https://github.com/rapidfuzz/strsim-rs/issues/59 - clippy::range_plus_one -)] - -use std::char; -use std::cmp::{max, min}; -use std::collections::HashMap; -use std::convert::TryFrom; -use std::error::Error; -use std::fmt::{self, Display, Formatter}; -use std::hash::Hash; -use std::mem; -use std::str::Chars; - -#[derive(Debug, PartialEq)] -pub enum StrSimError { - DifferentLengthArgs, -} - -impl Display for StrSimError { - fn fmt(&self, fmt: &mut Formatter) -> Result<(), fmt::Error> { - let text = match self { - StrSimError::DifferentLengthArgs => "Differing length arguments provided", - }; - - write!(fmt, "{text}") - } -} - -impl Error for StrSimError {} - -pub type HammingResult = Result<usize, StrSimError>; - -/// Calculates the number of positions in the two sequences where the elements -/// differ. Returns an error if the sequences have different lengths. -pub fn generic_hamming<Iter1, Iter2, Elem1, Elem2>(a: Iter1, b: Iter2) -> HammingResult -where - Iter1: IntoIterator<Item = Elem1>, - Iter2: IntoIterator<Item = Elem2>, - Elem1: PartialEq<Elem2>, -{ - let (mut ita, mut itb) = (a.into_iter(), b.into_iter()); - let mut count = 0; - loop { - match (ita.next(), itb.next()) { - (Some(x), Some(y)) => { - if x != y { - count += 1; - } - } - (None, None) => return Ok(count), - _ => return Err(StrSimError::DifferentLengthArgs), - } - } -} - -/// Calculates the number of positions in the two strings where the characters -/// differ. Returns an error if the strings have different lengths. -/// -/// ``` -/// use strsim::{hamming, StrSimError::DifferentLengthArgs}; -/// -/// assert_eq!(Ok(3), hamming("hamming", "hammers")); -/// -/// assert_eq!(Err(DifferentLengthArgs), hamming("hamming", "ham")); -/// ``` -pub fn hamming(a: &str, b: &str) -> HammingResult { - generic_hamming(a.chars(), b.chars()) -} - -/// Calculates the Jaro similarity between two sequences. The returned value -/// is between 0.0 and 1.0 (higher value means more similar). -pub fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 -where - &'a Iter1: IntoIterator<Item = Elem1>, - &'b Iter2: IntoIterator<Item = Elem2>, - Elem1: PartialEq<Elem2>, -{ - let a_len = a.into_iter().count(); - let b_len = b.into_iter().count(); - - if a_len == 0 && b_len == 0 { - return 1.0; - } else if a_len == 0 || b_len == 0 { - return 0.0; - } - - let mut search_range = max(a_len, b_len) / 2; - search_range = search_range.saturating_sub(1); - - // combine memory allocations to reduce runtime - let mut flags_memory = vec![false; a_len + b_len]; - let (a_flags, b_flags) = flags_memory.split_at_mut(a_len); - - let mut matches = 0_usize; - - for (i, a_elem) in a.into_iter().enumerate() { - // prevent integer wrapping - let min_bound = if i > search_range { - i - search_range - } else { - 0 - }; - - let max_bound = min(b_len, i + search_range + 1); - - for (j, b_elem) in b.into_iter().enumerate().take(max_bound) { - if min_bound <= j && a_elem == b_elem && !b_flags[j] { - a_flags[i] = true; - b_flags[j] = true; - matches += 1; - break; - } - } - } - - let mut transpositions = 0_usize; - if matches != 0 { - let mut b_iter = b_flags.iter().zip(b); - for (a_flag, ch1) in a_flags.iter().zip(a) { - if *a_flag { - loop { - if let Some((b_flag, ch2)) = b_iter.next() { - if !*b_flag { - continue; - } - - if ch1 != ch2 { - transpositions += 1; - } - break; - } - } - } - } - } - transpositions /= 2; - - if matches == 0 { - 0.0 - } else { - ((matches as f64 / a_len as f64) - + (matches as f64 / b_len as f64) - + ((matches - transpositions) as f64 / matches as f64)) - / 3.0 - } -} - -struct StringWrapper<'a>(&'a str); - -impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { - type Item = char; - type IntoIter = Chars<'b>; - - fn into_iter(self) -> Self::IntoIter { - self.0.chars() - } -} - -/// Calculates the Jaro similarity between two strings. The returned value -/// is between 0.0 and 1.0 (higher value means more similar). -/// -/// ``` -/// use strsim::jaro; -/// -/// assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() < -/// 0.001); -/// ``` -pub fn jaro(a: &str, b: &str) -> f64 { - generic_jaro(&StringWrapper(a), &StringWrapper(b)) -} - -/// Like Jaro but gives a boost to sequences that have a common prefix. -pub fn generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 -where - &'a Iter1: IntoIterator<Item = Elem1>, - &'b Iter2: IntoIterator<Item = Elem2>, - Elem1: PartialEq<Elem2>, -{ - let sim = generic_jaro(a, b); - - if sim > 0.7 { - let prefix_length = a - .into_iter() - .take(4) - .zip(b) - .take_while(|(a_elem, b_elem)| a_elem == b_elem) - .count(); - - sim + 0.1 * prefix_length as f64 * (1.0 - sim) - } else { - sim - } -} - -/// Like Jaro but gives a boost to strings that have a common prefix. -/// -/// ``` -/// use strsim::jaro_winkler; -/// -/// assert!((0.866 - jaro_winkler("cheeseburger", "cheese fries")).abs() < -/// 0.001); -/// ``` -pub fn jaro_winkler(a: &str, b: &str) -> f64 { - generic_jaro_winkler(&StringWrapper(a), &StringWrapper(b)) -} - -/// Calculates the minimum number of insertions, deletions, and substitutions -/// required to change one sequence into the other. -/// -/// ``` -/// use strsim::generic_levenshtein; -/// -/// assert_eq!(3, generic_levenshtein(&[1,2,3], &[1,2,3,4,5,6])); -/// ``` -pub fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> usize -where - &'a Iter1: IntoIterator<Item = Elem1>, - &'b Iter2: IntoIterator<Item = Elem2>, - Elem1: PartialEq<Elem2>, -{ - let b_len = b.into_iter().count(); - - let mut cache: Vec<usize> = (1..b_len + 1).collect(); - - let mut result = b_len; - - for (i, a_elem) in a.into_iter().enumerate() { - result = i + 1; - let mut distance_b = i; - - for (j, b_elem) in b.into_iter().enumerate() { - let cost = usize::from(a_elem != b_elem); - let distance_a = distance_b + cost; - distance_b = cache[j]; - result = min(result + 1, min(distance_a, distance_b + 1)); - cache[j] = result; - } - } - - result -} - -/// Calculates the minimum number of insertions, deletions, and substitutions -/// required to change one string into the other. -/// -/// ``` -/// use strsim::levenshtein; -/// -/// assert_eq!(3, levenshtein("kitten", "sitting")); -/// ``` -pub fn levenshtein(a: &str, b: &str) -> usize { - generic_levenshtein(&StringWrapper(a), &StringWrapper(b)) -} - -/// Calculates a normalized score of the Levenshtein algorithm between 0.0 and -/// 1.0 (inclusive), where 1.0 means the strings are the same. -/// -/// ``` -/// use strsim::normalized_levenshtein; -/// -/// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001); -/// assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001); -/// assert!(normalized_levenshtein("", "second").abs() < 0.00001); -/// assert!(normalized_levenshtein("first", "").abs() < 0.00001); -/// assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001); -/// ``` -pub fn normalized_levenshtein(a: &str, b: &str) -> f64 { - if a.is_empty() && b.is_empty() { - return 1.0; - } - 1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64) -} - -/// Like Levenshtein but allows for adjacent transpositions. Each substring can -/// only be edited once. -/// -/// ``` -/// use strsim::osa_distance; -/// -/// assert_eq!(3, osa_distance("ab", "bca")); -/// ``` -pub fn osa_distance(a: &str, b: &str) -> usize { - let b_len = b.chars().count(); - // 0..=b_len behaves like 0..b_len.saturating_add(1) which could be a different size - // this leads to significantly worse code gen when swapping the vectors below - let mut prev_two_distances: Vec<usize> = (0..b_len + 1).collect(); - let mut prev_distances: Vec<usize> = (0..b_len + 1).collect(); - let mut curr_distances: Vec<usize> = vec![0; b_len + 1]; - - let mut prev_a_char = char::MAX; - let mut prev_b_char = char::MAX; - - for (i, a_char) in a.chars().enumerate() { - curr_distances[0] = i + 1; - - for (j, b_char) in b.chars().enumerate() { - let cost = usize::from(a_char != b_char); - curr_distances[j + 1] = min( - curr_distances[j] + 1, - min(prev_distances[j + 1] + 1, prev_distances[j] + cost), - ); - if i > 0 && j > 0 && a_char != b_char && a_char == prev_b_char && b_char == prev_a_char - { - curr_distances[j + 1] = min(curr_distances[j + 1], prev_two_distances[j - 1] + 1); - } - - prev_b_char = b_char; - } - - mem::swap(&mut prev_two_distances, &mut prev_distances); - mem::swap(&mut prev_distances, &mut curr_distances); - prev_a_char = a_char; - } - - // access prev_distances instead of curr_distances since we swapped - // them above. In case a is empty this would still contain the correct value - // from initializing the last element to b_len - prev_distances[b_len] -} - -/* Returns the final index for a value in a single vector that represents a fixed -2d grid */ -fn flat_index(i: usize, j: usize, width: usize) -> usize { - j * width + i -} - -/// Like optimal string alignment, but substrings can be edited an unlimited -/// number of times, and the triangle inequality holds. -/// -/// ``` -/// use strsim::generic_damerau_levenshtein; -/// -/// assert_eq!(2, generic_damerau_levenshtein(&[1,2], &[2,3,1])); -/// ``` -pub fn generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) -> usize -where - Elem: Eq + Hash + Clone, -{ - let a_len = a_elems.len(); - let b_len = b_elems.len(); - - if a_len == 0 { - return b_len; - } - if b_len == 0 { - return a_len; - } - - let width = a_len + 2; - let mut distances = vec![0; (a_len + 2) * (b_len + 2)]; - let max_distance = a_len + b_len; - distances[0] = max_distance; - - for i in 0..(a_len + 1) { - distances[flat_index(i + 1, 0, width)] = max_distance; - distances[flat_index(i + 1, 1, width)] = i; - } - - for j in 0..(b_len + 1) { - distances[flat_index(0, j + 1, width)] = max_distance; - distances[flat_index(1, j + 1, width)] = j; - } - - let mut elems: HashMap<Elem, usize> = HashMap::with_capacity(64); - - for i in 1..(a_len + 1) { - let mut db = 0; - - for j in 1..(b_len + 1) { - let k = match elems.get(&b_elems[j - 1]) { - Some(&value) => value, - None => 0, - }; - - let insertion_cost = distances[flat_index(i, j + 1, width)] + 1; - let deletion_cost = distances[flat_index(i + 1, j, width)] + 1; - let transposition_cost = - distances[flat_index(k, db, width)] + (i - k - 1) + 1 + (j - db - 1); - - let mut substitution_cost = distances[flat_index(i, j, width)] + 1; - if a_elems[i - 1] == b_elems[j - 1] { - db = j; - substitution_cost -= 1; - } - - distances[flat_index(i + 1, j + 1, width)] = min( - substitution_cost, - min(insertion_cost, min(deletion_cost, transposition_cost)), - ); - } - - elems.insert(a_elems[i - 1].clone(), i); - } - - distances[flat_index(a_len + 1, b_len + 1, width)] -} - -#[derive(Clone, Copy, PartialEq, Eq)] -struct RowId { - val: isize, -} - -impl Default for RowId { - fn default() -> Self { - Self { val: -1 } - } -} - -#[derive(Default, Clone)] -struct GrowingHashmapMapElemChar<ValueType> { - key: u32, - value: ValueType, -} - -/// specialized hashmap to store user provided types -/// this implementation relies on a couple of base assumptions in order to simplify the implementation -/// - the hashmap does not have an upper limit of included items -/// - the default value for the `ValueType` can be used as a dummy value to indicate an empty cell -/// - elements can't be removed -/// - only allocates memory on first write access. -/// This improves performance for hashmaps that are never written to -struct GrowingHashmapChar<ValueType> { - used: i32, - fill: i32, - mask: i32, - map: Option<Vec<GrowingHashmapMapElemChar<ValueType>>>, -} - -impl<ValueType> Default for GrowingHashmapChar<ValueType> -where - ValueType: Default + Clone + Eq, -{ - fn default() -> Self { - Self { - used: 0, - fill: 0, - mask: -1, - map: None, - } - } -} - -impl<ValueType> GrowingHashmapChar<ValueType> -where - ValueType: Default + Clone + Eq + Copy, -{ - fn get(&self, key: u32) -> ValueType { - self.map - .as_ref() - .map_or_else(|| Default::default(), |map| map[self.lookup(key)].value) - } - - fn get_mut(&mut self, key: u32) -> &mut ValueType { - if self.map.is_none() { - self.allocate(); - } - - let mut i = self.lookup(key); - if self - .map - .as_ref() - .expect("map should have been created above")[i] - .value - == Default::default() - { - self.fill += 1; - // resize when 2/3 full - if self.fill * 3 >= (self.mask + 1) * 2 { - self.grow((self.used + 1) * 2); - i = self.lookup(key); - } - - self.used += 1; - } - - let elem = &mut self - .map - .as_mut() - .expect("map should have been created above")[i]; - elem.key = key; - &mut elem.value - } - - fn allocate(&mut self) { - self.mask = 8 - 1; - self.map = Some(vec![GrowingHashmapMapElemChar::default(); 8]); - } - - /// lookup key inside the hashmap using a similar collision resolution - /// strategy to `CPython` and `Ruby` - fn lookup(&self, key: u32) -> usize { - let hash = key; - let mut i = hash as usize & self.mask as usize; - - let map = self - .map - .as_ref() - .expect("callers have to ensure map is allocated"); - - if map[i].value == Default::default() || map[i].key == key { - return i; - } - - let mut perturb = key; - loop { - i = (i * 5 + perturb as usize + 1) & self.mask as usize; - - if map[i].value == Default::default() || map[i].key == key { - return i; - } - - perturb >>= 5; - } - } - - fn grow(&mut self, min_used: i32) { - let mut new_size = self.mask + 1; - while new_size <= min_used { - new_size <<= 1; - } - - self.fill = self.used; - self.mask = new_size - 1; - - let old_map = std::mem::replace( - self.map - .as_mut() - .expect("callers have to ensure map is allocated"), - vec![GrowingHashmapMapElemChar::<ValueType>::default(); new_size as usize], - ); - - for elem in old_map { - if elem.value != Default::default() { - let j = self.lookup(elem.key); - let new_elem = &mut self.map.as_mut().expect("map created above")[j]; - new_elem.key = elem.key; - new_elem.value = elem.value; - self.used -= 1; - if self.used == 0 { - break; - } - } - } - - self.used = self.fill; - } -} - -struct HybridGrowingHashmapChar<ValueType> { - map: GrowingHashmapChar<ValueType>, - extended_ascii: [ValueType; 256], -} - -impl<ValueType> HybridGrowingHashmapChar<ValueType> -where - ValueType: Default + Clone + Copy + Eq, -{ - fn get(&self, key: char) -> ValueType { - let value = key as u32; - if value <= 255 { - let val_u8 = u8::try_from(value).expect("we check the bounds above"); - self.extended_ascii[usize::from(val_u8)] - } else { - self.map.get(value) - } - } - - fn get_mut(&mut self, key: char) -> &mut ValueType { - let value = key as u32; - if value <= 255 { - let val_u8 = u8::try_from(value).expect("we check the bounds above"); - &mut self.extended_ascii[usize::from(val_u8)] - } else { - self.map.get_mut(value) - } - } -} - -impl<ValueType> Default for HybridGrowingHashmapChar<ValueType> -where - ValueType: Default + Clone + Copy + Eq, -{ - fn default() -> Self { - HybridGrowingHashmapChar { - map: GrowingHashmapChar::default(), - extended_ascii: [Default::default(); 256], - } - } -} - -fn damerau_levenshtein_impl<Iter1, Iter2>(s1: Iter1, len1: usize, s2: Iter2, len2: usize) -> usize -where - Iter1: Iterator<Item = char> + Clone, - Iter2: Iterator<Item = char> + Clone, -{ - // The implementations is based on the paper - // `Linear space string correction algorithm using the Damerau-Levenshtein distance` - // from Chunchun Zhao and Sartaj Sahni - // - // It has a runtime complexity of `O(N*M)` and a memory usage of `O(N+M)`. - let max_val = max(len1, len2) as isize + 1; - - let mut last_row_id = HybridGrowingHashmapChar::<RowId>::default(); - - let size = len2 + 2; - let mut fr = vec![max_val; size]; - let mut r1 = vec![max_val; size]; - let mut r: Vec<isize> = (max_val..max_val + 1) - .chain(0..(size - 1) as isize) - .collect(); - - for (i, ch1) in s1.enumerate().map(|(i, ch1)| (i + 1, ch1)) { - mem::swap(&mut r, &mut r1); - let mut last_col_id: isize = -1; - let mut last_i2l1 = r[1]; - r[1] = i as isize; - let mut t = max_val; - - for (j, ch2) in s2.clone().enumerate().map(|(j, ch2)| (j + 1, ch2)) { - let diag = r1[j] + isize::from(ch1 != ch2); - let left = r[j] + 1; - let up = r1[j + 1] + 1; - let mut temp = min(diag, min(left, up)); - - if ch1 == ch2 { - last_col_id = j as isize; // last occurence of s1_i - fr[j + 1] = r1[j - 1]; // save H_k-1,j-2 - t = last_i2l1; // save H_i-2,l-1 - } else { - let k = last_row_id.get(ch2).val; - let l = last_col_id; - - if j as isize - l == 1 { - let transpose = fr[j + 1] + (i as isize - k); - temp = min(temp, transpose); - } else if i as isize - k == 1 { - let transpose = t + (j as isize - l); - temp = min(temp, transpose); - } - } - - last_i2l1 = r[j + 1]; - r[j + 1] = temp; - } - last_row_id.get_mut(ch1).val = i as isize; - } - - r[len2 + 1] as usize -} - -/// Like optimal string alignment, but substrings can be edited an unlimited -/// number of times, and the triangle inequality holds. -/// -/// ``` -/// use strsim::damerau_levenshtein; -/// -/// assert_eq!(2, damerau_levenshtein("ab", "bca")); -/// ``` -pub fn damerau_levenshtein(a: &str, b: &str) -> usize { - damerau_levenshtein_impl(a.chars(), a.chars().count(), b.chars(), b.chars().count()) -} - -/// Calculates a normalized score of the Damerau–Levenshtein algorithm between -/// 0.0 and 1.0 (inclusive), where 1.0 means the strings are the same. -/// -/// ``` -/// use strsim::normalized_damerau_levenshtein; -/// -/// assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001); -/// assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001); -/// assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001); -/// assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001); -/// assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001); -/// ``` -pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 { - if a.is_empty() && b.is_empty() { - return 1.0; - } - - let len1 = a.chars().count(); - let len2 = b.chars().count(); - let dist = damerau_levenshtein_impl(a.chars(), len1, b.chars(), len2); - 1.0 - (dist as f64) / (max(len1, len2) as f64) -} - -/// Returns an Iterator of char tuples. -fn bigrams(s: &str) -> impl Iterator<Item = (char, char)> + '_ { - s.chars().zip(s.chars().skip(1)) -} - -/// Calculates a Sørensen-Dice similarity distance using bigrams. -/// See <https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient>. -/// -/// ``` -/// use strsim::sorensen_dice; -/// -/// assert_eq!(1.0, sorensen_dice("", "")); -/// assert_eq!(0.0, sorensen_dice("", "a")); -/// assert_eq!(0.0, sorensen_dice("french", "quebec")); -/// assert_eq!(1.0, sorensen_dice("ferris", "ferris")); -/// assert_eq!(0.8888888888888888, sorensen_dice("feris", "ferris")); -/// ``` -pub fn sorensen_dice(a: &str, b: &str) -> f64 { - // implementation guided by - // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/index.js#L6 - - let a: String = a.chars().filter(|&x| !char::is_whitespace(x)).collect(); - let b: String = b.chars().filter(|&x| !char::is_whitespace(x)).collect(); - - if a == b { - return 1.0; - } - - if a.len() < 2 || b.len() < 2 { - return 0.0; - } - - let mut a_bigrams: HashMap<(char, char), usize> = HashMap::new(); - - for bigram in bigrams(&a) { - *a_bigrams.entry(bigram).or_insert(0) += 1; - } - - let mut intersection_size = 0_usize; - - for bigram in bigrams(&b) { - a_bigrams.entry(bigram).and_modify(|bi| { - if *bi > 0 { - *bi -= 1; - intersection_size += 1; - } - }); - } - - (2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64 -} - -#[cfg(test)] -mod tests { - use super::*; - - macro_rules! assert_delta { - ($x:expr, $y:expr) => { - assert_delta!($x, $y, 1e-5); - }; - ($x:expr, $y:expr, $d:expr) => { - if ($x - $y).abs() > $d { - panic!( - "assertion failed: actual: `{}`, expected: `{}`: \ - actual not within < {} of expected", - $x, $y, $d - ); - } - }; - } - - #[test] - fn bigrams_iterator() { - let mut bi = bigrams("abcde"); - - assert_eq!(Some(('a', 'b')), bi.next()); - assert_eq!(Some(('b', 'c')), bi.next()); - assert_eq!(Some(('c', 'd')), bi.next()); - assert_eq!(Some(('d', 'e')), bi.next()); - assert_eq!(None, bi.next()); - } - - fn assert_hamming_dist(dist: usize, str1: &str, str2: &str) { - assert_eq!(Ok(dist), hamming(str1, str2)); - } - - #[test] - fn hamming_empty() { - assert_hamming_dist(0, "", "") - } - - #[test] - fn hamming_same() { - assert_hamming_dist(0, "hamming", "hamming") - } - - #[test] - fn hamming_numbers() { - assert_eq!(Ok(1), generic_hamming(&[1, 2, 4], &[1, 2, 3])); - } - - #[test] - fn hamming_diff() { - assert_hamming_dist(3, "hamming", "hammers") - } - - #[test] - fn hamming_diff_multibyte() { - assert_hamming_dist(2, "hamming", "h香mmüng"); - } - - #[test] - fn hamming_unequal_length() { - assert_eq!( - Err(StrSimError::DifferentLengthArgs), - generic_hamming("ham".chars(), "hamming".chars()) - ); - } - - #[test] - fn hamming_names() { - assert_hamming_dist(14, "Friedrich Nietzs", "Jean-Paul Sartre") - } - - #[test] - fn jaro_both_empty() { - assert_eq!(1.0, jaro("", "")); - } - - #[test] - fn jaro_first_empty() { - assert_eq!(0.0, jaro("", "jaro")); - } - - #[test] - fn jaro_second_empty() { - assert_eq!(0.0, jaro("distance", "")); - } - - #[test] - fn jaro_same() { - assert_eq!(1.0, jaro("jaro", "jaro")); - } - - #[test] - fn jaro_multibyte() { - assert_delta!(0.818, jaro("testabctest", "testöঙ香test"), 0.001); - assert_delta!(0.818, jaro("testöঙ香test", "testabctest"), 0.001); - } - - #[test] - fn jaro_diff_short() { - assert_delta!(0.767, jaro("dixon", "dicksonx"), 0.001); - } - - #[test] - fn jaro_diff_one_character() { - assert_eq!(0.0, jaro("a", "b")); - } - - #[test] - fn jaro_same_one_character() { - assert_eq!(1.0, jaro("a", "a")); - } - - #[test] - fn generic_jaro_diff() { - assert_eq!(0.0, generic_jaro(&[1, 2], &[3, 4])); - } - - #[test] - fn jaro_diff_one_and_two() { - assert_delta!(0.83, jaro("a", "ab"), 0.01); - } - - #[test] - fn jaro_diff_two_and_one() { - assert_delta!(0.83, jaro("ab", "a"), 0.01); - } - - #[test] - fn jaro_diff_no_transposition() { - assert_delta!(0.822, jaro("dwayne", "duane"), 0.001); - } - - #[test] - fn jaro_diff_with_transposition() { - assert_delta!(0.944, jaro("martha", "marhta"), 0.001); - assert_delta!(0.6, jaro("a jke", "jane a k"), 0.001); - } - - #[test] - fn jaro_names() { - assert_delta!( - 0.392, - jaro("Friedrich Nietzsche", "Jean-Paul Sartre"), - 0.001 - ); - } - - #[test] - fn jaro_winkler_both_empty() { - assert_eq!(1.0, jaro_winkler("", "")); - } - - #[test] - fn jaro_winkler_first_empty() { - assert_eq!(0.0, jaro_winkler("", "jaro-winkler")); - } - - #[test] - fn jaro_winkler_second_empty() { - assert_eq!(0.0, jaro_winkler("distance", "")); - } - - #[test] - fn jaro_winkler_same() { - assert_eq!(1.0, jaro_winkler("Jaro-Winkler", "Jaro-Winkler")); - } - - #[test] - fn jaro_winkler_multibyte() { - assert_delta!(0.89, jaro_winkler("testabctest", "testöঙ香test"), 0.001); - assert_delta!(0.89, jaro_winkler("testöঙ香test", "testabctest"), 0.001); - } - - #[test] - fn jaro_winkler_diff_short() { - assert_delta!(0.813, jaro_winkler("dixon", "dicksonx"), 0.001); - assert_delta!(0.813, jaro_winkler("dicksonx", "dixon"), 0.001); - } - - #[test] - fn jaro_winkler_diff_one_character() { - assert_eq!(0.0, jaro_winkler("a", "b")); - } - - #[test] - fn jaro_winkler_same_one_character() { - assert_eq!(1.0, jaro_winkler("a", "a")); - } - - #[test] - fn jaro_winkler_diff_no_transposition() { - assert_delta!(0.84, jaro_winkler("dwayne", "duane"), 0.001); - } - - #[test] - fn jaro_winkler_diff_with_transposition() { - assert_delta!(0.961, jaro_winkler("martha", "marhta"), 0.001); - assert_delta!(0.6, jaro_winkler("a jke", "jane a k"), 0.001); - } - - #[test] - fn jaro_winkler_names() { - assert_delta!( - 0.452, - jaro_winkler("Friedrich Nietzsche", "Fran-Paul Sartre"), - 0.001 - ); - } - - #[test] - fn jaro_winkler_long_prefix() { - assert_delta!(0.866, jaro_winkler("cheeseburger", "cheese fries"), 0.001); - } - - #[test] - fn jaro_winkler_more_names() { - assert_delta!(0.868, jaro_winkler("Thorkel", "Thorgier"), 0.001); - } - - #[test] - fn jaro_winkler_length_of_one() { - assert_delta!(0.738, jaro_winkler("Dinsdale", "D"), 0.001); - } - - #[test] - fn jaro_winkler_very_long_prefix() { - assert_delta!( - 0.98519, - jaro_winkler("thequickbrownfoxjumpedoverx", "thequickbrownfoxjumpedovery") - ); - } - - #[test] - fn levenshtein_empty() { - assert_eq!(0, levenshtein("", "")); - } - - #[test] - fn levenshtein_same() { - assert_eq!(0, levenshtein("levenshtein", "levenshtein")); - } - - #[test] - fn levenshtein_diff_short() { - assert_eq!(3, levenshtein("kitten", "sitting")); - } - - #[test] - fn levenshtein_diff_with_space() { - assert_eq!(5, levenshtein("hello, world", "bye, world")); - } - - #[test] - fn levenshtein_diff_multibyte() { - assert_eq!(3, levenshtein("öঙ香", "abc")); - assert_eq!(3, levenshtein("abc", "öঙ香")); - } - - #[test] - fn levenshtein_diff_longer() { - let a = "The quick brown fox jumped over the angry dog."; - let b = "Lorem ipsum dolor sit amet, dicta latine an eam."; - assert_eq!(37, levenshtein(a, b)); - } - - #[test] - fn levenshtein_first_empty() { - assert_eq!(7, levenshtein("", "sitting")); - } - - #[test] - fn levenshtein_second_empty() { - assert_eq!(6, levenshtein("kitten", "")); - } - - #[test] - fn normalized_levenshtein_diff_short() { - assert_delta!(0.57142, normalized_levenshtein("kitten", "sitting")); - } - - #[test] - fn normalized_levenshtein_for_empty_strings() { - assert_delta!(1.0, normalized_levenshtein("", "")); - } - - #[test] - fn normalized_levenshtein_first_empty() { - assert_delta!(0.0, normalized_levenshtein("", "second")); - } - - #[test] - fn normalized_levenshtein_second_empty() { - assert_delta!(0.0, normalized_levenshtein("first", "")); - } - - #[test] - fn normalized_levenshtein_identical_strings() { - assert_delta!(1.0, normalized_levenshtein("identical", "identical")); - } - - #[test] - fn osa_distance_empty() { - assert_eq!(0, osa_distance("", "")); - } - - #[test] - fn osa_distance_same() { - assert_eq!(0, osa_distance("damerau", "damerau")); - } - - #[test] - fn osa_distance_first_empty() { - assert_eq!(7, osa_distance("", "damerau")); - } - - #[test] - fn osa_distance_second_empty() { - assert_eq!(7, osa_distance("damerau", "")); - } - - #[test] - fn osa_distance_diff() { - assert_eq!(3, osa_distance("ca", "abc")); - } - - #[test] - fn osa_distance_diff_short() { - assert_eq!(3, osa_distance("damerau", "aderua")); - } - - #[test] - fn osa_distance_diff_reversed() { - assert_eq!(3, osa_distance("aderua", "damerau")); - } - - #[test] - fn osa_distance_diff_multibyte() { - assert_eq!(3, osa_distance("öঙ香", "abc")); - assert_eq!(3, osa_distance("abc", "öঙ香")); - } - - #[test] - fn osa_distance_diff_unequal_length() { - assert_eq!(6, osa_distance("damerau", "aderuaxyz")); - } - - #[test] - fn osa_distance_diff_unequal_length_reversed() { - assert_eq!(6, osa_distance("aderuaxyz", "damerau")); - } - - #[test] - fn osa_distance_diff_comedians() { - assert_eq!(5, osa_distance("Stewart", "Colbert")); - } - - #[test] - fn osa_distance_many_transpositions() { - assert_eq!(4, osa_distance("abcdefghijkl", "bacedfgihjlk")); - } - - #[test] - fn osa_distance_diff_longer() { - let a = "The quick brown fox jumped over the angry dog."; - let b = "Lehem ipsum dolor sit amet, dicta latine an eam."; - assert_eq!(36, osa_distance(a, b)); - } - - #[test] - fn osa_distance_beginning_transposition() { - assert_eq!(1, osa_distance("foobar", "ofobar")); - } - - #[test] - fn osa_distance_end_transposition() { - assert_eq!(1, osa_distance("specter", "spectre")); - } - - #[test] - fn osa_distance_restricted_edit() { - assert_eq!(4, osa_distance("a cat", "an abct")); - } - - #[test] - fn damerau_levenshtein_empty() { - assert_eq!(0, damerau_levenshtein("", "")); - } - - #[test] - fn damerau_levenshtein_same() { - assert_eq!(0, damerau_levenshtein("damerau", "damerau")); - } - - #[test] - fn damerau_levenshtein_first_empty() { - assert_eq!(7, damerau_levenshtein("", "damerau")); - } - - #[test] - fn damerau_levenshtein_second_empty() { - assert_eq!(7, damerau_levenshtein("damerau", "")); - } - - #[test] - fn damerau_levenshtein_diff() { - assert_eq!(2, damerau_levenshtein("ca", "abc")); - } - - #[test] - fn damerau_levenshtein_diff_short() { - assert_eq!(3, damerau_levenshtein("damerau", "aderua")); - } - - #[test] - fn damerau_levenshtein_diff_reversed() { - assert_eq!(3, damerau_levenshtein("aderua", "damerau")); - } - - #[test] - fn damerau_levenshtein_diff_multibyte() { - assert_eq!(3, damerau_levenshtein("öঙ香", "abc")); - assert_eq!(3, damerau_levenshtein("abc", "öঙ香")); - } - - #[test] - fn damerau_levenshtein_diff_unequal_length() { - assert_eq!(6, damerau_levenshtein("damerau", "aderuaxyz")); - } - - #[test] - fn damerau_levenshtein_diff_unequal_length_reversed() { - assert_eq!(6, damerau_levenshtein("aderuaxyz", "damerau")); - } - - #[test] - fn damerau_levenshtein_diff_comedians() { - assert_eq!(5, damerau_levenshtein("Stewart", "Colbert")); - } - - #[test] - fn damerau_levenshtein_many_transpositions() { - assert_eq!(4, damerau_levenshtein("abcdefghijkl", "bacedfgihjlk")); - } - - #[test] - fn damerau_levenshtein_diff_longer() { - let a = "The quick brown fox jumped over the angry dog."; - let b = "Lehem ipsum dolor sit amet, dicta latine an eam."; - assert_eq!(36, damerau_levenshtein(a, b)); - } - - #[test] - fn damerau_levenshtein_beginning_transposition() { - assert_eq!(1, damerau_levenshtein("foobar", "ofobar")); - } - - #[test] - fn damerau_levenshtein_end_transposition() { - assert_eq!(1, damerau_levenshtein("specter", "spectre")); - } - - #[test] - fn damerau_levenshtein_unrestricted_edit() { - assert_eq!(3, damerau_levenshtein("a cat", "an abct")); - } - - #[test] - fn normalized_damerau_levenshtein_diff_short() { - assert_delta!( - 0.27272, - normalized_damerau_levenshtein("levenshtein", "löwenbräu") - ); - } - - #[test] - fn normalized_damerau_levenshtein_for_empty_strings() { - assert_delta!(1.0, normalized_damerau_levenshtein("", "")); - } - - #[test] - fn normalized_damerau_levenshtein_first_empty() { - assert_delta!(0.0, normalized_damerau_levenshtein("", "flower")); - } - - #[test] - fn normalized_damerau_levenshtein_second_empty() { - assert_delta!(0.0, normalized_damerau_levenshtein("tree", "")); - } - - #[test] - fn normalized_damerau_levenshtein_identical_strings() { - assert_delta!( - 1.0, - normalized_damerau_levenshtein("sunglasses", "sunglasses") - ); - } - - #[test] - fn sorensen_dice_all() { - // test cases taken from - // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/spec/index.spec.js#L11 - - assert_delta!(1.0, sorensen_dice("a", "a")); - assert_delta!(0.0, sorensen_dice("a", "b")); - assert_delta!(1.0, sorensen_dice("", "")); - assert_delta!(0.0, sorensen_dice("a", "")); - assert_delta!(0.0, sorensen_dice("", "a")); - assert_delta!(1.0, sorensen_dice("apple event", "apple event")); - assert_delta!(0.90909, sorensen_dice("iphone", "iphone x")); - assert_delta!(0.0, sorensen_dice("french", "quebec")); - assert_delta!(1.0, sorensen_dice("france", "france")); - assert_delta!(0.2, sorensen_dice("fRaNce", "france")); - assert_delta!(0.8, sorensen_dice("healed", "sealed")); - assert_delta!( - 0.78788, - sorensen_dice("web applications", "applications of the web") - ); - assert_delta!( - 0.92, - sorensen_dice( - "this will have a typo somewhere", - "this will huve a typo somewhere" - ) - ); - assert_delta!( - 0.60606, - sorensen_dice( - "Olive-green table for sale, in extremely good condition.", - "For sale: table in very good condition, olive green in colour." - ) - ); - assert_delta!( - 0.25581, - sorensen_dice( - "Olive-green table for sale, in extremely good condition.", - "For sale: green Subaru Impreza, 210,000 miles" - ) - ); - assert_delta!( - 0.14118, - sorensen_dice( - "Olive-green table for sale, in extremely good condition.", - "Wanted: mountain bike with at least 21 gears." - ) - ); - assert_delta!( - 0.77419, - sorensen_dice("this has one extra word", "this has one word") - ); - } -} diff --git a/vendor/strsim/tests/lib.rs b/vendor/strsim/tests/lib.rs deleted file mode 100644 index 991fc6f..0000000 --- a/vendor/strsim/tests/lib.rs +++ /dev/null @@ -1,71 +0,0 @@ -extern crate strsim; - -use strsim::{ - damerau_levenshtein, hamming, jaro, jaro_winkler, levenshtein, normalized_damerau_levenshtein, - normalized_levenshtein, osa_distance, -}; - -macro_rules! assert_delta { - ($x:expr, $y:expr) => { - assert_delta!($x, $y, 1e-5); - }; - ($x:expr, $y:expr, $d:expr) => { - if ($x - $y).abs() > $d { - panic!( - "assertion failed: actual: `{}`, expected: `{}`: \ - actual not within < {} of expected", - $x, $y, $d - ); - } - }; -} - -#[test] -fn hamming_works() { - match hamming("hamming", "hammers") { - Ok(distance) => assert_eq!(3, distance), - Err(why) => panic!("{:?}", why), - } -} - -#[test] -fn levenshtein_works() { - assert_eq!(3, levenshtein("kitten", "sitting")); -} - -#[test] -fn normalized_levenshtein_works() { - assert_delta!(0.57142, normalized_levenshtein("kitten", "sitting")); -} - -#[test] -fn osa_distance_works() { - assert_eq!(3, osa_distance("ac", "cba")); -} - -#[test] -fn damerau_levenshtein_works() { - assert_eq!(2, damerau_levenshtein("ac", "cba")); -} - -#[test] -fn normalized_damerau_levenshtein_works() { - assert_delta!( - 0.27272, - normalized_damerau_levenshtein("levenshtein", "löwenbräu") - ); -} - -#[test] -fn jaro_works() { - assert_delta!( - 0.392, - jaro("Friedrich Nietzsche", "Jean-Paul Sartre"), - 0.001 - ); -} - -#[test] -fn jaro_winkler_works() { - assert_delta!(0.866, jaro_winkler("cheeseburger", "cheese fries"), 0.001); -} |