Решение на Spell Checker от Илиян Драгнев

Обратно към всички решения

Към профила на Илиян Драгнев

Резултати

  • 17 точки от тестове
  • 0 бонус точки
  • 17 точки общо
  • 13 успешни тест(а)
  • 2 неуспешни тест(а)

Код

use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt;
pub const ALPHABET_EN: &'static str = "abcdefghijklmnopqrstuvwxyz";
pub const ALPHABET_BG: &'static str = "абвгдежзийклмнопрстуфхцчшщъьюя";
pub fn clean_line(input: &str) -> String {
input
.trim()
.chars()
.filter(|&a| is_valid_symbol(a))
.collect()
}
fn is_valid_symbol(c: char) -> bool {
c == '-' ||
c == '\'' ||
c.is_alphabetic() ||
c.is_whitespace()
}
pub struct WordCounter {
words_map: HashMap<String, u32>,
}
impl WordCounter {
pub fn new() -> Self {
WordCounter {
words_map: HashMap::new(),
}
}
pub fn from_str(input: &str) -> Self {
let mut counter = Self::new();
for word in input.lines()
.map(|line| crate::clean_line(line))
.flat_map(|line| to_words(&line))
{
counter.add(&word);
}
counter
}
pub fn add(&mut self, item: &str) {
let word = item.trim().to_lowercase();
let count = self.words_map.entry(word).or_insert(0);
*count += 1;
}
pub fn words(&self) -> Vec<&String> {
let mut words = self.words_map.keys().collect::<Vec<&String>>();
words.sort_unstable_by(|&a, &b| a.cmp(b));
words
}
pub fn get(&self, word: &str) -> u32 {
*self.words_map.get(word).unwrap_or(&0)
}
pub fn total_count(&self) -> u32 {
self.words_map.values().sum()
}
}
impl std::fmt::Display for WordCounter {
fn fmt(&self, f: &mut fmt::Formatter) -> std::fmt::Result {
let _ = writeln!(f, "WordCounter, total count: {}", self.total_count())?;
let mut pairs = self.words_map.iter().collect::<Vec<(&String, &u32)>>();
pairs.sort_unstable_by(|(_, x), (_, y)| y.cmp(&x));
for (word, count) in &pairs {
let _ = writeln!(f, "{}: {}", word, count)?;
}
Ok(())
}
}
fn to_words(line: &str) -> Vec<String> {
line
.split_whitespace()
.map(|word| word.to_owned())
.collect()
}
pub struct SpellChecker {
corpus: WordCounter,
alphabet: String,
}
impl SpellChecker {
pub fn new(corpus: &str, alphabet: &str) -> Self {
SpellChecker {
corpus: WordCounter::from_str(corpus),
alphabet: alphabet.to_owned(),
}
}
pub fn correction(&self, word: &str) -> String {
self.candidates(word)
.into_iter()
.max_by(|a, b| self.probability(a).partial_cmp(&self.probability(b)).unwrap())
.expect("candidates returned empty range")
}
pub fn probability(&self, word: &str) -> f64 {
if self.corpus.total_count() > 0 {
self.corpus.get(word) as f64 / self.corpus.total_count() as f64
}
else {
0.0
}
}
pub fn candidates(&self, word: &str) -> Vec<String> {
let known_words = |edits| -> Option<Vec<String>> {
let words = self.known(&edits);
if !words.is_empty() {
Some(words.iter().map(|&s| s.to_owned()).collect())
}
else { None }
};
let edits = [word].iter().map(|s| s.to_string()).collect();
known_words(edits)
.or_else(|| known_words(self.edits1(word)))
.or_else(|| known_words(self.edits2(word)))
.unwrap_or_else(|| vec![word.to_owned()])
}
pub fn known<'a>(&self, words: &'a HashSet<String>) -> Vec<&'a String> {
words
.iter()
.filter(|word| self.corpus.get(word) > 0)
.collect()
}
pub fn edits1(&self, word: &str) -> HashSet<String> {
use std::iter::FromIterator;
let splits = word
.char_indices()
.map(|(i, _)| (&word[..i], &word[i..]))
.chain([(word, "")].iter().copied())
.collect::<Vec<(&str, &str)>>();
let deletes = Self::single_deletes(&splits);
let inserts = self.single_inserts(&splits);
let replaces = self.single_replaces(&splits);
let transposes = Self::adjacent_transposes(&splits);
HashSet::from_iter(
deletes
.into_iter()
.chain(inserts.into_iter())
.chain(replaces.into_iter())
.chain(transposes.into_iter())
)
}
fn single_deletes(splits: &[(&str, &str)]) -> Vec<String> {
splits
.iter()
.filter(|(_, right)| !right.is_empty())
.map(|(left, right)| {
format!("{}{}", left, drop_leading_chars(1, right))
})
.collect()
}
fn adjacent_transposes(splits: &[(&str, &str)]) -> Vec<String> {
splits
.iter()
.filter(|(_, right)| right.chars().count() > 1)
.map(|(left, right)| {
let r = drop_leading_chars(2, right);
let right_nth = |i| right.chars().nth(i).unwrap();
format!("{}{}{}{}", left, right_nth(1), right_nth(0), r)
})
.collect()
}
fn single_replaces(&self, splits: &[(&str, &str)]) -> Vec<String> {
splits
.iter()
.filter(|(_, right)| !right.is_empty())
.flat_map(|(left, right)| {
self.alphabet.chars().map(move |c| {
format!("{}{}{}", left, c, drop_leading_chars(1, right))
})
})
.collect()
}
fn single_inserts(&self, splits: &[(&str, &str)]) -> Vec<String> {
splits
.iter()
.flat_map(|(left, right)| {
self.alphabet.chars().map(move |c| {
format!("{}{}{}", left, c, right)
})
})
.collect()
}
pub fn edits2(&self, word: &str) -> HashSet<String> {
self.edits1(word)
.into_iter()
.flat_map(|e1| self.edits1(&e1))
.collect()
}
}
fn drop_leading_chars(n: usize, s: &str) -> &str {
s
.char_indices()
.skip(n)
.next()
.map(|(i, _)| &s[i..])
.unwrap_or("")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn clean_line_with_already_cleaned_line() {
let line = "i'm a clean-mf-line";
assert_eq!(line, clean_line(line));
}
#[test]
fn clean_line_removes_leading_and_trailing_spaces() {
let line = " abc \n";
assert_eq!(clean_line(line), "abc");
}
#[test]
fn clean_line_with_characters_to_remove() {
let line = "abc-1 @#";
assert_eq!(clean_line(line), "abc- ");
}
#[test]
fn default_counter_has_no_words() {
let counter = WordCounter::new();
assert!(counter.words().is_empty());
assert!(counter.total_count() == 0);
assert!(counter.get("random") == 0);
}
#[test]
fn counter_from_string() {
let text = "first line\nSecond LiNe\n THIRD LINE\n";
let expected_words = vec!["first", "line", "second", "third"];
let counter = WordCounter::from_str(text);
assert_eq!(counter.words(), expected_words);
assert_eq!(counter.total_count(), 6);
assert_eq!(counter.get("line"), 3);
assert_eq!(counter.get("first"), 1);
assert_eq!(counter.get("second"), 1);
assert_eq!(counter.get("third"), 1);
assert_eq!(counter.get("not-contained"), 0);
}
#[test]
fn add() {
let mut counter = WordCounter::new();
for word in ["word", "Word", " word ", " WORD "].iter() {
counter.add(word);
}
assert_eq!(counter.get("word"), 4);
}
#[test]
fn edits1_with_empty_alphabet() {
let checker = SpellChecker::new("", "");
let word = "ab";
let expected_words = as_set(&["a", "b", "ba"]);
assert_eq!(checker.edits1(word), expected_words);
}
#[test]
fn edits1_with_nonempty_en_alphabet() {
let checker = SpellChecker::new("", "c");
let word = "ab";
let expected_words = as_set(&["cb", "b", "acb", "abc", "ba", "a", "cab", "ac"]);
assert_eq!(checker.edits1(word), expected_words);
}
#[test]
fn edits1_with_nonempty_bg_alphabet() {
let checker = SpellChecker::new("", "з");
let word = "ей";
let expected_words = as_set(&["ез", "езй", "й", "зй", "ейз", "зей", "е", "йе"]);
assert_eq!(checker.edits1(word), expected_words);
}
#[test]
fn edits2_with_empty_alphabet() {
let checker = SpellChecker::new("", "");
let word = "ab";
let expected_words = as_set(&["", "ab", "a", "b"]);
assert_eq!(checker.edits2(word), expected_words);
}
#[test]
fn edits2_with_nonempty_en_alphabet() {
let checker = SpellChecker::new("", "c");
let word = "ab";
let expected_words = as_set(&[
"", "a", "cb", "cac", "bc", "cba", "acbc", "cab", "ac",
"acc", "abcc", "ab", "c", "accb", "cbc", "ca", "cc", "cacb",
"ccb", "acb", "abc", "cabc", "bca", "ccab", "b", "bac",
]);
assert_eq!(checker.edits2(word), expected_words);
}
#[test]
fn edits2_with_nonempty_bg_alphabet() {
let checker = SpellChecker::new("", "з");
let word = "ей";
let expected_words = as_set(&[
"", "зез", "езйз", "ейз", "з", "зей", "зз", "зйе", "ез",
"езй", "йез", "зейз", "ейзз", "еззй", "ззй", "зй", "зе",
"йзе", "зезй", "е", "ззей", "ей", "йз", "езз", "й", "зйз",
]);
assert_eq!(checker.edits2(word), expected_words);
}
#[test]
fn known_words_with_empty_corpus() {
let checker = SpellChecker::new("", ALPHABET_EN);
let words = as_set(&["a", "b"]);
let known_words = checker.known(&words);
assert!(known_words.is_empty());
}
#[test]
fn known_words_with_nonempty_corpus_and_words_which_are_not_in_the_corpus() {
let checker = SpellChecker::new("one two three изненада", ALPHABET_EN);
let words = as_set(&["a", "й"]);
let known_words = checker.known(&words);
assert!(known_words.is_empty());
}
#[test]
fn known_words_with_nonempty_corpus_and_words_which_are_in_the_corpus() {
let checker = SpellChecker::new("one two three изненада", ALPHABET_EN);
let words = as_set(&["a", "b", "изненада"]);
let expected_word = "изненада".to_owned();
let known_words = checker.known(&words);
assert_eq!(known_words.len(), 1);
assert!(known_words.contains(&&expected_word));
}
fn as_set(words: &[&str]) -> HashSet<String> {
words.iter().map(|&s| s.to_owned()).collect()
}
#[test]
fn candidates_with_very_different_word_from_ones_on_corpus() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "hamlet".to_owned();
let candidates = checker.candidates(&word);
assert_eq!(candidates.len(), 1);
assert!(candidates.contains(&word));
}
#[test]
fn candidates_with_one_letter_difference() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "ide".to_owned();
let expected = "ice".to_owned();
let candidates = checker.candidates(&word);
assert_eq!(candidates.len(), 1);
assert!(candidates.contains(&expected));
}
#[test]
fn candidates_with_two_letters_difference() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "idde".to_owned();
let candidates = checker.candidates(&word);
assert_eq!(candidates.len(), 3);
assert!(candidates.contains(&"dice".to_owned()));
assert!(candidates.contains(&"ice".to_owned()));
assert!(candidates.contains(&"isle".to_owned()));
}
#[test]
fn correction_with_very_different_word_from_ones_on_corpus() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "hamlet".to_owned();
let correction = checker.correction(&word);
assert_eq!(correction, word);
}
#[test]
fn correction_with_one_letter_difference() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "ide".to_owned();
let expected = "ice".to_owned();
let correction = checker.correction(&word);
assert_eq!(correction, expected);
}
#[test]
fn correction_with_two_letters_difference() {
let checker = SpellChecker::new("ice ice isle spie crie mic", ALPHABET_EN);
let word = "idde";
let expected = "ice";
let correction = checker.correction(&word);
assert_eq!(correction, expected);
}
}

Лог от изпълнението

Compiling solution v0.1.0 (/tmp/d20200114-2173579-1kn5h0n/solution)
    Finished test [unoptimized + debuginfo] target(s) in 8.26s
     Running target/debug/deps/solution-a73e64ec87929bd0

running 21 tests
test tests::add ... ok
test tests::candidates_with_one_letter_difference ... ok
test tests::candidates_with_two_letters_difference ... ok
test tests::candidates_with_very_different_word_from_ones_on_corpus ... ok
test tests::clean_line_removes_leading_and_trailing_spaces ... ok
test tests::clean_line_with_already_cleaned_line ... ok
test tests::clean_line_with_characters_to_remove ... ok
test tests::correction_with_one_letter_difference ... ok
test tests::correction_with_two_letters_difference ... ok
test tests::correction_with_very_different_word_from_ones_on_corpus ... ok
test tests::counter_from_string ... ok
test tests::default_counter_has_no_words ... ok
test tests::edits1_with_empty_alphabet ... ok
test tests::edits1_with_nonempty_bg_alphabet ... ok
test tests::edits1_with_nonempty_en_alphabet ... ok
test tests::edits2_with_empty_alphabet ... ok
test tests::edits2_with_nonempty_bg_alphabet ... ok
test tests::edits2_with_nonempty_en_alphabet ... ok
test tests::known_words_with_empty_corpus ... ok
test tests::known_words_with_nonempty_corpus_and_words_which_are_in_the_corpus ... ok
test tests::known_words_with_nonempty_corpus_and_words_which_are_not_in_the_corpus ... ok

test result: ok. 21 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out

     Running target/debug/deps/solution_test-38971695424b36d5

running 15 tests
test solution_test::test_best_word_is_returned ... ok
test solution_test::test_clean_line_removes_punctuation ... ok
test solution_test::test_clean_line_trims_the_input ... ok
test solution_test::test_correction ... ok
test solution_test::test_correction_fails_to_produce_new_result ... FAILED
test solution_test::test_correction_normalizes_case ... FAILED
test solution_test::test_counting ... ok
test solution_test::test_display ... ok
test solution_test::test_edits1 ... ok
test solution_test::test_edits2 ... ok
test solution_test::test_empty_counter ... ok
test solution_test::test_from_empty_str ... ok
test solution_test::test_from_str ... ok
test solution_test::test_known_words ... ok
test solution_test::test_probability ... ok

failures:

---- solution_test::test_correction_fails_to_produce_new_result stdout ----
thread 'main' panicked at 'assertion failed: `(left == right)`
  left: `"Либоф"`,
 right: `"либоф"`', tests/solution_test.rs:198:5
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace.

---- solution_test::test_correction_normalizes_case stdout ----
thread 'main' panicked at 'assertion failed: `(left == right)`
  left: `"Либоф"`,
 right: `"любов"`', tests/solution_test.rs:188:5


failures:
    solution_test::test_correction_fails_to_produce_new_result
    solution_test::test_correction_normalizes_case

test result: FAILED. 13 passed; 2 failed; 0 ignored; 0 measured; 0 filtered out

error: test failed, to rerun pass '--test solution_test'

История (3 версии и 2 коментара)

Илиян качи първо решение на 13.01.2020 23:04 (преди над 5 години)

Илиян качи решение на 14.01.2020 09:48 (преди над 5 години)

use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt;
pub const ALPHABET_EN: &'static str = "abcdefghijklmnopqrstuvwxyz";
pub const ALPHABET_BG: &'static str = "абвгдежзийклмнопрстуфхцчшщъьюя";
pub fn clean_line(input: &str) -> String {
input
.trim()
.chars()
.filter(|&a| is_valid_symbol(a))
.collect()
}
fn is_valid_symbol(c: char) -> bool {
c == '-' ||
c == '\'' ||
c.is_alphabetic() ||
c.is_whitespace()
}
pub struct WordCounter {
words_map: HashMap<String, u32>,
}
impl WordCounter {
pub fn new() -> Self {
WordCounter {
words_map: HashMap::new(),
}
}
pub fn from_str(input: &str) -> Self {
let mut counter = Self::new();
for word in input.lines()
.map(|line| crate::clean_line(line))
.flat_map(|line| to_words(&line))
{
counter.add(&word);
}
counter
}
pub fn add(&mut self, item: &str) {
let word = item.trim().to_lowercase();
let count = self.words_map.entry(word).or_insert(0);
*count += 1;
}
pub fn words(&self) -> Vec<&String> {
let mut words = self.words_map.keys().collect::<Vec<&String>>();
words.sort_unstable_by(|&a, &b| a.cmp(b));
words
}
pub fn get(&self, word: &str) -> u32 {
*self.words_map.get(word).unwrap_or(&0)
}
pub fn total_count(&self) -> u32 {
self.words_map.values().sum()
}
}
impl std::fmt::Display for WordCounter {
fn fmt(&self, f: &mut fmt::Formatter) -> std::fmt::Result {
let _ = writeln!(f, "WordCounter, total count: {}", self.total_count())?;
let mut pairs = self.words_map.iter().collect::<Vec<(&String, &u32)>>();
pairs.sort_unstable_by(|(_, x), (_, y)| y.cmp(&x));
for (word, count) in &pairs {
let _ = writeln!(f, "{}: {}", word, count)?;
}
Ok(())
}
}
fn to_words(line: &str) -> Vec<String> {
line
.split_whitespace()
.map(|word| word.to_owned())
.collect()
}
pub struct SpellChecker {
corpus: WordCounter,
alphabet: String,
}
impl SpellChecker {
pub fn new(corpus: &str, alphabet: &str) -> Self {
SpellChecker {
corpus: WordCounter::from_str(corpus),
alphabet: alphabet.to_owned(),
}
}
pub fn correction(&self, word: &str) -> String {
self.candidates(word)
.into_iter()
.max_by(|a, b| self.probability(a).partial_cmp(&self.probability(b)).unwrap())
.expect("candidates returned empty range")
}
pub fn probability(&self, word: &str) -> f64 {
if self.corpus.total_count() > 0 {
self.corpus.get(word) as f64 / self.corpus.total_count() as f64
}
else {
0.0
}
}
pub fn candidates(&self, word: &str) -> Vec<String> {
let known_words = |edits| {
let words = self.known(&edits);
if !words.is_empty() {
let mut vec = words.iter()
.map(|&s| s.to_owned())
.collect::<Vec<String>>();
vec.sort_unstable_by(|a, b| a.cmp(b));
Some(vec)
}
else { None }
};
let edits = [word].iter().map(|s| s.to_string()).collect();
known_words(edits)
.or_else(|| known_words(self.edits1(word)))
.or_else(|| known_words(self.edits2(word)))
.unwrap_or_else(|| vec![word.to_owned()])
}
pub fn known<'a>(&self, words: &'a HashSet<String>) -> Vec<&'a String> {
words
.iter()
.filter(|word| self.corpus.get(word) > 0)
.collect()
}
pub fn edits1(&self, word: &str) -> HashSet<String> {
use std::iter::FromIterator;
let splits = word
.char_indices()
.map(|(i, _)| (&word[..i], &word[i..]))
.chain([(word, "")].iter().copied())
.collect::<Vec<(&str, &str)>>();
let deletes = Self::single_deletes(&splits);
let inserts = self.single_inserts(&splits);
let replaces = self.single_replaces(&splits);
let transposes = Self::adjacent_transposes(&splits);
HashSet::from_iter(
deletes
.into_iter()
.chain(inserts.into_iter())
.chain(replaces.into_iter())
.chain(transposes.into_iter())
)
}
fn single_deletes(splits: &[(&str, &str)]) -> Vec<String> {
splits
.iter()
.filter(|(_, right)| !right.is_empty())
.map(|(left, right)| {
format!("{}{}", left, drop_leading_chars(1, right))
})
.collect()
}
fn adjacent_transposes(splits: &[(&str, &str)]) -> Vec<String> {
splits
.iter()
.filter(|(_, right)| right.chars().count() > 1)
.map(|(left, right)| {
let r = drop_leading_chars(2, right);
let right_nth = |i| right.chars().nth(i).unwrap();
format!("{}{}{}{}", left, right_nth(1), right_nth(0), r)
})
.collect()
}
fn single_replaces(&self, splits: &[(&str, &str)]) -> Vec<String> {
splits
.iter()
.filter(|(_, right)| !right.is_empty())
.flat_map(|(left, right)| {
self.alphabet.chars().map(move |c| {
format!("{}{}{}", left, c, drop_leading_chars(1, right))
})
})
.collect()
}
fn single_inserts(&self, splits: &[(&str, &str)]) -> Vec<String> {
splits
.iter()
.flat_map(|(left, right)| {
self.alphabet.chars().map(move |c| {
format!("{}{}{}", left, c, right)
})
})
.collect()
}
pub fn edits2(&self, word: &str) -> HashSet<String> {
self.edits1(word)
.into_iter()
.flat_map(|e1| self.edits1(&e1))
.collect()
}
}
fn drop_leading_chars(n: usize, s: &str) -> &str {
s
.char_indices()
.skip(n)
.next()
.map(|(i, _)| &s[i..])
.unwrap_or("")
}
-fn main() {
-}
-
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn clean_line_with_already_cleaned_line() {
let line = "i'm a clean-mf-line";
assert_eq!(line, clean_line(line));
}
#[test]
fn clean_line_removes_leading_and_trailing_spaces() {
let line = " abc \n";
assert_eq!(clean_line(line), "abc");
}
#[test]
fn clean_line_with_characters_to_remove() {
let line = "abc-1 @#";
assert_eq!(clean_line(line), "abc- ");
}
#[test]
fn default_counter_has_no_words() {
let counter = WordCounter::new();
assert!(counter.words().is_empty());
assert!(counter.total_count() == 0);
assert!(counter.get("random") == 0);
}
#[test]
fn counter_from_string() {
let text = "first line\nSecond LiNe\n THIRD LINE\n";
let expected_words = vec!["first", "line", "second", "third"];
let counter = WordCounter::from_str(text);
assert_eq!(counter.words(), expected_words);
assert_eq!(counter.total_count(), 6);
assert_eq!(counter.get("line"), 3);
assert_eq!(counter.get("first"), 1);
assert_eq!(counter.get("second"), 1);
assert_eq!(counter.get("third"), 1);
assert_eq!(counter.get("not-contained"), 0);
}
#[test]
fn add() {
let mut counter = WordCounter::new();
for word in ["word", "Word", " word ", " WORD "].iter() {
counter.add(word);
}
assert_eq!(counter.get("word"), 4);
}
#[test]
fn edits1_with_empty_alphabet() {
let checker = SpellChecker::new("", "");
let word = "ab";
let expected_words = as_set(&["a", "b", "ba"]);
assert_eq!(checker.edits1(word), expected_words);
}
#[test]
fn edits1_with_nonempty_en_alphabet() {
let checker = SpellChecker::new("", "c");
let word = "ab";
let expected_words = as_set(&["cb", "b", "acb", "abc", "ba", "a", "cab", "ac"]);
assert_eq!(checker.edits1(word), expected_words);
}
#[test]
fn edits1_with_nonempty_bg_alphabet() {
let checker = SpellChecker::new("", "з");
let word = "ей";
let expected_words = as_set(&["ез", "езй", "й", "зй", "ейз", "зей", "е", "йе"]);
assert_eq!(checker.edits1(word), expected_words);
}
#[test]
fn edits2_with_empty_alphabet() {
let checker = SpellChecker::new("", "");
let word = "ab";
let expected_words = as_set(&["", "ab", "a", "b"]);
assert_eq!(checker.edits2(word), expected_words);
}
#[test]
fn edits2_with_nonempty_en_alphabet() {
let checker = SpellChecker::new("", "c");
let word = "ab";
let expected_words = as_set(&[
"", "a", "cb", "cac", "bc", "cba", "acbc", "cab", "ac",
"acc", "abcc", "ab", "c", "accb", "cbc", "ca", "cc", "cacb",
"ccb", "acb", "abc", "cabc", "bca", "ccab", "b", "bac",
]);
assert_eq!(checker.edits2(word), expected_words);
}
#[test]
fn edits2_with_nonempty_bg_alphabet() {
let checker = SpellChecker::new("", "з");
let word = "ей";
let expected_words = as_set(&[
"", "зез", "езйз", "ейз", "з", "зей", "зз", "зйе", "ез",
"езй", "йез", "зейз", "ейзз", "еззй", "ззй", "зй", "зе",
"йзе", "зезй", "е", "ззей", "ей", "йз", "езз", "й", "зйз",
]);
assert_eq!(checker.edits2(word), expected_words);
}
#[test]
fn known_words_with_empty_corpus() {
let checker = SpellChecker::new("", ALPHABET_EN);
let words = as_set(&["a", "b"]);
let known_words = checker.known(&words);
assert!(known_words.is_empty());
}
#[test]
fn known_words_with_nonempty_corpus_and_words_which_are_not_in_the_corpus() {
let checker = SpellChecker::new("one two three изненада", ALPHABET_EN);
let words = as_set(&["a", "й"]);
let known_words = checker.known(&words);
assert!(known_words.is_empty());
}
#[test]
fn known_words_with_nonempty_corpus_and_words_which_are_in_the_corpus() {
let checker = SpellChecker::new("one two three изненада", ALPHABET_EN);
let words = as_set(&["a", "b", "изненада"]);
let expected_word = "изненада".to_owned();
let known_words = checker.known(&words);
assert_eq!(known_words.len(), 1);
assert!(known_words.contains(&&expected_word));
}
fn as_set(words: &[&str]) -> HashSet<String> {
words.iter().map(|&s| s.to_owned()).collect()
}
#[test]
fn candidates_with_very_different_word_from_ones_on_corpus() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "hamlet".to_owned();
let candidates = checker.candidates(&word);
assert_eq!(candidates.len(), 1);
assert!(candidates.contains(&word));
}
#[test]
fn candidates_with_one_letter_difference() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "ide".to_owned();
let expected = "ice".to_owned();
let candidates = checker.candidates(&word);
assert_eq!(candidates.len(), 1);
assert!(candidates.contains(&expected));
}
#[test]
fn candidates_with_two_letters_difference() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "idde".to_owned();
let expected = ["dice", "ice", "isle"];
let candidates = checker.candidates(&word);
assert_eq!(candidates, expected);
}
#[test]
fn correction_with_very_different_word_from_ones_on_corpus() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "hamlet".to_owned();
let correction = checker.correction(&word);
assert_eq!(correction, word);
}
#[test]
fn correction_with_one_letter_difference() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "ide".to_owned();
let expected = "ice".to_owned();
let correction = checker.correction(&word);
assert_eq!(correction, expected);
}
#[test]
fn correction_with_two_letters_difference() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "idde";
let expected = "isle";
let correction = checker.correction(&word);
assert_eq!(correction, expected);
}
}

Илиян качи решение на 14.01.2020 10:09 (преди над 5 години)

+
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt;
pub const ALPHABET_EN: &'static str = "abcdefghijklmnopqrstuvwxyz";
pub const ALPHABET_BG: &'static str = "абвгдежзийклмнопрстуфхцчшщъьюя";
pub fn clean_line(input: &str) -> String {
input
.trim()
.chars()
.filter(|&a| is_valid_symbol(a))
.collect()
}
fn is_valid_symbol(c: char) -> bool {
c == '-' ||
c == '\'' ||
c.is_alphabetic() ||
c.is_whitespace()
}
pub struct WordCounter {
words_map: HashMap<String, u32>,
}
impl WordCounter {
pub fn new() -> Self {
WordCounter {
words_map: HashMap::new(),
}
}
pub fn from_str(input: &str) -> Self {
let mut counter = Self::new();
for word in input.lines()
.map(|line| crate::clean_line(line))
.flat_map(|line| to_words(&line))
{
counter.add(&word);
}
counter
}
pub fn add(&mut self, item: &str) {
let word = item.trim().to_lowercase();
let count = self.words_map.entry(word).or_insert(0);
*count += 1;
}
pub fn words(&self) -> Vec<&String> {
let mut words = self.words_map.keys().collect::<Vec<&String>>();
words.sort_unstable_by(|&a, &b| a.cmp(b));
words
}
pub fn get(&self, word: &str) -> u32 {
*self.words_map.get(word).unwrap_or(&0)
}
pub fn total_count(&self) -> u32 {
self.words_map.values().sum()
}
}
impl std::fmt::Display for WordCounter {
fn fmt(&self, f: &mut fmt::Formatter) -> std::fmt::Result {
let _ = writeln!(f, "WordCounter, total count: {}", self.total_count())?;
let mut pairs = self.words_map.iter().collect::<Vec<(&String, &u32)>>();
pairs.sort_unstable_by(|(_, x), (_, y)| y.cmp(&x));
for (word, count) in &pairs {
let _ = writeln!(f, "{}: {}", word, count)?;
}
Ok(())
}
}
fn to_words(line: &str) -> Vec<String> {
line
.split_whitespace()
.map(|word| word.to_owned())
.collect()
}
pub struct SpellChecker {
corpus: WordCounter,
alphabet: String,
}
impl SpellChecker {
pub fn new(corpus: &str, alphabet: &str) -> Self {
SpellChecker {
corpus: WordCounter::from_str(corpus),
alphabet: alphabet.to_owned(),
}
}
pub fn correction(&self, word: &str) -> String {
self.candidates(word)
.into_iter()
.max_by(|a, b| self.probability(a).partial_cmp(&self.probability(b)).unwrap())
.expect("candidates returned empty range")
}
pub fn probability(&self, word: &str) -> f64 {
if self.corpus.total_count() > 0 {
self.corpus.get(word) as f64 / self.corpus.total_count() as f64
}
else {
0.0
}
}
pub fn candidates(&self, word: &str) -> Vec<String> {
- let known_words = |edits| {
+ let known_words = |edits| -> Option<Vec<String>> {
let words = self.known(&edits);
if !words.is_empty() {
- let mut vec = words.iter()
- .map(|&s| s.to_owned())
- .collect::<Vec<String>>();
- vec.sort_unstable_by(|a, b| a.cmp(b));
- Some(vec)
+ Some(words.iter().map(|&s| s.to_owned()).collect())
}
else { None }
};
let edits = [word].iter().map(|s| s.to_string()).collect();
known_words(edits)
.or_else(|| known_words(self.edits1(word)))
.or_else(|| known_words(self.edits2(word)))
.unwrap_or_else(|| vec![word.to_owned()])
}
pub fn known<'a>(&self, words: &'a HashSet<String>) -> Vec<&'a String> {
words
.iter()
.filter(|word| self.corpus.get(word) > 0)
.collect()
}
pub fn edits1(&self, word: &str) -> HashSet<String> {
use std::iter::FromIterator;
let splits = word
.char_indices()
.map(|(i, _)| (&word[..i], &word[i..]))
.chain([(word, "")].iter().copied())
.collect::<Vec<(&str, &str)>>();
let deletes = Self::single_deletes(&splits);
let inserts = self.single_inserts(&splits);
let replaces = self.single_replaces(&splits);
let transposes = Self::adjacent_transposes(&splits);
HashSet::from_iter(
deletes
.into_iter()
.chain(inserts.into_iter())
.chain(replaces.into_iter())
.chain(transposes.into_iter())
)
}
fn single_deletes(splits: &[(&str, &str)]) -> Vec<String> {
splits
.iter()
.filter(|(_, right)| !right.is_empty())
.map(|(left, right)| {
format!("{}{}", left, drop_leading_chars(1, right))
})
.collect()
}
fn adjacent_transposes(splits: &[(&str, &str)]) -> Vec<String> {
splits
.iter()
.filter(|(_, right)| right.chars().count() > 1)
.map(|(left, right)| {
let r = drop_leading_chars(2, right);
let right_nth = |i| right.chars().nth(i).unwrap();
format!("{}{}{}{}", left, right_nth(1), right_nth(0), r)
})
.collect()
}
fn single_replaces(&self, splits: &[(&str, &str)]) -> Vec<String> {
splits
.iter()
.filter(|(_, right)| !right.is_empty())
.flat_map(|(left, right)| {
self.alphabet.chars().map(move |c| {
format!("{}{}{}", left, c, drop_leading_chars(1, right))
})
})
.collect()
}
fn single_inserts(&self, splits: &[(&str, &str)]) -> Vec<String> {
splits
.iter()
.flat_map(|(left, right)| {
self.alphabet.chars().map(move |c| {
format!("{}{}{}", left, c, right)
})
})
.collect()
}
pub fn edits2(&self, word: &str) -> HashSet<String> {
self.edits1(word)
.into_iter()
.flat_map(|e1| self.edits1(&e1))
.collect()
}
}
fn drop_leading_chars(n: usize, s: &str) -> &str {
s
.char_indices()
.skip(n)
.next()
.map(|(i, _)| &s[i..])
.unwrap_or("")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn clean_line_with_already_cleaned_line() {
let line = "i'm a clean-mf-line";
assert_eq!(line, clean_line(line));
}
#[test]
fn clean_line_removes_leading_and_trailing_spaces() {
let line = " abc \n";
assert_eq!(clean_line(line), "abc");
}
#[test]
fn clean_line_with_characters_to_remove() {
let line = "abc-1 @#";
assert_eq!(clean_line(line), "abc- ");
}
#[test]
fn default_counter_has_no_words() {
let counter = WordCounter::new();
assert!(counter.words().is_empty());
assert!(counter.total_count() == 0);
assert!(counter.get("random") == 0);
}
#[test]
fn counter_from_string() {
let text = "first line\nSecond LiNe\n THIRD LINE\n";
let expected_words = vec!["first", "line", "second", "third"];
let counter = WordCounter::from_str(text);
assert_eq!(counter.words(), expected_words);
assert_eq!(counter.total_count(), 6);
assert_eq!(counter.get("line"), 3);
assert_eq!(counter.get("first"), 1);
assert_eq!(counter.get("second"), 1);
assert_eq!(counter.get("third"), 1);
assert_eq!(counter.get("not-contained"), 0);
}
#[test]
fn add() {
let mut counter = WordCounter::new();
for word in ["word", "Word", " word ", " WORD "].iter() {
counter.add(word);
}
assert_eq!(counter.get("word"), 4);
}
#[test]
fn edits1_with_empty_alphabet() {
let checker = SpellChecker::new("", "");
let word = "ab";
let expected_words = as_set(&["a", "b", "ba"]);
assert_eq!(checker.edits1(word), expected_words);
}
#[test]
fn edits1_with_nonempty_en_alphabet() {
let checker = SpellChecker::new("", "c");
let word = "ab";
let expected_words = as_set(&["cb", "b", "acb", "abc", "ba", "a", "cab", "ac"]);
assert_eq!(checker.edits1(word), expected_words);
}
#[test]
fn edits1_with_nonempty_bg_alphabet() {
let checker = SpellChecker::new("", "з");
let word = "ей";
let expected_words = as_set(&["ез", "езй", "й", "зй", "ейз", "зей", "е", "йе"]);
assert_eq!(checker.edits1(word), expected_words);
}
#[test]
fn edits2_with_empty_alphabet() {
let checker = SpellChecker::new("", "");
let word = "ab";
let expected_words = as_set(&["", "ab", "a", "b"]);
assert_eq!(checker.edits2(word), expected_words);
}
#[test]
fn edits2_with_nonempty_en_alphabet() {
let checker = SpellChecker::new("", "c");
let word = "ab";
let expected_words = as_set(&[
"", "a", "cb", "cac", "bc", "cba", "acbc", "cab", "ac",
"acc", "abcc", "ab", "c", "accb", "cbc", "ca", "cc", "cacb",
"ccb", "acb", "abc", "cabc", "bca", "ccab", "b", "bac",
]);
assert_eq!(checker.edits2(word), expected_words);
}
#[test]
fn edits2_with_nonempty_bg_alphabet() {
let checker = SpellChecker::new("", "з");
let word = "ей";
let expected_words = as_set(&[
"", "зез", "езйз", "ейз", "з", "зей", "зз", "зйе", "ез",
"езй", "йез", "зейз", "ейзз", "еззй", "ззй", "зй", "зе",
"йзе", "зезй", "е", "ззей", "ей", "йз", "езз", "й", "зйз",
]);
assert_eq!(checker.edits2(word), expected_words);
}
#[test]
fn known_words_with_empty_corpus() {
let checker = SpellChecker::new("", ALPHABET_EN);
let words = as_set(&["a", "b"]);
let known_words = checker.known(&words);
assert!(known_words.is_empty());
}
#[test]
fn known_words_with_nonempty_corpus_and_words_which_are_not_in_the_corpus() {
let checker = SpellChecker::new("one two three изненада", ALPHABET_EN);
let words = as_set(&["a", "й"]);
let known_words = checker.known(&words);
assert!(known_words.is_empty());
}
#[test]
fn known_words_with_nonempty_corpus_and_words_which_are_in_the_corpus() {
let checker = SpellChecker::new("one two three изненада", ALPHABET_EN);
let words = as_set(&["a", "b", "изненада"]);
let expected_word = "изненада".to_owned();
let known_words = checker.known(&words);
assert_eq!(known_words.len(), 1);
assert!(known_words.contains(&&expected_word));
}
fn as_set(words: &[&str]) -> HashSet<String> {
words.iter().map(|&s| s.to_owned()).collect()
}
#[test]
fn candidates_with_very_different_word_from_ones_on_corpus() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "hamlet".to_owned();
let candidates = checker.candidates(&word);
assert_eq!(candidates.len(), 1);
assert!(candidates.contains(&word));
}
#[test]
fn candidates_with_one_letter_difference() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "ide".to_owned();
let expected = "ice".to_owned();
let candidates = checker.candidates(&word);
assert_eq!(candidates.len(), 1);
assert!(candidates.contains(&expected));
}
#[test]
fn candidates_with_two_letters_difference() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "idde".to_owned();
- let expected = ["dice", "ice", "isle"];
let candidates = checker.candidates(&word);
- assert_eq!(candidates, expected);
+ assert_eq!(candidates.len(), 3);
+ assert!(candidates.contains(&"dice".to_owned()));
+ assert!(candidates.contains(&"ice".to_owned()));
+ assert!(candidates.contains(&"isle".to_owned()));
}
#[test]
fn correction_with_very_different_word_from_ones_on_corpus() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "hamlet".to_owned();
let correction = checker.correction(&word);
assert_eq!(correction, word);
}
#[test]
fn correction_with_one_letter_difference() {
let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
let word = "ide".to_owned();
let expected = "ice".to_owned();
let correction = checker.correction(&word);
assert_eq!(correction, expected);
}
#[test]
fn correction_with_two_letters_difference() {
- let checker = SpellChecker::new("ice isle spie crie dice mice mic", ALPHABET_EN);
+ let checker = SpellChecker::new("ice ice isle spie crie mic", ALPHABET_EN);
let word = "idde";
- let expected = "isle";
+ let expected = "ice";
let correction = checker.correction(&word);
assert_eq!(correction, expected);
}
}