Решение на Spell Checker от Стоян Ефтимов
Резултати
- 20 точки от тестове
- 0 бонус точки
- 20 точки общо
- 15 успешни тест(а)
- 0 неуспешни тест(а)
Код
use std::collections::{HashMap, HashSet};
use std::fmt::Write;
use std::fmt;
use std::iter::FromIterator;
pub fn clean_line(input: &str) -> String {
input.trim().chars().filter(|&letter|
char::is_alphabetic(letter) ||
char::is_whitespace(letter) || letter == '\'' || letter == '-'
).collect()
}
pub struct WordCounter {
count: u32,
words: HashMap<String, u32>,
}
impl WordCounter {
/// Конструира нов `WordCounter` без никакви данни.
///
pub fn new() -> Self {
Self{
count: 0,
words: HashMap::new(),
}
}
/// Прочита входния низ, ред по ред, обработва всеки ред с `clean_line`, разбива го на думи и
/// конструира нов WordCounter, който ще брои думите на този низ.
///
/// Нормализира думите по същия начин както `add` по-долу.
///
pub fn from_str(input: &str) -> Self {
let mut counter = Self::new();
input.lines().
for_each(|line| {
let line = clean_line(line);
line.split_whitespace().for_each(|word| counter.add(word))
});
counter
}
/// Връща (references към) всички съхранени думи във вектор, сортиран по азбучен ред.
///
pub fn words(&self) -> Vec<&String> {
let mut words = self.words.keys().collect::<Vec<_>>();
words.sort();
words
}
pub fn add(&mut self, item: &str) {
let count = self.words.entry(item.trim().to_lowercase().to_string()).or_insert(0);
*count += 1;
self.count += 1;
}
/// Връща колко пъти е бил преброена дадената дума.
///
pub fn get(&self, word: &str) -> u32 {
*self.words.get(word).unwrap_or(&0)
}
pub fn total_count(&self) -> u32 {
self.count
}
}
impl std::fmt::Display for WordCounter {
fn fmt(&self, f: &mut fmt::Formatter) -> std::fmt::Result {
let mut words = self.words.iter().collect::<Vec<_>>();
words.sort_by(|(_, c1), (_, c2)| c2.cmp(c1));
write!(f, "WordCounter, total count: {}\n", self.count)?;
for (word, count) in words {
write!(f, "{}: {}\n", word, count)?;
}
Ok(())
}
}
pub const ALPHABET_EN: &'static str = "abcdefghijklmnopqrstuvwxyz";
pub const ALPHABET_BG: &'static str = "абвгдежзийклмнопрстуфхцчшщъьюя";
pub struct SpellChecker {
counter: WordCounter,
alphabet: String,
}
impl SpellChecker {
pub fn new(corpus: &str, alphabet: &str) -> Self {
Self {
counter: WordCounter::from_str(corpus),
alphabet: String::from(alphabet),
}
}
pub fn correction(&self, word: &str) -> String {
let word = word.trim().to_lowercase();
let candidates = self.candidates(&word);
candidates.iter().
max_by(|&c1, &c2| self.probability(c1).partial_cmp(&self.probability(c2)).unwrap()).
unwrap().clone()
}
pub fn probability(&self, word: &str) -> f64 {
let total = self.counter.total_count() as f64;
if total == 0.0 {
return 0.0;
}
self.counter.get(word) as f64 / total
}
pub fn known<'a>(&self, words: &'a HashSet<String>) -> Vec<&'a String> {
words.iter().filter(|&word| self.counter.get(word) > 0).collect::<Vec<_>>()
}
pub fn candidates(&self, word: &str) -> Vec<String> {
if self.counter.get(word) > 0 {
return vec![String::from(word)]
}
let knowns1 = self.known(&self.edits1(word)).iter().map(|&s| s.clone()).collect::<Vec<_>>();
if knowns1.len() > 0 {
return knowns1
}
let knowns2 = self.known(&self.edits2(word)).iter().map(|&s| s.clone()).collect::<Vec<_>>();
if knowns2.len() > 0 {
return knowns2;
}
vec![String::from(word)]
}
pub fn edits1(&self, word: &str) -> HashSet<String> {
let mut split_idxs = word.char_indices().map(|(i, _)|i).collect::<Vec<_>>();
split_idxs.push(word.len());
let splits = split_idxs.iter().map(|&i| word.split_at(i)).collect::<Vec<_>>();
let deletes = splits.iter().
filter(|(_, last)| last.len() > 0).
map(|(first, last)| {
let last_idxs = last.char_indices().map(|(i, _)| i).collect::<Vec<_>>();
let rest = match last_idxs.len() {
1 => "",
_ => last.split_at(last_idxs[1]).1,
};
format!("{}{}", first, rest)
});
let transposes = splits.iter().
filter(|(_, last)|last.chars().count() > 1).
map(|(first, last)| {
let last_chars = last.char_indices().collect::<Vec<_>>();
let rest = match last_chars.len() {
2 => "",
_ => &last[last_chars[2].0..],
};
format!("{}{}{}{}", first, last_chars[1].1, last_chars[0].1, rest)
});
let replaces = self.alphabet.chars().flat_map(|letter| {
splits.iter().
filter(|(_, last)| last.len() > 0).
map(move |(first, last)| {
let last_idxs = last.char_indices().map(|(i, _)| i).collect::<Vec<_>>();
let rest = match last_idxs.len() {
1 => "",
_ => last.split_at(last_idxs[1]).1,
};
format!("{}{}{}", first, letter, rest)
})
});
let inserts = self.alphabet.chars().flat_map(|letter| {
splits.iter().
map(move |(first, last)| format!("{}{}{}", first, letter, last))
});
let edits = deletes.
chain(transposes).chain(replaces).chain(inserts);
HashSet::from_iter(edits)
}
pub fn edits2(&self, word: &str) -> HashSet<String> {
let edits1 = self.edits1(word);
let edits2 = edits1.iter().flat_map(|word| self.edits1(word));
edits2.collect::<HashSet<String>>()
}
}
#[test]
fn test_basic() {
let line = String::from(" Foo Bar ");
assert_eq!("Foo Bar", clean_line(&line));
let mut counter = WordCounter::new();
assert_eq!(counter.words(), WordCounter::from_str("").words());
assert_eq!(counter.get("word"), 0);
assert_eq!(counter.total_count(), 0);
assert_eq!(format!("{}", counter), "WordCounter, total count: 0\n");
counter.add("word");
let spell_checker = SpellChecker::new("foo bar", ALPHABET_EN);
assert_eq!(spell_checker.correction("foo"), "foo");
assert!(spell_checker.probability("foo") > 0.0);
assert!(spell_checker.known(&HashSet::new()).len() == 0);
assert!(spell_checker.candidates("foo").len() > 0);
assert!(spell_checker.edits1("foo").len() > 0);
assert!(spell_checker.edits2("foo").len() > 0);
}
#[test]
fn test_clean_line() {
assert_eq!(clean_line("едно две три"), "едно две три");
assert_eq!(clean_line("едно 2 три"), "едно три");
assert_eq!(clean_line("what's the time? - It's 3 o'clock!"), "what's the time - It's o'clock");
}
#[test]
fn test_counter() {
let counter = WordCounter::from_str(r"
What's the time?
- The time is 3 o'clock!
");
assert_eq!(counter.get("time"), 2);
assert_eq!(counter.get("the"), 2);
assert_eq!(counter.get("is"), 1);
assert_eq!(counter.get("hope"), 0);
assert_eq!(counter.words(), vec!["-", "is", "o'clock", "the", "time", "what's"]);
assert_eq!(counter.total_count(), 8);
}
#[test]
fn test_counter_fmt() {
let mut counter = WordCounter::new();
counter.add("баба");
counter.add("дядо");
counter.add("uncle");
counter.add("баба");
counter.add("дядо");
counter.add("баба");
assert_eq!(format!("{}", counter),
r"WordCounter, total count: 6
баба: 3
дядо: 2
uncle: 1
")
}
#[test]
fn test_correction_en() {
let checker = SpellChecker::new("one two three", ALPHABET_EN);
assert_eq!(checker.correction("tree"), "three");
assert_eq!(checker.correction("one"), "one");
assert_eq!(checker.correction("n"), "one");
assert_eq!(checker.correction(" twso "), "two");
assert_eq!(checker.correction("fiv"), "fiv");
assert_eq!(checker.probability("one"), 1.0/3.0 as f64);
}
#[test]
fn test_correction_bg() {
let checker = SpellChecker::new("едно две три едно", ALPHABET_BG);
assert_eq!(checker.correction("три"), "три");
assert_eq!(checker.correction("идно"), "едно");
assert_eq!(checker.correction("ено"), "едно");
assert_eq!(checker.correction(" две "), "две");
assert_eq!(checker.correction("fiv"), "fiv");
assert_eq!(checker.probability("one"), 0.0);
assert_eq!(checker.probability("едно"), 0.5);
}
#[test]
fn test_edit1_en() {
let checker = SpellChecker::new("", "x");
assert_set_eq( checker.edits1("a"), vec!["x", "xa", "ax", ""]);
assert_set_eq(checker.edits1("ab"), vec!["a", "b", "ba", "ax", "xb", "xab", "axb", "abx"]);
}
fn assert_set_eq(set: HashSet<String>, expected: Vec<&str>) {
assert_eq!(set.len(), expected.len());
assert!(expected.iter().all(|&s| set.contains(s)));
}
#[test]
fn test_edit1_bg() {
let checker = SpellChecker::new("", "я");
let deletes = checker.edits1("а");
assert_set_eq(checker.edits1("а"), vec!["я", "яа", "ая", ""]);
assert_set_eq(checker.edits1("аб"), vec!["а", "б", "ба", "ая", "яб", "яаб", "аяб", "абя"]);
}
#[test]
fn test_edit1_mix() {
let checker = SpellChecker::new("", "w");
assert_set_eq(checker.edits1("юあ"), vec!["ю", "あ", "あю", "юw", "wあ", "wюあ", "юwあ", "юあw"]);
}
#[test]
fn test_edit2_en() {
let checker = SpellChecker::new("", "z");
assert_set_eq(checker.edits2("j"), vec!["z", "zz", "", "j", "jz", "zzj", "zjz", "zj", "jzz"]);
}
#[test]
fn test_edit2_bg() {
let checker = SpellChecker::new("", "ь");
assert_set_eq(checker.edits2("щ"), vec!["ь", "ьь", "", "щ", "щь", "ььщ", "ьщь", "ьщ", "щьь"]);
}
#[test]
fn test_edit2_japan() {
let checker = SpellChecker::new("", "あ");
assert_set_eq(checker.edits2("き"), vec!["あ", "ああ", "", "き", "きあ", "ああき", "あきあ", "あき", "きああ"]);
}
Лог от изпълнението
Compiling solution v0.1.0 (/tmp/d20200114-2173579-1hiu0b3/solution) warning: unused import: `std::fmt::Write` --> src/lib.rs:2:5 | 2 | use std::fmt::Write; | ^^^^^^^^^^^^^^^ | = note: `#[warn(unused_imports)]` on by default warning: function is never used: `assert_set_eq` --> src/lib.rs:283:1 | 283 | fn assert_set_eq(set: HashSet<String>, expected: Vec<&str>) { | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | = note: `#[warn(dead_code)]` on by default warning: unused import: `std::fmt::Write` --> src/lib.rs:2:5 | 2 | use std::fmt::Write; | ^^^^^^^^^^^^^^^ | = note: `#[warn(unused_imports)]` on by default warning: unused variable: `deletes` --> src/lib.rs:291:9 | 291 | let deletes = checker.edits1("а"); | ^^^^^^^ help: consider prefixing with an underscore: `_deletes` | = note: `#[warn(unused_variables)]` on by default Finished test [unoptimized + debuginfo] target(s) in 7.98s Running target/debug/deps/solution-a73e64ec87929bd0 running 12 tests test test_basic ... ok test test_clean_line ... ok test test_correction_bg ... ok test test_correction_en ... ok test test_counter ... ok test test_counter_fmt ... ok test test_edit1_bg ... ok test test_edit1_en ... ok test test_edit1_mix ... ok test test_edit2_bg ... ok test test_edit2_en ... ok test test_edit2_japan ... ok test result: ok. 12 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out Running target/debug/deps/solution_test-38971695424b36d5 running 15 tests test solution_test::test_best_word_is_returned ... ok test solution_test::test_clean_line_removes_punctuation ... ok test solution_test::test_clean_line_trims_the_input ... ok test solution_test::test_correction ... ok test solution_test::test_correction_fails_to_produce_new_result ... ok test solution_test::test_correction_normalizes_case ... ok test solution_test::test_counting ... ok test solution_test::test_display ... ok test solution_test::test_edits1 ... ok test solution_test::test_edits2 ... ok test solution_test::test_empty_counter ... ok test solution_test::test_from_empty_str ... ok test solution_test::test_from_str ... ok test solution_test::test_known_words ... ok test solution_test::test_probability ... ok test result: ok. 15 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out Doc-tests solution running 0 tests test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out
История (2 версии и 0 коментара)
Стоян качи решение на 12.01.2020 19:53 (преди над 5 години)
use std::collections::{HashMap, HashSet};
use std::fmt::Write;
use std::fmt;
use std::iter::FromIterator;
-use std::path::Prefix::Verbatim;
pub fn clean_line(input: &str) -> String {
input.trim().chars().filter(|&letter|
char::is_alphabetic(letter) ||
char::is_whitespace(letter) || letter == '\'' || letter == '-'
).collect()
}
pub struct WordCounter {
count: u32,
words: HashMap<String, u32>,
}
impl WordCounter {
/// Конструира нов `WordCounter` без никакви данни.
///
pub fn new() -> Self {
Self{
count: 0,
words: HashMap::new(),
}
}
/// Прочита входния низ, ред по ред, обработва всеки ред с `clean_line`, разбива го на думи и
/// конструира нов WordCounter, който ще брои думите на този низ.
///
/// Нормализира думите по същия начин както `add` по-долу.
///
pub fn from_str(input: &str) -> Self {
let mut counter = Self::new();
input.lines().
for_each(|line| {
let line = clean_line(line);
line.split_whitespace().for_each(|word| counter.add(word))
});
counter
}
/// Връща (references към) всички съхранени думи във вектор, сортиран по азбучен ред.
///
pub fn words(&self) -> Vec<&String> {
let mut words = self.words.keys().collect::<Vec<_>>();
words.sort();
words
}
pub fn add(&mut self, item: &str) {
let count = self.words.entry(item.trim().to_lowercase().to_string()).or_insert(0);
*count += 1;
self.count += 1;
}
/// Връща колко пъти е бил преброена дадената дума.
///
pub fn get(&self, word: &str) -> u32 {
*self.words.get(word).unwrap_or(&0)
}
pub fn total_count(&self) -> u32 {
self.count
}
}
impl std::fmt::Display for WordCounter {
fn fmt(&self, f: &mut fmt::Formatter) -> std::fmt::Result {
let mut words = self.words.iter().collect::<Vec<_>>();
words.sort_by(|(_, c1), (_, c2)| c2.cmp(c1));
write!(f, "WordCounter, total count: {}\n", self.count)?;
for (word, count) in words {
write!(f, "{}: {}\n", word, count)?;
}
Ok(())
}
}
pub const ALPHABET_EN: &'static str = "abcdefghijklmnopqrstuvwxyz";
pub const ALPHABET_BG: &'static str = "абвгдежзийклмнопрстуфхцчшщъьюя";
pub struct SpellChecker {
counter: WordCounter,
alphabet: String,
}
impl SpellChecker {
pub fn new(corpus: &str, alphabet: &str) -> Self {
Self {
counter: WordCounter::from_str(corpus),
alphabet: String::from(alphabet),
}
}
pub fn correction(&self, word: &str) -> String {
let word = word.trim().to_lowercase();
let candidates = self.candidates(&word);
candidates.iter().
max_by(|&c1, &c2| self.probability(c1).partial_cmp(&self.probability(c2)).unwrap()).
unwrap().clone()
}
pub fn probability(&self, word: &str) -> f64 {
let total = self.counter.total_count() as f64;
if total == 0.0 {
return 0.0;
}
self.counter.get(word) as f64 / total
}
pub fn known<'a>(&self, words: &'a HashSet<String>) -> Vec<&'a String> {
words.iter().filter(|&word| self.counter.get(word) > 0).collect::<Vec<_>>()
}
pub fn candidates(&self, word: &str) -> Vec<String> {
if self.counter.get(word) > 0 {
return vec![String::from(word)]
}
let knowns1 = self.known(&self.edits1(word)).iter().map(|&s| s.clone()).collect::<Vec<_>>();
if knowns1.len() > 0 {
return knowns1
}
let knowns2 = self.known(&self.edits2(word)).iter().map(|&s| s.clone()).collect::<Vec<_>>();
if knowns2.len() > 0 {
return knowns2;
}
vec![String::from(word)]
}
pub fn edits1(&self, word: &str) -> HashSet<String> {
let mut split_idxs = word.char_indices().map(|(i, _)|i).collect::<Vec<_>>();
split_idxs.push(word.len());
let splits = split_idxs.iter().map(|&i| word.split_at(i)).collect::<Vec<_>>();
let deletes = splits.iter().
filter(|(_, last)| last.len() > 0).
map(|(first, last)| {
let last_idxs = last.char_indices().map(|(i, _)| i).collect::<Vec<_>>();
let rest = match last_idxs.len() {
1 => "",
_ => last.split_at(last_idxs[1]).1,
};
format!("{}{}", first, rest)
});
let transposes = splits.iter().
filter(|(_, last)|last.chars().count() > 1).
map(|(first, last)| {
let last_chars = last.char_indices().collect::<Vec<_>>();
let rest = match last_chars.len() {
2 => "",
_ => &last[last_chars[2].0..],
};
format!("{}{}{}{}", first, last_chars[1].1, last_chars[0].1, rest)
});
let replaces = self.alphabet.chars().flat_map(|letter| {
splits.iter().
filter(|(_, last)| last.len() > 0).
map(move |(first, last)| {
let last_idxs = last.char_indices().map(|(i, _)| i).collect::<Vec<_>>();
let rest = match last_idxs.len() {
1 => "",
_ => last.split_at(last_idxs[1]).1,
};
format!("{}{}{}", first, letter, rest)
})
});
let inserts = self.alphabet.chars().flat_map(|letter| {
splits.iter().
map(move |(first, last)| format!("{}{}{}", first, letter, last))
});
let edits = deletes.
chain(transposes).chain(replaces).chain(inserts);
HashSet::from_iter(edits)
}
pub fn edits2(&self, word: &str) -> HashSet<String> {
let edits1 = self.edits1(word);
let edits2 = edits1.iter().flat_map(|word| self.edits1(word));
edits2.collect::<HashSet<String>>()
}
}
#[test]
fn test_basic() {
let line = String::from(" Foo Bar ");
assert_eq!("Foo Bar", clean_line(&line));
let mut counter = WordCounter::new();
assert_eq!(counter.words(), WordCounter::from_str("").words());
assert_eq!(counter.get("word"), 0);
assert_eq!(counter.total_count(), 0);
assert_eq!(format!("{}", counter), "WordCounter, total count: 0\n");
counter.add("word");
let spell_checker = SpellChecker::new("foo bar", ALPHABET_EN);
assert_eq!(spell_checker.correction("foo"), "foo");
assert!(spell_checker.probability("foo") > 0.0);
assert!(spell_checker.known(&HashSet::new()).len() == 0);
assert!(spell_checker.candidates("foo").len() > 0);
assert!(spell_checker.edits1("foo").len() > 0);
assert!(spell_checker.edits2("foo").len() > 0);
}
#[test]
fn test_clean_line() {
assert_eq!(clean_line("едно две три"), "едно две три");
assert_eq!(clean_line("едно 2 три"), "едно три");
assert_eq!(clean_line("what's the time? - It's 3 o'clock!"), "what's the time - It's o'clock");
}
#[test]
fn test_counter() {
let counter = WordCounter::from_str(r"
What's the time?
- The time is 3 o'clock!
");
assert_eq!(counter.get("time"), 2);
assert_eq!(counter.get("the"), 2);
assert_eq!(counter.get("is"), 1);
assert_eq!(counter.get("hope"), 0);
assert_eq!(counter.words(), vec!["-", "is", "o'clock", "the", "time", "what's"]);
assert_eq!(counter.total_count(), 8);
}
#[test]
fn test_counter_fmt() {
let mut counter = WordCounter::new();
counter.add("баба");
counter.add("дядо");
counter.add("uncle");
counter.add("баба");
counter.add("дядо");
counter.add("баба");
assert_eq!(format!("{}", counter),
r"WordCounter, total count: 6
баба: 3
дядо: 2
uncle: 1
")
}
#[test]
fn test_correction_en() {
let checker = SpellChecker::new("one two three", ALPHABET_EN);
assert_eq!(checker.correction("tree"), "three");
assert_eq!(checker.correction("one"), "one");
assert_eq!(checker.correction("n"), "one");
assert_eq!(checker.correction(" twso "), "two");
assert_eq!(checker.correction("fiv"), "fiv");
assert_eq!(checker.probability("one"), 1.0/3.0 as f64);
}
#[test]
fn test_correction_bg() {
let checker = SpellChecker::new("едно две три едно", ALPHABET_BG);
assert_eq!(checker.correction("три"), "три");
assert_eq!(checker.correction("идно"), "едно");
assert_eq!(checker.correction("ено"), "едно");
assert_eq!(checker.correction(" две "), "две");
assert_eq!(checker.correction("fiv"), "fiv");
assert_eq!(checker.probability("one"), 0.0);
assert_eq!(checker.probability("едно"), 0.5);
+}
+
+#[test]
+fn test_edit1_en() {
+ let checker = SpellChecker::new("", "x");
+ assert_set_eq( checker.edits1("a"), vec!["x", "xa", "ax", ""]);
+ assert_set_eq(checker.edits1("ab"), vec!["a", "b", "ba", "ax", "xb", "xab", "axb", "abx"]);
+}
+
+fn assert_set_eq(set: HashSet<String>, expected: Vec<&str>) {
+ assert_eq!(set.len(), expected.len());
+ assert!(expected.iter().all(|&s| set.contains(s)));
+}
+
+#[test]
+fn test_edit1_bg() {
+ let checker = SpellChecker::new("", "я");
+ let deletes = checker.edits1("а");
+ assert_set_eq(checker.edits1("а"), vec!["я", "яа", "ая", ""]);
+ assert_set_eq(checker.edits1("аб"), vec!["а", "б", "ба", "ая", "яб", "яаб", "аяб", "абя"]);
+}
+
+#[test]
+fn test_edit1_mix() {
+ let checker = SpellChecker::new("", "w");
+ assert_set_eq(checker.edits1("юあ"), vec!["ю", "あ", "あю", "юw", "wあ", "wюあ", "юwあ", "юあw"]);
+
+}
+
+#[test]
+fn test_edit2_en() {
+ let checker = SpellChecker::new("", "z");
+ assert_set_eq(checker.edits2("j"), vec!["z", "zz", "", "j", "jz", "zzj", "zjz", "zj", "jzz"]);
+}
+
+#[test]
+fn test_edit2_bg() {
+ let checker = SpellChecker::new("", "ь");
+ assert_set_eq(checker.edits2("щ"), vec!["ь", "ьь", "", "щ", "щь", "ььщ", "ьщь", "ьщ", "щьь"]);
+}
+
+#[test]
+fn test_edit2_japan() {
+ let checker = SpellChecker::new("", "あ");
+ assert_set_eq(checker.edits2("き"), vec!["あ", "ああ", "", "き", "きあ", "ああき", "あきあ", "あき", "きああ"]);
}