Решение на Spell Checker от Ангел Беширов
Резултати
- 20 точки от тестове
- 0 бонус точки
- 20 точки общо
- 15 успешни тест(а)
- 0 неуспешни тест(а)
Код
use std::collections::HashSet;
use std::collections::HashMap;
pub const ALPHABET_EN: &'static str = "abcdefghijklmnopqrstuvwxyz";
pub const ALPHABET_BG: &'static str = "абвгдежзийклмнопрстуфхцчшщъьюя";
pub fn clean_line(input: &str) -> String {
let mut cleaned = String::new();
for c in input.trim().chars() {
if c.is_alphabetic() || c.is_whitespace() || c == '\'' || c == '-' {
cleaned.push(c);
}
}
cleaned
}
pub struct WordCounter {
counter: HashMap<String, u32>,
}
impl WordCounter {
pub fn new() -> Self {
WordCounter {
counter: HashMap::new(),
}
}
pub fn from_str(input: &str) -> Self {
let mut word_counter: WordCounter = WordCounter::new();
for line in input.lines() {
let line = clean_line(line);
for word in line.split_whitespace() {
word_counter.add(word);
}
};
word_counter
}
pub fn words(&self) -> Vec<&String> {
let mut words: Vec<&String> = Vec::new();
for key in self.counter.keys() {
words.push(key);
}
words.sort();
words
}
pub fn add(&mut self, item: &str) {
let cleaned: String = item.to_string().trim().to_lowercase();
if !cleaned.is_empty() {
let current: u32 = {
match self.counter.get(&cleaned) {
Some(x) => *x,
None => 0,
}
};
self.counter.insert(cleaned, current + 1);
}
}
pub fn get(&self, word: &str) -> u32 {
match self.counter.get(word) {
Some(x) => *x,
None => 0,
}
}
pub fn total_count(&self) -> u32 {
self.counter.values().sum()
}
}
impl std::fmt::Display for WordCounter {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
writeln!(f, "WordCounter, total count: {}", self.total_count())?;
let mut sorted: Vec<(&String, &u32)> = self.counter.iter().collect();
sorted.sort_by(|a, b| b.1.cmp(a.1));
for entry in sorted {
writeln!(f, "{}: {}", entry.0, entry.1)?;
}
Ok(())
}
}
pub struct SpellChecker {
word_counter: WordCounter,
alphabet: String,
}
impl SpellChecker {
pub fn new(corpus: &str, alphabet: &str) -> Self {
SpellChecker {
word_counter: WordCounter::from_str(corpus),
alphabet: alphabet.to_string(),
}
}
pub fn correction(&self, word: &str) -> String {
let corrected_word: String = word.to_string().trim().to_lowercase();
let mut max_probability: f64 = -1.0;
let mut most_viable_correction: String = corrected_word.clone();
for candidate in self.candidates(&corrected_word) {
let probability: f64 = self.probability(&candidate);
if probability > max_probability {
max_probability = probability;
most_viable_correction = candidate;
}
}
most_viable_correction
}
pub fn probability(&self, word: &str) -> f64 {
let total_count: f64 = self.word_counter.total_count() as f64;
if total_count == 0.0 {
// if there are no words in the corpus -> 0 probability
return 0.0;
}
self.word_counter.get(word) as f64 / total_count
}
pub fn known<'a>(&self, words: &'a HashSet<String>) -> Vec<&'a String> {
let mut known_words: Vec<&String> = Vec::new();
for word in words {
if self.word_counter.get(word) > 0 {
known_words.push(word);
}
}
known_words
}
pub fn candidates(&self, word: &str) -> Vec<String> {
let mut word_candidates: Vec<String> = Vec::new();
let mut candidates_check: HashSet<String> = HashSet::new();
candidates_check.insert(word.to_string());
for candidate in self.known(&candidates_check) {
word_candidates.push(candidate.clone());
}
if word_candidates.len() > 0 {
return word_candidates;
}
let edits1 = self.edits1(word);
for candidate in self.known(&edits1) {
word_candidates.push(candidate.clone());
}
if word_candidates.len() > 0 {
return word_candidates;
}
let edits2 = self.edits2(word);
for candidate in self.known(&edits2) {
word_candidates.push(candidate.clone());
}
if word_candidates.len() == 0 {
word_candidates.push(word.to_string());
}
word_candidates
}
pub fn edits1(&self, word: &str) -> HashSet<String> {
let mut edits: HashSet<String> = HashSet::new();
edits.extend(self.remove_one_char(word));
edits.extend(self.swap_consecutive_chars(word));
edits.extend(self.replace_from_alphabet(word));
edits.extend(self.insert_from_alphabet(word));
edits
}
pub fn edits2(&self, word: &str) -> HashSet<String> {
let mut edits: HashSet<String> = HashSet::new();
let edits1: HashSet<String> = self.edits1(word);
for edits1_word in edits1 {
edits.extend(self.edits1(&edits1_word));
}
edits
}
pub fn remove_one_char(&self, word: &str) -> HashSet<String> {
let mut edits: HashSet<String> = HashSet::new();
let len: usize = word.chars().count();
let word = word.to_string();
for i in 0..len {
let mut chars: Vec<char> = word.chars().collect();
chars.remove(i);
let edited: String = chars.iter().collect();
edits.insert(edited);
}
edits
}
pub fn swap_consecutive_chars(&self, word: &str) -> HashSet<String> {
let mut edits: HashSet<String> = HashSet::new();
let len: usize = word.chars().count();
let word = word.to_string();
for i in 0..(len - 1) {
let mut chars: Vec<char> = word.chars().collect();
chars.swap(i, i + 1);
let edited: String = chars.iter().collect();
edits.insert(edited);
}
edits
}
pub fn replace_from_alphabet(&self, word: &str) -> HashSet<String> {
let mut edits: HashSet<String> = HashSet::new();
let len: usize = word.chars().count();
let word = word.to_string();
for i in 0..len {
let mut chars: Vec<char> = word.chars().collect();
chars.remove(i);
for c in self.alphabet.chars() {
chars.insert(i, c);
let edited: String = chars.iter().collect();
edits.insert(edited);
chars.remove(i);
}
}
edits
}
pub fn insert_from_alphabet(&self, word: &str) -> HashSet<String> {
let mut edits: HashSet<String> = HashSet::new();
let len: usize = word.chars().count();
let word = word.to_string();
for i in 0..(len + 1) {
let mut chars: Vec<char> = word.chars().collect();
for c in self.alphabet.chars() {
chars.insert(i, c);
let edited: String = chars.iter().collect();
edits.insert(edited);
chars.remove(i);
}
}
edits
}
}
fn from_vec_to_hash_set(data: Vec<&str>) -> HashSet<String> {
let mut set: HashSet<String> = HashSet::new();
for word in data {
set.insert(word.to_string());
}
set
}
#[test]
fn test_basic() {
let line = String::from(" Foo Bar ");
assert_eq!("Foo Bar", clean_line(&line));
let mut counter = WordCounter::new();
assert_eq!(counter.words(), WordCounter::from_str("").words());
assert_eq!(counter.get("word"), 0);
assert_eq!(counter.total_count(), 0);
assert_eq!(format!("{}", counter), "WordCounter, total count: 0\n");
counter.add("word");
let spell_checker = SpellChecker::new("foo bar", ALPHABET_EN);
assert_eq!(spell_checker.correction("foo"), "foo");
assert!(spell_checker.probability("foo") > 0.0);
assert!(spell_checker.known(&HashSet::new()).len() == 0);
assert!(spell_checker.candidates("foo").len() > 0);
assert!(spell_checker.edits1("foo").len() > 0);
assert!(spell_checker.edits2("foo").len() > 0);
}
#[test]
fn test_cleaning_line() {
let to_be_cleaned = String::from("This sentece %-%% has to be 'cleared'!!!$");
assert_eq!(clean_line(&to_be_cleaned), "This sentece - has to be 'cleared'");
}
#[test]
fn test_cleaning_line_trimming() {
let to_be_cleaned = String::from(" This sh0o()uld be trimmed??? ");
assert_eq!(clean_line(&to_be_cleaned), "This should be trimmed");
}
#[test]
fn test_empty_counter() {
let word_counter: WordCounter = WordCounter::new();
assert_eq!(word_counter.words(), Vec::<&String>::new());
assert_eq!(word_counter.total_count(), 0);
assert_eq!(word_counter.get(&String::from("test")), 0);
}
#[test]
fn test_creating_counter_unique_words() {
let text: String = String::from("Lorem Ipsum is simply dummy text of the printing and typesetting industry.\n");
let word_counter: WordCounter = WordCounter::from_str(&text);
assert_eq!(word_counter.words(), vec!["and", "dummy", "industry", "ipsum", "is", "lorem", "of", "printing", "simply", "text", "the", "typesetting"]);
assert_eq!(word_counter.total_count(), 12);
assert_eq!(word_counter.get(&String::from("ipsum")), 1);
}
#[test]
fn test_creating_counter_repeated_word_cyrilic() {
let text: String = String::from("Тест на текст, написан на кирилица а не на текст на латиница :(");
let word_counter: WordCounter = WordCounter::from_str(&text);
assert_eq!(word_counter.words(), vec!["а", "кирилица", "латиница", "на", "написан", "не", "текст", "тест"]);
assert_eq!(word_counter.total_count(), 12);
assert_eq!(word_counter.get(&String::from("текст")), 2);
}
#[test]
fn test_add_over_repeating() {
let text: String = String::from("Тест на текст, написан на кирилица а не на текст на латиница :(");
let mut word_counter: WordCounter = WordCounter::from_str(&text);
assert_eq!(word_counter.words(), vec!["а", "кирилица", "латиница", "на", "написан", "не", "текст", "тест"]);
assert_eq!(word_counter.total_count(), 12);
assert_eq!(word_counter.get(&String::from("текст")), 2);
word_counter.add("текст");
assert_eq!(word_counter.total_count(), 13);
assert_eq!(word_counter.get(&String::from(&String::from("текст"))), 3);
}
#[test]
fn test_new_spellchecker() {
let text: String = String::from("Кратък текст, който ще се ползва за корпус");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_BG);
println!("{}", spell_checker.correction(&String::from("краТк")));
assert_eq!(spell_checker.correction(&String::from("краТк")), String::from("кратък"));
}
#[test]
fn test_removing_one_char() {
let text: String = String::from("Кратък текст, който ще се ползва за корпус");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_BG);
let expected: HashSet<String> = from_vec_to_hash_set(vec!["ексст", "тксст", "тесст", "текст", "тексс"]);
assert_eq!(spell_checker.remove_one_char(&String::from("тексст")), expected);
}
#[test]
fn test_swap_consecutive_chars() {
let text: String = String::from("Кратък текст, който ще се ползва за корпус");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_BG);
let expected: HashSet<String> = from_vec_to_hash_set(vec!["етксст", "ткесст", "тескст", "тексст", "текстс"]);
assert_eq!(spell_checker.swap_consecutive_chars(&String::from("тексст")), expected);
}
#[test]
fn test_replace_from_alphabet() {
let text: String = String::from("Кратък текст, който ще се ползва за корпус");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_BG);
let expected: HashSet<String> = from_vec_to_hash_set(vec!["стф", "вмф", "ътф", "втю", "вдф", "вьф", "лтф", "веф", "впф", "вшф", "вта",
"итф", "хтф", "вту", "втф", "ктф", "вуф", "втг", "фтф", "ттф", "утф", "внф", "вхф", "втч", "чтф", "втъ", "отф", "вть", "втя", "бтф", "вйф",
"вяф", "втд", "ютф", "вчф", "втй", "вфф", "ртф", "вцф", "втх", "вто", "ьтф", "етф", "вюф", "штф", "гтф", "втл", "дтф", "йтф", "ввф", "вщф",
"втк", "втн", "влф", "втм", "вте", "втз", "втш", "воф", "втж", "втц", "зтф", "виф", "втщ", "ятф", "втв", "втт", "нтф", "птф", "вкф", "всф",
"жтф", "ваф", "втп", "втр", "вжф", "атф", "мтф", "щтф", "взф", "втс", "врф", "вгф", "вти", "вбф", "въф", "втб", "цтф"]);
let actual = spell_checker.replace_from_alphabet(&String::from("втф"));
assert_eq!(actual.len(), 3 * 30 - 2); // 3 chars = 3 repetitions only 1 of them is added
assert_eq!(actual, expected);
}
#[test]
fn test_insert_from_alphabet() {
let text: String = String::from("Кратък текст, който ще се ползва за корпус");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_BG);
let expected: HashSet<String> = from_vec_to_hash_set(vec!["втфф", "чвтф", "втуф", "втфю", "втоф", "влтф", "вбтф", "витф", "рвтф", "ювтф", "втфр", "втфх",
"вътф", "втшф", "ьвтф", "втчф", "вютф", "втфи", "вутф", "втнф", "втфо", "вктф", "втиф", "втгф", "втцф", "вцтф", "втфщ", "жвтф", "втфм", "вхтф", "ввтф", "вптф",
"евтф", "втьф", "вжтф", "втфк", "лвтф", "пвтф", "втфд", "звтф", "втъф", "втфз", "йвтф", "вмтф", "встф", "втзф", "втфц", "ъвтф", "вттф", "вотф", "хвтф", "швтф",
"внтф", "ивтф", "втфб", "взтф", "вфтф", "двтф", "втмф", "втсф", "щвтф", "втфн", "втщф", "втфг", "втпф", "вртф", "втеф", "втхф", "овтф", "втйф", "автф", "втрф",
"бвтф", "втфв", "вгтф", "вйтф", "втюф", "втбф", "свтф", "втфж", "цвтф", "втфй", "втфш", "вчтф", "нвтф", "втфл", "втвф", "втаф", "мвтф", "вштф", "вткф", "квтф",
"вдтф", "втфс", "втфт", "втфч", "втфе", "втфп", "втфа", "втфъ", "ватф", "втфу", "втяф", "вщтф", "втжф", "втлф", "втфь", "гвтф", "фвтф", "явтф", "втдф", "увтф",
"твтф", "вьтф", "втфя", "ветф", "вятф"]);
let actual = spell_checker.insert_from_alphabet(&String::from("втф"));
assert_eq!(actual.len(), 4 * 30 - 3); // ввтф вттф втфф are repeated
assert_eq!(actual, expected);
}
#[test]
fn test_probability_more_than_once() {
let text: String = String::from("Кратък текст с повтаряща се дума, който ще се ползва за корпус.\n Корпус123412 КоРпУс");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_BG);
assert_eq!(spell_checker.probability(&String::from("корпус")), 3.0 / 14.0);
}
#[test]
fn test_probability_once() {
let text: String = String::from("Кратък текст с повтаряща се дума, който ще се ползва за корпус.\n Корпус123412 КоРпУс");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_BG);
assert_eq!(spell_checker.probability(&String::from("кратък")), 1.0 / 14.0);
}
#[test]
fn test_known_words() {
let text: String = String::from("Кратък текст с повтаряща се дума, който ще се ползва за корпус.\n Корпус123412 КоРпУс");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_BG);
let words_to_test = from_vec_to_hash_set(vec!["кратък", "ползва", "баба"]);
let actual: Vec<&String> = spell_checker.known(&words_to_test);
assert!(actual.contains(&&String::from("кратък")));
assert!(actual.contains(&&String::from("ползва")));
}
#[test]
fn test_unknown_words() {
let text: String = String::from("Кратък текст с повтаряща се дума, който ще се ползва за корпус.\n Корпус123412 КоРпУс");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_BG);
let words_to_test = from_vec_to_hash_set(vec!["адсасдасдсда", "бабабаба", "баба"]);
assert_eq!(spell_checker.known(&words_to_test), Vec::<&String>::new());
}
#[test]
fn test_edits1() {
let text: String = String::from("Кратък текст с повтаряща се дума, който ще се ползва за корпус.\n Корпус123412 КоРпУс");
let test_alphabet: String = String::from("абц");
let spell_checker: SpellChecker = SpellChecker::new(&text, &test_alphabet);
let expected: HashSet<String> = from_vec_to_hash_set(vec!["ума", "дма", "дуа", "дум", "удма", "дмуа", "дуам", "адума", "даума", "дуама", "думаа",
"бдума", "дбума", "дубма", "думба", "думаб", "цдума", "дцума", "дуцма", "думца", "думац", "аума", "дама", "дуаа", "дума", "бума", "дбма", "дуба",
"думб", "цума", "дцма", "дуца", "думц"]);
let actual: HashSet<String> = spell_checker.edits1("дума");
assert_eq!(actual, expected);
}
#[test]
fn test_edits2() {
let text: String = String::from("Кратък текст с повтаряща се дума, който ще се ползва за корпус.\n Корпус123412 КоРпУс");
let test_alphabet: String = String::from("аб");
let spell_checker: SpellChecker = SpellChecker::new(&text, &test_alphabet);
let expected: HashSet<String> = from_vec_to_hash_set(vec!["фпб", "аф", "бб", "апаф", "бп", "фб", "пфба", "афа", "бфб", "апфб", "пфаа", "б", "бфп",
"апфа", "бпфа", "пфбб", "пф", "бпф", "ап", "фа", "бпб", "пааф", "а", "апф", "бпаф", "пбф", "пфаб", "фбп", "бпфб", "апа", "бпбф", "ф", "пб", "пбаф",
"апб", "пфб", "пбфб", "аапф", "афб", "пбб", "аа", "пфа", "пабф", "паф", "паа", "ба", "пбфа", "апбф", "афп", "паб", "пафб", "", "пббф", "бапф", "п",
"фпа", "пафа", "ббф", "абпф", "абф", "ааф", "ббпф", "бфа", "пба", "бпа", "бф", "па", "баф", "аб", "фап"]);
let actual: HashSet<String> = spell_checker.edits2("пф");
assert_eq!(actual, expected);
}
#[test]
fn test_candidates_direct() {
let text: String = String::from("Lorem Ipsum is simply dummy text of the printing and typesetting industry.\n");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_EN);
let expected: Vec<String> = vec![String::from("ipsum")];
let actual: Vec<String> = spell_checker.candidates("ipsum");
assert_eq!(actual, expected);
}
#[test]
fn test_candidates_edits1() {
let text: String = String::from("Lorem Ipsum is simply dummy text of the printing and typesetting industry.\n");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_EN);
let expected: Vec<String> = vec![String::from("ipsum")];
let actual: Vec<String> = spell_checker.candidates("isum");
assert_eq!(actual, expected);
}
#[test]
fn test_candidates_edits2() {
let text: String = String::from("Lorem Ipsum is simply dummy text of the printing and typesetting industry.\n");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_EN);
let expected: Vec<String> = vec![String::from("simply")];
let actual: Vec<String> = spell_checker.candidates("simy");
assert_eq!(actual, expected);
}
#[test]
fn test_candidates_unknown() {
let text: String = String::from("Lorem Ipsum is simply dummy text of the printing and typesetting industry.\n");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_EN);
let expected: Vec<String> = vec![String::from("unknown")];
let actual: Vec<String> = spell_checker.candidates("unknown");
assert_eq!(actual, expected);
}
#[test]
fn test_correction_same_probability() {
let text: String = String::from("Lorem Ipsum is simply dummy text of the printing and typesetting industry.\n");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_EN);
let expected: String = String::from("dummy");
let actual: String = spell_checker.correction("dumy");
assert_eq!(actual, expected);
}
#[test]
fn test_correction_greater_probability() {
let text: String = String::from("Lorem Ipsum is simply simply simpl text of the printing and typesetting industry.\n");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_EN);
let expected: String = String::from("simply");
let actual: String = spell_checker.correction("simplu");
assert_eq!(actual, expected);
}
#[test]
fn test_correction_unknown() {
let text: String = String::from("Lorem Ipsum is simply simply simpl text of the printing and typesetting industry.\n");
let spell_checker: SpellChecker = SpellChecker::new(&text, &ALPHABET_EN);
let expected: String = String::from("unknown");
let actual: String = spell_checker.correction("unknown");
assert_eq!(actual, expected);
}
Лог от изпълнението
Compiling solution v0.1.0 (/tmp/d20200114-2173579-af9frh/solution) warning: function is never used: `from_vec_to_hash_set` --> src/lib.rs:276:1 | 276 | fn from_vec_to_hash_set(data: Vec<&str>) -> HashSet<String> { | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | = note: `#[warn(dead_code)]` on by default Finished test [unoptimized + debuginfo] target(s) in 6.41s Running target/debug/deps/solution-a73e64ec87929bd0 running 25 tests test test_add_over_repeating ... ok test test_basic ... ok test test_candidates_direct ... ok test test_candidates_edits1 ... ok test test_candidates_edits2 ... ok test test_candidates_unknown ... ok test test_cleaning_line ... ok test test_cleaning_line_trimming ... ok test test_correction_greater_probability ... ok test test_correction_same_probability ... ok test test_correction_unknown ... ok test test_creating_counter_repeated_word_cyrilic ... ok test test_creating_counter_unique_words ... ok test test_edits1 ... ok test test_edits2 ... ok test test_empty_counter ... ok test test_insert_from_alphabet ... ok test test_known_words ... ok test test_new_spellchecker ... ok test test_probability_more_than_once ... ok test test_probability_once ... ok test test_removing_one_char ... ok test test_replace_from_alphabet ... ok test test_swap_consecutive_chars ... ok test test_unknown_words ... ok test result: ok. 25 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out Running target/debug/deps/solution_test-38971695424b36d5 running 15 tests test solution_test::test_best_word_is_returned ... ok test solution_test::test_clean_line_removes_punctuation ... ok test solution_test::test_clean_line_trims_the_input ... ok test solution_test::test_correction ... ok test solution_test::test_correction_fails_to_produce_new_result ... ok test solution_test::test_correction_normalizes_case ... ok test solution_test::test_counting ... ok test solution_test::test_display ... ok test solution_test::test_edits1 ... ok test solution_test::test_edits2 ... ok test solution_test::test_empty_counter ... ok test solution_test::test_from_empty_str ... ok test solution_test::test_from_str ... ok test solution_test::test_known_words ... ok test solution_test::test_probability ... ok test result: ok. 15 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out Doc-tests solution running 0 tests test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out