Решение на Spell Checker от Антонио Миндов

Обратно към всички решения

Към профила на Антонио Миндов

Резултати

  • 20 точки от тестове
  • 0 бонус точки
  • 20 точки общо
  • 15 успешни тест(а)
  • 0 неуспешни тест(а)

Код

use std::collections::HashMap;
use std::collections::HashSet;
use std::iter::FromIterator;
use std::ops::{RangeFrom, Range};
pub fn clean_line(input: &str) -> String {
input.trim().chars().filter(|&c| c.is_alphabetic() || c.is_whitespace() || "'-".contains(c)).collect()
}
fn clean_word(input: &str) -> String {
input.trim().to_lowercase()
}
trait Utf8Index {
fn utf8_index(&self, r: Range<usize>) -> String;
fn utf8_index_from(&self, r: RangeFrom<usize>) -> String;
}
impl Utf8Index for &str {
fn utf8_index(&self, r: Range<usize>) -> String {
self.chars().skip(r.start).take(r.end - r.start).collect()
}
fn utf8_index_from(&self, r: RangeFrom<usize>) -> String {
self.chars().skip(r.start).collect()
}
}
pub struct WordCounter {
word_frequencies: HashMap<String, u32>
}
impl WordCounter {
pub fn new() -> Self {
WordCounter { word_frequencies: HashMap::new() }
}
pub fn from_str(input: &str) -> Self {
let cleaned: Vec<String> = input.lines().map(clean_line).collect();
let words: Vec<String> = cleaned.iter()
.flat_map(|l| l.split_whitespace())
.map(String::from).collect();
let mut counter = WordCounter { word_frequencies: HashMap::new() };
words.iter().for_each(|w|counter.add(w));
counter
}
pub fn words(&self) -> Vec<&String> {
let mut words: Vec<&String> = self.word_frequencies.keys().collect();
words.sort();
words
}
pub fn add(&mut self, item: &str) {
let word = clean_word(item);
*self.word_frequencies.entry(word).or_insert(0) += 1;
}
pub fn get(&self, word: &str) -> u32 {
*self.word_frequencies.get(word).unwrap_or(&0)
}
pub fn total_count(&self) -> u32 {
self.word_frequencies.values().sum()
}
}
impl std::fmt::Display for WordCounter {
fn fmt(&self, f: &mut core::fmt::Formatter) -> std::fmt::Result {
writeln!(f, "WordCounter, total count: {}", self.total_count())?;
let mut entries: Vec<(&String, &u32)> = self.word_frequencies.iter().collect();
entries.sort_by(|(_, a), (_, b)| b.cmp(a));
for (word, freq) in entries {
writeln!(f, "{}: {}", word, freq)?;
}
Ok(())
}
}
pub struct SpellChecker {
counter: WordCounter,
alphabet: String
}
impl SpellChecker {
pub fn new(corpus: &str, alphabet: &str) -> Self {
SpellChecker { counter: WordCounter::from_str(corpus), alphabet: String::from(alphabet) }
}
pub fn correction(&self, word: &str) -> String {
self.candidates(&clean_word(word)).iter()
.max_by(|a, b| self.probability(a).partial_cmp(&self.probability(b)).unwrap())
.unwrap().clone()
}
pub fn probability(&self, word: &str) -> f64 {
self.counter.get(word) as f64 / self.counter.total_count() as f64
}
pub fn known<'a>(&self, words: &'a HashSet<String>) -> Vec<&'a String> {
let known = self.counter.words();
words.iter().filter(|w| known.contains(w)).collect()
}
pub fn candidates(&self, word: &str) -> Vec<String> {
let mut word_set = HashSet::new();
word_set.insert(String::from(word));
self.try_words(&word_set)
.or_else(|| self.try_words(&self.edits1(word)))
.or_else(|| self.try_words(&self.edits2(word)))
.unwrap_or(vec![String::from(word)])
}
fn try_words(&self, words: &HashSet<String>) -> Option<Vec<String>> {
let result = self.known(words);
Some(result.iter().map(|&s| s.clone()).collect())
.filter(|_| !result.is_empty()) // Little hack so there isn't an if else with None on the else branch
}
pub fn edits1(&self, word: &str) -> HashSet<String> { // Who said Rust code needs to be longer than Python
let splits: Vec<(&str, &str)> = (0..word.len()+1).filter(|&i| word.is_char_boundary(i)).map(|i| (word.split_at(i))).collect();
let deletes = splits.iter().map(|(l, r)| format!("{}{}",l, r.utf8_index_from(1..)));
let transposes = splits.iter().map(|(l, r)| format!("{}{}{}{}", l, r.utf8_index(1..2), r.utf8_index(0..1), r.utf8_index_from(2..)));
let replaces = splits.iter().flat_map(|(l, r)| self.alphabet.chars().map(move |c| format!("{}{}{}", l, c, r.utf8_index_from(1..))));
let inserts = splits.iter().flat_map(|(l, r)| self.alphabet.chars().map(move |c| format!("{}{}{}", l, c, r)));
HashSet::from_iter(deletes.chain(transposes).chain(replaces).chain(inserts))
}
pub fn edits2(&self, word: &str) -> HashSet<String> {
self.edits1(word).iter().flat_map(|ew| self.edits1(ew)).collect()
}
}

Лог от изпълнението

Compiling solution v0.1.0 (/tmp/d20200114-2173579-14nkjsg/solution)
    Finished test [unoptimized + debuginfo] target(s) in 4.88s
     Running target/debug/deps/solution-a73e64ec87929bd0

running 0 tests

test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out

     Running target/debug/deps/solution_test-38971695424b36d5

running 15 tests
test solution_test::test_best_word_is_returned ... ok
test solution_test::test_clean_line_removes_punctuation ... ok
test solution_test::test_clean_line_trims_the_input ... ok
test solution_test::test_correction ... ok
test solution_test::test_correction_fails_to_produce_new_result ... ok
test solution_test::test_correction_normalizes_case ... ok
test solution_test::test_counting ... ok
test solution_test::test_display ... ok
test solution_test::test_edits1 ... ok
test solution_test::test_edits2 ... ok
test solution_test::test_empty_counter ... ok
test solution_test::test_from_empty_str ... ok
test solution_test::test_from_str ... ok
test solution_test::test_known_words ... ok
test solution_test::test_probability ... ok

test result: ok. 15 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out

   Doc-tests solution

running 0 tests

test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out

История (3 версии и 1 коментар)

Антонио качи първо решение на 10.01.2020 16:03 (преди над 5 години)

Антонио качи решение на 11.01.2020 17:56 (преди над 5 години)

use std::collections::HashMap;
use std::collections::HashSet;
use std::iter::FromIterator;
+use std::ops::{RangeFrom, Range};
pub fn clean_line(input: &str) -> String {
- input.chars().filter(|&c| c.is_alphabetic() || c.is_whitespace() || "'-".contains(c)).collect()
+ input.trim().chars().filter(|&c| c.is_alphabetic() || c.is_whitespace() || "'-".contains(c)).collect()
}
fn clean_word(input: &str) -> String {
input.trim().to_lowercase()
}
+trait Utf8Index {
+ fn utf8_index(&self, r: Range<usize>) -> String;
+ fn utf8_index_from(&self, r: RangeFrom<usize>) -> String;
+}
+
+impl Utf8Index for &str {
+ fn utf8_index(&self, r: Range<usize>) -> String {
+ self.chars().skip(r.start).take(r.end - r.start).collect()
+ }
+
+ fn utf8_index_from(&self, r: RangeFrom<usize>) -> String {
+ self.utf8_index(r.start..self.len())
+ }
+}
+
pub struct WordCounter {
word_frequencies: HashMap<String, u32>
}
impl WordCounter {
pub fn new() -> Self {
WordCounter { word_frequencies: HashMap::new() }
}
pub fn from_str(input: &str) -> Self {
let cleaned: Vec<String> = input.lines().map(clean_line).collect();
let words: Vec<String> = cleaned.iter()
.flat_map(|l| l.split_whitespace())
.map(String::from).collect();
let mut counter = WordCounter { word_frequencies: HashMap::new() };
words.iter().for_each(|w|counter.add(w));
counter
}
pub fn words(&self) -> Vec<&String> {
let mut words: Vec<&String> = self.word_frequencies.keys().collect();
words.sort();
words
}
pub fn add(&mut self, item: &str) {
let word = clean_word(item);
*self.word_frequencies.entry(word).or_insert(0) += 1;
}
pub fn get(&self, word: &str) -> u32 {
- let word = clean_word(word);
- *self.word_frequencies.get(&word).unwrap_or(&0)
+ *self.word_frequencies.get(word).unwrap_or(&0)
}
pub fn total_count(&self) -> u32 {
self.word_frequencies.values().sum()
}
}
impl std::fmt::Display for WordCounter {
fn fmt(&self, f: &mut core::fmt::Formatter) -> std::fmt::Result {
writeln!(f, "WordCounter, total count: {}", self.total_count())?;
let mut entries: Vec<(&String, &u32)> = self.word_frequencies.iter().collect();
entries.sort_by(|(_, a), (_, b)| b.cmp(a));
for (word, freq) in entries {
writeln!(f, "{}: {}", word, freq)?;
}
Ok(())
}
}
pub struct SpellChecker {
counter: WordCounter,
alphabet: String
}
impl SpellChecker {
pub fn new(corpus: &str, alphabet: &str) -> Self {
SpellChecker { counter: WordCounter::from_str(corpus), alphabet: String::from(alphabet) }
}
pub fn correction(&self, word: &str) -> String {
self.candidates(&clean_word(word)).iter()
.max_by(|a, b| self.probability(a).partial_cmp(&self.probability(b)).unwrap())
.unwrap().clone()
}
pub fn probability(&self, word: &str) -> f64 {
self.counter.get(word) as f64 / self.counter.total_count() as f64
}
pub fn known<'a>(&self, words: &'a HashSet<String>) -> Vec<&'a String> {
let known = self.counter.words();
words.iter().filter(|w| known.contains(w)).collect()
}
pub fn candidates(&self, word: &str) -> Vec<String> {
let mut word_set = HashSet::new();
word_set.insert(String::from(word));
self.try_words(&word_set)
.or_else(|| self.try_words(&self.edits1(word)))
.or_else(|| self.try_words(&self.edits2(word)))
.unwrap_or(vec![String::from(word)])
}
fn try_words(&self, words: &HashSet<String>) -> Option<Vec<String>> {
let result = self.known(words);
Some(result.iter().map(|&s| s.clone()).collect())
.filter(|_| !result.is_empty()) // Little hack so there isn't an if else with None on the else branch
}
pub fn edits1(&self, word: &str) -> HashSet<String> { // Who said Rust code needs to be longer than Python
- let splits: Vec<(&str, &str)> = (0..word.len()+1).map(|i| (&word[..i], &word[i..])).collect();
- let deletes = splits.iter().filter(|(_, r)| !r.is_empty()).map(|(l, r)| format!("{}{}",l, &r[1..]));
- let transposes = splits.iter().filter(|(_, r)| r.len() > 1).map(|(l, r)| format!("{}{}{}{}", l, &r[1..2], &r[0..1], &r[2..]));
- let replaces = splits.iter() .filter(|(_, r)| !r.is_empty()).flat_map(|(l, r)| self.alphabet.chars().map(move |c| format!("{}{}{}", l, c, &r[1..])));
+ let splits: Vec<(&str, &str)> = (0..word.len()+1).filter(|&i| word.is_char_boundary(i)).map(|i| (word.split_at(i))).collect();
+ let deletes = splits.iter().filter(|(_, r)| !r.is_empty()).map(|(l, r)| format!("{}{}",l, r.utf8_index_from(1..)));
+ let transposes = splits.iter().filter(|(_, r)| r.len() > 1).map(|(l, r)| format!("{}{}{}{}", l, r.utf8_index(1..2), r.utf8_index(0..1), r.utf8_index_from(2..)));
+ let replaces = splits.iter() .filter(|(_, r)| !r.is_empty()).flat_map(|(l, r)| self.alphabet.chars().map(move |c| format!("{}{}{}", l, c, r.utf8_index_from(1..))));
let inserts = splits.iter().flat_map(|(l, r)| self.alphabet.chars().map(move |c| format!("{}{}{}", l, c, r)));
HashSet::from_iter(deletes.chain(transposes).chain(replaces).chain(inserts))
}
pub fn edits2(&self, word: &str) -> HashSet<String> {
self.edits1(word).iter().flat_map(|ew| self.edits1(ew)).collect()
}
-}
+}

Антонио качи решение на 11.01.2020 18:04 (преди над 5 години)

use std::collections::HashMap;
use std::collections::HashSet;
use std::iter::FromIterator;
use std::ops::{RangeFrom, Range};
pub fn clean_line(input: &str) -> String {
input.trim().chars().filter(|&c| c.is_alphabetic() || c.is_whitespace() || "'-".contains(c)).collect()
}
fn clean_word(input: &str) -> String {
input.trim().to_lowercase()
}
trait Utf8Index {
fn utf8_index(&self, r: Range<usize>) -> String;
fn utf8_index_from(&self, r: RangeFrom<usize>) -> String;
}
impl Utf8Index for &str {
fn utf8_index(&self, r: Range<usize>) -> String {
self.chars().skip(r.start).take(r.end - r.start).collect()
}
fn utf8_index_from(&self, r: RangeFrom<usize>) -> String {
- self.utf8_index(r.start..self.len())
+ self.chars().skip(r.start).collect()
}
}
pub struct WordCounter {
word_frequencies: HashMap<String, u32>
}
impl WordCounter {
pub fn new() -> Self {
WordCounter { word_frequencies: HashMap::new() }
}
pub fn from_str(input: &str) -> Self {
let cleaned: Vec<String> = input.lines().map(clean_line).collect();
let words: Vec<String> = cleaned.iter()
.flat_map(|l| l.split_whitespace())
.map(String::from).collect();
let mut counter = WordCounter { word_frequencies: HashMap::new() };
words.iter().for_each(|w|counter.add(w));
counter
}
pub fn words(&self) -> Vec<&String> {
let mut words: Vec<&String> = self.word_frequencies.keys().collect();
words.sort();
words
}
pub fn add(&mut self, item: &str) {
let word = clean_word(item);
*self.word_frequencies.entry(word).or_insert(0) += 1;
}
pub fn get(&self, word: &str) -> u32 {
*self.word_frequencies.get(word).unwrap_or(&0)
}
pub fn total_count(&self) -> u32 {
self.word_frequencies.values().sum()
}
}
impl std::fmt::Display for WordCounter {
fn fmt(&self, f: &mut core::fmt::Formatter) -> std::fmt::Result {
writeln!(f, "WordCounter, total count: {}", self.total_count())?;
let mut entries: Vec<(&String, &u32)> = self.word_frequencies.iter().collect();
entries.sort_by(|(_, a), (_, b)| b.cmp(a));
for (word, freq) in entries {
writeln!(f, "{}: {}", word, freq)?;
}
Ok(())
}
}
pub struct SpellChecker {
counter: WordCounter,
alphabet: String
}
impl SpellChecker {
pub fn new(corpus: &str, alphabet: &str) -> Self {
SpellChecker { counter: WordCounter::from_str(corpus), alphabet: String::from(alphabet) }
}
pub fn correction(&self, word: &str) -> String {
self.candidates(&clean_word(word)).iter()
.max_by(|a, b| self.probability(a).partial_cmp(&self.probability(b)).unwrap())
.unwrap().clone()
}
pub fn probability(&self, word: &str) -> f64 {
self.counter.get(word) as f64 / self.counter.total_count() as f64
}
pub fn known<'a>(&self, words: &'a HashSet<String>) -> Vec<&'a String> {
let known = self.counter.words();
words.iter().filter(|w| known.contains(w)).collect()
}
pub fn candidates(&self, word: &str) -> Vec<String> {
let mut word_set = HashSet::new();
word_set.insert(String::from(word));
self.try_words(&word_set)
.or_else(|| self.try_words(&self.edits1(word)))
.or_else(|| self.try_words(&self.edits2(word)))
.unwrap_or(vec![String::from(word)])
}
fn try_words(&self, words: &HashSet<String>) -> Option<Vec<String>> {
let result = self.known(words);
Some(result.iter().map(|&s| s.clone()).collect())
.filter(|_| !result.is_empty()) // Little hack so there isn't an if else with None on the else branch
}
pub fn edits1(&self, word: &str) -> HashSet<String> { // Who said Rust code needs to be longer than Python
let splits: Vec<(&str, &str)> = (0..word.len()+1).filter(|&i| word.is_char_boundary(i)).map(|i| (word.split_at(i))).collect();
- let deletes = splits.iter().filter(|(_, r)| !r.is_empty()).map(|(l, r)| format!("{}{}",l, r.utf8_index_from(1..)));
- let transposes = splits.iter().filter(|(_, r)| r.len() > 1).map(|(l, r)| format!("{}{}{}{}", l, r.utf8_index(1..2), r.utf8_index(0..1), r.utf8_index_from(2..)));
- let replaces = splits.iter() .filter(|(_, r)| !r.is_empty()).flat_map(|(l, r)| self.alphabet.chars().map(move |c| format!("{}{}{}", l, c, r.utf8_index_from(1..))));
+ let deletes = splits.iter().map(|(l, r)| format!("{}{}",l, r.utf8_index_from(1..)));
+ let transposes = splits.iter().map(|(l, r)| format!("{}{}{}{}", l, r.utf8_index(1..2), r.utf8_index(0..1), r.utf8_index_from(2..)));
+ let replaces = splits.iter().flat_map(|(l, r)| self.alphabet.chars().map(move |c| format!("{}{}{}", l, c, r.utf8_index_from(1..))));
let inserts = splits.iter().flat_map(|(l, r)| self.alphabet.chars().map(move |c| format!("{}{}{}", l, c, r)));
HashSet::from_iter(deletes.chain(transposes).chain(replaces).chain(inserts))
}
pub fn edits2(&self, word: &str) -> HashSet<String> {
self.edits1(word).iter().flat_map(|ew| self.edits1(ew)).collect()
}
}