diff --git a/src/tokenizer/stop_word_filter.rs b/src/tokenizer/stop_word_filter.rs index f94ec632f..45691d470 100644 --- a/src/tokenizer/stop_word_filter.rs +++ b/src/tokenizer/stop_word_filter.rs @@ -39,6 +39,16 @@ impl StopWordFilter { StopWordFilter { words: set } } + + fn english() -> StopWordFilter { + let words: [&'static str; 33] = [ + "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", + "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", + "there", "these", "they", "this", "to", "was", "will", "with", + ]; + + StopWordFilter::remove(words.iter().map(|s| s.to_string()).collect()) + } } pub struct StopWordFilterStream @@ -98,3 +108,9 @@ where false } } + +impl Default for StopWordFilter { + fn default() -> StopWordFilter { + StopWordFilter::english() + } +} diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index cbb46af3b..410e7f30b 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -8,6 +8,7 @@ use tokenizer::RawTokenizer; use tokenizer::RemoveLongFilter; use tokenizer::SimpleTokenizer; use tokenizer::Stemmer; +use tokenizer::StopWordFilter; use tokenizer::Tokenizer; /// The tokenizer manager serves as a store for