Add default EN stopwords to the default analyzer (#381)

* Add a default list of en stopwords

* Add the default en stopword filter to the standard tokenizers

* code review feedback
This commit is contained in:
Dru Sellers
2018-08-21 20:49:39 -05:00
committed by Paul Masurel
parent 3d73c0c240
commit af593b1116
2 changed files with 17 additions and 0 deletions

View File

@@ -39,6 +39,16 @@ impl StopWordFilter {
StopWordFilter { words: set }
}
fn english() -> StopWordFilter {
let words: [&'static str; 33] = [
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into",
"is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then",
"there", "these", "they", "this", "to", "was", "will", "with",
];
StopWordFilter::remove(words.iter().map(|s| s.to_string()).collect())
}
}
pub struct StopWordFilterStream<TailTokenStream>
@@ -98,3 +108,9 @@ where
false
}
}
impl Default for StopWordFilter {
fn default() -> StopWordFilter {
StopWordFilter::english()
}
}

View File

@@ -8,6 +8,7 @@ use tokenizer::RawTokenizer;
use tokenizer::RemoveLongFilter;
use tokenizer::SimpleTokenizer;
use tokenizer::Stemmer;
use tokenizer::StopWordFilter;
use tokenizer::Tokenizer;
/// The tokenizer manager serves as a store for