mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-29 14:40:40 +00:00
Add default EN stopwords to the default analyzer (#381)
* Add a default list of en stopwords * Add the default en stopword filter to the standard tokenizers * code review feedback
This commit is contained in:
committed by
Paul Masurel
parent
3d73c0c240
commit
af593b1116
@@ -39,6 +39,16 @@ impl StopWordFilter {
|
||||
|
||||
StopWordFilter { words: set }
|
||||
}
|
||||
|
||||
fn english() -> StopWordFilter {
|
||||
let words: [&'static str; 33] = [
|
||||
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into",
|
||||
"is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then",
|
||||
"there", "these", "they", "this", "to", "was", "will", "with",
|
||||
];
|
||||
|
||||
StopWordFilter::remove(words.iter().map(|s| s.to_string()).collect())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StopWordFilterStream<TailTokenStream>
|
||||
@@ -98,3 +108,9 @@ where
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for StopWordFilter {
|
||||
fn default() -> StopWordFilter {
|
||||
StopWordFilter::english()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ use tokenizer::RawTokenizer;
|
||||
use tokenizer::RemoveLongFilter;
|
||||
use tokenizer::SimpleTokenizer;
|
||||
use tokenizer::Stemmer;
|
||||
use tokenizer::StopWordFilter;
|
||||
use tokenizer::Tokenizer;
|
||||
|
||||
/// The tokenizer manager serves as a store for
|
||||
|
||||
Reference in New Issue
Block a user