Make the built-in stop word lists selectable via the Language enum already used by the Stemmer filter. (#1671)

2026-01-08 10:02:55 +00:00 · 2022-11-15 09:40:25 +01:00
parent eda6e5a10a
commit ca6231170e
1 changed files with 33 additions and 91 deletions
--- a/src/tokenizer/stop_word_filter/mod.rs
+++ b/src/tokenizer/stop_word_filter/mod.rs
@@ -18,8 +18,7 @@ use std::sync::Arc;

 use rustc_hash::FxHashSet;

-use super::{Token, TokenFilter, TokenStream};
-use crate::tokenizer::BoxTokenStream;
+use super::{BoxTokenStream, Language, Token, TokenFilter, TokenStream};

 /// `TokenFilter` that removes stop words from a token stream
 #[derive(Clone)]
@@ -28,95 +27,44 @@ pub struct StopWordFilter {
 }

 impl StopWordFilter {
+    /// Creates a new [`StopWordFilter`] for the given [`Language`]
+    ///
+    /// Returns `Some` if a list of stop words is available and `None` otherwise.
+    #[cfg(feature = "stopwords")]
+    pub fn new(language: Language) -> Option<Self> {
+        let words = match language {
+            Language::Danish => stopwords::DANISH,
+            Language::Dutch => stopwords::DUTCH,
+            Language::English => {
+                // This is the same list of words used by the Apache-licensed Lucene project,
+                // c.f. https://github.com/apache/lucene/blob/d5d6dc079395c47cd6d12dcce3bcfdd2c7d9dc63/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java#L46
+                &[
+                    "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in",
+                    "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the",
+                    "their", "then", "there", "these", "they", "this", "to", "was", "will", "with",
+                ]
+            }
+            Language::Finnish => stopwords::FINNISH,
+            Language::French => stopwords::FRENCH,
+            Language::German => stopwords::GERMAN,
+            Language::Italian => stopwords::ITALIAN,
+            Language::Norwegian => stopwords::NORWEGIAN,
+            Language::Portuguese => stopwords::PORTUGUESE,
+            Language::Russian => stopwords::RUSSIAN,
+            Language::Spanish => stopwords::SPANISH,
+            Language::Swedish => stopwords::SWEDISH,
+            _ => return None,
+        };
+
+        Some(Self::remove(words.iter().map(|&word| word.to_owned())))
+    }
+
    /// Creates a `StopWordFilter` given a list of words to remove
    pub fn remove<W: IntoIterator<Item = String>>(words: W) -> StopWordFilter {
        StopWordFilter {
            words: Arc::new(words.into_iter().collect()),
        }
    }
-
-    fn from_word_list(words: &[&str]) -> Self {
-        Self::remove(words.iter().map(|&word| word.to_owned()))
-    }
-
-    #[cfg(feature = "stopwords")]
-    /// Create a `StopWorldFilter` for the Danish language
-    pub fn danish() -> Self {
-        Self::from_word_list(stopwords::DANISH)
-    }
-
-    #[cfg(feature = "stopwords")]
-    /// Create a `StopWorldFilter` for the Dutch language
-    pub fn dutch() -> Self {
-        Self::from_word_list(stopwords::DUTCH)
-    }
-
-    /// Create a `StopWorldFilter` for the English language
-    pub fn english() -> Self {
-        // This is the same list of words used by the Apache-licensed Lucene project,
-        // c.f. https://github.com/apache/lucene/blob/d5d6dc079395c47cd6d12dcce3bcfdd2c7d9dc63/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java#L46
-        const WORDS: &[&str] = &[
-            "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into",
-            "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then",
-            "there", "these", "they", "this", "to", "was", "will", "with",
-        ];
-
-        Self::from_word_list(WORDS)
-    }
-
-    #[cfg(feature = "stopwords")]
-    /// Create a `StopWorldFilter` for the Finnish language
-    pub fn finnish() -> Self {
-        Self::from_word_list(stopwords::FINNISH)
-    }
-
-    #[cfg(feature = "stopwords")]
-    /// Create a `StopWorldFilter` for the French language
-    pub fn french() -> Self {
-        Self::from_word_list(stopwords::FRENCH)
-    }
-
-    #[cfg(feature = "stopwords")]
-    /// Create a `StopWorldFilter` for the German language
-    pub fn german() -> Self {
-        Self::from_word_list(stopwords::GERMAN)
-    }
-
-    #[cfg(feature = "stopwords")]
-    /// Create a `StopWorldFilter` for the Italian language
-    pub fn italian() -> Self {
-        Self::from_word_list(stopwords::ITALIAN)
-    }
-
-    #[cfg(feature = "stopwords")]
-    /// Create a `StopWorldFilter` for the Norwegian language
-    pub fn norwegian() -> Self {
-        Self::from_word_list(stopwords::NORWEGIAN)
-    }
-
-    #[cfg(feature = "stopwords")]
-    /// Create a `StopWorldFilter` for the Portuguese language
-    pub fn portuguese() -> Self {
-        Self::from_word_list(stopwords::PORTUGUESE)
-    }
-
-    #[cfg(feature = "stopwords")]
-    /// Create a `StopWorldFilter` for the Russian language
-    pub fn russian() -> Self {
-        Self::from_word_list(stopwords::RUSSIAN)
-    }
-
-    #[cfg(feature = "stopwords")]
-    /// Create a `StopWorldFilter` for the Spanish language
-    pub fn spanish() -> Self {
-        Self::from_word_list(stopwords::SPANISH)
-    }
-
-    #[cfg(feature = "stopwords")]
-    /// Create a `StopWorldFilter` for the Swedish language
-    pub fn swedish() -> Self {
-        Self::from_word_list(stopwords::SWEDISH)
-    }
 }

 pub struct StopWordFilterStream<'a> {
@@ -158,12 +106,6 @@ impl<'a> TokenStream for StopWordFilterStream<'a> {
    }
 }

-impl Default for StopWordFilter {
-    fn default() -> StopWordFilter {
-        StopWordFilter::english()
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use crate::tokenizer::tests::assert_token;