mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 01:32:53 +00:00
Include stop word lists from Lucene and the Snowball project (#1666)
This commit is contained in:
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@@ -48,7 +48,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
features: [
|
features: [
|
||||||
{ label: "all", flags: "mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
|
{ label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
|
||||||
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
|
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -91,8 +91,9 @@ debug-assertions = true
|
|||||||
overflow-checks = true
|
overflow-checks = true
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["mmap", "lz4-compression" ]
|
default = ["mmap", "stopwords", "lz4-compression"]
|
||||||
mmap = ["fs2", "tempfile", "memmap2"]
|
mmap = ["fs2", "tempfile", "memmap2"]
|
||||||
|
stopwords = []
|
||||||
|
|
||||||
brotli-compression = ["brotli"]
|
brotli-compression = ["brotli"]
|
||||||
lz4-compression = ["lz4_flex"]
|
lz4-compression = ["lz4_flex"]
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ use crate::DocId;
|
|||||||
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
|
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
|
||||||
/// of each document for each field with field norms.
|
/// of each document for each field with field norms.
|
||||||
///
|
///
|
||||||
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
|
/// `FieldNormsWriter` stores a `Vec<u8>` for each tracked field, using a
|
||||||
/// byte per document per field.
|
/// byte per document per field.
|
||||||
pub struct FieldNormsWriter {
|
pub struct FieldNormsWriter {
|
||||||
fieldnorms_buffers: Vec<Option<Vec<u8>>>,
|
fieldnorms_buffers: Vec<Option<Vec<u8>>>,
|
||||||
|
|||||||
42
src/tokenizer/stop_word_filter/gen_stopwords.py
Normal file
42
src/tokenizer/stop_word_filter/gen_stopwords.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
import requests
|
||||||
|
|
||||||
|
LANGUAGES = [
|
||||||
|
"danish",
|
||||||
|
"dutch",
|
||||||
|
"finnish",
|
||||||
|
"french",
|
||||||
|
"german",
|
||||||
|
"italian",
|
||||||
|
"norwegian",
|
||||||
|
"portuguese",
|
||||||
|
"russian",
|
||||||
|
"spanish",
|
||||||
|
"swedish",
|
||||||
|
]
|
||||||
|
|
||||||
|
with requests.Session() as sess, open("stopwords.rs", "w") as mod:
|
||||||
|
mod.write("/*\n")
|
||||||
|
mod.write(
|
||||||
|
"These stop word lists are from the Snowball project (https://snowballstem.org/)\nwhich carries the following copyright and license:\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
resp = sess.get(
|
||||||
|
"https://raw.githubusercontent.com/snowballstem/snowball/master/COPYING"
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
mod.write(resp.text)
|
||||||
|
mod.write("*/\n\n")
|
||||||
|
|
||||||
|
for lang in LANGUAGES:
|
||||||
|
resp = sess.get(f"https://snowballstem.org/algorithms/{lang}/stop.txt")
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
mod.write(f"pub const {lang.upper()}: &[&str] = &[\n")
|
||||||
|
|
||||||
|
for line in resp.text.splitlines():
|
||||||
|
line, _, _ = line.partition("|")
|
||||||
|
|
||||||
|
for word in line.split():
|
||||||
|
mod.write(f' "{word}",\n')
|
||||||
|
|
||||||
|
mod.write("];\n\n")
|
||||||
@@ -10,6 +10,10 @@
|
|||||||
//! assert_eq!(stream.next().unwrap().text, "crafty");
|
//! assert_eq!(stream.next().unwrap().text, "crafty");
|
||||||
//! assert!(stream.next().is_none());
|
//! assert!(stream.next().is_none());
|
||||||
//! ```
|
//! ```
|
||||||
|
#[cfg(feature = "stopwords")]
|
||||||
|
#[rustfmt::skip]
|
||||||
|
mod stopwords;
|
||||||
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use rustc_hash::FxHashSet;
|
use rustc_hash::FxHashSet;
|
||||||
@@ -31,14 +35,87 @@ impl StopWordFilter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn english() -> StopWordFilter {
|
fn from_word_list(words: &[&str]) -> Self {
|
||||||
let words: [&'static str; 33] = [
|
Self::remove(words.iter().map(|&word| word.to_owned()))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "stopwords")]
|
||||||
|
/// Create a `StopWorldFilter` for the Danish language
|
||||||
|
pub fn danish() -> Self {
|
||||||
|
Self::from_word_list(stopwords::DANISH)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "stopwords")]
|
||||||
|
/// Create a `StopWorldFilter` for the Dutch language
|
||||||
|
pub fn dutch() -> Self {
|
||||||
|
Self::from_word_list(stopwords::DUTCH)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a `StopWorldFilter` for the English language
|
||||||
|
pub fn english() -> Self {
|
||||||
|
// This is the same list of words used by the Apache-licensed Lucene project,
|
||||||
|
// c.f. https://github.com/apache/lucene/blob/d5d6dc079395c47cd6d12dcce3bcfdd2c7d9dc63/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java#L46
|
||||||
|
const WORDS: &[&str] = &[
|
||||||
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into",
|
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into",
|
||||||
"is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then",
|
"is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then",
|
||||||
"there", "these", "they", "this", "to", "was", "will", "with",
|
"there", "these", "they", "this", "to", "was", "will", "with",
|
||||||
];
|
];
|
||||||
|
|
||||||
StopWordFilter::remove(words.iter().map(|&s| s.to_string()))
|
Self::from_word_list(WORDS)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "stopwords")]
|
||||||
|
/// Create a `StopWorldFilter` for the Finnish language
|
||||||
|
pub fn finnish() -> Self {
|
||||||
|
Self::from_word_list(stopwords::FINNISH)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "stopwords")]
|
||||||
|
/// Create a `StopWorldFilter` for the French language
|
||||||
|
pub fn french() -> Self {
|
||||||
|
Self::from_word_list(stopwords::FRENCH)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "stopwords")]
|
||||||
|
/// Create a `StopWorldFilter` for the German language
|
||||||
|
pub fn german() -> Self {
|
||||||
|
Self::from_word_list(stopwords::GERMAN)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "stopwords")]
|
||||||
|
/// Create a `StopWorldFilter` for the Italian language
|
||||||
|
pub fn italian() -> Self {
|
||||||
|
Self::from_word_list(stopwords::ITALIAN)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "stopwords")]
|
||||||
|
/// Create a `StopWorldFilter` for the Norwegian language
|
||||||
|
pub fn norwegian() -> Self {
|
||||||
|
Self::from_word_list(stopwords::NORWEGIAN)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "stopwords")]
|
||||||
|
/// Create a `StopWorldFilter` for the Portuguese language
|
||||||
|
pub fn portuguese() -> Self {
|
||||||
|
Self::from_word_list(stopwords::PORTUGUESE)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "stopwords")]
|
||||||
|
/// Create a `StopWorldFilter` for the Russian language
|
||||||
|
pub fn russian() -> Self {
|
||||||
|
Self::from_word_list(stopwords::RUSSIAN)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "stopwords")]
|
||||||
|
/// Create a `StopWorldFilter` for the Spanish language
|
||||||
|
pub fn spanish() -> Self {
|
||||||
|
Self::from_word_list(stopwords::SPANISH)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "stopwords")]
|
||||||
|
/// Create a `StopWorldFilter` for the Swedish language
|
||||||
|
pub fn swedish() -> Self {
|
||||||
|
Self::from_word_list(stopwords::SWEDISH)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
2117
src/tokenizer/stop_word_filter/stopwords.rs
Normal file
2117
src/tokenizer/stop_word_filter/stopwords.rs
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user