mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
Merge pull request #1649 from adamreichold/split-compound-words
RFC: Add dictionary-based SplitCompoundWords token filter.
This commit is contained in:
@@ -20,6 +20,7 @@ byteorder = "1.4.3"
|
||||
crc32fast = "1.3.2"
|
||||
once_cell = "1.10.0"
|
||||
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
|
||||
aho-corasick = "0.7"
|
||||
tantivy-fst = "0.4.0"
|
||||
memmap2 = { version = "0.5.3", optional = true }
|
||||
lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true }
|
||||
|
||||
@@ -126,6 +126,7 @@ mod ngram_tokenizer;
|
||||
mod raw_tokenizer;
|
||||
mod remove_long;
|
||||
mod simple_tokenizer;
|
||||
mod split_compound_words;
|
||||
mod stemmer;
|
||||
mod stop_word_filter;
|
||||
mod tokenized_string;
|
||||
@@ -141,6 +142,7 @@ pub use self::ngram_tokenizer::NgramTokenizer;
|
||||
pub use self::raw_tokenizer::RawTokenizer;
|
||||
pub use self::remove_long::RemoveLongFilter;
|
||||
pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::split_compound_words::SplitCompoundWords;
|
||||
pub use self::stemmer::{Language, Stemmer};
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||
|
||||
252
src/tokenizer/split_compound_words.rs
Normal file
252
src/tokenizer/split_compound_words.rs
Normal file
@@ -0,0 +1,252 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind, StateID};
|
||||
|
||||
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||
|
||||
/// A [`TokenFilter`] which splits compound words into their parts
|
||||
/// based on a given dictionary.
|
||||
///
|
||||
/// Words only will be split if they can be fully decomposed into
|
||||
/// consecutive matches into the given dictionary.
|
||||
///
|
||||
/// This is mostly useful to split [compound nouns][compound] common to many
|
||||
/// Germanic languages into their constituents.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// The quality of the dictionary determines the quality of the splits,
|
||||
/// e.g. the missing stem "back" of "backen" implies that "brotbackautomat"
|
||||
/// is not split in the following example.
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer};
|
||||
///
|
||||
/// let tokenizer =
|
||||
/// TextAnalyzer::from(SimpleTokenizer).filter(SplitCompoundWords::from_dictionary([
|
||||
/// "dampf", "schiff", "fahrt", "brot", "backen", "automat",
|
||||
/// ]));
|
||||
///
|
||||
/// let mut stream = tokenizer.token_stream("dampfschifffahrt");
|
||||
/// assert_eq!(stream.next().unwrap().text, "dampf");
|
||||
/// assert_eq!(stream.next().unwrap().text, "schiff");
|
||||
/// assert_eq!(stream.next().unwrap().text, "fahrt");
|
||||
/// assert_eq!(stream.next(), None);
|
||||
///
|
||||
/// let mut stream = tokenizer.token_stream("brotbackautomat");
|
||||
/// assert_eq!(stream.next().unwrap().text, "brotbackautomat");
|
||||
/// assert_eq!(stream.next(), None);
|
||||
/// ```
|
||||
///
|
||||
/// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics)
|
||||
#[derive(Clone)]
|
||||
pub struct SplitCompoundWords<S: StateID> {
|
||||
dict: Arc<AhoCorasick<S>>,
|
||||
}
|
||||
|
||||
impl SplitCompoundWords<usize> {
|
||||
/// Create a filter from a given dictionary.
|
||||
///
|
||||
/// The dictionary will be used to construct an [`AhoCorasick`] automaton
|
||||
/// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if
|
||||
/// more control over its construction is required.
|
||||
pub fn from_dictionary<I, P>(dict: I) -> Self
|
||||
where
|
||||
I: IntoIterator<Item = P>,
|
||||
P: AsRef<[u8]>,
|
||||
{
|
||||
let dict = AhoCorasickBuilder::new()
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
.build(dict);
|
||||
|
||||
Self::from_automaton(dict)
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: StateID> SplitCompoundWords<S> {
|
||||
/// Create a filter from a given automaton.
|
||||
///
|
||||
/// The automaton should use one of the leftmost-first match kinds
|
||||
/// and it should not be anchored.
|
||||
pub fn from_automaton(dict: AhoCorasick<S>) -> Self {
|
||||
Self {
|
||||
dict: Arc::new(dict),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: StateID + Send + Sync + 'static> TokenFilter for SplitCompoundWords<S> {
|
||||
fn transform<'a>(&self, stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(SplitCompoundWordsTokenStream {
|
||||
dict: self.dict.clone(),
|
||||
tail: stream,
|
||||
cuts: Vec::new(),
|
||||
parts: Vec::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
struct SplitCompoundWordsTokenStream<'a, S: StateID> {
|
||||
dict: Arc<AhoCorasick<S>>,
|
||||
tail: BoxTokenStream<'a>,
|
||||
cuts: Vec<usize>,
|
||||
parts: Vec<Token>,
|
||||
}
|
||||
|
||||
impl<'a, S: StateID> SplitCompoundWordsTokenStream<'a, S> {
|
||||
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
|
||||
// can fully be split into consecutive matches against `self.dict`.
|
||||
fn split(&mut self) {
|
||||
let token = self.tail.token();
|
||||
let mut text = token.text.as_str();
|
||||
|
||||
self.cuts.clear();
|
||||
let mut pos = 0;
|
||||
|
||||
for match_ in self.dict.find_iter(text) {
|
||||
if pos != match_.start() {
|
||||
break;
|
||||
}
|
||||
|
||||
self.cuts.push(pos);
|
||||
pos = match_.end();
|
||||
}
|
||||
|
||||
if pos == token.text.len() {
|
||||
// Fill `self.parts` in reverse order,
|
||||
// so that `self.parts.pop()` yields
|
||||
// the tokens in their original order.
|
||||
for pos in self.cuts.iter().rev() {
|
||||
let (head, tail) = text.split_at(*pos);
|
||||
|
||||
text = head;
|
||||
self.parts.push(Token {
|
||||
text: tail.to_owned(),
|
||||
..*token
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, S: StateID> TokenStream for SplitCompoundWordsTokenStream<'a, S> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.parts.pop();
|
||||
|
||||
if !self.parts.is_empty() {
|
||||
return true;
|
||||
}
|
||||
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Will yield either `self.parts.last()` or
|
||||
// `self.tail.token()` if it could not be split.
|
||||
self.split();
|
||||
true
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.parts.last().unwrap_or_else(|| self.tail.token())
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.parts
|
||||
.last_mut()
|
||||
.unwrap_or_else(|| self.tail.token_mut())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tokenizer::{SimpleTokenizer, TextAnalyzer};
|
||||
|
||||
#[test]
|
||||
fn splitting_compound_words_works() {
|
||||
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]));
|
||||
|
||||
{
|
||||
let mut stream = tokenizer.token_stream("");
|
||||
assert_eq!(stream.next(), None);
|
||||
}
|
||||
|
||||
{
|
||||
let mut stream = tokenizer.token_stream("foo bar");
|
||||
assert_eq!(stream.next().unwrap().text, "foo");
|
||||
assert_eq!(stream.next().unwrap().text, "bar");
|
||||
assert_eq!(stream.next(), None);
|
||||
}
|
||||
|
||||
{
|
||||
let mut stream = tokenizer.token_stream("foobar");
|
||||
assert_eq!(stream.next().unwrap().text, "foo");
|
||||
assert_eq!(stream.next().unwrap().text, "bar");
|
||||
assert_eq!(stream.next(), None);
|
||||
}
|
||||
|
||||
{
|
||||
let mut stream = tokenizer.token_stream("foobarbaz");
|
||||
assert_eq!(stream.next().unwrap().text, "foobarbaz");
|
||||
assert_eq!(stream.next(), None);
|
||||
}
|
||||
|
||||
{
|
||||
let mut stream = tokenizer.token_stream("baz foobar qux");
|
||||
assert_eq!(stream.next().unwrap().text, "baz");
|
||||
assert_eq!(stream.next().unwrap().text, "foo");
|
||||
assert_eq!(stream.next().unwrap().text, "bar");
|
||||
assert_eq!(stream.next().unwrap().text, "qux");
|
||||
assert_eq!(stream.next(), None);
|
||||
}
|
||||
|
||||
{
|
||||
let mut stream = tokenizer.token_stream("foobar foobar");
|
||||
assert_eq!(stream.next().unwrap().text, "foo");
|
||||
assert_eq!(stream.next().unwrap().text, "bar");
|
||||
assert_eq!(stream.next().unwrap().text, "foo");
|
||||
assert_eq!(stream.next().unwrap().text, "bar");
|
||||
assert_eq!(stream.next(), None);
|
||||
}
|
||||
|
||||
{
|
||||
let mut stream = tokenizer.token_stream("foobar foo bar foobar");
|
||||
assert_eq!(stream.next().unwrap().text, "foo");
|
||||
assert_eq!(stream.next().unwrap().text, "bar");
|
||||
assert_eq!(stream.next().unwrap().text, "foo");
|
||||
assert_eq!(stream.next().unwrap().text, "bar");
|
||||
assert_eq!(stream.next().unwrap().text, "foo");
|
||||
assert_eq!(stream.next().unwrap().text, "bar");
|
||||
assert_eq!(stream.next(), None);
|
||||
}
|
||||
|
||||
{
|
||||
let mut stream = tokenizer.token_stream("foobazbar foo bar foobar");
|
||||
assert_eq!(stream.next().unwrap().text, "foobazbar");
|
||||
assert_eq!(stream.next().unwrap().text, "foo");
|
||||
assert_eq!(stream.next().unwrap().text, "bar");
|
||||
assert_eq!(stream.next().unwrap().text, "foo");
|
||||
assert_eq!(stream.next().unwrap().text, "bar");
|
||||
assert_eq!(stream.next(), None);
|
||||
}
|
||||
|
||||
{
|
||||
let mut stream = tokenizer.token_stream("foobar qux foobar");
|
||||
assert_eq!(stream.next().unwrap().text, "foo");
|
||||
assert_eq!(stream.next().unwrap().text, "bar");
|
||||
assert_eq!(stream.next().unwrap().text, "qux");
|
||||
assert_eq!(stream.next().unwrap().text, "foo");
|
||||
assert_eq!(stream.next().unwrap().text, "bar");
|
||||
assert_eq!(stream.next(), None);
|
||||
}
|
||||
|
||||
{
|
||||
let mut stream = tokenizer.token_stream("barfoo");
|
||||
assert_eq!(stream.next().unwrap().text, "bar");
|
||||
assert_eq!(stream.next().unwrap().text, "foo");
|
||||
assert_eq!(stream.next(), None);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user