Files
tantivy/src/tokenizer/split_compound_words.rs
2022-10-27 08:30:33 +02:00

253 lines
8.2 KiB
Rust

use std::sync::Arc;
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind, StateID};
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
/// A [`TokenFilter`] which splits compound words into their parts
/// based on a given dictionary.
///
/// Words only will be split if they can be fully decomposed into
/// consecutive matches into the given dictionary.
///
/// This is mostly useful to split [compound nouns][compound] common to many
/// Germanic languages into their constituents.
///
/// # Example
///
/// The quality of the dictionary determines the quality of the splits,
/// e.g. the missing stem "back" of "backen" implies that "brotbackautomat"
/// is not split in the following example.
///
/// ```rust
/// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer};
///
/// let tokenizer =
/// TextAnalyzer::from(SimpleTokenizer).filter(SplitCompoundWords::from_dictionary([
/// "dampf", "schiff", "fahrt", "brot", "backen", "automat",
/// ]));
///
/// let mut stream = tokenizer.token_stream("dampfschifffahrt");
/// assert_eq!(stream.next().unwrap().text, "dampf");
/// assert_eq!(stream.next().unwrap().text, "schiff");
/// assert_eq!(stream.next().unwrap().text, "fahrt");
/// assert_eq!(stream.next(), None);
///
/// let mut stream = tokenizer.token_stream("brotbackautomat");
/// assert_eq!(stream.next().unwrap().text, "brotbackautomat");
/// assert_eq!(stream.next(), None);
/// ```
///
/// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics)
#[derive(Clone)]
pub struct SplitCompoundWords<S: StateID> {
dict: Arc<AhoCorasick<S>>,
}
impl SplitCompoundWords<usize> {
/// Create a filter from a given dictionary.
///
/// The dictionary will be used to construct an [`AhoCorasick`] automaton
/// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if
/// more control over its construction is required.
pub fn from_dictionary<I, P>(dict: I) -> Self
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
let dict = AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostLongest)
.build(dict);
Self::from_automaton(dict)
}
}
impl<S: StateID> SplitCompoundWords<S> {
/// Create a filter from a given automaton.
///
/// The automaton should use one of the leftmost-first match kinds
/// and it should not be anchored.
pub fn from_automaton(dict: AhoCorasick<S>) -> Self {
Self {
dict: Arc::new(dict),
}
}
}
impl<S: StateID + Send + Sync + 'static> TokenFilter for SplitCompoundWords<S> {
fn transform<'a>(&self, stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: stream,
cuts: Vec::new(),
parts: Vec::new(),
})
}
}
struct SplitCompoundWordsTokenStream<'a, S: StateID> {
dict: Arc<AhoCorasick<S>>,
tail: BoxTokenStream<'a>,
cuts: Vec<usize>,
parts: Vec<Token>,
}
impl<'a, S: StateID> SplitCompoundWordsTokenStream<'a, S> {
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
// can fully be split into consecutive matches against `self.dict`.
fn split(&mut self) {
let token = self.tail.token();
let mut text = token.text.as_str();
self.cuts.clear();
let mut pos = 0;
for match_ in self.dict.find_iter(text) {
if pos != match_.start() {
break;
}
self.cuts.push(pos);
pos = match_.end();
}
if pos == token.text.len() {
// Fill `self.parts` in reverse order,
// so that `self.parts.pop()` yields
// the tokens in their original order.
for pos in self.cuts.iter().rev() {
let (head, tail) = text.split_at(*pos);
text = head;
self.parts.push(Token {
text: tail.to_owned(),
..*token
});
}
}
}
}
impl<'a, S: StateID> TokenStream for SplitCompoundWordsTokenStream<'a, S> {
fn advance(&mut self) -> bool {
self.parts.pop();
if !self.parts.is_empty() {
return true;
}
if !self.tail.advance() {
return false;
}
// Will yield either `self.parts.last()` or
// `self.tail.token()` if it could not be split.
self.split();
true
}
fn token(&self) -> &Token {
self.parts.last().unwrap_or_else(|| self.tail.token())
}
fn token_mut(&mut self) -> &mut Token {
self.parts
.last_mut()
.unwrap_or_else(|| self.tail.token_mut())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tokenizer::{SimpleTokenizer, TextAnalyzer};
#[test]
fn splitting_compound_words_works() {
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]));
{
let mut stream = tokenizer.token_stream("");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foo bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobarbaz");
assert_eq!(stream.next().unwrap().text, "foobarbaz");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("baz foobar qux");
assert_eq!(stream.next().unwrap().text, "baz");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "qux");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar foo bar foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobazbar foo bar foobar");
assert_eq!(stream.next().unwrap().text, "foobazbar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar qux foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "qux");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("barfoo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next(), None);
}
}
}