From 064518156f570ee2aa03cf63be6d5605a96d6285 Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Thu, 9 Mar 2023 09:39:37 +0100 Subject: [PATCH] refactor tokenization pipeline to use GATs (#1924) * refactor tokenization pipeline to use GATs * fix doctests * fix clippy lints * remove commented code --- examples/pre_tokenized_text.rs | 2 +- examples/stop_words.rs | 5 +- src/indexer/segment_writer.rs | 2 +- src/query/more_like_this/more_like_this.rs | 4 +- src/query/query_parser/query_parser.rs | 5 +- src/tokenizer/alphanum_only.rs | 43 ++++++++--- src/tokenizer/ascii_folding_filter.rs | 39 +++++++--- src/tokenizer/empty_tokenizer.rs | 11 +-- src/tokenizer/facet_tokenizer.rs | 8 +- src/tokenizer/lower_caser.rs | 44 +++++++---- src/tokenizer/mod.rs | 19 ++--- src/tokenizer/ngram_tokenizer.rs | 12 +-- src/tokenizer/raw_tokenizer.rs | 5 +- src/tokenizer/regex_tokenizer.rs | 9 ++- src/tokenizer/remove_long.rs | 51 +++++++++---- src/tokenizer/simple_tokenizer.rs | 9 ++- src/tokenizer/split_compound_words.rs | 56 ++++++++++---- src/tokenizer/stemmer.rs | 42 ++++++++--- src/tokenizer/stop_word_filter/mod.rs | 54 +++++++++---- src/tokenizer/tokenizer.rs | 88 ++++++++++------------ src/tokenizer/tokenizer_manager.rs | 10 ++- src/tokenizer/whitespace_tokenizer.rs | 9 ++- tokenizer-api/src/lib.rs | 65 ++++++---------- 23 files changed, 353 insertions(+), 239 deletions(-) diff --git a/examples/pre_tokenized_text.rs b/examples/pre_tokenized_text.rs index e8bfff7dc..c6595cef9 100644 --- a/examples/pre_tokenized_text.rs +++ b/examples/pre_tokenized_text.rs @@ -12,7 +12,7 @@ use tantivy::collector::{Count, TopDocs}; use tantivy::query::TermQuery; use tantivy::schema::*; -use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, Tokenizer}; +use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer}; use tantivy::{doc, Index, ReloadPolicy}; use tempfile::TempDir; diff --git a/examples/stop_words.rs b/examples/stop_words.rs index 3d4079332..3a4a0651e 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -50,12 +50,13 @@ fn main() -> tantivy::Result<()> { // This tokenizer lowers all of the text (to help with stop word matching) // then removes all instances of `the` and `and` from the corpus - let tokenizer = TextAnalyzer::from(SimpleTokenizer) + let tokenizer = TextAnalyzer::builder(SimpleTokenizer) .filter(LowerCaser) .filter(StopWordFilter::remove(vec![ "the".to_string(), "and".to_string(), - ])); + ])) + .build(); index.tokenizers().register("stoppy", tokenizer); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index d60c640d9..d7cfb4935 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -188,7 +188,7 @@ impl SegmentWriter { let mut indexing_position = IndexingPosition::default(); postings_writer.index_text( doc_id, - &mut *facet_tokenizer, + &mut facet_tokenizer, term_buffer, ctx, &mut indexing_position, diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs index 995a141c8..cb8884e06 100644 --- a/src/query/more_like_this/more_like_this.rs +++ b/src/query/more_like_this/more_like_this.rs @@ -4,7 +4,9 @@ use std::collections::{BinaryHeap, HashMap}; use crate::query::bm25::idf; use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery}; use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value}; -use crate::tokenizer::{BoxTokenStream, FacetTokenizer, PreTokenizedStream, Tokenizer}; +use crate::tokenizer::{ + BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer, +}; use crate::{DocAddress, Result, Searcher, TantivyError}; #[derive(Debug, PartialEq)] diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 88417c0de..57939fdad 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -913,9 +913,10 @@ mod test { let tokenizer_manager = TokenizerManager::default(); tokenizer_manager.register( "en_with_stop_words", - TextAnalyzer::from(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer) .filter(LowerCaser) - .filter(StopWordFilter::remove(vec!["the".to_string()])), + .filter(StopWordFilter::remove(vec!["the".to_string()])) + .build(), ); QueryParser::new(schema, default_fields, tokenizer_manager) } diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs index 6508cc87d..c0175e736 100644 --- a/src/tokenizer/alphanum_only.rs +++ b/src/tokenizer/alphanum_only.rs @@ -2,16 +2,18 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = TextAnalyzer::from(RawTokenizer) -//! .filter(AlphaNumOnlyFilter); +//! let tokenizer = TextAnalyzer::builder(RawTokenizer) +//! .filter(AlphaNumOnlyFilter) +//! .build(); //! //! let mut stream = tokenizer.token_stream("hello there"); //! // is none because the raw filter emits one token that //! // contains a space //! assert!(stream.next().is_none()); //! -//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) -//! .filter(AlphaNumOnlyFilter); +//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer) +//! .filter(AlphaNumOnlyFilter) +//! .build(); //! //! let mut stream = tokenizer.token_stream("hello there 💣"); //! assert!(stream.next().is_some()); @@ -19,30 +21,45 @@ //! // the "emoji" is dropped because its not an alphanum //! assert!(stream.next().is_none()); //! ``` -use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; +use super::{Token, TokenFilter, TokenStream, Tokenizer}; /// `TokenFilter` that removes all tokens that contain non /// ascii alphanumeric characters. #[derive(Clone)] pub struct AlphaNumOnlyFilter; -pub struct AlphaNumOnlyFilterStream<'a> { - tail: BoxTokenStream<'a>, +pub struct AlphaNumOnlyFilterStream { + tail: T, } -impl<'a> AlphaNumOnlyFilterStream<'a> { +impl AlphaNumOnlyFilterStream { fn predicate(&self, token: &Token) -> bool { token.text.chars().all(|c| c.is_ascii_alphanumeric()) } } impl TokenFilter for AlphaNumOnlyFilter { - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { - BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream }) + type Tokenizer = AlphaNumOnlyFilterWrapper; + + fn transform(self, tokenizer: T) -> AlphaNumOnlyFilterWrapper { + AlphaNumOnlyFilterWrapper(tokenizer) } } -impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> { +#[derive(Clone)] +pub struct AlphaNumOnlyFilterWrapper(T); + +impl Tokenizer for AlphaNumOnlyFilterWrapper { + type TokenStream<'a> = AlphaNumOnlyFilterStream>; + + fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + AlphaNumOnlyFilterStream { + tail: self.0.token_stream(text), + } + } +} + +impl TokenStream for AlphaNumOnlyFilterStream { fn advance(&mut self) -> bool { while self.tail.advance() { if self.predicate(self.tail.token()) { @@ -79,7 +96,9 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let a = TextAnalyzer::from(SimpleTokenizer).filter(AlphaNumOnlyFilter); + let a = TextAnalyzer::builder(SimpleTokenizer) + .filter(AlphaNumOnlyFilter) + .build(); let mut token_stream = a.token_stream(text); let mut tokens: Vec = vec![]; let mut add_token = |token: &Token| { diff --git a/src/tokenizer/ascii_folding_filter.rs b/src/tokenizer/ascii_folding_filter.rs index f5527a690..6a9de4875 100644 --- a/src/tokenizer/ascii_folding_filter.rs +++ b/src/tokenizer/ascii_folding_filter.rs @@ -1,6 +1,6 @@ use std::mem; -use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; +use super::{Token, TokenFilter, TokenStream, Tokenizer}; /// This class converts alphabetic, numeric, and symbolic Unicode characters /// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode @@ -9,20 +9,33 @@ use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; pub struct AsciiFoldingFilter; impl TokenFilter for AsciiFoldingFilter { - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { - From::from(AsciiFoldingFilterTokenStream { - tail: token_stream, - buffer: String::with_capacity(100), - }) + type Tokenizer = AsciiFoldingFilterWrapper; + + fn transform(self, tokenizer: T) -> AsciiFoldingFilterWrapper { + AsciiFoldingFilterWrapper(tokenizer) } } -pub struct AsciiFoldingFilterTokenStream<'a> { - buffer: String, - tail: BoxTokenStream<'a>, +#[derive(Clone)] +pub struct AsciiFoldingFilterWrapper(T); + +impl Tokenizer for AsciiFoldingFilterWrapper { + type TokenStream<'a> = AsciiFoldingFilterTokenStream>; + + fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + AsciiFoldingFilterTokenStream { + buffer: String::with_capacity(100), + tail: self.0.token_stream(text), + } + } } -impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> { +pub struct AsciiFoldingFilterTokenStream { + buffer: String, + tail: T, +} + +impl TokenStream for AsciiFoldingFilterTokenStream { fn advance(&mut self) -> bool { if !self.tail.advance() { return false; @@ -1560,8 +1573,9 @@ mod tests { fn folding_helper(text: &str) -> Vec { let mut tokens = Vec::new(); - TextAnalyzer::from(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer) .filter(AsciiFoldingFilter) + .build() .token_stream(text) .process(&mut |token| { tokens.push(token.text.clone()); @@ -1570,8 +1584,9 @@ mod tests { } fn folding_using_raw_tokenizer_helper(text: &str) -> String { - let mut token_stream = TextAnalyzer::from(RawTokenizer) + let mut token_stream = TextAnalyzer::builder(RawTokenizer) .filter(AsciiFoldingFilter) + .build() .token_stream(text); token_stream.advance(); token_stream.token().text.clone() diff --git a/src/tokenizer/empty_tokenizer.rs b/src/tokenizer/empty_tokenizer.rs index 1dca0006d..4f4822206 100644 --- a/src/tokenizer/empty_tokenizer.rs +++ b/src/tokenizer/empty_tokenizer.rs @@ -1,16 +1,17 @@ -use crate::tokenizer::{BoxTokenStream, Token, TokenStream, Tokenizer}; +use crate::tokenizer::{Token, TokenStream, Tokenizer}; #[derive(Clone)] pub(crate) struct EmptyTokenizer; impl Tokenizer for EmptyTokenizer { - fn token_stream<'a>(&self, _text: &'a str) -> BoxTokenStream<'a> { - EmptyTokenStream::default().into() + type TokenStream<'a> = EmptyTokenStream; + fn token_stream(&self, _text: &str) -> EmptyTokenStream { + EmptyTokenStream::default() } } #[derive(Default)] -struct EmptyTokenStream { +pub struct EmptyTokenStream { token: Token, } @@ -30,7 +31,7 @@ impl TokenStream for EmptyTokenStream { #[cfg(test)] mod tests { - use crate::tokenizer::Tokenizer; + use crate::tokenizer::{TokenStream, Tokenizer}; #[test] fn test_empty_tokenizer() { diff --git a/src/tokenizer/facet_tokenizer.rs b/src/tokenizer/facet_tokenizer.rs index e5586fb98..3f2f1df2f 100644 --- a/src/tokenizer/facet_tokenizer.rs +++ b/src/tokenizer/facet_tokenizer.rs @@ -1,4 +1,4 @@ -use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; +use super::{Token, TokenStream, Tokenizer}; use crate::schema::FACET_SEP_BYTE; /// The `FacetTokenizer` process a `Facet` binary representation @@ -26,7 +26,8 @@ pub struct FacetTokenStream<'a> { } impl Tokenizer for FacetTokenizer { - fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + type TokenStream<'a> = FacetTokenStream<'a>; + fn token_stream<'a>(&self, text: &'a str) -> FacetTokenStream<'a> { let token = Token { position: 0, ..Default::default() @@ -36,7 +37,6 @@ impl Tokenizer for FacetTokenizer { state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet. token, } - .into() } } @@ -87,7 +87,7 @@ mod tests { use super::FacetTokenizer; use crate::schema::Facet; - use crate::tokenizer::{Token, Tokenizer}; + use crate::tokenizer::{Token, TokenStream, Tokenizer}; #[test] fn test_facet_tokenizer() { diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index c6bc7b225..dc10d3e27 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -1,29 +1,42 @@ use std::mem; -use super::{Token, TokenFilter, TokenStream}; -use crate::tokenizer::BoxTokenStream; - -impl TokenFilter for LowerCaser { - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { - BoxTokenStream::from(LowerCaserTokenStream { - tail: token_stream, - buffer: String::with_capacity(100), - }) - } -} +use super::{Token, TokenFilter, TokenStream, Tokenizer}; /// Token filter that lowercase terms. #[derive(Clone)] pub struct LowerCaser; -pub struct LowerCaserTokenStream<'a> { +impl TokenFilter for LowerCaser { + type Tokenizer = LowerCaserFilter; + + fn transform(self, tokenizer: T) -> Self::Tokenizer { + LowerCaserFilter(tokenizer) + } +} + +#[derive(Clone)] +pub struct LowerCaserFilter(T); + +impl Tokenizer for LowerCaserFilter { + type TokenStream<'a> = LowerCaserTokenStream>; + + fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + LowerCaserTokenStream { + tail: self.0.token_stream(text), + buffer: String::new(), + } + } +} + +pub struct LowerCaserTokenStream { buffer: String, - tail: BoxTokenStream<'a>, + tail: T, } // writes a lowercased version of text into output. fn to_lowercase_unicode(text: &str, output: &mut String) { output.clear(); + output.reserve(50); for c in text.chars() { // Contrary to the std, we do not take care of sigma special case. // This will have an normalizationo effect, which is ok for search. @@ -31,7 +44,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) { } } -impl<'a> TokenStream for LowerCaserTokenStream<'a> { +impl TokenStream for LowerCaserTokenStream { fn advance(&mut self) -> bool { if !self.tail.advance() { return false; @@ -73,8 +86,9 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let mut token_stream = TextAnalyzer::from(SimpleTokenizer) + let mut token_stream = TextAnalyzer::builder(SimpleTokenizer) .filter(LowerCaser) + .build() .token_stream(text); let mut tokens = vec![]; let mut add_token = |token: &Token| { diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index e283382e1..7b9ef75bb 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -66,10 +66,11 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let en_stem = TextAnalyzer::from(SimpleTokenizer) +//! let en_stem = TextAnalyzer::builder(SimpleTokenizer) //! .filter(RemoveLongFilter::limit(40)) //! .filter(LowerCaser) -//! .filter(Stemmer::new(Language::English)); +//! .filter(Stemmer::new(Language::English)) +//! .build(); //! ``` //! //! Once your tokenizer is defined, you need to @@ -112,9 +113,10 @@ //! let index = Index::create_in_ram(schema); //! //! // We need to register our tokenizer : -//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer) +//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer) //! .filter(RemoveLongFilter::limit(40)) -//! .filter(LowerCaser); +//! .filter(LowerCaser) +//! .build(); //! index //! .tokenizers() //! .register("custom_en", custom_en_tokenizer); @@ -137,9 +139,7 @@ mod tokenizer; mod tokenizer_manager; mod whitespace_tokenizer; -pub use tokenizer_api::{ - BoxTokenFilter, BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer, -}; +pub use tokenizer_api::{BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer}; pub use self::alphanum_only::AlphaNumOnlyFilter; pub use self::ascii_folding_filter::AsciiFoldingFilter; @@ -237,10 +237,11 @@ pub mod tests { let tokenizer_manager = TokenizerManager::default(); tokenizer_manager.register( "el_stem", - TextAnalyzer::from(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) - .filter(Stemmer::new(Language::Greek)), + .filter(Stemmer::new(Language::Greek)) + .build(), ); let en_tokenizer = tokenizer_manager.get("el_stem").unwrap(); let mut tokens: Vec = vec![]; diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs index 05ddefa4a..b3af1dd03 100644 --- a/src/tokenizer/ngram_tokenizer.rs +++ b/src/tokenizer/ngram_tokenizer.rs @@ -1,5 +1,4 @@ use super::{Token, TokenStream, Tokenizer}; -use crate::tokenizer::BoxTokenStream; /// Tokenize the text by splitting words into n-grams of the given size(s) /// @@ -132,8 +131,9 @@ pub struct NgramTokenStream<'a> { } impl Tokenizer for NgramTokenizer { - fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { - From::from(NgramTokenStream { + type TokenStream<'a> = NgramTokenStream<'a>; + fn token_stream<'a>(&self, text: &'a str) -> NgramTokenStream<'a> { + NgramTokenStream { ngram_charidx_iterator: StutteringIterator::new( CodepointFrontiers::for_str(text), self.min_gram, @@ -142,7 +142,7 @@ impl Tokenizer for NgramTokenizer { prefix_only: self.prefix_only, text, token: Token::default(), - }) + } } } @@ -303,9 +303,9 @@ mod tests { use super::{utf8_codepoint_width, CodepointFrontiers, NgramTokenizer, StutteringIterator}; use crate::tokenizer::tests::assert_token; - use crate::tokenizer::{BoxTokenStream, Token, Tokenizer}; + use crate::tokenizer::{Token, TokenStream, Tokenizer}; - fn test_helper(mut tokenizer: BoxTokenStream) -> Vec { + fn test_helper(mut tokenizer: T) -> Vec { let mut tokens: Vec = vec![]; tokenizer.process(&mut |token: &Token| tokens.push(token.clone())); tokens diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index c51d83340..901994915 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -1,5 +1,4 @@ use super::{Token, TokenStream, Tokenizer}; -use crate::tokenizer::BoxTokenStream; /// For each value of the field, emit a single unprocessed token. #[derive(Clone)] @@ -11,7 +10,8 @@ pub struct RawTokenStream { } impl Tokenizer for RawTokenizer { - fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + type TokenStream<'a> = RawTokenStream; + fn token_stream(&self, text: &str) -> RawTokenStream { let token = Token { offset_from: 0, offset_to: text.len(), @@ -23,7 +23,6 @@ impl Tokenizer for RawTokenizer { token, has_token: true, } - .into() } } diff --git a/src/tokenizer/regex_tokenizer.rs b/src/tokenizer/regex_tokenizer.rs index 54bf3e228..9030fb04e 100644 --- a/src/tokenizer/regex_tokenizer.rs +++ b/src/tokenizer/regex_tokenizer.rs @@ -1,6 +1,6 @@ use regex::Regex; -use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; +use super::{Token, TokenStream, Tokenizer}; use crate::TantivyError; /// Tokenize the text by using a regex pattern to split. @@ -60,13 +60,14 @@ impl RegexTokenizer { } impl Tokenizer for RegexTokenizer { - fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { - BoxTokenStream::from(RegexTokenStream { + type TokenStream<'a> = RegexTokenStream<'a>; + fn token_stream<'a>(&self, text: &'a str) -> RegexTokenStream<'a> { + RegexTokenStream { regex: self.regex.clone(), text, token: Token::default(), cursor: 0, - }) + } } } diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs index 173291904..933e98adb 100644 --- a/src/tokenizer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -2,8 +2,9 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) -//! .filter(RemoveLongFilter::limit(5)); +//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer) +//! .filter(RemoveLongFilter::limit(5)) +//! .build(); //! //! let mut stream = tokenizer.token_stream("toolong nice"); //! // because `toolong` is more than 5 characters, it is filtered @@ -11,8 +12,7 @@ //! assert_eq!(stream.next().unwrap().text, "nice"); //! assert!(stream.next().is_none()); //! ``` -use super::{Token, TokenFilter, TokenStream}; -use crate::tokenizer::BoxTokenStream; +use super::{Token, TokenFilter, TokenStream, Tokenizer}; /// `RemoveLongFilter` removes tokens that are longer /// than a given number of bytes (in UTF-8 representation). @@ -31,27 +31,46 @@ impl RemoveLongFilter { } } -impl<'a> RemoveLongFilterStream<'a> { +impl RemoveLongFilterStream { fn predicate(&self, token: &Token) -> bool { token.text.len() < self.token_length_limit } } impl TokenFilter for RemoveLongFilter { - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { - BoxTokenStream::from(RemoveLongFilterStream { - token_length_limit: self.length_limit, - tail: token_stream, - }) + type Tokenizer = RemoveLongFilterWrapper; + + fn transform(self, tokenizer: T) -> RemoveLongFilterWrapper { + RemoveLongFilterWrapper { + length_limit: self.length_limit, + inner: tokenizer, + } } } -pub struct RemoveLongFilterStream<'a> { - token_length_limit: usize, - tail: BoxTokenStream<'a>, +#[derive(Clone)] +pub struct RemoveLongFilterWrapper { + length_limit: usize, + inner: T, } -impl<'a> TokenStream for RemoveLongFilterStream<'a> { +impl Tokenizer for RemoveLongFilterWrapper { + type TokenStream<'a> = RemoveLongFilterStream>; + + fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + RemoveLongFilterStream { + token_length_limit: self.length_limit, + tail: self.inner.token_stream(text), + } + } +} + +pub struct RemoveLongFilterStream { + token_length_limit: usize, + tail: T, +} + +impl TokenStream for RemoveLongFilterStream { fn advance(&mut self) -> bool { while self.tail.advance() { if self.predicate(self.tail.token()) { @@ -84,7 +103,9 @@ mod tests { } fn token_stream_helper(text: &str) -> Vec { - let a = TextAnalyzer::from(SimpleTokenizer).filter(RemoveLongFilter::limit(6)); + let a = TextAnalyzer::builder(SimpleTokenizer) + .filter(RemoveLongFilter::limit(6)) + .build(); let mut token_stream = a.token_stream(text); let mut tokens: Vec = vec![]; let mut add_token = |token: &Token| { diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs index 9cffcd15b..2b9163b23 100644 --- a/src/tokenizer/simple_tokenizer.rs +++ b/src/tokenizer/simple_tokenizer.rs @@ -1,6 +1,6 @@ use std::str::CharIndices; -use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; +use super::{Token, TokenStream, Tokenizer}; /// Tokenize the text by splitting on whitespaces and punctuation. #[derive(Clone)] @@ -13,12 +13,13 @@ pub struct SimpleTokenStream<'a> { } impl Tokenizer for SimpleTokenizer { - fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { - BoxTokenStream::from(SimpleTokenStream { + type TokenStream<'a> = SimpleTokenStream<'a>; + fn token_stream<'a>(&self, text: &'a str) -> SimpleTokenStream<'a> { + SimpleTokenStream { text, chars: text.char_indices(), token: Token::default(), - }) + } } } diff --git a/src/tokenizer/split_compound_words.rs b/src/tokenizer/split_compound_words.rs index b6063e83d..e79c48bac 100644 --- a/src/tokenizer/split_compound_words.rs +++ b/src/tokenizer/split_compound_words.rs @@ -2,7 +2,7 @@ use std::sync::Arc; use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind, StateID}; -use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; +use super::{Token, TokenFilter, TokenStream, Tokenizer}; /// A [`TokenFilter`] which splits compound words into their parts /// based on a given dictionary. @@ -23,9 +23,11 @@ use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; /// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer}; /// /// let tokenizer = -/// TextAnalyzer::from(SimpleTokenizer).filter(SplitCompoundWords::from_dictionary([ +/// TextAnalyzer::builder(SimpleTokenizer) +/// .filter(SplitCompoundWords::from_dictionary([ /// "dampf", "schiff", "fahrt", "brot", "backen", "automat", -/// ])); +/// ])) +/// .build(); /// /// let mut stream = tokenizer.token_stream("dampfschifffahrt"); /// assert_eq!(stream.next().unwrap().text, "dampf"); @@ -76,24 +78,45 @@ impl SplitCompoundWords { } impl TokenFilter for SplitCompoundWords { - fn transform<'a>(&self, stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { - BoxTokenStream::from(SplitCompoundWordsTokenStream { - dict: self.dict.clone(), - tail: stream, - cuts: Vec::new(), - parts: Vec::new(), - }) + type Tokenizer = SplitCompoundWordsFilter; + + fn transform(self, tokenizer: T) -> SplitCompoundWordsFilter { + SplitCompoundWordsFilter { + dict: self.dict, + inner: tokenizer, + } } } -struct SplitCompoundWordsTokenStream<'a, S: StateID> { +#[derive(Clone)] +pub struct SplitCompoundWordsFilter { dict: Arc>, - tail: BoxTokenStream<'a>, + inner: T, +} + +impl Tokenizer + for SplitCompoundWordsFilter +{ + type TokenStream<'a> = SplitCompoundWordsTokenStream, S>; + + fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + SplitCompoundWordsTokenStream { + dict: self.dict.clone(), + tail: self.inner.token_stream(text), + cuts: Vec::new(), + parts: Vec::new(), + } + } +} + +pub struct SplitCompoundWordsTokenStream { + dict: Arc>, + tail: T, cuts: Vec, parts: Vec, } -impl<'a, S: StateID> SplitCompoundWordsTokenStream<'a, S> { +impl SplitCompoundWordsTokenStream { // Will use `self.cuts` to fill `self.parts` if `self.tail.token()` // can fully be split into consecutive matches against `self.dict`. fn split(&mut self) { @@ -129,7 +152,7 @@ impl<'a, S: StateID> SplitCompoundWordsTokenStream<'a, S> { } } -impl<'a, S: StateID> TokenStream for SplitCompoundWordsTokenStream<'a, S> { +impl TokenStream for SplitCompoundWordsTokenStream { fn advance(&mut self) -> bool { self.parts.pop(); @@ -165,8 +188,9 @@ mod tests { #[test] fn splitting_compound_words_works() { - let tokenizer = TextAnalyzer::from(SimpleTokenizer) - .filter(SplitCompoundWords::from_dictionary(["foo", "bar"])); + let tokenizer = TextAnalyzer::builder(SimpleTokenizer) + .filter(SplitCompoundWords::from_dictionary(["foo", "bar"])) + .build(); { let mut stream = tokenizer.token_stream(""); diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index b76361ec3..3f8a3eead 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -4,8 +4,7 @@ use std::mem; use rust_stemmers::{self, Algorithm}; use serde::{Deserialize, Serialize}; -use super::{Token, TokenFilter, TokenStream}; -use crate::tokenizer::BoxTokenStream; +use super::{Token, TokenFilter, TokenStream, Tokenizer}; /// Available stemmer languages. #[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)] @@ -82,23 +81,42 @@ impl Default for Stemmer { } impl TokenFilter for Stemmer { - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { - let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm); - BoxTokenStream::from(StemmerTokenStream { - tail: token_stream, - stemmer: inner_stemmer, - buffer: String::new(), - }) + type Tokenizer = StemmerFilter; + + fn transform(self, tokenizer: T) -> StemmerFilter { + StemmerFilter { + stemmer_algorithm: self.stemmer_algorithm, + inner: tokenizer, + } } } -pub struct StemmerTokenStream<'a> { - tail: BoxTokenStream<'a>, +#[derive(Clone)] +pub struct StemmerFilter { + stemmer_algorithm: Algorithm, + inner: T, +} + +impl Tokenizer for StemmerFilter { + type TokenStream<'a> = StemmerTokenStream>; + + fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm); + StemmerTokenStream { + tail: self.inner.token_stream(text), + stemmer, + buffer: String::new(), + } + } +} + +pub struct StemmerTokenStream { + tail: T, stemmer: rust_stemmers::Stemmer, buffer: String, } -impl<'a> TokenStream for StemmerTokenStream<'a> { +impl TokenStream for StemmerTokenStream { fn advance(&mut self) -> bool { if !self.tail.advance() { return false; diff --git a/src/tokenizer/stop_word_filter/mod.rs b/src/tokenizer/stop_word_filter/mod.rs index daee693c5..adfbf17d4 100644 --- a/src/tokenizer/stop_word_filter/mod.rs +++ b/src/tokenizer/stop_word_filter/mod.rs @@ -2,8 +2,9 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) -//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()])); +//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer) +//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()])) +//! .build(); //! //! let mut stream = tokenizer.token_stream("the fox is crafty"); //! assert_eq!(stream.next().unwrap().text, "fox"); @@ -20,7 +21,7 @@ use rustc_hash::FxHashSet; #[cfg(feature = "stopwords")] use super::Language; -use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; +use super::{Token, TokenFilter, TokenStream, Tokenizer}; /// `TokenFilter` that removes stop words from a token stream #[derive(Clone)] @@ -69,27 +70,46 @@ impl StopWordFilter { } } -pub struct StopWordFilterStream<'a> { - words: Arc>, - tail: BoxTokenStream<'a>, -} - impl TokenFilter for StopWordFilter { - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { - BoxTokenStream::from(StopWordFilterStream { - words: self.words.clone(), - tail: token_stream, - }) + type Tokenizer = StopWordFilterWrapper; + + fn transform(self, tokenizer: T) -> StopWordFilterWrapper { + StopWordFilterWrapper { + words: self.words, + inner: tokenizer, + } } } -impl<'a> StopWordFilterStream<'a> { +#[derive(Clone)] +pub struct StopWordFilterWrapper { + words: Arc>, + inner: T, +} + +impl Tokenizer for StopWordFilterWrapper { + type TokenStream<'a> = StopWordFilterStream>; + + fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> { + StopWordFilterStream { + words: self.words.clone(), + tail: self.inner.token_stream(text), + } + } +} + +pub struct StopWordFilterStream { + words: Arc>, + tail: T, +} + +impl StopWordFilterStream { fn predicate(&self, token: &Token) -> bool { !self.words.contains(&token.text) } } -impl<'a> TokenStream for StopWordFilterStream<'a> { +impl TokenStream for StopWordFilterStream { fn advance(&mut self) -> bool { while self.tail.advance() { if self.predicate(self.tail.token()) { @@ -131,7 +151,9 @@ mod tests { "am".to_string(), "i".to_string(), ]; - let a = TextAnalyzer::from(SimpleTokenizer).filter(StopWordFilter::remove(stops)); + let a = TextAnalyzer::builder(SimpleTokenizer) + .filter(StopWordFilter::remove(stops)) + .build(); let mut token_stream = a.token_stream(text); let mut tokens: Vec = vec![]; let mut add_token = |token: &Token| { diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 5fa37685b..7e1394076 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -1,15 +1,12 @@ /// The tokenizer module contains all of the tools used to process /// text in `tantivy`. -use tokenizer_api::{BoxTokenFilter, BoxTokenStream, Tokenizer}; +use tokenizer_api::{BoxTokenStream, BoxableTokenizer, TokenFilter, Tokenizer}; use crate::tokenizer::empty_tokenizer::EmptyTokenizer; /// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`. -/// -/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially. pub struct TextAnalyzer { - tokenizer: Box, - token_filters: Vec, + tokenizer: Box, } impl Default for TextAnalyzer { @@ -18,52 +15,21 @@ impl Default for TextAnalyzer { } } -impl From for TextAnalyzer { +impl From for TextAnalyzer { fn from(tokenizer: T) -> Self { - TextAnalyzer::new(tokenizer, Vec::new()) + TextAnalyzer::builder(tokenizer).build() } } impl TextAnalyzer { - /// Creates a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`. - /// - /// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using - /// `TextAnalyzer::from(tokenizer)`. - pub fn new(tokenizer: T, token_filters: Vec) -> TextAnalyzer { - TextAnalyzer { - tokenizer: Box::new(tokenizer), - token_filters, - } - } - - /// Appends a token filter to the current tokenizer. - /// - /// The method consumes the current `TokenStream` and returns a - /// new one. - /// - /// # Example - /// - /// ```rust - /// use tantivy::tokenizer::*; - /// - /// let en_stem = TextAnalyzer::from(SimpleTokenizer) - /// .filter(RemoveLongFilter::limit(40)) - /// .filter(LowerCaser) - /// .filter(Stemmer::default()); - /// ``` - #[must_use] - pub fn filter>(mut self, token_filter: F) -> Self { - self.token_filters.push(token_filter.into()); - self + /// Create a new TextAnalyzerBuilder + pub fn builder(tokenizer: T) -> TextAnalyzerBuilder { + TextAnalyzerBuilder { tokenizer } } /// Creates a token stream for a given `str`. pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { - let mut token_stream = self.tokenizer.token_stream(text); - for token_filter in &self.token_filters { - token_stream = token_filter.transform(token_stream); - } - token_stream + self.tokenizer.box_token_stream(text) } } @@ -71,11 +37,39 @@ impl Clone for TextAnalyzer { fn clone(&self) -> Self { TextAnalyzer { tokenizer: self.tokenizer.box_clone(), - token_filters: self - .token_filters - .iter() - .map(|token_filter| token_filter.box_clone()) - .collect(), + } + } +} + +/// Builder helper for [`TextAnalyzer`] +pub struct TextAnalyzerBuilder { + tokenizer: T, +} + +impl TextAnalyzerBuilder { + /// Appends a token filter to the current builder. + /// + /// # Example + /// + /// ```rust + /// use tantivy::tokenizer::*; + /// + /// let en_stem = TextAnalyzer::builder(SimpleTokenizer) + /// .filter(RemoveLongFilter::limit(40)) + /// .filter(LowerCaser) + /// .filter(Stemmer::default()) + /// .build(); + /// ``` + pub fn filter(self, token_filter: F) -> TextAnalyzerBuilder> { + TextAnalyzerBuilder { + tokenizer: token_filter.transform(self.tokenizer), + } + } + + /// Finalize building the TextAnalyzer + pub fn build(self) -> TextAnalyzer { + TextAnalyzer { + tokenizer: Box::new(self.tokenizer), } } } diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index 73dcba21a..e849471bc 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -61,16 +61,18 @@ impl Default for TokenizerManager { manager.register("raw", RawTokenizer); manager.register( "default", - TextAnalyzer::from(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer) .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser), + .filter(LowerCaser) + .build(), ); manager.register( "en_stem", - TextAnalyzer::from(SimpleTokenizer) + TextAnalyzer::builder(SimpleTokenizer) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) - .filter(Stemmer::new(Language::English)), + .filter(Stemmer::new(Language::English)) + .build(), ); manager.register("whitespace", WhitespaceTokenizer); manager diff --git a/src/tokenizer/whitespace_tokenizer.rs b/src/tokenizer/whitespace_tokenizer.rs index 87122c0aa..6de19ddd7 100644 --- a/src/tokenizer/whitespace_tokenizer.rs +++ b/src/tokenizer/whitespace_tokenizer.rs @@ -1,6 +1,6 @@ use std::str::CharIndices; -use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; +use super::{Token, TokenStream, Tokenizer}; /// Tokenize the text by splitting on whitespaces. #[derive(Clone)] @@ -13,12 +13,13 @@ pub struct WhitespaceTokenStream<'a> { } impl Tokenizer for WhitespaceTokenizer { - fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { - BoxTokenStream::from(WhitespaceTokenStream { + type TokenStream<'a> = WhitespaceTokenStream<'a>; + fn token_stream<'a>(&self, text: &'a str) -> WhitespaceTokenStream<'a> { + WhitespaceTokenStream { text, chars: text.char_indices(), token: Token::default(), - }) + } } } diff --git a/tokenizer-api/src/lib.rs b/tokenizer-api/src/lib.rs index 5fbd8629a..d1497a946 100644 --- a/tokenizer-api/src/lib.rs +++ b/tokenizer-api/src/lib.rs @@ -42,28 +42,31 @@ impl Default for Token { /// `Tokenizer` are in charge of splitting text into a stream of token /// before indexing. -/// -/// # Warning -/// -/// This API may change to use associated types. -pub trait Tokenizer: 'static + Send + Sync + TokenizerClone { +pub trait Tokenizer: 'static + Clone + Send + Sync { + /// The token stream returned by this Tokenizer. + type TokenStream<'a>: TokenStream; /// Creates a token stream for a given `str`. - fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>; + fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a>; } -pub trait TokenizerClone { - fn box_clone(&self) -> Box; +/// A boxable `Tokenizer`, with its `TokenStream` type erased. +pub trait BoxableTokenizer: 'static + Send + Sync { + /// Creates a boxed token stream for a given `str`. + fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>; + /// Clone this tokenizer. + fn box_clone(&self) -> Box; } -impl TokenizerClone for T { - fn box_clone(&self) -> Box { +impl BoxableTokenizer for T { + fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + self.token_stream(text).into() + } + fn box_clone(&self) -> Box { Box::new(self.clone()) } } /// Simple wrapper of `Box`. -/// -/// See [`TokenStream`] for more information. pub struct BoxTokenStream<'a>(Box); impl<'a, T> From for BoxTokenStream<'a> @@ -139,39 +142,13 @@ pub trait TokenStream { } } -/// Simple wrapper of `Box`. -/// -/// See [`TokenFilter`] for more information. -pub struct BoxTokenFilter(Box); - -impl Deref for BoxTokenFilter { - type Target = dyn TokenFilter; - - fn deref(&self) -> &dyn TokenFilter { - &*self.0 - } -} - -impl From for BoxTokenFilter { - fn from(tokenizer: T) -> BoxTokenFilter { - BoxTokenFilter(Box::new(tokenizer)) - } -} - -pub trait TokenFilterClone { - fn box_clone(&self) -> BoxTokenFilter; -} - /// Trait for the pluggable components of `Tokenizer`s. -pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone { - /// Wraps a token stream and returns the modified one. - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>; -} - -impl TokenFilterClone for T { - fn box_clone(&self) -> BoxTokenFilter { - BoxTokenFilter::from(self.clone()) - } +pub trait TokenFilter: 'static + Send + Sync { + /// The Tokenizer type returned by this filter, typically parametrized by the underlying + /// Tokenizer. + type Tokenizer: Tokenizer; + /// Wraps a Tokenizer and returns a new one. + fn transform(self, tokenizer: T) -> Self::Tokenizer; } #[cfg(test)]