From ca6fd5effcaf6e6cf292ec00e2c651b4dd5a2d43 Mon Sep 17 00:00:00 2001 From: dcraven Date: Wed, 30 Dec 2020 13:28:27 +0100 Subject: [PATCH] Fix bug. Cleanup some rough spots. Renamed functions. Fixed tests and docs. --- examples/custom_tokenizer.rs | 9 +- examples/stop_words.rs | 7 +- src/core/index.rs | 4 +- src/indexer/segment_writer.rs | 4 +- src/postings/mod.rs | 6 +- src/query/query_parser/query_parser.rs | 12 +- src/snippet/mod.rs | 62 +++++-- src/tokenizer/alphanum_only.rs | 14 +- src/tokenizer/ascii_folding_filter.rs | 10 +- src/tokenizer/facet_tokenizer.rs | 12 +- src/tokenizer/lower_caser.rs | 16 +- src/tokenizer/mod.rs | 20 ++- src/tokenizer/ngram_tokenizer.rs | 2 +- src/tokenizer/raw_tokenizer.rs | 15 +- src/tokenizer/remove_long.rs | 6 +- src/tokenizer/simple_tokenizer.rs | 73 ++++---- src/tokenizer/stop_word_filter.rs | 8 +- src/tokenizer/token_stream_chain.rs | 125 +++++++------- src/tokenizer/tokenized_string.rs | 16 +- src/tokenizer/tokenizer.rs | 228 ++++++++++++------------- src/tokenizer/tokenizer_manager.rs | 24 +-- 21 files changed, 360 insertions(+), 313 deletions(-) diff --git a/examples/custom_tokenizer.rs b/examples/custom_tokenizer.rs index e73842632..4db6d10cb 100644 --- a/examples/custom_tokenizer.rs +++ b/examples/custom_tokenizer.rs @@ -5,7 +5,7 @@ use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::schema::*; -use tantivy::tokenizer::{NgramTokenizer, TextAnalyzer}; +use tantivy::tokenizer::NgramTokenizer; use tantivy::{doc, Index}; fn main() -> tantivy::Result<()> { @@ -52,10 +52,9 @@ fn main() -> tantivy::Result<()> { // here we are registering our custome tokenizer // this will store tokens of 3 characters each - index.tokenizers().register( - "ngram3", - TextAnalyzer::new(NgramTokenizer::new(3, 3, false)), - ); + index + .tokenizers() + .register("ngram3", NgramTokenizer::new(3, 3, false)); // To insert document we need an index writer. // There must be only one writer at a time. diff --git a/examples/stop_words.rs b/examples/stop_words.rs index 408ecd8bd..4a7929cdf 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -50,12 +50,13 @@ fn main() -> tantivy::Result<()> { // This tokenizer lowers all of the text (to help with stop word matching) // then removes all instances of `the` and `and` from the corpus - let tokenizer = TextAnalyzer::new(SimpleTokenizer) + let tokenizer = analyzer_builder(SimpleTokenizer) .filter(LowerCaser::new()) - .filter(StopWordFilter::new(vec![ + .filter(StopWordFilter::remove(vec![ "the".to_string(), "and".to_string(), - ])); + ])) + .build(); index.tokenizers().register("stoppy", tokenizer); diff --git a/src/core/index.rs b/src/core/index.rs index 2eb82392f..565efd686 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -20,8 +20,8 @@ use crate::reader::IndexReaderBuilder; use crate::schema::Field; use crate::schema::FieldType; use crate::schema::Schema; -use crate::tokenizer::TextAnalyzerT; -use crate::tokenizer::{TextAnalyzer, TokenizerManager}; +use crate::tokenizer::Tokenizer; +use crate::tokenizer::{TextAnalyzer, TextAnalyzerT, TokenizerManager}; use crate::IndexWriter; use std::collections::HashSet; use std::fmt; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index b34e9bb0c..42684598b 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -13,8 +13,8 @@ use crate::schema::Value; use crate::schema::{Field, FieldEntry}; use crate::tokenizer::PreTokenizedStream; use crate::tokenizer::TokenStream; -use crate::tokenizer::{DynTokenStreamChain, TextAnalyzerT, TokenStreamChain, Tokenizer}; -use crate::tokenizer::{FacetTokenizer, TextAnalyzer}; +use crate::tokenizer::{DynTokenStreamChain, TokenStreamChain, Tokenizer}; +use crate::tokenizer::{FacetTokenizer, TextAnalyzer, TextAnalyzerT}; use crate::Opstamp; use crate::{DocId, SegmentComponent}; diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 65e286c6d..81d56aa6e 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -50,7 +50,9 @@ pub mod tests { use crate::schema::{Field, TextOptions}; use crate::schema::{IndexRecordOption, TextFieldIndexing}; use crate::schema::{Schema, Term, INDEXED, TEXT}; - use crate::tokenizer::{SimpleTokenizer, TextAnalyzer, MAX_TOKEN_LEN}; + use crate::tokenizer::{ + analyzer_builder, SimpleTokenizer, TextAnalyzer, TextAnalyzerT, MAX_TOKEN_LEN, + }; use crate::DocId; use crate::HasLen; use crate::Score; @@ -167,7 +169,7 @@ pub mod tests { let index = Index::create_in_ram(schema.clone()); index .tokenizers() - .register("simple_no_truncation", TextAnalyzer::new(SimpleTokenizer)); + .register("simple_no_truncation", SimpleTokenizer); let reader = index.reader().unwrap(); let mut index_writer = index.writer_for_tests().unwrap(); index_writer.set_merge_policy(Box::new(NoMergePolicy)); diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 1544f3263..1675633a4 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -573,14 +573,13 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box { #[cfg(test)] mod test { use super::super::logical_ast::*; - use super::QueryParser; - use super::QueryParserError; + use super::*; use crate::query::Query; use crate::schema::Field; use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions}; use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT}; use crate::tokenizer::{ - LowerCaser, SimpleTokenizer, StopWordFilter, TextAnalyzer, TokenizerManager, + analyzer_builder, LowerCaser, SimpleTokenizer, StopWordFilter, TextAnalyzer, }; use crate::Index; use matches::assert_matches; @@ -619,9 +618,10 @@ mod test { let tokenizer_manager = TokenizerManager::default(); tokenizer_manager.register( "en_with_stop_words", - TextAnalyzer::new(SimpleTokenizer) + analyzer_builder(SimpleTokenizer) .filter(LowerCaser::new()) - .filter(StopWordFilter::new(vec!["the".to_string()])), + .filter(StopWordFilter::remove(vec!["the".to_string()])) + .build(), ); QueryParser::new(schema, default_fields, tokenizer_manager) } @@ -978,7 +978,7 @@ mod test { let index = Index::create_in_ram(schema); index .tokenizers() - .register("customtokenizer", TextAnalyzer::new(SimpleTokenizer)); + .register("customtokenizer", SimpleTokenizer); let query_parser = QueryParser::for_index(&index, vec![title]); assert_eq!( query_parser.parse_query("title:\"happy tax\"").unwrap_err(), diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index d8da9a026..11f87f5ad 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,7 +1,7 @@ use crate::query::Query; use crate::schema::Field; use crate::schema::Value; -use crate::tokenizer::{TextAnalyzerT, Token}; +use crate::tokenizer::{TextAnalyzerT, Token, Tokenizer}; use crate::Searcher; use crate::{Document, Score}; use htmlescape::encode_minimal; @@ -350,8 +350,13 @@ Survey in 2016, 2017, and 2018."#; String::from("rust") => 1.0, String::from("language") => 0.9 }; - let fragments = - search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 100); + + let fragments = search_fragments( + &Into::>::into(SimpleTokenizer), + TEST_TEXT, + &terms, + 100, + ); assert_eq!(fragments.len(), 7); { let first = &fragments[0]; @@ -378,8 +383,12 @@ Survey in 2016, 2017, and 2018."#; String::from("rust") =>1.0, String::from("language") => 0.9 }; - let fragments = - search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 20); + let fragments = search_fragments( + &Into::>::into(SimpleTokenizer), + TEST_TEXT, + &terms, + 20, + ); { let first = &fragments[0]; assert_eq!(first.score, 1.0); @@ -393,8 +402,12 @@ Survey in 2016, 2017, and 2018."#; String::from("rust") =>0.9, String::from("language") => 1.0 }; - let fragments = - search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 20); + let fragments = search_fragments( + &Into::>::into(SimpleTokenizer), + TEST_TEXT, + &terms, + 20, + ); //assert_eq!(fragments.len(), 7); { let first = &fragments[0]; @@ -413,7 +426,12 @@ Survey in 2016, 2017, and 2018."#; let mut terms = BTreeMap::new(); terms.insert(String::from("c"), 1.0); - let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3); + let fragments = search_fragments( + &Into::>::into(SimpleTokenizer), + &text, + &terms, + 3, + ); assert_eq!(fragments.len(), 1); { @@ -435,7 +453,12 @@ Survey in 2016, 2017, and 2018."#; let mut terms = BTreeMap::new(); terms.insert(String::from("f"), 1.0); - let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3); + let fragments = search_fragments( + &Into::>::into(SimpleTokenizer), + &text, + &terms, + 3, + ); assert_eq!(fragments.len(), 2); { @@ -458,7 +481,12 @@ Survey in 2016, 2017, and 2018."#; terms.insert(String::from("f"), 1.0); terms.insert(String::from("a"), 0.9); - let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 7); + let fragments = search_fragments( + &Into::>::into(SimpleTokenizer), + &text, + &terms, + 7, + ); assert_eq!(fragments.len(), 2); { @@ -480,7 +508,12 @@ Survey in 2016, 2017, and 2018."#; let mut terms = BTreeMap::new(); terms.insert(String::from("z"), 1.0); - let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3); + let fragments = search_fragments( + &Into::>::into(SimpleTokenizer), + &text, + &terms, + 3, + ); assert_eq!(fragments.len(), 0); @@ -494,7 +527,12 @@ Survey in 2016, 2017, and 2018."#; let text = "a b c d"; let terms = BTreeMap::new(); - let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3); + let fragments = search_fragments( + &Into::>::into(SimpleTokenizer), + &text, + &terms, + 3, + ); assert_eq!(fragments.len(), 0); let snippet = select_best_fragment_combination(&fragments[..], &text); diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs index 7c35eb842..58a6df05d 100644 --- a/src/tokenizer/alphanum_only.rs +++ b/src/tokenizer/alphanum_only.rs @@ -2,16 +2,16 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = TextAnalyzer::from(RawTokenizer) -//! .filter(AlphaNumOnlyFilter); +//! let tokenizer = analyzer_builder(RawTokenizer) +//! .filter(AlphaNumOnlyFilter).build(); //! //! let mut stream = tokenizer.token_stream("hello there"); //! // is none because the raw filter emits one token that //! // contains a space //! assert!(stream.next().is_none()); //! -//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) -//! .filter(AlphaNumOnlyFilter); +//! let tokenizer = analyzer_builder(SimpleTokenizer) +//! .filter(AlphaNumOnlyFilter).build(); //! //! let mut stream = tokenizer.token_stream("hello there πŸ’£"); //! assert!(stream.next().is_some()); @@ -23,14 +23,14 @@ use super::{Token, TokenFilter, TokenStream}; /// `TokenFilter` that removes all tokens that contain non /// ascii alphanumeric characters. -#[derive(Clone)] +#[derive(Clone, Debug, Default)] pub struct AlphaNumOnlyFilter; impl TokenFilter for AlphaNumOnlyFilter { fn transform(&mut self, token: Token) -> Option { if token.text.chars().all(|c| c.is_ascii_alphanumeric()) { - return None; + return Some(token); } - Some(token) + None } } diff --git a/src/tokenizer/ascii_folding_filter.rs b/src/tokenizer/ascii_folding_filter.rs index a92ea9f3d..bbfb4d4d9 100644 --- a/src/tokenizer/ascii_folding_filter.rs +++ b/src/tokenizer/ascii_folding_filter.rs @@ -1,10 +1,10 @@ -use super::{Token, TokenFilter, TokenStream}; +use super::{analyzer_builder, Token, TokenFilter, TokenStream}; use std::mem; /// This class converts alphabetic, numeric, and symbolic Unicode characters /// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode /// block) into their ASCII equivalents, if one exists. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Default)] pub struct AsciiFolding { buffer: String, } @@ -1543,8 +1543,9 @@ mod tests { } fn folding_helper(text: &str) -> Vec { - let tokens = TextAnalyzer::new(SimpleTokenizer) + let tokens = analyzer_builder(SimpleTokenizer) .filter(AsciiFolding::new()) + .build() .token_stream(text) .map(|token| token.text.clone()) .collect(); @@ -1552,8 +1553,9 @@ mod tests { } fn folding_using_raw_tokenizer_helper(text: &str) -> String { - let mut token_stream = TextAnalyzer::new(RawTokenizer) + let mut token_stream = analyzer_builder(RawTokenizer) .filter(AsciiFolding::new()) + .build() .token_stream(text); let Token { text, .. } = token_stream.next().unwrap(); text diff --git a/src/tokenizer/facet_tokenizer.rs b/src/tokenizer/facet_tokenizer.rs index 8b433bc27..1916d837f 100644 --- a/src/tokenizer/facet_tokenizer.rs +++ b/src/tokenizer/facet_tokenizer.rs @@ -9,7 +9,7 @@ use crate::schema::FACET_SEP_BYTE; /// - `/america/north_america/canada` /// - `/america/north_america` /// - `/america` -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Default)] pub struct FacetTokenizer; #[derive(Clone, Debug)] @@ -40,13 +40,13 @@ impl Tokenizer for FacetTokenizer { impl Iterator for FacetTokenStream { type Item = Token; fn next(&mut self) -> Option { - match self.state { + self.state = match self.state { State::RootFacetNotEmitted => { - self.state = if self.text.is_empty() { + if self.text.is_empty() { State::Terminated } else { State::UpToPosition(0) - }; + } } State::UpToPosition(cursor) => { if let Some(next_sep_pos) = self.text.as_bytes()[cursor + 1..] @@ -56,11 +56,11 @@ impl Iterator for FacetTokenStream { { let facet_part = &self.text[cursor..next_sep_pos]; self.token.text.push_str(facet_part); - self.state = State::UpToPosition(next_sep_pos); + State::UpToPosition(next_sep_pos) } else { let facet_part = &self.text[cursor..]; self.token.text.push_str(facet_part); - self.state = State::Terminated; + State::Terminated } } State::Terminated => return None, diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index 7211ec2cd..4beb78d4c 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -1,4 +1,4 @@ -use super::{Token, TokenFilter}; +use super::{analyzer_builder, TextAnalyzerT, Token, TokenFilter}; use std::mem; impl TokenFilter for LowerCaser { @@ -15,7 +15,7 @@ impl TokenFilter for LowerCaser { } /// Token filter that lowercase terms. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Default)] pub struct LowerCaser { buffer: String, } @@ -46,15 +46,13 @@ mod tests { #[test] fn test_to_lower_case() { - assert_eq!( - lowercase_helper("Русский тСкст"), - vec!["русский".to_string(), "тСкст".to_string()] - ); + assert_eq!(lowercase_helper("Русский тСкст"), vec!["русский", "тСкст"]); } fn lowercase_helper(text: &str) -> Vec { - TextAnalyzer::new(SimpleTokenizer) + analyzer_builder(SimpleTokenizer) .filter(LowerCaser::new()) + .build() .token_stream(text) .map(|token| { let Token { text, .. } = token; @@ -65,7 +63,7 @@ mod tests { #[test] fn test_lowercaser() { - assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]); - assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]); + assert_eq!(lowercase_helper("Tree"), vec!["tree"]); + assert_eq!(lowercase_helper("Русский"), vec!["русский"]); } } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index c8895f546..4235e6f9d 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -64,10 +64,10 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let en_stem = TextAnalyzer::from(SimpleTokenizer) +//! let en_stem = analyzer_builder(SimpleTokenizer) //! .filter(RemoveLongFilter::limit(40)) -//! .filter(LowerCaser) -//! .filter(Stemmer::new(Language::English)); +//! .filter(LowerCaser::new()) +//! .filter(Stemmer::new(Language::English)).build(); //! ``` //! //! Once your tokenizer is defined, you need to @@ -109,9 +109,9 @@ //! let index = Index::create_in_ram(schema); //! //! // We need to register our tokenizer : -//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer) +//! let custom_en_tokenizer = analyzer_builder(SimpleTokenizer) //! .filter(RemoveLongFilter::limit(40)) -//! .filter(LowerCaser); +//! .filter(LowerCaser::new()).build(); //! index //! .tokenizers() //! .register("custom_en", custom_en_tokenizer); @@ -146,7 +146,8 @@ pub(crate) use self::token_stream_chain::{DynTokenStreamChain, TokenStreamChain} pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; pub use self::tokenizer::{ - TextAnalyzer, TextAnalyzerT, Token, TokenFilter, TokenStream, Tokenizer, + analyzer_builder, Identity, TextAnalyzer, TextAnalyzerT, Token, TokenFilter, TokenStream, + Tokenizer, }; pub use self::tokenizer_manager::TokenizerManager; @@ -215,10 +216,11 @@ pub mod tests { let tokenizer_manager = TokenizerManager::default(); tokenizer_manager.register( "el_stem", - TextAnalyzer::new(SimpleTokenizer) - .filter(RemoveLongFilter::new(40)) + analyzer_builder(SimpleTokenizer) + .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser::new()) - .filter(Stemmer::new(Language::Greek)), + .filter(Stemmer::new(Language::Greek)) + .build(), ); let en_tokenizer = tokenizer_manager.get("el_stem").unwrap(); let tokens: Vec = en_tokenizer diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs index 344ff46c6..036f8cd0b 100644 --- a/src/tokenizer/ngram_tokenizer.rs +++ b/src/tokenizer/ngram_tokenizer.rs @@ -78,7 +78,7 @@ use super::{Token, TokenStream, Tokenizer}; /// } /// assert!(stream.next().is_none()); /// ``` -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Default)] pub struct NgramTokenizer { /// min size of the n-gram min_gram: usize, diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index 9b1eae85a..719173c27 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -1,13 +1,12 @@ use super::{Token, TokenStream, Tokenizer}; /// For each value of the field, emit a single unprocessed token. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Default)] pub struct RawTokenizer; #[derive(Clone, Debug)] pub struct RawTokenStream { - token: Token, - has_token: bool, + token: Option, } impl Tokenizer for RawTokenizer { @@ -21,8 +20,7 @@ impl Tokenizer for RawTokenizer { position_length: 1, }; RawTokenStream { - token, - has_token: true, + token: Some(token), } } } @@ -30,12 +28,7 @@ impl Tokenizer for RawTokenizer { impl Iterator for RawTokenStream { type Item = Token; fn next(&mut self) -> Option { - if self.has_token { - self.has_token = false; - Some(self.token.clone()) - } else { - None - } + self.token.take() } } diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs index b65233c1a..d566055d6 100644 --- a/src/tokenizer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -2,8 +2,8 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) -//! .filter(RemoveLongFilter::limit(5)); +//! let tokenizer = analyzer_builder(SimpleTokenizer) +//! .filter(RemoveLongFilter::limit(5)).build(); //! //! let mut stream = tokenizer.token_stream("toolong nice"); //! // because `toolong` is more than 5 characters, it is filtered @@ -26,7 +26,7 @@ pub struct RemoveLongFilter { impl RemoveLongFilter { /// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation. - pub fn new(limit: usize) -> RemoveLongFilter { + pub fn limit(limit: usize) -> RemoveLongFilter { RemoveLongFilter { limit } } } diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs index fba91a543..fc69b0efd 100644 --- a/src/tokenizer/simple_tokenizer.rs +++ b/src/tokenizer/simple_tokenizer.rs @@ -1,37 +1,36 @@ use super::{Token, TokenStream, Tokenizer}; use std::str::CharIndices; +impl TokenStream for SimpleTokenizerStream {} + /// Tokenize the text by splitting on whitespaces and punctuation. #[derive(Clone, Debug)] pub struct SimpleTokenizer; +impl Tokenizer for SimpleTokenizer { + type Iter = SimpleTokenizerStream; + fn token_stream(&self, text: &str) -> Self::Iter { + let vec: Vec<_> = text.char_indices().collect(); + SimpleTokenizerStream { + text: text.to_string(), + chars: vec.into_iter(), + position: usize::max_value(), + } + } +} #[derive(Clone, Debug)] pub struct SimpleTokenizerStream { text: String, - idx: usize, - chars: Vec<(usize, char)>, - token: Token, -} - -impl Tokenizer for SimpleTokenizer { - type Iter = SimpleTokenizerStream; - fn token_stream(&self, text: &str) -> Self::Iter { - SimpleTokenizerStream { - text: text.to_string(), - chars: text.char_indices().collect(), - idx: 0, - token: Token::default(), - } - } + chars: std::vec::IntoIter<(usize, char)>, + position: usize, } impl SimpleTokenizerStream { // search for the end of the current token. fn search_token_end(&mut self) -> usize { (&mut self.chars) - .iter() - .filter(|&&(_, ref c)| !c.is_alphanumeric()) - .map(|(offset, _)| *offset) + .filter(|&(_, c)| !c.is_alphanumeric()) + .map(|(offset, _)| offset) .next() .unwrap_or_else(|| self.text.len()) } @@ -40,37 +39,39 @@ impl SimpleTokenizerStream { impl Iterator for SimpleTokenizerStream { type Item = Token; fn next(&mut self) -> Option { - if self.idx >= self.chars.len() { - return None; - } - self.token.text.clear(); - self.token.position = self.token.position.wrapping_add(1); - while self.idx < self.chars.len() { - let (offset_from, c) = self.chars[self.idx]; + self.position = self.position.wrapping_add(1); + while let Some((offset_from, c)) = self.chars.next() { if c.is_alphanumeric() { let offset_to = self.search_token_end(); - self.token.offset_from = offset_from; - self.token.offset_to = offset_to; - self.token.text.push_str(&self.text[offset_from..offset_to]); - return Some(self.token.clone()); + let token = Token { + text: self.text[offset_from..offset_to].into(), + offset_from, + offset_to, + position: self.position, + ..Default::default() + }; + return Some(token); } - self.idx += 1; } None } } -impl TokenStream for SimpleTokenizerStream {} - #[cfg(test)] mod tests { use super::*; + #[test] + fn test_empty() { + let mut empty = SimpleTokenizer.token_stream(""); + assert_eq!(empty.next(), None); + } + #[test] fn simple_tokenizer() { - let mut stream = SimpleTokenizer.token_stream("tokenizer hello world"); - dbg!(stream.next()); - dbg!(stream.next()); - dbg!(stream.next()); + let mut simple = SimpleTokenizer.token_stream("tokenizer hello world"); + assert_eq!(simple.next().unwrap().text, "tokenizer"); + assert_eq!(simple.next().unwrap().text, "hello"); + assert_eq!(simple.next().unwrap().text, "world"); } } diff --git a/src/tokenizer/stop_word_filter.rs b/src/tokenizer/stop_word_filter.rs index eb3acc531..2a4818f59 100644 --- a/src/tokenizer/stop_word_filter.rs +++ b/src/tokenizer/stop_word_filter.rs @@ -2,8 +2,8 @@ //! ```rust //! use tantivy::tokenizer::*; //! -//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) -//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()])); +//! let tokenizer = analyzer_builder(SimpleTokenizer) +//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()])).build(); //! //! let mut stream = tokenizer.token_stream("the fox is crafty"); //! assert_eq!(stream.next().unwrap().text, "fox"); @@ -27,7 +27,7 @@ pub struct StopWordFilter { impl StopWordFilter { /// Creates a `StopWordFilter` given a list of words to remove - pub fn new(words: Vec) -> StopWordFilter { + pub fn remove(words: Vec) -> StopWordFilter { let mut set = StopWordHashSet::default(); for word in words { @@ -44,7 +44,7 @@ impl StopWordFilter { "there", "these", "they", "this", "to", "was", "will", "with", ]; - StopWordFilter::new(words.iter().map(|&s| s.to_string()).collect()) + StopWordFilter::remove(words.iter().map(|&s| s.to_string()).collect()) } } diff --git a/src/tokenizer/token_stream_chain.rs b/src/tokenizer/token_stream_chain.rs index 3fe893269..c724c8546 100644 --- a/src/tokenizer/token_stream_chain.rs +++ b/src/tokenizer/token_stream_chain.rs @@ -2,31 +2,64 @@ use crate::tokenizer::{Token, TokenStream, Tokenizer}; const POSITION_GAP: usize = 2; -pub(crate) struct TokenStreamChain { - streams_with_offsets: I, - token: Token, +pub(crate) struct TokenStreamChain { + streams_with_offsets: Outer, + current: Option<(Inner, usize)>, + position: usize, position_shift: usize, } -impl<'a, Out> TokenStreamChain { - pub fn new(streams_with_offsets: Out) -> TokenStreamChain - where - In: Iterator, - Out: Iterator, - { +impl<'a, Inner, Outer> TokenStreamChain +where + Inner: Iterator, + Outer: Iterator, +{ + pub fn new(mut streams_with_offsets: Outer) -> TokenStreamChain { + let current = streams_with_offsets.next(); TokenStreamChain { - streams_with_offsets, - token: Token::default(), + streams_with_offsets: streams_with_offsets, + current, + position: usize::max_value(), position_shift: 0, } } } + +impl<'a, Inner, Outer: Iterator> TokenStream + for TokenStreamChain +where + Inner: Iterator, +{ +} + +impl<'a, Inner, Outer> Iterator for TokenStreamChain +where + Inner: Iterator, + Outer: Iterator, +{ + type Item = Token; + fn next(&mut self) -> Option { + while let Some((ref mut token_stream, offset_offset)) = self.current { + if let Some(mut token) = token_stream.next() { + token.offset_from += offset_offset; + token.offset_to += offset_offset; + token.position += self.position_shift; + self.position = token.position; + return Some(token); + } + self.position_shift = self.position.wrapping_add(POSITION_GAP); + self.current = self.streams_with_offsets.next(); + } + None + } +} + impl DynTokenStreamChain { pub fn from_vec(streams_with_offsets: Vec<(Box, usize)>) -> impl TokenStream { DynTokenStreamChain { streams_with_offsets, idx: 0, - token: Token::default(), + position: usize::max_value(), position_shift: 0, } } @@ -35,7 +68,7 @@ impl DynTokenStreamChain { pub(crate) struct DynTokenStreamChain { streams_with_offsets: Vec<(Box, usize)>, idx: usize, - token: Token, + position: usize, position_shift: usize, } @@ -44,48 +77,17 @@ impl<'a> TokenStream for DynTokenStreamChain {} impl Iterator for DynTokenStreamChain { type Item = Token; fn next(&mut self) -> Option { - if self.idx >= self.streams_with_offsets.len() { - return None; - }; - while self.idx < self.streams_with_offsets.len() { - let (ref mut token_stream, offset_offset) = self.streams_with_offsets[self.idx]; - if let Some(token) = token_stream.next() { - self.token = token; - self.token.offset_from += offset_offset; - self.token.offset_to += offset_offset; - self.token.position += self.position_shift; - return Some(self.token.clone()); - } else { - self.idx += 1; - self.position_shift = self.token.position.wrapping_add(POSITION_GAP); - } - } - None - } -} - -impl<'a, In, Out: Iterator> TokenStream for TokenStreamChain where - In: Iterator -{ -} - -impl<'a, In, Out> Iterator for TokenStreamChain -where - In: Iterator, - Out: Iterator, -{ - type Item = Token; - fn next(&mut self) -> Option { - while let Some((ref mut token_stream, offset_offset)) = self.streams_with_offsets.next() { - if let Some(token) = token_stream.next() { - self.token = token; - self.token.offset_from += offset_offset; - self.token.offset_to += offset_offset; - self.token.position += self.position_shift; - return Some(self.token.clone()); - } else { - self.position_shift = self.token.position.wrapping_add(POSITION_GAP); + while let Some((token_stream, offset_offset)) = self.streams_with_offsets.get_mut(self.idx) + { + if let Some(mut token) = token_stream.next() { + token.offset_from += *offset_offset; + token.offset_to += *offset_offset; + token.position += self.position_shift; + self.position = token.position; + return Some(token); } + self.idx += 1; + self.position_shift = self.position.wrapping_add(POSITION_GAP); } None } @@ -103,11 +105,16 @@ mod tests { (SimpleTokenizer.token_stream("hello world"), 0), ]; let mut token_chain = TokenStreamChain::new(token_streams.into_iter()); - let token = token_chain.next().unwrap(); - assert_eq!(token.text, "hello"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 5); - assert_eq!(token.position, POSITION_GAP - 1); + let token = token_chain.next(); + + let expect = Token { + offset_from: 0, + offset_to: 5, + position: POSITION_GAP - 1, + text: "hello".into(), + ..Token::default() + }; + assert_eq!(token.unwrap(), expect); let token = token_chain.next().unwrap(); assert_eq!(token.text, "world"); diff --git a/src/tokenizer/tokenized_string.rs b/src/tokenizer/tokenized_string.rs index 411011cb8..016263160 100644 --- a/src/tokenizer/tokenized_string.rs +++ b/src/tokenizer/tokenized_string.rs @@ -97,8 +97,12 @@ mod tests { ], }; - let token_stream: Vec<_> = PreTokenizedStream::from(tok_text.clone()).collect(); - assert_eq!(token_stream, tok_text.tokens); + let mut token_stream = PreTokenizedStream::from(tok_text.clone()); + + for expected_token in tok_text.tokens { + assert_eq!(token_stream.next().unwrap(), expected_token); + } + assert!(token_stream.next().is_none()); } #[test] @@ -125,7 +129,7 @@ mod tests { let chain_parts = vec![&tok_text, &tok_text]; - let token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]); + let mut token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]); let expected_tokens = vec![ Token { @@ -157,6 +161,10 @@ mod tests { position_length: 1, }, ]; - assert_eq!(token_stream.collect::>(), expected_tokens); + + for expected_token in expected_tokens { + assert_eq!(token_stream.next().unwrap(), expected_token); + } + assert!(token_stream.next().is_none()); } } diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index db00006ba..5efeb8b19 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -36,16 +36,101 @@ impl Default for Token { /// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`. /// /// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially. -pub struct TextAnalyzer { - tokenizer: T, - filters: Vec>, +#[derive(Clone, Debug, Default)] +pub struct TextAnalyzer(T); + +/// Identity `TokenFilter` +#[derive(Clone, Debug, Default)] +pub struct Identity; + +impl TokenFilter for Identity { + fn transform(&mut self, token: Token) -> Option { + Some(token) + } } -/// Top-level trait for hiding the types contained in it. -pub trait TextAnalyzerT: 'static + Send + Sync + TextAnalyzerClone { - /// Top-level method that calls the corresponding `token_stream` on the - /// contained type. - fn token_stream(&self, text: &str) -> Box; +#[derive(Clone, Debug, Default)] +pub struct AnalyzerBuilder { + tokenizer: T, + f: F, +} + +/// Construct an `AnalyzerBuilder` on which to apply `TokenFilter`. +pub fn analyzer_builder(tokenizer: T) -> AnalyzerBuilder { + AnalyzerBuilder { + tokenizer, + f: Identity, + } +} + +impl AnalyzerBuilder +where + T: Tokenizer, + F: TokenFilter, +{ + /// Appends a token filter to the current tokenizer. + /// + /// The method consumes the current `TokenStream` and returns a + /// new one. + /// + /// # Example + /// + /// ```rust + /// use tantivy::tokenizer::*; + /// + /// let en_stem = analyzer_builder(SimpleTokenizer) + /// .filter(RemoveLongFilter::limit(40)) + /// .filter(LowerCaser::new()) + /// .filter(Stemmer::default()).build(); + /// ``` + /// + pub fn filter(self, f: G) -> AnalyzerBuilder, G> { + AnalyzerBuilder { tokenizer: self, f } + } + /// Finalize the build process. + pub fn build(self) -> TextAnalyzer> { + TextAnalyzer(self) + } +} + +impl Tokenizer for AnalyzerBuilder { + type Iter = Filter; + fn token_stream(&self, text: &str) -> Self::Iter { + Filter { + iter: self.tokenizer.token_stream(text), + f: self.f.clone(), + } + } +} + +/// `Filter` is a wrapper around a `TokenStream` and a `TokenFilter` which modifies the `TokenStream`. +#[derive(Clone, Default, Debug)] +pub struct Filter { + iter: I, + f: F, +} + +impl Iterator for Filter +where + I: TokenStream, + F: TokenFilter, +{ + type Item = Token; + fn next(&mut self) -> Option { + while let Some(token) = self.iter.next() { + if let Some(tok) = self.f.transform(token) { + return Some(tok); + } + } + None + } +} + +impl TokenStream for Filter +where + I: TokenStream, + F: TokenFilter, +{ } pub trait TextAnalyzerClone { @@ -58,112 +143,25 @@ impl Clone for Box { } } -impl Clone for Box { - fn clone(&self) -> Self { - (**self).box_clone() - } -} - -impl TextAnalyzerClone for TextAnalyzer { +impl TextAnalyzerClone for TextAnalyzer { fn box_clone(&self) -> Box { - Box::new(TextAnalyzer { - tokenizer: self.tokenizer.clone(), - filters: self.filters.clone(), - }) + Box::new(TextAnalyzer(self.0.clone())) } } impl TextAnalyzerT for TextAnalyzer { fn token_stream(&self, text: &str) -> Box { - let tokens = self.tokenizer.token_stream(text); - Box::new(TextIter { - tokens, - // TODO: remove clone - filters: self.filters.clone(), - }) + Box::new(self.0.token_stream(text)) } } -impl TextAnalyzer -where - T: Tokenizer, -{ - /// Creates a new `TextAnalyzer` given a tokenizer and a vector of `Box`. - /// - /// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using - /// `TextAnalyzer::from(tokenizer)`. - pub fn new(tokenizer: T) -> TextAnalyzer { - TextAnalyzer { - tokenizer, - filters: vec![], - } - } - - /// Appends a token filter to the current tokenizer. - /// - /// The method consumes the current `TokenStream` and returns a - /// new one. - /// - /// # Example - /// - /// ```rust - /// use tantivy::tokenizer::*; - /// - /// let en_stem = TextAnalyzer::from(SimpleTokenizer) - /// .filter(RemoveLongFilter::limit(40)) - /// .filter(LowerCaser) - /// .filter(Stemmer::default()); - /// ``` - /// - pub fn filter(mut self, token_filter: F) -> Self { - self.filters.push(Box::new(token_filter)); - self - } - - /// Tokenize an array`&str` - /// - /// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were - /// one concatenated `&str`, with an artificial position gap of `2` between the different fields - /// to prevent accidental `PhraseQuery` to match accross two terms. - - /// Creates a token stream for a given `str`. - pub fn token_stream(&self, text: &str) -> TextIter { - let tokens = self.tokenizer.token_stream(text); - TextIter { - tokens, - // TODO: remove clone - filters: self.filters.clone(), - } - } +/// 'Top-level' trait hiding concrete types, below which static dispatch occurs. +pub trait TextAnalyzerT: 'static + Send + Sync + TextAnalyzerClone { + /// 'Top-level' dynamic dispatch function hiding concrete types of the staticly + /// dispatched `token_stream` from the `Tokenizer` trait. + fn token_stream(&self, text: &str) -> Box; } -pub struct TextIter { - tokens: I, - filters: Vec>, -} - -impl Iterator for TextIter -where - I: Iterator, -{ - type Item = I::Item; - fn next(&mut self) -> Option { - 'outer: while let Some(mut token) = self.tokens.next() { - for filter in self.filters.iter_mut() { - if let Some(tok) = filter.transform(token) { - token = tok; - continue; - }; - continue 'outer; - } - return Some(token); - } - None - } -} - -impl> TokenStream for TextIter {} - /// `Tokenizer` are in charge of splitting text into a stream of token /// before indexing. /// @@ -193,22 +191,12 @@ pub trait Tokenizer: 'static + Send + Sync + Clone { } /// Trait for the pluggable components of `Tokenizer`s. -pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone { +pub trait TokenFilter: 'static + Send + Sync + Clone { /// Take a `Token` and transform it or return `None` if it's to be removed /// from the output stream. fn transform(&mut self, token: Token) -> Option; } -pub trait TokenFilterClone { - fn box_clone(&self) -> Box; -} - -impl TokenFilterClone for T { - fn box_clone(&self) -> Box { - Box::new(self.clone()) - } -} - /// `TokenStream` is the result of the tokenization. /// /// It consists consumable stream of `Token`s. @@ -218,9 +206,9 @@ impl TokenFilterClone for T { /// ``` /// use tantivy::tokenizer::*; /// -/// let tokenizer = TextAnalyzer::from(SimpleTokenizer) +/// let tokenizer = analyzer_builder(SimpleTokenizer) /// .filter(RemoveLongFilter::limit(40)) -/// .filter(LowerCaser); +/// .filter(LowerCaser::new()).build(); /// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer"); /// { /// let token = token_stream.next().unwrap(); @@ -239,6 +227,12 @@ impl TokenFilterClone for T { /// ``` pub trait TokenStream: Iterator {} +impl From for TextAnalyzer { + fn from(src: T) -> TextAnalyzer { + TextAnalyzer(src) + } +} + #[cfg(test)] mod test { use super::*; @@ -263,7 +257,7 @@ mod test { #[test] fn text_analyzer() { - let mut stream = TextAnalyzer::new(SimpleTokenizer).token_stream("tokenizer hello world"); + let mut stream = SimpleTokenizer.token_stream("tokenizer hello world"); dbg!(stream.next()); dbg!(stream.next()); dbg!(stream.next()); diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index ecb6b19ed..8d73e8892 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -1,5 +1,5 @@ use crate::tokenizer::stemmer::Language; -use crate::tokenizer::tokenizer::{TextAnalyzer, TextAnalyzerT, Tokenizer}; +use crate::tokenizer::tokenizer::{analyzer_builder, TextAnalyzer, TextAnalyzerT, Tokenizer}; use crate::tokenizer::LowerCaser; use crate::tokenizer::RawTokenizer; use crate::tokenizer::RemoveLongFilter; @@ -27,14 +27,14 @@ pub struct TokenizerManager { impl TokenizerManager { /// Registers a new tokenizer associated with a given name. - pub fn register(&self, tokenizer_name: &str, tokenizer: T) + pub fn register(&self, tokenizer_name: &str, tokenizer: T) where - T: TextAnalyzerT, + T: Into>, { self.tokenizers .write() .expect("Acquiring the lock should never fail") - .insert(tokenizer_name.to_string(), Box::new(tokenizer)); + .insert(tokenizer_name.to_string(), Box::new(tokenizer.into())); } /// Accessing a tokenizer given its name. @@ -57,19 +57,21 @@ impl Default for TokenizerManager { let manager = TokenizerManager { tokenizers: Arc::new(RwLock::new(HashMap::new())), }; - manager.register("raw", TextAnalyzer::new(RawTokenizer)); + manager.register("raw", RawTokenizer); manager.register( "default", - TextAnalyzer::new(SimpleTokenizer) - .filter(RemoveLongFilter::new(40)) - .filter(LowerCaser::new()), + analyzer_builder(SimpleTokenizer) + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser::new()) + .build(), ); manager.register( "en_stem", - TextAnalyzer::new(SimpleTokenizer) - .filter(RemoveLongFilter::new(40)) + analyzer_builder(SimpleTokenizer) + .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser::new()) - .filter(Stemmer::new(Language::English)), + .filter(Stemmer::new(Language::English)) + .build(), ); manager }