From 4e6b3414224922ad259da706a1da3d00150ace28 Mon Sep 17 00:00:00 2001 From: dcraven Date: Tue, 29 Dec 2020 16:09:07 +0100 Subject: [PATCH] Tests compile. --- Cargo.toml | 3 + examples/custom_tokenizer.rs | 9 +- examples/pre_tokenized_text.rs | 7 +- examples/snippet.rs | 2 +- examples/stop_words.rs | 6 +- src/core/index.rs | 20 +-- src/indexer/segment_writer.rs | 32 ++--- src/lib.rs | 1 + src/postings/mod.rs | 4 +- src/postings/postings_writer.rs | 2 +- src/query/query_parser/query_parser.rs | 29 ++--- src/snippet/mod.rs | 53 ++++---- src/tokenizer/alphanum_only.rs | 35 +----- src/tokenizer/ascii_folding_filter.rs | 46 +++---- src/tokenizer/facet_tokenizer.rs | 52 ++++---- src/tokenizer/lower_caser.rs | 17 +-- src/tokenizer/mod.rs | 78 ++++-------- src/tokenizer/ngram_tokenizer.rs | 38 +++--- src/tokenizer/raw_tokenizer.rs | 4 +- src/tokenizer/remove_long.rs | 2 +- src/tokenizer/simple_tokenizer.rs | 49 +++++--- src/tokenizer/stop_word_filter.rs | 42 +------ src/tokenizer/token_stream_chain.rs | 65 ++++++++-- src/tokenizer/tokenized_string.rs | 18 +-- src/tokenizer/tokenizer.rs | 161 +++++++++++++++---------- src/tokenizer/tokenizer_manager.rs | 12 +- 26 files changed, 400 insertions(+), 387 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6648e0a83..db3414c7c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -87,6 +87,9 @@ members = ["query-grammar"] [badges] travis-ci = { repository = "tantivy-search/tantivy" } +[patch.crates-io] +rust-stemmers = {path = "src/vendor/rust-stemmers"} + # Following the "fail" crate best practises, we isolate # tests that define specific behavior in fail check points # in a different binary. diff --git a/examples/custom_tokenizer.rs b/examples/custom_tokenizer.rs index 4db6d10cb..e73842632 100644 --- a/examples/custom_tokenizer.rs +++ b/examples/custom_tokenizer.rs @@ -5,7 +5,7 @@ use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::schema::*; -use tantivy::tokenizer::NgramTokenizer; +use tantivy::tokenizer::{NgramTokenizer, TextAnalyzer}; use tantivy::{doc, Index}; fn main() -> tantivy::Result<()> { @@ -52,9 +52,10 @@ fn main() -> tantivy::Result<()> { // here we are registering our custome tokenizer // this will store tokens of 3 characters each - index - .tokenizers() - .register("ngram3", NgramTokenizer::new(3, 3, false)); + index.tokenizers().register( + "ngram3", + TextAnalyzer::new(NgramTokenizer::new(3, 3, false)), + ); // To insert document we need an index writer. // There must be only one writer at a time. diff --git a/examples/pre_tokenized_text.rs b/examples/pre_tokenized_text.rs index 15c26dfcb..ac787625b 100644 --- a/examples/pre_tokenized_text.rs +++ b/examples/pre_tokenized_text.rs @@ -17,12 +17,7 @@ use tantivy::{doc, Index, ReloadPolicy}; use tempfile::TempDir; fn pre_tokenize_text(text: &str) -> Vec { - let mut token_stream = SimpleTokenizer.token_stream(text); - let mut tokens = vec![]; - while token_stream.advance() { - tokens.push(token_stream.token().clone()); - } - tokens + SimpleTokenizer.token_stream(text).collect() } fn main() -> tantivy::Result<()> { diff --git a/examples/snippet.rs b/examples/snippet.rs index eab64056a..9320b085c 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> { let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; - let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?; + let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?; for (score, doc_address) in top_docs { let doc = searcher.doc(doc_address)?; diff --git a/examples/stop_words.rs b/examples/stop_words.rs index ac7694122..408ecd8bd 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -50,9 +50,9 @@ fn main() -> tantivy::Result<()> { // This tokenizer lowers all of the text (to help with stop word matching) // then removes all instances of `the` and `and` from the corpus - let tokenizer = TextAnalyzer::from(SimpleTokenizer) - .filter(LowerCaser) - .filter(StopWordFilter::remove(vec![ + let tokenizer = TextAnalyzer::new(SimpleTokenizer) + .filter(LowerCaser::new()) + .filter(StopWordFilter::new(vec![ "the".to_string(), "and".to_string(), ])); diff --git a/src/core/index.rs b/src/core/index.rs index e2cb135c6..2eb82392f 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -37,12 +37,14 @@ fn load_metas( ) -> crate::Result { let meta_data = directory.atomic_read(&META_FILEPATH)?; let meta_string = String::from_utf8_lossy(&meta_data); - IndexMeta::deserialize(&meta_string, &inventory).map_err(|e| { - DataCorruption::new( - META_FILEPATH.to_path_buf(), - format!("Meta file cannot be deserialized. {:?}.", e), - ) - })? + IndexMeta::deserialize(&meta_string, &inventory) + .map_err(|e| { + DataCorruption::new( + META_FILEPATH.to_path_buf(), + format!("Meta file cannot be deserialized. {:?}.", e), + ) + }) + .map_err(From::from) } /// Search Index @@ -179,11 +181,11 @@ impl Index { } /// Helper to access the tokenizer associated to a specific field. - pub fn tokenizer_for_field(&'a self, field: Field) -> crate::Result>> { + pub fn tokenizer_for_field(&self, field: Field) -> crate::Result> { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); - let tokenizer_manager: &TokenizerManager<'a> = self.tokenizers(); - let tokenizer_name_opt: Option>> = match field_type { + let tokenizer_manager: &TokenizerManager = self.tokenizers(); + let tokenizer_name_opt: Option> = match field_type { FieldType::Str(text_options) => text_options .get_indexing_options() .map(|text_indexing_options| text_indexing_options.tokenizer().to_string()) diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 1f199cf08..b34e9bb0c 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -13,8 +13,8 @@ use crate::schema::Value; use crate::schema::{Field, FieldEntry}; use crate::tokenizer::PreTokenizedStream; use crate::tokenizer::TokenStream; +use crate::tokenizer::{DynTokenStreamChain, TextAnalyzerT, TokenStreamChain, Tokenizer}; use crate::tokenizer::{FacetTokenizer, TextAnalyzer}; -use crate::tokenizer::{TextAnalyzerT, TokenStreamChain, Tokenizer}; use crate::Opstamp; use crate::{DocId, SegmentComponent}; @@ -24,7 +24,7 @@ use crate::{DocId, SegmentComponent}; fn initial_table_size(per_thread_memory_budget: usize) -> crate::Result { let table_memory_upper_bound = per_thread_memory_budget / 3; if let Some(limit) = (10..) - .take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_memory_upper_bound) + .take_while(|&num_bits| compute_table_size(num_bits) < table_memory_upper_bound) .last() { Ok(limit.min(19)) // we cap it at 2^19 = 512K. @@ -46,8 +46,8 @@ pub struct SegmentWriter { fast_field_writers: FastFieldsWriter, fieldnorms_writer: FieldNormsWriter, doc_opstamps: Vec, - // TODO: redo ugly trait - tokenizers: Vec>>>, + // TODO: change type + tokenizers: Vec>>, term_buffer: Term, } @@ -72,17 +72,17 @@ impl SegmentWriter { let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits); let tokenizers = schema .fields() - .map( - |(_, field_entry): (Field, &FieldEntry)| match field_entry.field_type() { - FieldType::Str(ref text_options) => text_options + .map(|(_, field_entry)| match field_entry.field_type() { + FieldType::Str(text_options) => { + text_options .get_indexing_options() .and_then(|text_index_option| { let tokenizer_name = &text_index_option.tokenizer(); tokenizer_manager.get(tokenizer_name) - }), - _ => None, - }, - ) + }) + } + _ => None, + }) .collect(); Ok(SegmentWriter { max_doc: 0, @@ -159,12 +159,13 @@ impl SegmentWriter { let mut unordered_term_id_opt = None; FacetTokenizer .token_stream(facet_str) - .process(&mut |token| { + .map(|token| { term_buffer.set_text(&token.text); let unordered_term_id = multifield_postings.subscribe(doc_id, &term_buffer); unordered_term_id_opt = Some(unordered_term_id); - }); + }) + .count(); if let Some(unordered_term_id) = unordered_term_id_opt { self.fast_field_writers .get_multivalue_writer(field) @@ -189,7 +190,7 @@ impl SegmentWriter { total_offset += last_token.offset_to; } } - Value::Str(ref text) => { + Value::Str(text) => { if let Some(ref mut tokenizer) = self.tokenizers[field.field_id() as usize] { @@ -205,7 +206,7 @@ impl SegmentWriter { let num_tokens = if streams_with_offsets.is_empty() { 0 } else { - let mut token_stream = TokenStreamChain::new(streams_with_offsets); + let mut token_stream = DynTokenStreamChain::from_vec(streams_with_offsets); multifield_postings.index_text( doc_id, field, @@ -271,6 +272,7 @@ impl SegmentWriter { self.multifield_postings.subscribe(doc_id, &term_buffer); } } + _ => {} } } doc.filter_fields(|field| schema.get_field_entry(field).is_stored()); diff --git a/src/lib.rs b/src/lib.rs index 33baf80d7..83b9e2c5a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,7 @@ #![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))] #![doc(test(attr(allow(unused_variables), deny(warnings))))] #![warn(missing_docs)] +#![allow(unused_imports)] //! # `tantivy` //! diff --git a/src/postings/mod.rs b/src/postings/mod.rs index ca736ed18..65e286c6d 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -50,7 +50,7 @@ pub mod tests { use crate::schema::{Field, TextOptions}; use crate::schema::{IndexRecordOption, TextFieldIndexing}; use crate::schema::{Schema, Term, INDEXED, TEXT}; - use crate::tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN}; + use crate::tokenizer::{SimpleTokenizer, TextAnalyzer, MAX_TOKEN_LEN}; use crate::DocId; use crate::HasLen; use crate::Score; @@ -167,7 +167,7 @@ pub mod tests { let index = Index::create_in_ram(schema.clone()); index .tokenizers() - .register("simple_no_truncation", SimpleTokenizer); + .register("simple_no_truncation", TextAnalyzer::new(SimpleTokenizer)); let reader = index.reader().unwrap(); let mut index_writer = index.writer_for_tests().unwrap(); index_writer.set_merge_policy(Box::new(NoMergePolicy)); diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 6e4d62815..21f963a21 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -240,7 +240,7 @@ pub trait PostingsWriter { ); } }; - token_stream.process(&mut sink) + token_stream.map(|tok| sink(&tok)).count() as u32 } fn total_num_tokens(&self) -> u64; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index ed3f0e9c0..1544f3263 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -289,7 +289,7 @@ impl QueryParser { let field_name = field_entry.name().to_string(); return Err(QueryParserError::FieldNotIndexed(field_name)); } - match *field_type { + match field_type { FieldType::I64(_) => { let val: i64 = i64::from_str(phrase)?; let term = Term::from_field_i64(field, val); @@ -312,7 +312,7 @@ impl QueryParser { let term = Term::from_field_u64(field, val); Ok(vec![(0, term)]) } - FieldType::Str(ref str_options) => { + FieldType::Str(str_options) => { if let Some(option) = str_options.get_indexing_options() { let tokenizer = self.tokenizer_manager @@ -323,12 +323,13 @@ impl QueryParser { option.tokenizer().to_string(), ) })?; - let mut terms: Vec<(usize, Term)> = Vec::new(); - let mut token_stream = tokenizer.token_stream(phrase); - token_stream.process(&mut |token| { - let term = Term::from_field_text(field, &token.text); - terms.push((token.position, term)); - }); + let token_stream = tokenizer.token_stream(phrase); + let terms: Vec<_> = token_stream + .map(|token| { + let term = Term::from_field_text(field, &token.text); + (token.position, term) + }) + .collect(); if terms.len() <= 1 { Ok(terms) } else { @@ -412,7 +413,7 @@ impl QueryParser { &self, given_field: &Option, ) -> Result, QueryParserError> { - match *given_field { + match given_field { None => { if self.default_fields.is_empty() { Err(QueryParserError::NoDefaultFieldDeclared) @@ -420,7 +421,7 @@ impl QueryParser { Ok(Cow::from(&self.default_fields[..])) } } - Some(ref field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])), + Some(field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])), } } @@ -618,9 +619,9 @@ mod test { let tokenizer_manager = TokenizerManager::default(); tokenizer_manager.register( "en_with_stop_words", - TextAnalyzer::from(SimpleTokenizer) - .filter(LowerCaser) - .filter(StopWordFilter::remove(vec!["the".to_string()])), + TextAnalyzer::new(SimpleTokenizer) + .filter(LowerCaser::new()) + .filter(StopWordFilter::new(vec!["the".to_string()])), ); QueryParser::new(schema, default_fields, tokenizer_manager) } @@ -977,7 +978,7 @@ mod test { let index = Index::create_in_ram(schema); index .tokenizers() - .register("customtokenizer", SimpleTokenizer); + .register("customtokenizer", TextAnalyzer::new(SimpleTokenizer)); let query_parser = QueryParser::for_index(&index, vec![title]); assert_eq!( query_parser.parse_query("title:\"happy tax\"").unwrap_err(), diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index c44b46e0e..d8da9a026 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -139,13 +139,13 @@ impl Snippet { /// /// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ /// has to be a valid string. -fn search_fragments<'a>( - tokenizer: &(dyn TextAnalyzerT<'a> + 'a), - text: String, +fn search_fragments( + tokenizer: &dyn TextAnalyzerT, + text: &str, terms: &BTreeMap, max_num_chars: usize, ) -> Vec { - let mut token_stream = tokenizer.token_stream(text.as_ref()); + let mut token_stream = tokenizer.token_stream(text); let mut fragment = FragmentCandidate::new(0); let mut fragments: Vec = vec![]; while let Some(next) = token_stream.next() { @@ -249,7 +249,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) /// ``` pub struct SnippetGenerator { terms_text: BTreeMap, - tokenizer: Box>, + tokenizer: Box, field: Field, max_num_chars: usize, } @@ -297,33 +297,37 @@ impl SnippetGenerator { /// /// This method extract the text associated to the `SnippetGenerator`'s field /// and computes a snippet. - pub fn snippet_from_doc(&self, doc: &Document) -> Snippet { + pub fn snippet_from_doc(&mut self, doc: &Document) -> Snippet { let text: String = doc .get_all(self.field) .flat_map(Value::text) .collect::>() .join(" "); - self.snippet(text) + self.snippet(text.as_ref()) } /// Generates a snippet for the given text. - pub fn snippet(&self, text: String) -> Snippet { - let fragment_candidates = - search_fragments(&mut *self.tokenizer, text, &self.terms_text, self.max_num_chars); - select_best_fragment_combination(&fragment_candidates[..], &text) + pub fn snippet(&mut self, text: &str) -> Snippet { + let fragment_candidates = search_fragments( + &mut *self.tokenizer, + text, + &self.terms_text, + self.max_num_chars, + ); + select_best_fragment_combination(&fragment_candidates[..], text) } } #[cfg(test)] mod tests { - use super::{search_fragments, select_best_fragment_combination}; + use super::*; use crate::query::QueryParser; use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT}; use crate::tokenizer::SimpleTokenizer; + use crate::tokenizer::TextAnalyzer; use crate::Index; use crate::SnippetGenerator; use maplit::btreemap; - use std::collections::BTreeMap; use std::iter::Iterator; const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by @@ -346,7 +350,8 @@ Survey in 2016, 2017, and 2018."#; String::from("rust") => 1.0, String::from("language") => 0.9 }; - let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100); + let fragments = + search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 100); assert_eq!(fragments.len(), 7); { let first = &fragments[0]; @@ -373,7 +378,8 @@ Survey in 2016, 2017, and 2018."#; String::from("rust") =>1.0, String::from("language") => 0.9 }; - let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20); + let fragments = + search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 20); { let first = &fragments[0]; assert_eq!(first.score, 1.0); @@ -387,7 +393,8 @@ Survey in 2016, 2017, and 2018."#; String::from("rust") =>0.9, String::from("language") => 1.0 }; - let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20); + let fragments = + search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 20); //assert_eq!(fragments.len(), 7); { let first = &fragments[0]; @@ -406,7 +413,7 @@ Survey in 2016, 2017, and 2018."#; let mut terms = BTreeMap::new(); terms.insert(String::from("c"), 1.0); - let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3); + let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3); assert_eq!(fragments.len(), 1); { @@ -428,7 +435,7 @@ Survey in 2016, 2017, and 2018."#; let mut terms = BTreeMap::new(); terms.insert(String::from("f"), 1.0); - let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3); + let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3); assert_eq!(fragments.len(), 2); { @@ -451,7 +458,7 @@ Survey in 2016, 2017, and 2018."#; terms.insert(String::from("f"), 1.0); terms.insert(String::from("a"), 0.9); - let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 7); + let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 7); assert_eq!(fragments.len(), 2); { @@ -473,7 +480,7 @@ Survey in 2016, 2017, and 2018."#; let mut terms = BTreeMap::new(); terms.insert(String::from("z"), 1.0); - let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3); + let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3); assert_eq!(fragments.len(), 0); @@ -487,7 +494,7 @@ Survey in 2016, 2017, and 2018."#; let text = "a b c d"; let terms = BTreeMap::new(); - let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3); + let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3); assert_eq!(fragments.len(), 0); let snippet = select_best_fragment_combination(&fragments[..], &text); @@ -572,12 +579,12 @@ Survey in 2016, 2017, and 2018."#; let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field).unwrap(); { - let snippet = snippet_generator.snippet(TEST_TEXT); + let snippet = snippet_generator.snippet(TEST_TEXT.into()); assert_eq!(snippet.to_html(), "imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to provide better memory safety"); } { snippet_generator.set_max_num_chars(90); - let snippet = snippet_generator.snippet(TEST_TEXT); + let snippet = snippet_generator.snippet(TEST_TEXT.into()); assert_eq!(snippet.to_html(), "Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to"); } } diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs index 2cfee26ba..7c35eb842 100644 --- a/src/tokenizer/alphanum_only.rs +++ b/src/tokenizer/alphanum_only.rs @@ -26,38 +26,11 @@ use super::{Token, TokenFilter, TokenStream}; #[derive(Clone)] pub struct AlphaNumOnlyFilter; -pub struct AlphaNumOnlyFilterStream<'a> { - tail: Box, -} - -impl<'a> AlphaNumOnlyFilterStream<'a> { - fn predicate(&self, token: &Token) -> bool { - token.text.chars().all(|c| c.is_ascii_alphanumeric()) - } -} - impl TokenFilter for AlphaNumOnlyFilter { - fn transform<'a>(&self, token_stream: Box) -> Box { - Box::new(AlphaNumOnlyFilterStream { tail: token_stream }) - } -} - -impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> { - fn advance(&mut self) -> bool { - while self.tail.advance() { - if self.predicate(self.tail.token()) { - return true; - } + fn transform(&mut self, token: Token) -> Option { + if token.text.chars().all(|c| c.is_ascii_alphanumeric()) { + return None; } - - false - } - - fn token(&self) -> &Token { - self.tail.token() - } - - fn token_mut(&mut self) -> &mut Token { - self.tail.token_mut() + Some(token) } } diff --git a/src/tokenizer/ascii_folding_filter.rs b/src/tokenizer/ascii_folding_filter.rs index 1100387b6..a92ea9f3d 100644 --- a/src/tokenizer/ascii_folding_filter.rs +++ b/src/tokenizer/ascii_folding_filter.rs @@ -1,4 +1,4 @@ -use super::{Token, TokenStream, TokenFilter}; +use super::{Token, TokenFilter, TokenStream}; use std::mem; /// This class converts alphabetic, numeric, and symbolic Unicode characters @@ -9,14 +9,23 @@ pub struct AsciiFolding { buffer: String, } +impl AsciiFolding { + /// Construct a new `AsciiFolding` filter. + pub fn new() -> Self { + Self { + buffer: String::with_capacity(100), + } + } +} + impl TokenFilter for AsciiFolding { fn transform(&mut self, mut token: Token) -> Option { - let token = &mut token; if !token.text.is_ascii() { // ignore its already ascii - to_ascii(&mut token.text, &mut self.buffer); - mem::swap(&mut token, &mut self.buffer); + to_ascii(&token.text, &mut self.buffer); + mem::swap(&mut token.text, &mut self.buffer); } + Some(token) } } @@ -1517,11 +1526,8 @@ fn to_ascii(text: &String, output: &mut String) { #[cfg(test)] mod tests { - use super::to_ascii; - use crate::tokenizer::AsciiFoldingFilter; - use crate::tokenizer::RawTokenizer; - use crate::tokenizer::SimpleTokenizer; - use crate::tokenizer::TextAnalyzer; + use super::super::*; + use super::*; use std::iter; #[test] @@ -1537,22 +1543,20 @@ mod tests { } fn folding_helper(text: &str) -> Vec { - let mut tokens = Vec::new(); - TextAnalyzer::from(SimpleTokenizer) - .filter(AsciiFoldingFilter) + let tokens = TextAnalyzer::new(SimpleTokenizer) + .filter(AsciiFolding::new()) .token_stream(text) - .process(&mut |token| { - tokens.push(token.text.clone()); - }); + .map(|token| token.text.clone()) + .collect(); tokens } fn folding_using_raw_tokenizer_helper(text: &str) -> String { - let mut token_stream = TextAnalyzer::from(RawTokenizer) - .filter(AsciiFoldingFilter) + let mut token_stream = TextAnalyzer::new(RawTokenizer) + .filter(AsciiFolding::new()) .token_stream(text); - token_stream.advance(); - token_stream.token().text.clone() + let Token { text, .. } = token_stream.next().unwrap(); + text } #[test] @@ -1603,9 +1607,9 @@ mod tests { #[test] fn test_to_ascii() { - let mut input = "Rámon".to_string(); + let input = "Rámon".to_string(); let mut buffer = String::new(); - to_ascii(&mut input, &mut buffer); + to_ascii(&input, &mut buffer); assert_eq!("Ramon", buffer); } diff --git a/src/tokenizer/facet_tokenizer.rs b/src/tokenizer/facet_tokenizer.rs index 2e79bda00..8b433bc27 100644 --- a/src/tokenizer/facet_tokenizer.rs +++ b/src/tokenizer/facet_tokenizer.rs @@ -20,24 +20,24 @@ enum State { } #[derive(Clone, Debug)] -pub struct FacetTokenStream<'a> { - text: &'a str, +pub struct FacetTokenStream { + text: String, state: State, token: Token, } -impl<'a> Tokenizer<'a> for FacetTokenizer { - type Iter = FacetTokenStream<'a>; - fn token_stream(&self, text: &'a str) -> Self::Iter { +impl Tokenizer for FacetTokenizer { + type Iter = FacetTokenStream; + fn token_stream(&self, text: &str) -> Self::Iter { FacetTokenStream { - text, + text: text.to_string(), state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet. token: Token::default(), } } } -impl<'a> Iterator for FacetTokenStream<'a> { +impl Iterator for FacetTokenStream { type Item = Token; fn next(&mut self) -> Option { match self.state { @@ -69,7 +69,7 @@ impl<'a> Iterator for FacetTokenStream<'a> { } } -impl<'a> TokenStream for FacetTokenStream<'a> {} +impl TokenStream for FacetTokenStream {} #[cfg(test)] mod tests { @@ -81,16 +81,14 @@ mod tests { #[test] fn test_facet_tokenizer() { let facet = Facet::from_path(vec!["top", "a", "b"]); - let mut tokens = vec![]; - { - let mut add_token = |token: &Token| { - let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); - tokens.push(format!("{}", facet)); - }; - FacetTokenizer - .token_stream(facet.encoded_str()) - .process(&mut add_token); - } + let tokens: Vec<_> = FacetTokenizer + .token_stream(facet.encoded_str()) + .map(|token| { + Facet::from_encoded(token.text.as_bytes().to_owned()) + .unwrap() + .to_string() + }) + .collect(); assert_eq!(tokens.len(), 4); assert_eq!(tokens[0], "/"); assert_eq!(tokens[1], "/top"); @@ -101,16 +99,14 @@ mod tests { #[test] fn test_facet_tokenizer_root_facets() { let facet = Facet::root(); - let mut tokens = vec![]; - { - let mut add_token = |token: &Token| { - let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test - tokens.push(format!("{}", facet)); - }; - FacetTokenizer - .token_stream(facet.encoded_str()) // ok test - .process(&mut add_token); - } + let tokens: Vec<_> = FacetTokenizer + .token_stream(facet.encoded_str()) + .map(|token| { + Facet::from_encoded(token.text.as_bytes().to_owned()) + .unwrap() + .to_string() + }) + .collect(); assert_eq!(tokens.len(), 1); assert_eq!(tokens[0], "/"); } diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index 6a1081f98..7211ec2cd 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -21,6 +21,7 @@ pub struct LowerCaser { } impl LowerCaser { + /// Initialize the `LowerCaser` pub fn new() -> Self { LowerCaser { buffer: String::with_capacity(100), @@ -40,6 +41,7 @@ fn to_lowercase_unicode(text: &String, output: &mut String) { #[cfg(test)] mod tests { + use super::*; use crate::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer}; #[test] @@ -51,13 +53,14 @@ mod tests { } fn lowercase_helper(text: &str) -> Vec { - let mut tokens = vec![]; - let mut token_stream = TextAnalyzer::new(SimpleTokenizer, text).filter(LowerCaser::new()); - while let Some(token) = token_stream.next() { - let token_text = token.text.clone(); - tokens.push(token_text); - } - tokens + TextAnalyzer::new(SimpleTokenizer) + .filter(LowerCaser::new()) + .token_stream(text) + .map(|token| { + let Token { text, .. } = token; + text + }) + .collect() } #[test] diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index a8deff838..c8895f546 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -117,8 +117,8 @@ //! .register("custom_en", custom_en_tokenizer); //! ``` //! -// mod alphanum_only; -// mod ascii_folding_filter; +mod alphanum_only; +mod ascii_folding_filter; mod facet_tokenizer; mod lower_caser; mod ngram_tokenizer; @@ -126,14 +126,14 @@ mod raw_tokenizer; mod remove_long; mod simple_tokenizer; mod stemmer; -// mod stop_word_filter; +mod stop_word_filter; mod token_stream_chain; mod tokenized_string; mod tokenizer; mod tokenizer_manager; -// pub use self::alphanum_only::AlphaNumOnlyFilter; -// pub use self::ascii_folding_filter::AsciiFolding; +pub use self::alphanum_only::AlphaNumOnlyFilter; +pub use self::ascii_folding_filter::AsciiFolding; pub use self::facet_tokenizer::FacetTokenizer; pub use self::lower_caser::LowerCaser; pub use self::ngram_tokenizer::NgramTokenizer; @@ -141,8 +141,8 @@ pub use self::raw_tokenizer::RawTokenizer; pub use self::remove_long::RemoveLongFilter; pub use self::simple_tokenizer::SimpleTokenizer; pub use self::stemmer::{Language, Stemmer}; -// pub use self::stop_word_filter::StopWordFilter; -pub(crate) use self::token_stream_chain::TokenStreamChain; +pub use self::stop_word_filter::StopWordFilter; +pub(crate) use self::token_stream_chain::{DynTokenStreamChain, TokenStreamChain}; pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; pub use self::tokenizer::{ @@ -187,15 +187,9 @@ pub mod tests { fn test_raw_tokenizer() { let tokenizer_manager = TokenizerManager::default(); let en_tokenizer = tokenizer_manager.get("raw").unwrap(); - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { - tokens.push(token.clone()); - }; - en_tokenizer - .token_stream("Hello, happy tax payer!") - .process(&mut add_token); - } + let tokens: Vec = en_tokenizer + .token_stream("Hello, happy tax payer!") + .collect(); assert_eq!(tokens.len(), 1); assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23); } @@ -205,15 +199,9 @@ pub mod tests { let tokenizer_manager = TokenizerManager::default(); assert!(tokenizer_manager.get("en_doesnotexist").is_none()); let en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { - tokens.push(token.clone()); - }; - en_tokenizer - .token_stream("Hello, happy tax payer!") - .process(&mut add_token); - } + let tokens: Vec = en_tokenizer + .token_stream("Hello, happy tax payer!") + .collect(); assert_eq!(tokens.len(), 4); assert_token(&tokens[0], 0, "hello", 0, 5); @@ -228,20 +216,14 @@ pub mod tests { tokenizer_manager.register( "el_stem", TextAnalyzer::new(SimpleTokenizer) - .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) + .filter(RemoveLongFilter::new(40)) + .filter(LowerCaser::new()) .filter(Stemmer::new(Language::Greek)), ); let en_tokenizer = tokenizer_manager.get("el_stem").unwrap(); - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { - tokens.push(token.clone()); - }; - en_tokenizer - .token_stream("Καλημέρα, χαρούμενε φορολογούμενε!") - .process(&mut add_token); - } + let tokens: Vec = en_tokenizer + .token_stream("Καλημέρα, χαρούμενε φορολογούμενε!") + .collect(); assert_eq!(tokens.len(), 3); assert_token(&tokens[0], 0, "καλημερ", 0, 16); @@ -253,25 +235,9 @@ pub mod tests { fn test_tokenizer_empty() { let tokenizer_manager = TokenizerManager::default(); let en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); - { - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { - tokens.push(token.clone()); - }; - en_tokenizer.token_stream(" ").process(&mut add_token); - } - assert!(tokens.is_empty()); - } - { - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { - tokens.push(token.clone()); - }; - en_tokenizer.token_stream(" ").process(&mut add_token); - } - assert!(tokens.is_empty()); - } + let tokens: Vec = en_tokenizer.token_stream(" ").collect(); + assert!(tokens.is_empty()); + let tokens: Vec = en_tokenizer.token_stream(" ").collect(); + assert!(tokens.is_empty()); } } diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs index 9844107e5..344ff46c6 100644 --- a/src/tokenizer/ngram_tokenizer.rs +++ b/src/tokenizer/ngram_tokenizer.rs @@ -118,20 +118,20 @@ impl NgramTokenizer { } /// TokenStream associate to the `NgramTokenizer` -pub struct NgramTokenStream<'a> { +pub struct NgramTokenStream { /// parameters - ngram_charidx_iterator: StutteringIterator>, + ngram_charidx_iterator: StutteringIterator, /// true if the NgramTokenStream is in prefix mode. prefix_only: bool, /// input - text: &'a str, + text: String, /// output token: Token, } -impl<'a> Tokenizer<'a> for NgramTokenizer { - type Iter = NgramTokenStream<'a>; - fn token_stream(&self, text: &'a str) -> Self::Iter { +impl Tokenizer for NgramTokenizer { + type Iter = NgramTokenStream; + fn token_stream(&self, text: &str) -> Self::Iter { NgramTokenStream { ngram_charidx_iterator: StutteringIterator::new( CodepointFrontiers::for_str(text), @@ -139,15 +139,15 @@ impl<'a> Tokenizer<'a> for NgramTokenizer { self.max_gram, ), prefix_only: self.prefix_only, - text, + text: text.to_string(), token: Token::default(), } } } -impl<'a> TokenStream for NgramTokenStream<'a> {} +impl TokenStream for NgramTokenStream {} -impl<'a> Iterator for NgramTokenStream<'a> { +impl Iterator for NgramTokenStream { type Item = Token; fn next(&mut self) -> Option { if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() { @@ -252,21 +252,21 @@ where /// or a codepoint ends. /// /// By convention, we emit [0] for the empty string. -struct CodepointFrontiers<'a> { - s: &'a str, +struct CodepointFrontiers { + s: String, next_el: Option, } -impl<'a> CodepointFrontiers<'a> { - fn for_str(s: &'a str) -> Self { +impl CodepointFrontiers { + fn for_str(s: &str) -> Self { CodepointFrontiers { - s, + s: s.to_string(), next_el: Some(0), } } } -impl<'a> Iterator for CodepointFrontiers<'a> { +impl<'a> Iterator for CodepointFrontiers { type Item = usize; fn next(&mut self) -> Option { @@ -275,7 +275,7 @@ impl<'a> Iterator for CodepointFrontiers<'a> { self.next_el = None; } else { let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]); - self.s = &self.s[first_codepoint_width..]; + self.s = (&self.s[first_codepoint_width..]).to_string(); self.next_el = Some(offset + first_codepoint_width); } offset @@ -305,10 +305,8 @@ mod tests { use crate::tokenizer::tokenizer::Tokenizer; use crate::tokenizer::{Token, TokenStream}; - fn test_helper(mut tokenizer: Box) -> Vec { - let mut tokens: Vec = vec![]; - tokenizer.process(&mut |token: &Token| tokens.push(token.clone())); - tokens + fn test_helper(tokens: T) -> Vec { + tokens.collect() } #[test] diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index 9631e730e..9b1eae85a 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -10,9 +10,9 @@ pub struct RawTokenStream { has_token: bool, } -impl<'a> Tokenizer<'a> for RawTokenizer { +impl Tokenizer for RawTokenizer { type Iter = RawTokenStream; - fn token_stream(&self, text: &'a str) -> Self::Iter { + fn token_stream(&self, text: &str) -> Self::Iter { let token = Token { offset_from: 0, offset_to: text.len(), diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs index 01cc43822..b65233c1a 100644 --- a/src/tokenizer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -32,7 +32,7 @@ impl RemoveLongFilter { } impl TokenFilter for RemoveLongFilter { - fn transform(&mut self, mut token: Token) -> Option { + fn transform(&mut self, token: Token) -> Option { if token.text.len() >= self.limit { return None; } diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs index b109fd9e5..fba91a543 100644 --- a/src/tokenizer/simple_tokenizer.rs +++ b/src/tokenizer/simple_tokenizer.rs @@ -6,40 +6,47 @@ use std::str::CharIndices; pub struct SimpleTokenizer; #[derive(Clone, Debug)] -pub struct SimpleTokenizerStream<'a> { - text: &'a str, - chars: CharIndices<'a>, +pub struct SimpleTokenizerStream { + text: String, + idx: usize, + chars: Vec<(usize, char)>, token: Token, } -impl<'a> Tokenizer<'a> for SimpleTokenizer { - type Iter = SimpleTokenizerStream<'a>; - fn token_stream(&self, text: &'a str) -> Self::Iter { +impl Tokenizer for SimpleTokenizer { + type Iter = SimpleTokenizerStream; + fn token_stream(&self, text: &str) -> Self::Iter { SimpleTokenizerStream { - text, - chars: text.char_indices(), + text: text.to_string(), + chars: text.char_indices().collect(), + idx: 0, token: Token::default(), } } } -impl<'a> SimpleTokenizerStream<'a> { +impl SimpleTokenizerStream { // search for the end of the current token. fn search_token_end(&mut self) -> usize { (&mut self.chars) - .filter(|&(_, ref c)| !c.is_alphanumeric()) - .map(|(offset, _)| offset) + .iter() + .filter(|&&(_, ref c)| !c.is_alphanumeric()) + .map(|(offset, _)| *offset) .next() .unwrap_or_else(|| self.text.len()) } } -impl<'a> Iterator for SimpleTokenizerStream<'a> { +impl Iterator for SimpleTokenizerStream { type Item = Token; fn next(&mut self) -> Option { + if self.idx >= self.chars.len() { + return None; + } self.token.text.clear(); self.token.position = self.token.position.wrapping_add(1); - while let Some((offset_from, c)) = self.chars.next() { + while self.idx < self.chars.len() { + let (offset_from, c) = self.chars[self.idx]; if c.is_alphanumeric() { let offset_to = self.search_token_end(); self.token.offset_from = offset_from; @@ -47,9 +54,23 @@ impl<'a> Iterator for SimpleTokenizerStream<'a> { self.token.text.push_str(&self.text[offset_from..offset_to]); return Some(self.token.clone()); } + self.idx += 1; } None } } -impl<'a> TokenStream for SimpleTokenizerStream<'a> {} +impl TokenStream for SimpleTokenizerStream {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn simple_tokenizer() { + let mut stream = SimpleTokenizer.token_stream("tokenizer hello world"); + dbg!(stream.next()); + dbg!(stream.next()); + dbg!(stream.next()); + } +} diff --git a/src/tokenizer/stop_word_filter.rs b/src/tokenizer/stop_word_filter.rs index 12c2b12ae..eb3acc531 100644 --- a/src/tokenizer/stop_word_filter.rs +++ b/src/tokenizer/stop_word_filter.rs @@ -27,7 +27,7 @@ pub struct StopWordFilter { impl StopWordFilter { /// Creates a `StopWordFilter` given a list of words to remove - pub fn remove(words: Vec) -> StopWordFilter { + pub fn new(words: Vec) -> StopWordFilter { let mut set = StopWordHashSet::default(); for word in words { @@ -44,46 +44,16 @@ impl StopWordFilter { "there", "these", "they", "this", "to", "was", "will", "with", ]; - StopWordFilter::remove(words.iter().map(|&s| s.to_string()).collect()) + StopWordFilter::new(words.iter().map(|&s| s.to_string()).collect()) } } -pub struct StopWordFilterStream<'a> { - words: StopWordHashSet, - tail: Box, -} - impl TokenFilter for StopWordFilter { - fn transform<'a>(&self, token_stream: Box) -> Box { - Box::new(StopWordFilterStream { - words: self.words.clone(), - tail: token_stream, - }) - } -} - -impl<'a> StopWordFilterStream<'a> { - fn predicate(&self, token: &Token) -> bool { - !self.words.contains(&token.text) - } -} - -impl<'a> TokenStream for StopWordFilterStream<'a> { - fn advance(&mut self) -> bool { - while self.tail.advance() { - if self.predicate(self.tail.token()) { - return true; - } + fn transform(&mut self, token: Token) -> Option { + if self.words.contains(&token.text) { + return None; } - false - } - - fn token(&self) -> &Token { - self.tail.token() - } - - fn token_mut(&mut self) -> &mut Token { - self.tail.token_mut() + Some(token) } } diff --git a/src/tokenizer/token_stream_chain.rs b/src/tokenizer/token_stream_chain.rs index ec7e048aa..3fe893269 100644 --- a/src/tokenizer/token_stream_chain.rs +++ b/src/tokenizer/token_stream_chain.rs @@ -21,6 +21,48 @@ impl<'a, Out> TokenStreamChain { } } } +impl DynTokenStreamChain { + pub fn from_vec(streams_with_offsets: Vec<(Box, usize)>) -> impl TokenStream { + DynTokenStreamChain { + streams_with_offsets, + idx: 0, + token: Token::default(), + position_shift: 0, + } + } +} + +pub(crate) struct DynTokenStreamChain { + streams_with_offsets: Vec<(Box, usize)>, + idx: usize, + token: Token, + position_shift: usize, +} + +impl<'a> TokenStream for DynTokenStreamChain {} + +impl Iterator for DynTokenStreamChain { + type Item = Token; + fn next(&mut self) -> Option { + if self.idx >= self.streams_with_offsets.len() { + return None; + }; + while self.idx < self.streams_with_offsets.len() { + let (ref mut token_stream, offset_offset) = self.streams_with_offsets[self.idx]; + if let Some(token) = token_stream.next() { + self.token = token; + self.token.offset_from += offset_offset; + self.token.offset_to += offset_offset; + self.token.position += self.position_shift; + return Some(self.token.clone()); + } else { + self.idx += 1; + self.position_shift = self.token.position.wrapping_add(POSITION_GAP); + } + } + None + } +} impl<'a, In, Out: Iterator> TokenStream for TokenStreamChain where In: Iterator @@ -61,19 +103,18 @@ mod tests { (SimpleTokenizer.token_stream("hello world"), 0), ]; let mut token_chain = TokenStreamChain::new(token_streams.into_iter()); + let token = token_chain.next().unwrap(); + assert_eq!(token.text, "hello"); + assert_eq!(token.offset_from, 0); + assert_eq!(token.offset_to, 5); + assert_eq!(token.position, POSITION_GAP - 1); - assert!(token_chain.advance()); - assert_eq!(token_chain.token().text, "hello"); - assert_eq!(token_chain.token().offset_from, 0); - assert_eq!(token_chain.token().offset_to, 5); - assert_eq!(token_chain.token().position, POSITION_GAP - 1); + let token = token_chain.next().unwrap(); + assert_eq!(token.text, "world"); + assert_eq!(token.offset_from, 6); + assert_eq!(token.offset_to, 11); + assert_eq!(token.position, POSITION_GAP); - assert!(token_chain.advance()); - assert_eq!(token_chain.token().text, "world"); - assert_eq!(token_chain.token().offset_from, 6); - assert_eq!(token_chain.token().offset_to, 11); - assert_eq!(token_chain.token().position, POSITION_GAP); - - assert!(!token_chain.advance()); + assert!(token_chain.next().is_none()); } } diff --git a/src/tokenizer/tokenized_string.rs b/src/tokenizer/tokenized_string.rs index c68c31509..411011cb8 100644 --- a/src/tokenizer/tokenized_string.rs +++ b/src/tokenizer/tokenized_string.rs @@ -97,13 +97,8 @@ mod tests { ], }; - let mut token_stream = PreTokenizedStream::from(tok_text.clone()); - - for expected_token in tok_text.tokens { - assert!(token_stream.advance()); - assert_eq!(token_stream.token(), &expected_token); - } - assert!(!token_stream.advance()); + let token_stream: Vec<_> = PreTokenizedStream::from(tok_text.clone()).collect(); + assert_eq!(token_stream, tok_text.tokens); } #[test] @@ -130,7 +125,7 @@ mod tests { let chain_parts = vec![&tok_text, &tok_text]; - let mut token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]); + let token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]); let expected_tokens = vec![ Token { @@ -162,11 +157,6 @@ mod tests { position_length: 1, }, ]; - - for expected_token in expected_tokens { - assert!(token_stream.advance()); - assert_eq!(token_stream.token(), &expected_token); - } - assert!(!token_stream.advance()); + assert_eq!(token_stream.collect::>(), expected_tokens); } } diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index e0e39bebc..db00006ba 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -1,4 +1,4 @@ -use crate::tokenizer::TokenStreamChain; +use crate::tokenizer::{DynTokenStreamChain, TokenStreamChain}; use serde::{Deserialize, Serialize}; /// The tokenizer module contains all of the tools used to process /// text in `tantivy`. @@ -41,27 +41,31 @@ pub struct TextAnalyzer { filters: Vec>, } -pub trait TextAnalyzerT<'a>: 'static + Send + Sync + TextAnalyzerClone<'a> { - fn token_stream(&self, text: &'a str) -> Box; +/// Top-level trait for hiding the types contained in it. +pub trait TextAnalyzerT: 'static + Send + Sync + TextAnalyzerClone { + /// Top-level method that calls the corresponding `token_stream` on the + /// contained type. + fn token_stream(&self, text: &str) -> Box; } -pub trait TextAnalyzerClone<'a> { - fn box_clone(&self) -> Box>; +pub trait TextAnalyzerClone { + fn box_clone(&self) -> Box; } -impl<'a> Clone for Box> { +impl Clone for Box { fn clone(&self) -> Self { (**self).box_clone() } } + impl Clone for Box { fn clone(&self) -> Self { (**self).box_clone() } } -impl<'a, T: Clone + Tokenizer<'a>> TextAnalyzerClone<'a> for TextAnalyzer { - fn box_clone(&self) -> Box> { +impl TextAnalyzerClone for TextAnalyzer { + fn box_clone(&self) -> Box { Box::new(TextAnalyzer { tokenizer: self.tokenizer.clone(), filters: self.filters.clone(), @@ -69,8 +73,8 @@ impl<'a, T: Clone + Tokenizer<'a>> TextAnalyzerClone<'a> for TextAnalyzer { } } -impl<'a, T: Tokenizer<'a>> TextAnalyzerT<'a> for TextAnalyzer { - fn token_stream(&self, text: &'a str) -> Box { +impl TextAnalyzerT for TextAnalyzer { + fn token_stream(&self, text: &str) -> Box { let tokens = self.tokenizer.token_stream(text); Box::new(TextIter { tokens, @@ -80,9 +84,9 @@ impl<'a, T: Tokenizer<'a>> TextAnalyzerT<'a> for TextAnalyzer { } } -impl<'a, T> TextAnalyzer +impl TextAnalyzer where - T: Tokenizer<'a>, + T: Tokenizer, { /// Creates a new `TextAnalyzer` given a tokenizer and a vector of `Box`. /// @@ -123,7 +127,7 @@ where /// to prevent accidental `PhraseQuery` to match accross two terms. /// Creates a token stream for a given `str`. - pub fn token_stream(&self, text: &'a str) -> TextIter { + pub fn token_stream(&self, text: &str) -> TextIter { let tokens = self.tokenizer.token_stream(text); TextIter { tokens, @@ -133,12 +137,12 @@ where } } -struct TextIter { +pub struct TextIter { tokens: I, filters: Vec>, } -impl<'a, I> Iterator for TextIter +impl Iterator for TextIter where I: Iterator, { @@ -152,6 +156,7 @@ where }; continue 'outer; } + return Some(token); } None } @@ -167,27 +172,30 @@ impl> TokenStream for TextIter {} /// # Warning /// /// This API may change to use associated types. -pub trait Tokenizer<'a>: 'static + Send + Sync + Clone { - type Iter: Iterator + 'a; +pub trait Tokenizer: 'static + Send + Sync + Clone { + /// An iteratable type is returned. + type Iter: TokenStream; /// Creates a token stream for a given `str`. - // TODO: make clone unnecessary - fn token_stream(&self, text: &'a str) -> Self::Iter; -} - -fn token_stream_texts<'a, T: Tokenizer<'a>>( - tokenizer: &'a T, - texts: &'a [&str], -) -> impl TokenStream + 'a { - let streams_with_offsets = texts.iter().scan(0, move |total_offset, &text| { - let temp = *total_offset; - *total_offset += text.len(); - Some((tokenizer.token_stream(text), temp)) - }); - TokenStreamChain::new(streams_with_offsets) + fn token_stream(&self, text: &str) -> Self::Iter; + /// Tokenize an array`&str` + /// + /// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were + /// one concatenated `&str`, with an artificial position gap of `2` between the different fields + /// to prevent accidental `PhraseQuery` to match accross two terms. + fn token_stream_texts<'a>(&'a self, texts: &'a [&str]) -> Box { + let streams_with_offsets = texts.iter().scan(0, move |total_offset, &text| { + let temp = *total_offset; + *total_offset += text.len(); + Some((self.token_stream(text), temp)) + }); + Box::new(TokenStreamChain::new(streams_with_offsets)) + } } /// Trait for the pluggable components of `Tokenizer`s. pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone { + /// Take a `Token` and transform it or return `None` if it's to be removed + /// from the output stream. fn transform(&mut self, token: Token) -> Option; } @@ -201,35 +209,66 @@ impl TokenFilterClone for T { } } -pub trait TokenStream: Iterator { - fn process(&mut self, sink: &mut dyn FnMut(&Token)) -> u32 { - let mut num_tokens_pushed = 0u32; - while let Some(token) = self.next() { - sink(&token); - num_tokens_pushed += 1u32; - } - num_tokens_pushed +/// `TokenStream` is the result of the tokenization. +/// +/// It consists consumable stream of `Token`s. +/// +/// # Example +/// +/// ``` +/// use tantivy::tokenizer::*; +/// +/// let tokenizer = TextAnalyzer::from(SimpleTokenizer) +/// .filter(RemoveLongFilter::limit(40)) +/// .filter(LowerCaser); +/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer"); +/// { +/// let token = token_stream.next().unwrap(); +/// assert_eq!(&token.text, "hello"); +/// assert_eq!(token.offset_from, 0); +/// assert_eq!(token.offset_to, 5); +/// assert_eq!(token.position, 0); +/// } +/// { +/// let token = token_stream.next().unwrap(); +/// assert_eq!(&token.text, "happy"); +/// assert_eq!(token.offset_from, 7); +/// assert_eq!(token.offset_to, 12); +/// assert_eq!(token.position, 1); +/// } +/// ``` +pub trait TokenStream: Iterator {} + +#[cfg(test)] +mod test { + use super::*; + use crate::tokenizer::SimpleTokenizer; + + #[test] + fn clone() { + let t1 = Token { + position: 1, + offset_from: 2, + offset_to: 3, + text: "abc".to_string(), + position_length: 1, + }; + let t2 = t1.clone(); + + assert_eq!(t1.position, t2.position); + assert_eq!(t1.offset_from, t2.offset_from); + assert_eq!(t1.offset_to, t2.offset_to); + assert_eq!(t1.text, t2.text); + } + + #[test] + fn text_analyzer() { + let mut stream = TextAnalyzer::new(SimpleTokenizer).token_stream("tokenizer hello world"); + dbg!(stream.next()); + dbg!(stream.next()); + dbg!(stream.next()); + dbg!(stream.next()); + dbg!(stream.next()); + dbg!(stream.next()); } } - -// #[cfg(test)] -// mod test { -// use super::Token; - -// #[test] -// fn clone() { -// let t1 = Token { -// position: 1, -// offset_from: 2, -// offset_to: 3, -// text: "abc".to_string(), -// position_length: 1, -// }; -// let t2 = t1.clone(); - -// assert_eq!(t1.position, t2.position); -// assert_eq!(t1.offset_from, t2.offset_from); -// assert_eq!(t1.offset_to, t2.offset_to); -// assert_eq!(t1.text, t2.text); -// } -// } diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index 9f7d808cb..ecb6b19ed 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -21,15 +21,15 @@ use std::sync::{Arc, RwLock}; /// resulting tokens. Stemming can improve the recall of your /// search engine. #[derive(Clone)] -pub struct TokenizerManager<'a> { - tokenizers: Arc>>>>, +pub struct TokenizerManager { + tokenizers: Arc>>>, } -impl<'a> TokenizerManager<'a> { +impl TokenizerManager { /// Registers a new tokenizer associated with a given name. pub fn register(&self, tokenizer_name: &str, tokenizer: T) where - T: TextAnalyzerT<'a>, + T: TextAnalyzerT, { self.tokenizers .write() @@ -38,7 +38,7 @@ impl<'a> TokenizerManager<'a> { } /// Accessing a tokenizer given its name. - pub fn get(&self, tokenizer_name: &str) -> Option>> { + pub fn get(&self, tokenizer_name: &str) -> Option> { self.tokenizers .read() .expect("Acquiring the lock should never fail") @@ -47,7 +47,7 @@ impl<'a> TokenizerManager<'a> { } } -impl<'a> Default for TokenizerManager<'a> { +impl Default for TokenizerManager { /// Creates an `TokenizerManager` prepopulated with /// the default pre-configured tokenizers of `tantivy`. /// - simple