From 0a232013386f4a12cf797a700c5c842b5c12c4e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Massot?= Date: Fri, 30 Jun 2023 13:49:39 +0200 Subject: [PATCH] Fix stackoverflow and add docs. --- benches/analyzer.rs | 18 ++----- src/query/more_like_this/more_like_this.rs | 7 +-- src/tokenizer/tokenizer.rs | 55 ++++++++++++---------- 3 files changed, 35 insertions(+), 45 deletions(-) diff --git a/benches/analyzer.rs b/benches/analyzer.rs index eb6574e86..5d93a0db4 100644 --- a/benches/analyzer.rs +++ b/benches/analyzer.rs @@ -1,5 +1,7 @@ use criterion::{criterion_group, criterion_main, Criterion}; -use tantivy::tokenizer::{TokenizerManager, TextAnalyzer, RemoveLongFilter, LowerCaser, SimpleTokenizer}; +use tantivy::tokenizer::{ + LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager, +}; const ALICE_TXT: &str = include_str!("alice.txt"); @@ -16,20 +18,6 @@ pub fn criterion_benchmark(c: &mut Criterion) { assert_eq!(word_count, 30_731); }) }); - let mut static_analyzer = TextAnalyzer::builder(SimpleTokenizer::default()) - .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) - .build(); - c.bench_function("static-tokenize-alice", |b| { - b.iter(|| { - let mut word_count = 0; - let mut token_stream = static_analyzer.token_stream(ALICE_TXT); - while token_stream.advance() { - word_count += 1; - } - assert_eq!(word_count, 30_731); - }) - }); let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default()) .dynamic() .filter_dynamic(RemoveLongFilter::limit(40)) diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs index c0bfe552f..ca86c70b1 100644 --- a/src/query/more_like_this/more_like_this.rs +++ b/src/query/more_like_this/more_like_this.rs @@ -4,9 +4,7 @@ use std::collections::{BinaryHeap, HashMap}; use crate::query::bm25::idf; use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery}; use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value}; -use crate::tokenizer::{ - FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer, -}; +use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer}; use crate::{DocAddress, Result, Searcher, TantivyError}; #[derive(Debug, PartialEq)] @@ -206,8 +204,7 @@ impl MoreLikeThis { for value in values { match value { Value::PreTokStr(tok_str) => { - let mut token_stream = - PreTokenizedStream::from(tok_str.clone()); + let mut token_stream = PreTokenizedStream::from(tok_str.clone()); token_stream.process(&mut |token| { if !self.is_noise_word(token.text.clone()) { let term = Term::from_field_text(field, &token.text); diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index da3760005..fc22fcefd 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -13,12 +13,17 @@ pub struct TextAnalyzer { impl Tokenizer for Box { type TokenStream<'a> = BoxTokenStream<'a>; + // Note: we want to call `box_token_stream` on the concrete `Tokenizer` + // implementation, not the `BoxableTokenizer` one as it will cause + // a recursive call (and a stack overflow). fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { (**self).box_token_stream(text) } } impl Clone for Box { + // Note: we want to call `box_clone` on the concrete `Tokenizer` + // implementation in order to clone the concrete `Tokenizer`. fn clone(&self) -> Self { (**self).box_clone() } @@ -61,12 +66,12 @@ impl TextAnalyzer { /// Creates a token stream for a given `str`. pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> { - self.tokenizer.box_token_stream(text) + self.tokenizer.token_stream(text) } } /// Builder helper for [`TextAnalyzer`] -pub struct TextAnalyzerBuilder> { +pub struct TextAnalyzerBuilder> { tokenizer: T, } @@ -90,18 +95,20 @@ impl TextAnalyzerBuilder { } } - /// Boxes the internal tokenizer. This is useful to write generic code. - /// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`, - /// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it - /// will be more performant and create less boxes. + /// Boxes the internal tokenizer. This is useful for adding dynamic filters. + /// Note: this will be less performant than the non boxed version. pub fn dynamic(self) -> TextAnalyzerBuilder { let boxed_tokenizer = Box::new(self.tokenizer); TextAnalyzerBuilder { - tokenizer: boxed_tokenizer, + tokenizer: boxed_tokenizer, } } - /// Apply a filter and returns a boxed version of the TextAnalyzerBuilder. + /// Appends a token filter to the current builder and returns a boxed version of the + /// tokenizer. This is useful when you want to build a `TextAnalyzer` dynamically. + /// Prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` if + /// possible as it will be more performant and create less boxes. + /// ``` pub fn filter_dynamic(self, token_filter: F) -> TextAnalyzerBuilder { self.filter(token_filter).dynamic() } @@ -114,12 +121,11 @@ impl TextAnalyzerBuilder { } } - #[cfg(test)] mod tests { use super::*; - use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer, SimpleTokenizer}; + use crate::tokenizer::{LowerCaser, RemoveLongFilter, SimpleTokenizer}; #[test] fn test_text_analyzer_builder() { @@ -133,8 +139,6 @@ mod tests { assert_eq!(stream.next().unwrap().text, "bullet"); } - - #[test] fn test_text_analyzer_with_filters_boxed() { // This test shows how one can build a TextAnalyzer dynamically, by stacking a list @@ -151,19 +155,20 @@ mod tests { SerializableTokenFilterEnum::LowerCaser(LowerCaser), SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)), ]; - let mut analyzer_builder: TextAnalyzerBuilder = TextAnalyzer::builder(SimpleTokenizer::default()) - .filter_dynamic(RemoveLongFilter::limit(40)) - .filter_dynamic(LowerCaser); - // for filter in filters { - // analyzer_builder = - // match filter { - // SerializableTokenFilterEnum::LowerCaser(lower_caser) => - // analyzer_builder.filter_dynamic(lower_caser), - // SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => { - // analyzer_builder.filter_dynamic(remove_long_filter) - // }, - // } - // } + let mut analyzer_builder: TextAnalyzerBuilder = + TextAnalyzer::builder(SimpleTokenizer::default()) + .filter_dynamic(RemoveLongFilter::limit(40)) + .filter_dynamic(LowerCaser); + for filter in filters { + analyzer_builder = match filter { + SerializableTokenFilterEnum::LowerCaser(lower_caser) => { + analyzer_builder.filter_dynamic(lower_caser) + } + SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => { + analyzer_builder.filter_dynamic(remove_long_filter) + } + } + } let mut analyzer = analyzer_builder.build().clone(); let mut stream = analyzer.token_stream("first bullet point"); assert_eq!(stream.next().unwrap().text, "first");