Fix stackoverflow and add docs.

This commit is contained in:
François Massot
2023-06-30 13:49:39 +02:00
committed by Paul Masurel
parent 81330aaf89
commit 0a23201338
3 changed files with 35 additions and 45 deletions

View File

@@ -1,5 +1,7 @@
use criterion::{criterion_group, criterion_main, Criterion}; use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::{TokenizerManager, TextAnalyzer, RemoveLongFilter, LowerCaser, SimpleTokenizer}; use tantivy::tokenizer::{
LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
};
const ALICE_TXT: &str = include_str!("alice.txt"); const ALICE_TXT: &str = include_str!("alice.txt");
@@ -16,20 +18,6 @@ pub fn criterion_benchmark(c: &mut Criterion) {
assert_eq!(word_count, 30_731); assert_eq!(word_count, 30_731);
}) })
}); });
let mut static_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.build();
c.bench_function("static-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
let mut token_stream = static_analyzer.token_stream(ALICE_TXT);
while token_stream.advance() {
word_count += 1;
}
assert_eq!(word_count, 30_731);
})
});
let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default()) let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.dynamic() .dynamic()
.filter_dynamic(RemoveLongFilter::limit(40)) .filter_dynamic(RemoveLongFilter::limit(40))

View File

@@ -4,9 +4,7 @@ use std::collections::{BinaryHeap, HashMap};
use crate::query::bm25::idf; use crate::query::bm25::idf;
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery}; use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value}; use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
use crate::tokenizer::{ use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
};
use crate::{DocAddress, Result, Searcher, TantivyError}; use crate::{DocAddress, Result, Searcher, TantivyError};
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
@@ -206,8 +204,7 @@ impl MoreLikeThis {
for value in values { for value in values {
match value { match value {
Value::PreTokStr(tok_str) => { Value::PreTokStr(tok_str) => {
let mut token_stream = let mut token_stream = PreTokenizedStream::from(tok_str.clone());
PreTokenizedStream::from(tok_str.clone());
token_stream.process(&mut |token| { token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) { if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text); let term = Term::from_field_text(field, &token.text);

View File

@@ -13,12 +13,17 @@ pub struct TextAnalyzer {
impl Tokenizer for Box<dyn BoxableTokenizer> { impl Tokenizer for Box<dyn BoxableTokenizer> {
type TokenStream<'a> = BoxTokenStream<'a>; type TokenStream<'a> = BoxTokenStream<'a>;
// Note: we want to call `box_token_stream` on the concrete `Tokenizer`
// implementation, not the `BoxableTokenizer` one as it will cause
// a recursive call (and a stack overflow).
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
(**self).box_token_stream(text) (**self).box_token_stream(text)
} }
} }
impl Clone for Box<dyn BoxableTokenizer> { impl Clone for Box<dyn BoxableTokenizer> {
// Note: we want to call `box_clone` on the concrete `Tokenizer`
// implementation in order to clone the concrete `Tokenizer`.
fn clone(&self) -> Self { fn clone(&self) -> Self {
(**self).box_clone() (**self).box_clone()
} }
@@ -61,12 +66,12 @@ impl TextAnalyzer {
/// Creates a token stream for a given `str`. /// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> { pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.tokenizer.box_token_stream(text) self.tokenizer.token_stream(text)
} }
} }
/// Builder helper for [`TextAnalyzer`] /// Builder helper for [`TextAnalyzer`]
pub struct TextAnalyzerBuilder<T=Box<dyn BoxableTokenizer>> { pub struct TextAnalyzerBuilder<T = Box<dyn BoxableTokenizer>> {
tokenizer: T, tokenizer: T,
} }
@@ -90,18 +95,20 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
} }
} }
/// Boxes the internal tokenizer. This is useful to write generic code. /// Boxes the internal tokenizer. This is useful for adding dynamic filters.
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`, /// Note: this will be less performant than the non boxed version.
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it
/// will be more performant and create less boxes.
pub fn dynamic(self) -> TextAnalyzerBuilder { pub fn dynamic(self) -> TextAnalyzerBuilder {
let boxed_tokenizer = Box::new(self.tokenizer); let boxed_tokenizer = Box::new(self.tokenizer);
TextAnalyzerBuilder { TextAnalyzerBuilder {
tokenizer: boxed_tokenizer, tokenizer: boxed_tokenizer,
} }
} }
/// Apply a filter and returns a boxed version of the TextAnalyzerBuilder. /// Appends a token filter to the current builder and returns a boxed version of the
/// tokenizer. This is useful when you want to build a `TextAnalyzer` dynamically.
/// Prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` if
/// possible as it will be more performant and create less boxes.
/// ```
pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder { pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder {
self.filter(token_filter).dynamic() self.filter(token_filter).dynamic()
} }
@@ -114,12 +121,11 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer, SimpleTokenizer}; use crate::tokenizer::{LowerCaser, RemoveLongFilter, SimpleTokenizer};
#[test] #[test]
fn test_text_analyzer_builder() { fn test_text_analyzer_builder() {
@@ -133,8 +139,6 @@ mod tests {
assert_eq!(stream.next().unwrap().text, "bullet"); assert_eq!(stream.next().unwrap().text, "bullet");
} }
#[test] #[test]
fn test_text_analyzer_with_filters_boxed() { fn test_text_analyzer_with_filters_boxed() {
// This test shows how one can build a TextAnalyzer dynamically, by stacking a list // This test shows how one can build a TextAnalyzer dynamically, by stacking a list
@@ -151,19 +155,20 @@ mod tests {
SerializableTokenFilterEnum::LowerCaser(LowerCaser), SerializableTokenFilterEnum::LowerCaser(LowerCaser),
SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)), SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)),
]; ];
let mut analyzer_builder: TextAnalyzerBuilder = TextAnalyzer::builder(SimpleTokenizer::default()) let mut analyzer_builder: TextAnalyzerBuilder =
.filter_dynamic(RemoveLongFilter::limit(40)) TextAnalyzer::builder(SimpleTokenizer::default())
.filter_dynamic(LowerCaser); .filter_dynamic(RemoveLongFilter::limit(40))
// for filter in filters { .filter_dynamic(LowerCaser);
// analyzer_builder = for filter in filters {
// match filter { analyzer_builder = match filter {
// SerializableTokenFilterEnum::LowerCaser(lower_caser) => SerializableTokenFilterEnum::LowerCaser(lower_caser) => {
// analyzer_builder.filter_dynamic(lower_caser), analyzer_builder.filter_dynamic(lower_caser)
// SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => { }
// analyzer_builder.filter_dynamic(remove_long_filter) SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => {
// }, analyzer_builder.filter_dynamic(remove_long_filter)
// } }
// } }
}
let mut analyzer = analyzer_builder.build().clone(); let mut analyzer = analyzer_builder.build().clone();
let mut stream = analyzer.token_stream("first bullet point"); let mut stream = analyzer.token_stream("first bullet point");
assert_eq!(stream.next().unwrap().text, "first"); assert_eq!(stream.next().unwrap().text, "first");