mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-03 15:52:55 +00:00
Fix stackoverflow and add docs.
This commit is contained in:
committed by
Paul Masurel
parent
81330aaf89
commit
0a23201338
@@ -1,5 +1,7 @@
|
|||||||
use criterion::{criterion_group, criterion_main, Criterion};
|
use criterion::{criterion_group, criterion_main, Criterion};
|
||||||
use tantivy::tokenizer::{TokenizerManager, TextAnalyzer, RemoveLongFilter, LowerCaser, SimpleTokenizer};
|
use tantivy::tokenizer::{
|
||||||
|
LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
|
||||||
|
};
|
||||||
|
|
||||||
const ALICE_TXT: &str = include_str!("alice.txt");
|
const ALICE_TXT: &str = include_str!("alice.txt");
|
||||||
|
|
||||||
@@ -16,20 +18,6 @@ pub fn criterion_benchmark(c: &mut Criterion) {
|
|||||||
assert_eq!(word_count, 30_731);
|
assert_eq!(word_count, 30_731);
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
let mut static_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
|
|
||||||
.filter(RemoveLongFilter::limit(40))
|
|
||||||
.filter(LowerCaser)
|
|
||||||
.build();
|
|
||||||
c.bench_function("static-tokenize-alice", |b| {
|
|
||||||
b.iter(|| {
|
|
||||||
let mut word_count = 0;
|
|
||||||
let mut token_stream = static_analyzer.token_stream(ALICE_TXT);
|
|
||||||
while token_stream.advance() {
|
|
||||||
word_count += 1;
|
|
||||||
}
|
|
||||||
assert_eq!(word_count, 30_731);
|
|
||||||
})
|
|
||||||
});
|
|
||||||
let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
|
let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||||
.dynamic()
|
.dynamic()
|
||||||
.filter_dynamic(RemoveLongFilter::limit(40))
|
.filter_dynamic(RemoveLongFilter::limit(40))
|
||||||
|
|||||||
@@ -4,9 +4,7 @@ use std::collections::{BinaryHeap, HashMap};
|
|||||||
use crate::query::bm25::idf;
|
use crate::query::bm25::idf;
|
||||||
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
|
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
|
||||||
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
|
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
|
||||||
use crate::tokenizer::{
|
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
|
||||||
FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
|
|
||||||
};
|
|
||||||
use crate::{DocAddress, Result, Searcher, TantivyError};
|
use crate::{DocAddress, Result, Searcher, TantivyError};
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
@@ -206,8 +204,7 @@ impl MoreLikeThis {
|
|||||||
for value in values {
|
for value in values {
|
||||||
match value {
|
match value {
|
||||||
Value::PreTokStr(tok_str) => {
|
Value::PreTokStr(tok_str) => {
|
||||||
let mut token_stream =
|
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
|
||||||
PreTokenizedStream::from(tok_str.clone());
|
|
||||||
token_stream.process(&mut |token| {
|
token_stream.process(&mut |token| {
|
||||||
if !self.is_noise_word(token.text.clone()) {
|
if !self.is_noise_word(token.text.clone()) {
|
||||||
let term = Term::from_field_text(field, &token.text);
|
let term = Term::from_field_text(field, &token.text);
|
||||||
|
|||||||
@@ -13,12 +13,17 @@ pub struct TextAnalyzer {
|
|||||||
impl Tokenizer for Box<dyn BoxableTokenizer> {
|
impl Tokenizer for Box<dyn BoxableTokenizer> {
|
||||||
type TokenStream<'a> = BoxTokenStream<'a>;
|
type TokenStream<'a> = BoxTokenStream<'a>;
|
||||||
|
|
||||||
|
// Note: we want to call `box_token_stream` on the concrete `Tokenizer`
|
||||||
|
// implementation, not the `BoxableTokenizer` one as it will cause
|
||||||
|
// a recursive call (and a stack overflow).
|
||||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||||
(**self).box_token_stream(text)
|
(**self).box_token_stream(text)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for Box<dyn BoxableTokenizer> {
|
impl Clone for Box<dyn BoxableTokenizer> {
|
||||||
|
// Note: we want to call `box_clone` on the concrete `Tokenizer`
|
||||||
|
// implementation in order to clone the concrete `Tokenizer`.
|
||||||
fn clone(&self) -> Self {
|
fn clone(&self) -> Self {
|
||||||
(**self).box_clone()
|
(**self).box_clone()
|
||||||
}
|
}
|
||||||
@@ -61,12 +66,12 @@ impl TextAnalyzer {
|
|||||||
|
|
||||||
/// Creates a token stream for a given `str`.
|
/// Creates a token stream for a given `str`.
|
||||||
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||||
self.tokenizer.box_token_stream(text)
|
self.tokenizer.token_stream(text)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Builder helper for [`TextAnalyzer`]
|
/// Builder helper for [`TextAnalyzer`]
|
||||||
pub struct TextAnalyzerBuilder<T=Box<dyn BoxableTokenizer>> {
|
pub struct TextAnalyzerBuilder<T = Box<dyn BoxableTokenizer>> {
|
||||||
tokenizer: T,
|
tokenizer: T,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -90,18 +95,20 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Boxes the internal tokenizer. This is useful to write generic code.
|
/// Boxes the internal tokenizer. This is useful for adding dynamic filters.
|
||||||
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
|
/// Note: this will be less performant than the non boxed version.
|
||||||
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it
|
|
||||||
/// will be more performant and create less boxes.
|
|
||||||
pub fn dynamic(self) -> TextAnalyzerBuilder {
|
pub fn dynamic(self) -> TextAnalyzerBuilder {
|
||||||
let boxed_tokenizer = Box::new(self.tokenizer);
|
let boxed_tokenizer = Box::new(self.tokenizer);
|
||||||
TextAnalyzerBuilder {
|
TextAnalyzerBuilder {
|
||||||
tokenizer: boxed_tokenizer,
|
tokenizer: boxed_tokenizer,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Apply a filter and returns a boxed version of the TextAnalyzerBuilder.
|
/// Appends a token filter to the current builder and returns a boxed version of the
|
||||||
|
/// tokenizer. This is useful when you want to build a `TextAnalyzer` dynamically.
|
||||||
|
/// Prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` if
|
||||||
|
/// possible as it will be more performant and create less boxes.
|
||||||
|
/// ```
|
||||||
pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder {
|
pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder {
|
||||||
self.filter(token_filter).dynamic()
|
self.filter(token_filter).dynamic()
|
||||||
}
|
}
|
||||||
@@ -114,12 +121,11 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer, SimpleTokenizer};
|
use crate::tokenizer::{LowerCaser, RemoveLongFilter, SimpleTokenizer};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_text_analyzer_builder() {
|
fn test_text_analyzer_builder() {
|
||||||
@@ -133,8 +139,6 @@ mod tests {
|
|||||||
assert_eq!(stream.next().unwrap().text, "bullet");
|
assert_eq!(stream.next().unwrap().text, "bullet");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_text_analyzer_with_filters_boxed() {
|
fn test_text_analyzer_with_filters_boxed() {
|
||||||
// This test shows how one can build a TextAnalyzer dynamically, by stacking a list
|
// This test shows how one can build a TextAnalyzer dynamically, by stacking a list
|
||||||
@@ -151,19 +155,20 @@ mod tests {
|
|||||||
SerializableTokenFilterEnum::LowerCaser(LowerCaser),
|
SerializableTokenFilterEnum::LowerCaser(LowerCaser),
|
||||||
SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)),
|
SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)),
|
||||||
];
|
];
|
||||||
let mut analyzer_builder: TextAnalyzerBuilder = TextAnalyzer::builder(SimpleTokenizer::default())
|
let mut analyzer_builder: TextAnalyzerBuilder =
|
||||||
.filter_dynamic(RemoveLongFilter::limit(40))
|
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||||
.filter_dynamic(LowerCaser);
|
.filter_dynamic(RemoveLongFilter::limit(40))
|
||||||
// for filter in filters {
|
.filter_dynamic(LowerCaser);
|
||||||
// analyzer_builder =
|
for filter in filters {
|
||||||
// match filter {
|
analyzer_builder = match filter {
|
||||||
// SerializableTokenFilterEnum::LowerCaser(lower_caser) =>
|
SerializableTokenFilterEnum::LowerCaser(lower_caser) => {
|
||||||
// analyzer_builder.filter_dynamic(lower_caser),
|
analyzer_builder.filter_dynamic(lower_caser)
|
||||||
// SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => {
|
}
|
||||||
// analyzer_builder.filter_dynamic(remove_long_filter)
|
SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => {
|
||||||
// },
|
analyzer_builder.filter_dynamic(remove_long_filter)
|
||||||
// }
|
}
|
||||||
// }
|
}
|
||||||
|
}
|
||||||
let mut analyzer = analyzer_builder.build().clone();
|
let mut analyzer = analyzer_builder.build().clone();
|
||||||
let mut stream = analyzer.token_stream("first bullet point");
|
let mut stream = analyzer.token_stream("first bullet point");
|
||||||
assert_eq!(stream.next().unwrap().text, "first");
|
assert_eq!(stream.next().unwrap().text, "first");
|
||||||
|
|||||||
Reference in New Issue
Block a user