mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
Avoid allocation in filters.
This commit is contained in:
@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
|
||||
@@ -1209,7 +1209,7 @@ mod tests {
|
||||
ff_tokenizer_manager.register(
|
||||
"custom_lowercase",
|
||||
TextAnalyzer::builder(RawTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.build(),
|
||||
);
|
||||
|
||||
|
||||
@@ -960,7 +960,8 @@ mod test {
|
||||
tokenizer_manager.register(
|
||||
"en_with_stop_words",
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.filter(LowerCaser::default())
|
||||
.filter(StopWordFilter::remove(vec!["the".to_string()]))
|
||||
.build(),
|
||||
);
|
||||
|
||||
@@ -39,9 +39,9 @@ impl<T> AlphaNumOnlyFilterStream<T> {
|
||||
}
|
||||
|
||||
impl TokenFilter for AlphaNumOnlyFilter {
|
||||
type OutputTokenStream<T: TokenStream> = AlphaNumOnlyFilterStream<T>;
|
||||
type OutputTokenStream<'a, T: TokenStream> = AlphaNumOnlyFilterStream<T>;
|
||||
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
|
||||
AlphaNumOnlyFilterStream { tail: token_stream }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,34 +5,35 @@ use super::{Token, TokenFilter, TokenStream};
|
||||
/// This class converts alphabetic, numeric, and symbolic Unicode characters
|
||||
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
|
||||
/// block) into their ASCII equivalents, if one exists.
|
||||
#[derive(Clone)]
|
||||
pub struct AsciiFoldingFilter;
|
||||
#[derive(Clone, Default)]
|
||||
pub struct AsciiFoldingFilter(String);
|
||||
|
||||
impl TokenFilter for AsciiFoldingFilter {
|
||||
type OutputTokenStream<T: TokenStream> = AsciiFoldingFilterTokenStream<T>;
|
||||
type OutputTokenStream<'a, T: TokenStream> = AsciiFoldingFilterTokenStream<'a, T>;
|
||||
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
|
||||
self.0.clear();
|
||||
AsciiFoldingFilterTokenStream {
|
||||
buffer: String::new(),
|
||||
buffer: &mut self.0,
|
||||
tail: token_stream,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AsciiFoldingFilterTokenStream<T> {
|
||||
buffer: String,
|
||||
pub struct AsciiFoldingFilterTokenStream<'a, T> {
|
||||
buffer: &'a mut String,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<T> {
|
||||
impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
}
|
||||
if !self.token_mut().text.is_ascii() {
|
||||
// ignore its already ascii
|
||||
to_ascii(&self.tail.token().text, &mut self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
to_ascii(&self.tail.token().text, self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, self.buffer);
|
||||
}
|
||||
true
|
||||
}
|
||||
@@ -1563,7 +1564,7 @@ mod tests {
|
||||
fn folding_helper(text: &str) -> Vec<String> {
|
||||
let mut tokens = Vec::new();
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(AsciiFoldingFilter)
|
||||
.filter(AsciiFoldingFilter::default())
|
||||
.build()
|
||||
.token_stream(text)
|
||||
.process(&mut |token| {
|
||||
@@ -1574,7 +1575,7 @@ mod tests {
|
||||
|
||||
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
|
||||
let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default())
|
||||
.filter(AsciiFoldingFilter)
|
||||
.filter(AsciiFoldingFilter::default())
|
||||
.build();
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
token_stream.advance();
|
||||
|
||||
@@ -3,22 +3,23 @@ use std::mem;
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// Token filter that lowercase terms.
|
||||
#[derive(Clone)]
|
||||
pub struct LowerCaser;
|
||||
#[derive(Clone, Default)]
|
||||
pub struct LowerCaser(String);
|
||||
|
||||
impl TokenFilter for LowerCaser {
|
||||
type OutputTokenStream<T: TokenStream> = LowerCaserTokenStream<T>;
|
||||
type OutputTokenStream<'a, T: TokenStream> = LowerCaserTokenStream<'a, T>;
|
||||
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
|
||||
self.0.clear();
|
||||
LowerCaserTokenStream {
|
||||
tail: token_stream,
|
||||
buffer: String::new(),
|
||||
buffer: &mut self.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LowerCaserTokenStream<T> {
|
||||
buffer: String,
|
||||
pub struct LowerCaserTokenStream<'a, T> {
|
||||
buffer: &'a mut String,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
@@ -33,7 +34,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for LowerCaserTokenStream<T> {
|
||||
impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
@@ -42,8 +43,8 @@ impl<T: TokenStream> TokenStream for LowerCaserTokenStream<T> {
|
||||
// fast track for ascii.
|
||||
self.token_mut().text.make_ascii_lowercase();
|
||||
} else {
|
||||
to_lowercase_unicode(&self.tail.token().text, &mut self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
to_lowercase_unicode(&self.tail.token().text, self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, self.buffer);
|
||||
}
|
||||
true
|
||||
}
|
||||
@@ -76,7 +77,7 @@ mod tests {
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let mut token_stream = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.build();
|
||||
|
||||
let mut token_stream = token_stream.token_stream(text);
|
||||
|
||||
@@ -68,7 +68,7 @@
|
||||
//!
|
||||
//! let en_stem = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser)
|
||||
//! .filter(LowerCaser::default())
|
||||
//! .filter(Stemmer::new(Language::English))
|
||||
//! .build();
|
||||
//! ```
|
||||
@@ -115,7 +115,7 @@
|
||||
//! // We need to register our tokenizer :
|
||||
//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser)
|
||||
//! .filter(LowerCaser::default())
|
||||
//! .build();
|
||||
//! index
|
||||
//! .tokenizers()
|
||||
@@ -233,7 +233,7 @@ pub mod tests {
|
||||
"el_stem",
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.filter(Stemmer::new(Language::Greek))
|
||||
.build(),
|
||||
);
|
||||
|
||||
@@ -38,9 +38,9 @@ impl<T> RemoveLongFilterStream<T> {
|
||||
}
|
||||
|
||||
impl TokenFilter for RemoveLongFilter {
|
||||
type OutputTokenStream<T: TokenStream> = RemoveLongFilterStream<T>;
|
||||
type OutputTokenStream<'a, T: TokenStream> = RemoveLongFilterStream<T>;
|
||||
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
|
||||
RemoveLongFilterStream {
|
||||
token_length_limit: self.length_limit,
|
||||
tail: token_stream,
|
||||
|
||||
@@ -80,9 +80,9 @@ impl SplitCompoundWords {
|
||||
}
|
||||
|
||||
impl TokenFilter for SplitCompoundWords {
|
||||
type OutputTokenStream<T: TokenStream> = SplitCompoundWordsTokenStream<T>;
|
||||
type OutputTokenStream<'a, T: TokenStream> = SplitCompoundWordsTokenStream<T>;
|
||||
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
|
||||
SplitCompoundWordsTokenStream {
|
||||
dict: self.dict.clone(),
|
||||
tail: token_stream,
|
||||
|
||||
@@ -81,9 +81,9 @@ impl Default for Stemmer {
|
||||
}
|
||||
|
||||
impl TokenFilter for Stemmer {
|
||||
type OutputTokenStream<T: TokenStream> = StemmerTokenStream<T>;
|
||||
type OutputTokenStream<'a, T: TokenStream> = StemmerTokenStream<T>;
|
||||
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
|
||||
let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||
StemmerTokenStream {
|
||||
tail: token_stream,
|
||||
|
||||
@@ -72,9 +72,9 @@ impl StopWordFilter {
|
||||
}
|
||||
|
||||
impl TokenFilter for StopWordFilter {
|
||||
type OutputTokenStream<T: TokenStream> = StopWordFilterStream<T>;
|
||||
type OutputTokenStream<'a, T: TokenStream> = StopWordFilterStream<T>;
|
||||
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
|
||||
StopWordFilterStream {
|
||||
words: self.words.clone(),
|
||||
tail: token_stream,
|
||||
|
||||
@@ -29,11 +29,17 @@ dyn_clone::clone_trait_object!(BoxableTokenizer);
|
||||
/// A boxable `TokenFilter`, with its `Tokenizer` type erased.
|
||||
trait BoxableTokenFilter: 'static + Send + Sync + DynClone {
|
||||
/// Transforms a boxed token stream into a new one.
|
||||
fn box_filter<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a>;
|
||||
fn box_filter<'a>(
|
||||
&'a mut self,
|
||||
token_stream: Box<dyn TokenStream + 'a>,
|
||||
) -> Box<dyn TokenStream + 'a>;
|
||||
}
|
||||
|
||||
impl<T: TokenFilter> BoxableTokenFilter for T {
|
||||
fn box_filter<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
|
||||
fn box_filter<'a>(
|
||||
&'a mut self,
|
||||
token_stream: Box<dyn TokenStream + 'a>,
|
||||
) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(self.filter(token_stream))
|
||||
}
|
||||
}
|
||||
@@ -87,7 +93,7 @@ impl TextAnalyzer {
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
let mut token_stream = self.tokenizer.box_token_stream(text);
|
||||
for token_filter in &self.token_filters {
|
||||
for token_filter in self.token_filters.iter_mut() {
|
||||
token_stream = token_filter.0.box_filter(token_stream);
|
||||
}
|
||||
token_stream
|
||||
@@ -154,7 +160,7 @@ mod tests {
|
||||
let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default())
|
||||
.filter(AlphaNumOnlyFilter)
|
||||
.filter(RemoveLongFilter::limit(6))
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.build();
|
||||
let mut stream = analyzer.token_stream("- first bullet point");
|
||||
assert_eq!(stream.next().unwrap().text, "first");
|
||||
@@ -167,7 +173,7 @@ mod tests {
|
||||
WhitespaceTokenizer::default(),
|
||||
vec![
|
||||
BoxTokenFilter::from(AlphaNumOnlyFilter),
|
||||
BoxTokenFilter::from(LowerCaser),
|
||||
BoxTokenFilter::from(LowerCaser::default()),
|
||||
BoxTokenFilter::from(RemoveLongFilter::limit(6)),
|
||||
],
|
||||
);
|
||||
|
||||
@@ -63,14 +63,14 @@ impl Default for TokenizerManager {
|
||||
"default",
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.build(),
|
||||
);
|
||||
manager.register(
|
||||
"en_stem",
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.filter(Stemmer::new(Language::English))
|
||||
.build(),
|
||||
);
|
||||
|
||||
@@ -115,9 +115,9 @@ pub trait TokenStream {
|
||||
pub trait TokenFilter: 'static + Send + Sync + Clone {
|
||||
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
|
||||
/// Tokenizer.
|
||||
type OutputTokenStream<T: TokenStream>: TokenStream;
|
||||
type OutputTokenStream<'a, T: TokenStream>: TokenStream;
|
||||
/// Filter a token stream and returns a new one.
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T>;
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T>;
|
||||
/// Wraps a Tokenizer and returns a new one.
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> FilteredTokenizer<T, Self> {
|
||||
FilteredTokenizer {
|
||||
@@ -134,7 +134,7 @@ pub struct FilteredTokenizer<T: Tokenizer, F: TokenFilter> {
|
||||
}
|
||||
|
||||
impl<T: Tokenizer, F: TokenFilter> Tokenizer for FilteredTokenizer<T, F> {
|
||||
type TokenStream<'a> = F::OutputTokenStream<T::TokenStream<'a>>;
|
||||
type TokenStream<'a> = F::OutputTokenStream<'a, T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
let token_stream = self.tokenizer.token_stream(text);
|
||||
|
||||
Reference in New Issue
Block a user