Avoid allocation in filters.

This commit is contained in:
François Massot
2023-06-29 14:57:39 +02:00
parent f6a6b4a2ff
commit f777de12ea
14 changed files with 58 additions and 49 deletions

View File

@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
// This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.filter(LowerCaser::default())
.filter(StopWordFilter::remove(vec![
"the".to_string(),
"and".to_string(),

View File

@@ -1209,7 +1209,7 @@ mod tests {
ff_tokenizer_manager.register(
"custom_lowercase",
TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
.filter(LowerCaser::default())
.build(),
);

View File

@@ -960,7 +960,8 @@ mod test {
tokenizer_manager.register(
"en_with_stop_words",
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.filter(LowerCaser::default())
.filter(LowerCaser::default())
.filter(StopWordFilter::remove(vec!["the".to_string()]))
.build(),
);

View File

@@ -39,9 +39,9 @@ impl<T> AlphaNumOnlyFilterStream<T> {
}
impl TokenFilter for AlphaNumOnlyFilter {
type OutputTokenStream<T: TokenStream> = AlphaNumOnlyFilterStream<T>;
type OutputTokenStream<'a, T: TokenStream> = AlphaNumOnlyFilterStream<T>;
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
AlphaNumOnlyFilterStream { tail: token_stream }
}
}

View File

@@ -5,34 +5,35 @@ use super::{Token, TokenFilter, TokenStream};
/// This class converts alphabetic, numeric, and symbolic Unicode characters
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
/// block) into their ASCII equivalents, if one exists.
#[derive(Clone)]
pub struct AsciiFoldingFilter;
#[derive(Clone, Default)]
pub struct AsciiFoldingFilter(String);
impl TokenFilter for AsciiFoldingFilter {
type OutputTokenStream<T: TokenStream> = AsciiFoldingFilterTokenStream<T>;
type OutputTokenStream<'a, T: TokenStream> = AsciiFoldingFilterTokenStream<'a, T>;
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
self.0.clear();
AsciiFoldingFilterTokenStream {
buffer: String::new(),
buffer: &mut self.0,
tail: token_stream,
}
}
}
pub struct AsciiFoldingFilterTokenStream<T> {
buffer: String,
pub struct AsciiFoldingFilterTokenStream<'a, T> {
buffer: &'a mut String,
tail: T,
}
impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<T> {
impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
}
if !self.token_mut().text.is_ascii() {
// ignore its already ascii
to_ascii(&self.tail.token().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
to_ascii(&self.tail.token().text, self.buffer);
mem::swap(&mut self.tail.token_mut().text, self.buffer);
}
true
}
@@ -1563,7 +1564,7 @@ mod tests {
fn folding_helper(text: &str) -> Vec<String> {
let mut tokens = Vec::new();
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(AsciiFoldingFilter)
.filter(AsciiFoldingFilter::default())
.build()
.token_stream(text)
.process(&mut |token| {
@@ -1574,7 +1575,7 @@ mod tests {
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(AsciiFoldingFilter)
.filter(AsciiFoldingFilter::default())
.build();
let mut token_stream = tokenizer.token_stream(text);
token_stream.advance();

View File

@@ -3,22 +3,23 @@ use std::mem;
use super::{Token, TokenFilter, TokenStream};
/// Token filter that lowercase terms.
#[derive(Clone)]
pub struct LowerCaser;
#[derive(Clone, Default)]
pub struct LowerCaser(String);
impl TokenFilter for LowerCaser {
type OutputTokenStream<T: TokenStream> = LowerCaserTokenStream<T>;
type OutputTokenStream<'a, T: TokenStream> = LowerCaserTokenStream<'a, T>;
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
self.0.clear();
LowerCaserTokenStream {
tail: token_stream,
buffer: String::new(),
buffer: &mut self.0,
}
}
}
pub struct LowerCaserTokenStream<T> {
buffer: String,
pub struct LowerCaserTokenStream<'a, T> {
buffer: &'a mut String,
tail: T,
}
@@ -33,7 +34,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) {
}
}
impl<T: TokenStream> TokenStream for LowerCaserTokenStream<T> {
impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
@@ -42,8 +43,8 @@ impl<T: TokenStream> TokenStream for LowerCaserTokenStream<T> {
// fast track for ascii.
self.token_mut().text.make_ascii_lowercase();
} else {
to_lowercase_unicode(&self.tail.token().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
to_lowercase_unicode(&self.tail.token().text, self.buffer);
mem::swap(&mut self.tail.token_mut().text, self.buffer);
}
true
}
@@ -76,7 +77,7 @@ mod tests {
fn token_stream_helper(text: &str) -> Vec<Token> {
let mut token_stream = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.filter(LowerCaser::default())
.build();
let mut token_stream = token_stream.token_stream(text);

View File

@@ -68,7 +68,7 @@
//!
//! let en_stem = TextAnalyzer::builder(SimpleTokenizer::default())
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser)
//! .filter(LowerCaser::default())
//! .filter(Stemmer::new(Language::English))
//! .build();
//! ```
@@ -115,7 +115,7 @@
//! // We need to register our tokenizer :
//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser)
//! .filter(LowerCaser::default())
//! .build();
//! index
//! .tokenizers()
@@ -233,7 +233,7 @@ pub mod tests {
"el_stem",
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(LowerCaser::default())
.filter(Stemmer::new(Language::Greek))
.build(),
);

View File

@@ -38,9 +38,9 @@ impl<T> RemoveLongFilterStream<T> {
}
impl TokenFilter for RemoveLongFilter {
type OutputTokenStream<T: TokenStream> = RemoveLongFilterStream<T>;
type OutputTokenStream<'a, T: TokenStream> = RemoveLongFilterStream<T>;
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
RemoveLongFilterStream {
token_length_limit: self.length_limit,
tail: token_stream,

View File

@@ -80,9 +80,9 @@ impl SplitCompoundWords {
}
impl TokenFilter for SplitCompoundWords {
type OutputTokenStream<T: TokenStream> = SplitCompoundWordsTokenStream<T>;
type OutputTokenStream<'a, T: TokenStream> = SplitCompoundWordsTokenStream<T>;
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: token_stream,

View File

@@ -81,9 +81,9 @@ impl Default for Stemmer {
}
impl TokenFilter for Stemmer {
type OutputTokenStream<T: TokenStream> = StemmerTokenStream<T>;
type OutputTokenStream<'a, T: TokenStream> = StemmerTokenStream<T>;
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
StemmerTokenStream {
tail: token_stream,

View File

@@ -72,9 +72,9 @@ impl StopWordFilter {
}
impl TokenFilter for StopWordFilter {
type OutputTokenStream<T: TokenStream> = StopWordFilterStream<T>;
type OutputTokenStream<'a, T: TokenStream> = StopWordFilterStream<T>;
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
StopWordFilterStream {
words: self.words.clone(),
tail: token_stream,

View File

@@ -29,11 +29,17 @@ dyn_clone::clone_trait_object!(BoxableTokenizer);
/// A boxable `TokenFilter`, with its `Tokenizer` type erased.
trait BoxableTokenFilter: 'static + Send + Sync + DynClone {
/// Transforms a boxed token stream into a new one.
fn box_filter<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a>;
fn box_filter<'a>(
&'a mut self,
token_stream: Box<dyn TokenStream + 'a>,
) -> Box<dyn TokenStream + 'a>;
}
impl<T: TokenFilter> BoxableTokenFilter for T {
fn box_filter<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
fn box_filter<'a>(
&'a mut self,
token_stream: Box<dyn TokenStream + 'a>,
) -> Box<dyn TokenStream + 'a> {
Box::new(self.filter(token_stream))
}
}
@@ -87,7 +93,7 @@ impl TextAnalyzer {
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
let mut token_stream = self.tokenizer.box_token_stream(text);
for token_filter in &self.token_filters {
for token_filter in self.token_filters.iter_mut() {
token_stream = token_filter.0.box_filter(token_stream);
}
token_stream
@@ -154,7 +160,7 @@ mod tests {
let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default())
.filter(AlphaNumOnlyFilter)
.filter(RemoveLongFilter::limit(6))
.filter(LowerCaser)
.filter(LowerCaser::default())
.build();
let mut stream = analyzer.token_stream("- first bullet point");
assert_eq!(stream.next().unwrap().text, "first");
@@ -167,7 +173,7 @@ mod tests {
WhitespaceTokenizer::default(),
vec![
BoxTokenFilter::from(AlphaNumOnlyFilter),
BoxTokenFilter::from(LowerCaser),
BoxTokenFilter::from(LowerCaser::default()),
BoxTokenFilter::from(RemoveLongFilter::limit(6)),
],
);

View File

@@ -63,14 +63,14 @@ impl Default for TokenizerManager {
"default",
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(LowerCaser::default())
.build(),
);
manager.register(
"en_stem",
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(LowerCaser::default())
.filter(Stemmer::new(Language::English))
.build(),
);

View File

@@ -115,9 +115,9 @@ pub trait TokenStream {
pub trait TokenFilter: 'static + Send + Sync + Clone {
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
/// Tokenizer.
type OutputTokenStream<T: TokenStream>: TokenStream;
type OutputTokenStream<'a, T: TokenStream>: TokenStream;
/// Filter a token stream and returns a new one.
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T>;
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T>;
/// Wraps a Tokenizer and returns a new one.
fn transform<T: Tokenizer>(self, tokenizer: T) -> FilteredTokenizer<T, Self> {
FilteredTokenizer {
@@ -134,7 +134,7 @@ pub struct FilteredTokenizer<T: Tokenizer, F: TokenFilter> {
}
impl<T: Tokenizer, F: TokenFilter> Tokenizer for FilteredTokenizer<T, F> {
type TokenStream<'a> = F::OutputTokenStream<T::TokenStream<'a>>;
type TokenStream<'a> = F::OutputTokenStream<'a, T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
let token_stream = self.tokenizer.token_stream(text);