mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 17:22:54 +00:00
refactor tokenization pipeline to use GATs (#1924)
* refactor tokenization pipeline to use GATs * fix doctests * fix clippy lints * remove commented code
This commit is contained in:
@@ -1,15 +1,12 @@
|
||||
/// The tokenizer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
use tokenizer_api::{BoxTokenFilter, BoxTokenStream, Tokenizer};
|
||||
use tokenizer_api::{BoxTokenStream, BoxableTokenizer, TokenFilter, Tokenizer};
|
||||
|
||||
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
||||
|
||||
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||
///
|
||||
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
|
||||
pub struct TextAnalyzer {
|
||||
tokenizer: Box<dyn Tokenizer>,
|
||||
token_filters: Vec<BoxTokenFilter>,
|
||||
tokenizer: Box<dyn BoxableTokenizer>,
|
||||
}
|
||||
|
||||
impl Default for TextAnalyzer {
|
||||
@@ -18,52 +15,21 @@ impl Default for TextAnalyzer {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> From<T> for TextAnalyzer {
|
||||
impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
|
||||
fn from(tokenizer: T) -> Self {
|
||||
TextAnalyzer::new(tokenizer, Vec::new())
|
||||
TextAnalyzer::builder(tokenizer).build()
|
||||
}
|
||||
}
|
||||
|
||||
impl TextAnalyzer {
|
||||
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
|
||||
///
|
||||
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
|
||||
/// `TextAnalyzer::from(tokenizer)`.
|
||||
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
|
||||
TextAnalyzer {
|
||||
tokenizer: Box::new(tokenizer),
|
||||
token_filters,
|
||||
}
|
||||
}
|
||||
|
||||
/// Appends a token filter to the current tokenizer.
|
||||
///
|
||||
/// The method consumes the current `TokenStream` and returns a
|
||||
/// new one.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let en_stem = TextAnalyzer::from(SimpleTokenizer)
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser)
|
||||
/// .filter(Stemmer::default());
|
||||
/// ```
|
||||
#[must_use]
|
||||
pub fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self {
|
||||
self.token_filters.push(token_filter.into());
|
||||
self
|
||||
/// Create a new TextAnalyzerBuilder
|
||||
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
|
||||
TextAnalyzerBuilder { tokenizer }
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
let mut token_stream = self.tokenizer.token_stream(text);
|
||||
for token_filter in &self.token_filters {
|
||||
token_stream = token_filter.transform(token_stream);
|
||||
}
|
||||
token_stream
|
||||
self.tokenizer.box_token_stream(text)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,11 +37,39 @@ impl Clone for TextAnalyzer {
|
||||
fn clone(&self) -> Self {
|
||||
TextAnalyzer {
|
||||
tokenizer: self.tokenizer.box_clone(),
|
||||
token_filters: self
|
||||
.token_filters
|
||||
.iter()
|
||||
.map(|token_filter| token_filter.box_clone())
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder helper for [`TextAnalyzer`]
|
||||
pub struct TextAnalyzerBuilder<T> {
|
||||
tokenizer: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
/// Appends a token filter to the current builder.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let en_stem = TextAnalyzer::builder(SimpleTokenizer)
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser)
|
||||
/// .filter(Stemmer::default())
|
||||
/// .build();
|
||||
/// ```
|
||||
pub fn filter<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder<F::Tokenizer<T>> {
|
||||
TextAnalyzerBuilder {
|
||||
tokenizer: token_filter.transform(self.tokenizer),
|
||||
}
|
||||
}
|
||||
|
||||
/// Finalize building the TextAnalyzer
|
||||
pub fn build(self) -> TextAnalyzer {
|
||||
TextAnalyzer {
|
||||
tokenizer: Box::new(self.tokenizer),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user