mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-26 13:10:41 +00:00
* enable tokenizer on json fields enable tokenizer on json fields for type text * Avoid making the tokenizer within the TextAnalyzer pub(crate) * Moving BoxableTokenizer to tantivy. --------- Co-authored-by: Paul Masurel <paul@quickwit.io>
93 lines
2.6 KiB
Rust
93 lines
2.6 KiB
Rust
/// The tokenizer module contains all of the tools used to process
|
|
/// text in `tantivy`.
|
|
use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
|
|
|
|
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
|
|
|
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
|
pub struct TextAnalyzer {
|
|
tokenizer: Box<dyn BoxableTokenizer>,
|
|
}
|
|
|
|
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
|
trait BoxableTokenizer: 'static + Send + Sync {
|
|
/// Creates a boxed token stream for a given `str`.
|
|
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
|
/// Clone this tokenizer.
|
|
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
|
|
}
|
|
|
|
impl<T: Tokenizer> BoxableTokenizer for T {
|
|
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
|
self.token_stream(text).into()
|
|
}
|
|
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
|
|
Box::new(self.clone())
|
|
}
|
|
}
|
|
|
|
impl Clone for TextAnalyzer {
|
|
fn clone(&self) -> Self {
|
|
TextAnalyzer {
|
|
tokenizer: self.tokenizer.box_clone(),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Default for TextAnalyzer {
|
|
fn default() -> TextAnalyzer {
|
|
TextAnalyzer::from(EmptyTokenizer)
|
|
}
|
|
}
|
|
|
|
impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
|
|
fn from(tokenizer: T) -> Self {
|
|
TextAnalyzer::builder(tokenizer).build()
|
|
}
|
|
}
|
|
|
|
impl TextAnalyzer {
|
|
/// Create a new TextAnalyzerBuilder
|
|
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
|
|
TextAnalyzerBuilder { tokenizer }
|
|
}
|
|
|
|
/// Creates a token stream for a given `str`.
|
|
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
|
self.tokenizer.box_token_stream(text)
|
|
}
|
|
}
|
|
|
|
/// Builder helper for [`TextAnalyzer`]
|
|
pub struct TextAnalyzerBuilder<T> {
|
|
tokenizer: T,
|
|
}
|
|
|
|
impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
|
/// Appends a token filter to the current builder.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// ```rust
|
|
/// use tantivy::tokenizer::*;
|
|
///
|
|
/// let en_stem = TextAnalyzer::builder(SimpleTokenizer)
|
|
/// .filter(RemoveLongFilter::limit(40))
|
|
/// .filter(LowerCaser)
|
|
/// .filter(Stemmer::default())
|
|
/// .build();
|
|
/// ```
|
|
pub fn filter<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder<F::Tokenizer<T>> {
|
|
TextAnalyzerBuilder {
|
|
tokenizer: token_filter.transform(self.tokenizer),
|
|
}
|
|
}
|
|
|
|
/// Finalize building the TextAnalyzer
|
|
pub fn build(self) -> TextAnalyzer {
|
|
TextAnalyzer {
|
|
tokenizer: Box::new(self.tokenizer),
|
|
}
|
|
}
|
|
}
|