Compare commits

...

8 Commits

Author SHA1 Message Date
François Massot
f777de12ea Avoid allocation in filters. 2023-06-29 14:57:39 +02:00
François Massot
f6a6b4a2ff Clean. 2023-06-25 14:01:10 +02:00
François Massot
2cab111f99 Refactor token filter. 2023-06-25 13:21:31 +02:00
François Massot
dc783f8328 Remove BoxTokenStream. 2023-06-23 13:33:40 +02:00
François Massot
b82cd08f5d Fix comment. 2023-06-22 09:13:21 +02:00
François Massot
54f43135f2 Use dyn_clone. 2023-06-22 09:13:21 +02:00
François Massot
6c6b97d4ef Clean code and improve docs. 2023-06-22 09:13:20 +02:00
François Massot
ad9b825067 Add boxed token filter to ease the building of TextAnalyzer with a vec of filters. 2023-06-22 09:12:23 +02:00
18 changed files with 221 additions and 227 deletions

View File

@@ -19,6 +19,7 @@ oneshot = "0.1.5"
base64 = "0.21.0"
byteorder = "1.4.3"
crc32fast = "1.3.2"
dyn-clone = "1.0.11"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "1.0"

View File

@@ -1,5 +1,7 @@
use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::TokenizerManager;
use tantivy::tokenizer::{
BoxTokenFilter, LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
};
const ALICE_TXT: &str = include_str!("alice.txt");
@@ -16,7 +18,26 @@ pub fn criterion_benchmark(c: &mut Criterion) {
assert_eq!(word_count, 30_731);
})
});
let token_filters = vec![
BoxTokenFilter::from(RemoveLongFilter::limit(40)),
BoxTokenFilter::from(LowerCaser),
];
let mut dynamic_analyzer = TextAnalyzer::new(SimpleTokenizer::default(), token_filters);
c.bench_function("default-dynamic-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
while token_stream.advance() {
word_count += 1;
}
assert_eq!(word_count, 30_731);
})
});
}
criterion_group!(benches, criterion_benchmark);
criterion_group! {
name = benches;
config = Criterion::default().sample_size(200);
targets = criterion_benchmark
}
criterion_main!(benches);

View File

@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
// This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.filter(LowerCaser::default())
.filter(StopWordFilter::remove(vec![
"the".to_string(),
"and".to_string(),

View File

@@ -1209,7 +1209,7 @@ mod tests {
ff_tokenizer_manager.register(
"custom_lowercase",
TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
.filter(LowerCaser::default())
.build(),
);

View File

@@ -209,7 +209,7 @@ impl SegmentWriter {
for value in values {
let mut token_stream = match value {
Value::PreTokStr(tok_str) => {
PreTokenizedStream::from(tok_str.clone()).into()
Box::new(PreTokenizedStream::from(tok_str.clone()))
}
Value::Str(ref text) => {
let text_analyzer =

View File

@@ -4,9 +4,7 @@ use std::collections::{BinaryHeap, HashMap};
use crate::query::bm25::idf;
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
use crate::tokenizer::{
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
use crate::{DocAddress, Result, Searcher, TantivyError};
#[derive(Debug, PartialEq)]
@@ -206,8 +204,7 @@ impl MoreLikeThis {
for value in values {
match value {
Value::PreTokStr(tok_str) => {
let mut token_stream: BoxTokenStream =
PreTokenizedStream::from(tok_str.clone()).into();
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);

View File

@@ -960,7 +960,8 @@ mod test {
tokenizer_manager.register(
"en_with_stop_words",
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.filter(LowerCaser::default())
.filter(LowerCaser::default())
.filter(StopWordFilter::remove(vec!["the".to_string()]))
.build(),
);

View File

@@ -21,7 +21,7 @@
//! // the "emoji" is dropped because its not an alphanum
//! assert!(stream.next().is_none());
//! ```
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};
/// `TokenFilter` that removes all tokens that contain non
/// ascii alphanumeric characters.
@@ -39,23 +39,10 @@ impl<T> AlphaNumOnlyFilterStream<T> {
}
impl TokenFilter for AlphaNumOnlyFilter {
type Tokenizer<T: Tokenizer> = AlphaNumOnlyFilterWrapper<T>;
type OutputTokenStream<'a, T: TokenStream> = AlphaNumOnlyFilterStream<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> AlphaNumOnlyFilterWrapper<T> {
AlphaNumOnlyFilterWrapper(tokenizer)
}
}
#[derive(Clone)]
pub struct AlphaNumOnlyFilterWrapper<T>(T);
impl<T: Tokenizer> Tokenizer for AlphaNumOnlyFilterWrapper<T> {
type TokenStream<'a> = AlphaNumOnlyFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
AlphaNumOnlyFilterStream {
tail: self.0.token_stream(text),
}
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
AlphaNumOnlyFilterStream { tail: token_stream }
}
}

View File

@@ -1,38 +1,21 @@
use std::mem;
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};
/// This class converts alphabetic, numeric, and symbolic Unicode characters
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
/// block) into their ASCII equivalents, if one exists.
#[derive(Clone)]
pub struct AsciiFoldingFilter;
#[derive(Clone, Default)]
pub struct AsciiFoldingFilter(String);
impl TokenFilter for AsciiFoldingFilter {
type Tokenizer<T: Tokenizer> = AsciiFoldingFilterWrapper<T>;
type OutputTokenStream<'a, T: TokenStream> = AsciiFoldingFilterTokenStream<'a, T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> AsciiFoldingFilterWrapper<T> {
AsciiFoldingFilterWrapper {
tokenizer,
buffer: String::new(),
}
}
}
#[derive(Clone)]
pub struct AsciiFoldingFilterWrapper<T> {
tokenizer: T,
buffer: String,
}
impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
type TokenStream<'a> = AsciiFoldingFilterTokenStream<'a, T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.buffer.clear();
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
self.0.clear();
AsciiFoldingFilterTokenStream {
buffer: &mut self.buffer,
tail: self.tokenizer.token_stream(text),
buffer: &mut self.0,
tail: token_stream,
}
}
}
@@ -1581,7 +1564,7 @@ mod tests {
fn folding_helper(text: &str) -> Vec<String> {
let mut tokens = Vec::new();
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(AsciiFoldingFilter)
.filter(AsciiFoldingFilter::default())
.build()
.token_stream(text)
.process(&mut |token| {
@@ -1592,7 +1575,7 @@ mod tests {
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(AsciiFoldingFilter)
.filter(AsciiFoldingFilter::default())
.build();
let mut token_stream = tokenizer.token_stream(text);
token_stream.advance();

View File

@@ -1,36 +1,19 @@
use std::mem;
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};
/// Token filter that lowercase terms.
#[derive(Clone)]
pub struct LowerCaser;
#[derive(Clone, Default)]
pub struct LowerCaser(String);
impl TokenFilter for LowerCaser {
type Tokenizer<T: Tokenizer> = LowerCaserFilter<T>;
type OutputTokenStream<'a, T: TokenStream> = LowerCaserTokenStream<'a, T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
LowerCaserFilter {
tokenizer,
buffer: String::new(),
}
}
}
#[derive(Clone)]
pub struct LowerCaserFilter<T> {
tokenizer: T,
buffer: String,
}
impl<T: Tokenizer> Tokenizer for LowerCaserFilter<T> {
type TokenStream<'a> = LowerCaserTokenStream<'a, T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.buffer.clear();
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
self.0.clear();
LowerCaserTokenStream {
tail: self.tokenizer.token_stream(text),
buffer: &mut self.buffer,
tail: token_stream,
buffer: &mut self.0,
}
}
}
@@ -94,7 +77,7 @@ mod tests {
fn token_stream_helper(text: &str) -> Vec<Token> {
let mut token_stream = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.filter(LowerCaser::default())
.build();
let mut token_stream = token_stream.token_stream(text);

View File

@@ -68,7 +68,7 @@
//!
//! let en_stem = TextAnalyzer::builder(SimpleTokenizer::default())
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser)
//! .filter(LowerCaser::default())
//! .filter(Stemmer::new(Language::English))
//! .build();
//! ```
@@ -115,7 +115,7 @@
//! // We need to register our tokenizer :
//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser)
//! .filter(LowerCaser::default())
//! .build();
//! index
//! .tokenizers()
@@ -139,7 +139,7 @@ mod tokenizer;
mod tokenizer_manager;
mod whitespace_tokenizer;
pub use tokenizer_api::{BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer};
pub use tokenizer_api::{Token, TokenFilter, TokenStream, Tokenizer};
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::AsciiFoldingFilter;
@@ -154,7 +154,7 @@ pub use self::split_compound_words::SplitCompoundWords;
pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{TextAnalyzer, TextAnalyzerBuilder};
pub use self::tokenizer::{BoxTokenFilter, TextAnalyzer, TextAnalyzerBuilder};
pub use self::tokenizer_manager::TokenizerManager;
pub use self::whitespace_tokenizer::WhitespaceTokenizer;
@@ -233,7 +233,7 @@ pub mod tests {
"el_stem",
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(LowerCaser::default())
.filter(Stemmer::new(Language::Greek))
.build(),
);

View File

@@ -12,7 +12,7 @@
//! assert_eq!(stream.next().unwrap().text, "nice");
//! assert!(stream.next().is_none());
//! ```
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};
/// `RemoveLongFilter` removes tokens that are longer
/// than a given number of bytes (in UTF-8 representation).
@@ -38,29 +38,12 @@ impl<T> RemoveLongFilterStream<T> {
}
impl TokenFilter for RemoveLongFilter {
type Tokenizer<T: Tokenizer> = RemoveLongFilterWrapper<T>;
type OutputTokenStream<'a, T: TokenStream> = RemoveLongFilterStream<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> RemoveLongFilterWrapper<T> {
RemoveLongFilterWrapper {
length_limit: self.length_limit,
inner: tokenizer,
}
}
}
#[derive(Clone)]
pub struct RemoveLongFilterWrapper<T: Tokenizer> {
length_limit: usize,
inner: T,
}
impl<T: Tokenizer> Tokenizer for RemoveLongFilterWrapper<T> {
type TokenStream<'a> = RemoveLongFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
RemoveLongFilterStream {
token_length_limit: self.length_limit,
tail: self.inner.token_stream(text),
tail: token_stream,
}
}
}

View File

@@ -1,6 +1,6 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};
/// A [`TokenFilter`] which splits compound words into their parts
/// based on a given dictionary.
@@ -80,29 +80,12 @@ impl SplitCompoundWords {
}
impl TokenFilter for SplitCompoundWords {
type Tokenizer<T: Tokenizer> = SplitCompoundWordsFilter<T>;
type OutputTokenStream<'a, T: TokenStream> = SplitCompoundWordsTokenStream<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> SplitCompoundWordsFilter<T> {
SplitCompoundWordsFilter {
dict: self.dict,
inner: tokenizer,
}
}
}
#[derive(Clone)]
pub struct SplitCompoundWordsFilter<T> {
dict: AhoCorasick,
inner: T,
}
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: self.inner.token_stream(text),
tail: token_stream,
cuts: Vec::new(),
parts: Vec::new(),
}

View File

@@ -4,7 +4,7 @@ use std::mem;
use rust_stemmers::{self, Algorithm};
use serde::{Deserialize, Serialize};
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};
/// Available stemmer languages.
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
@@ -81,29 +81,12 @@ impl Default for Stemmer {
}
impl TokenFilter for Stemmer {
type Tokenizer<T: Tokenizer> = StemmerFilter<T>;
type OutputTokenStream<'a, T: TokenStream> = StemmerTokenStream<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> StemmerFilter<T> {
StemmerFilter {
stemmer_algorithm: self.stemmer_algorithm,
inner: tokenizer,
}
}
}
#[derive(Clone)]
pub struct StemmerFilter<T> {
stemmer_algorithm: Algorithm,
inner: T,
}
impl<T: Tokenizer> Tokenizer for StemmerFilter<T> {
type TokenStream<'a> = StemmerTokenStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
StemmerTokenStream {
tail: self.inner.token_stream(text),
tail: token_stream,
stemmer,
buffer: String::new(),
}

View File

@@ -21,7 +21,7 @@ use rustc_hash::FxHashSet;
#[cfg(feature = "stopwords")]
use super::Language;
use super::{Token, TokenFilter, TokenStream, Tokenizer};
use super::{Token, TokenFilter, TokenStream};
/// `TokenFilter` that removes stop words from a token stream
#[derive(Clone)]
@@ -72,29 +72,12 @@ impl StopWordFilter {
}
impl TokenFilter for StopWordFilter {
type Tokenizer<T: Tokenizer> = StopWordFilterWrapper<T>;
type OutputTokenStream<'a, T: TokenStream> = StopWordFilterStream<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> StopWordFilterWrapper<T> {
StopWordFilterWrapper {
words: self.words,
inner: tokenizer,
}
}
}
#[derive(Clone)]
pub struct StopWordFilterWrapper<T> {
words: Arc<FxHashSet<String>>,
inner: T,
}
impl<T: Tokenizer> Tokenizer for StopWordFilterWrapper<T> {
type TokenStream<'a> = StopWordFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
StopWordFilterStream {
words: self.words.clone(),
tail: self.inner.token_stream(text),
tail: token_stream,
}
}
}

View File

@@ -1,37 +1,103 @@
use dyn_clone::DynClone;
/// The tokenizer module contains all of the tools used to process
/// text in `tantivy`.
use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
use tokenizer_api::{FilteredTokenizer, TokenFilter, TokenStream, Tokenizer};
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
#[derive(Clone)]
pub struct TextAnalyzer {
tokenizer: Box<dyn BoxableTokenizer>,
token_filters: Vec<BoxTokenFilter>,
}
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
trait BoxableTokenizer: 'static + Send + Sync {
trait BoxableTokenizer: 'static + Send + Sync + DynClone {
/// Creates a boxed token stream for a given `str`.
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
/// Clone this tokenizer.
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a>;
}
impl<T: Tokenizer> BoxableTokenizer for T {
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.token_stream(text).into()
}
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
Box::new(self.clone())
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
Box::new(self.token_stream(text))
}
}
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
dyn_clone::clone_trait_object!(BoxableTokenizer);
/// A boxable `TokenFilter`, with its `Tokenizer` type erased.
trait BoxableTokenFilter: 'static + Send + Sync + DynClone {
/// Transforms a boxed token stream into a new one.
fn box_filter<'a>(
&'a mut self,
token_stream: Box<dyn TokenStream + 'a>,
) -> Box<dyn TokenStream + 'a>;
}
impl<T: TokenFilter> BoxableTokenFilter for T {
fn box_filter<'a>(
&'a mut self,
token_stream: Box<dyn TokenStream + 'a>,
) -> Box<dyn TokenStream + 'a> {
Box::new(self.filter(token_stream))
}
}
dyn_clone::clone_trait_object!(BoxableTokenFilter);
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
///
/// See [`TokenFilter`] for more information.
#[derive(Clone)]
pub struct BoxTokenFilter(Box<dyn BoxableTokenFilter>);
impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(tokenizer: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(tokenizer))
}
}
impl TextAnalyzer {
/// Builds a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
///
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it
/// will be more performant and create less boxes.
///
/// # Example
///
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let en_stem = TextAnalyzer::new(
/// SimpleTokenizer::default(),
/// vec![
/// BoxTokenFilter::from(RemoveLongFilter::limit(40)),
/// BoxTokenFilter::from(LowerCaser),
/// BoxTokenFilter::from(Stemmer::default()),
/// ]);
/// ```
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
TextAnalyzer {
tokenizer: self.tokenizer.box_clone(),
tokenizer: Box::new(tokenizer),
token_filters,
}
}
/// Create a new TextAnalyzerBuilder.
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
TextAnalyzerBuilder { tokenizer }
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
let mut token_stream = self.tokenizer.box_token_stream(text);
for token_filter in self.token_filters.iter_mut() {
token_stream = token_filter.0.box_filter(token_stream);
}
token_stream
}
}
impl Default for TextAnalyzer {
@@ -46,20 +112,8 @@ impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
}
}
impl TextAnalyzer {
/// Create a new TextAnalyzerBuilder
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
TextAnalyzerBuilder { tokenizer }
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.tokenizer.box_token_stream(text)
}
}
/// Builder helper for [`TextAnalyzer`]
pub struct TextAnalyzerBuilder<T> {
pub struct TextAnalyzerBuilder<T: Tokenizer> {
tokenizer: T,
}
@@ -77,7 +131,10 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
/// .filter(Stemmer::default())
/// .build();
/// ```
pub fn filter<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder<F::Tokenizer<T>> {
pub fn filter<F: TokenFilter>(
self,
token_filter: F,
) -> TextAnalyzerBuilder<FilteredTokenizer<T, F>> {
TextAnalyzerBuilder {
tokenizer: token_filter.transform(self.tokenizer),
}
@@ -87,6 +144,41 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
pub fn build(self) -> TextAnalyzer {
TextAnalyzer {
tokenizer: Box::new(self.tokenizer),
token_filters: Vec::new(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer};
#[test]
fn test_text_analyzer_builder() {
let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default())
.filter(AlphaNumOnlyFilter)
.filter(RemoveLongFilter::limit(6))
.filter(LowerCaser::default())
.build();
let mut stream = analyzer.token_stream("- first bullet point");
assert_eq!(stream.next().unwrap().text, "first");
assert_eq!(stream.next().unwrap().text, "point");
}
#[test]
fn test_text_analyzer_with_filters_boxed() {
let mut analyzer = TextAnalyzer::new(
WhitespaceTokenizer::default(),
vec![
BoxTokenFilter::from(AlphaNumOnlyFilter),
BoxTokenFilter::from(LowerCaser::default()),
BoxTokenFilter::from(RemoveLongFilter::limit(6)),
],
);
let mut stream = analyzer.token_stream("- first bullet point");
assert_eq!(stream.next().unwrap().text, "first");
assert_eq!(stream.next().unwrap().text, "point");
}
}

View File

@@ -63,14 +63,14 @@ impl Default for TokenizerManager {
"default",
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(LowerCaser::default())
.build(),
);
manager.register(
"en_stem",
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(LowerCaser::default())
.filter(Stemmer::new(Language::English))
.build(),
);

View File

@@ -6,7 +6,6 @@
//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
use std::borrow::{Borrow, BorrowMut};
use std::ops::{Deref, DerefMut};
use serde::{Deserialize, Serialize};
@@ -60,30 +59,6 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>;
}
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a, T> From<T> for BoxTokenStream<'a>
where T: TokenStream + 'a
{
fn from(token_stream: T) -> BoxTokenStream<'a> {
BoxTokenStream(Box::new(token_stream))
}
}
impl<'a> Deref for BoxTokenStream<'a> {
type Target = dyn TokenStream + 'a;
fn deref(&self) -> &Self::Target {
&*self.0
}
}
impl<'a> DerefMut for BoxTokenStream<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut *self.0
}
}
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
fn advance(&mut self) -> bool {
let token_stream: &mut dyn TokenStream = self.borrow_mut();
@@ -137,12 +112,34 @@ pub trait TokenStream {
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync {
pub trait TokenFilter: 'static + Send + Sync + Clone {
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
/// Tokenizer.
type Tokenizer<T: Tokenizer>: Tokenizer;
type OutputTokenStream<'a, T: TokenStream>: TokenStream;
/// Filter a token stream and returns a new one.
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T>;
/// Wraps a Tokenizer and returns a new one.
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> FilteredTokenizer<T, Self> {
FilteredTokenizer {
tokenizer,
token_filter: self,
}
}
}
#[derive(Clone)]
pub struct FilteredTokenizer<T: Tokenizer, F: TokenFilter> {
tokenizer: T,
token_filter: F,
}
impl<T: Tokenizer, F: TokenFilter> Tokenizer for FilteredTokenizer<T, F> {
type TokenStream<'a> = F::OutputTokenStream<'a, T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
let token_stream = self.tokenizer.token_stream(text);
self.token_filter.filter(token_stream)
}
}
#[cfg(test)]