mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-25 04:30:40 +00:00
Refactor token filter.
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use tantivy::tokenizer::TokenizerManager;
|
||||
use tantivy::tokenizer::{
|
||||
BoxTokenFilter, LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
|
||||
};
|
||||
|
||||
const ALICE_TXT: &str = include_str!("alice.txt");
|
||||
|
||||
@@ -16,7 +18,26 @@ pub fn criterion_benchmark(c: &mut Criterion) {
|
||||
assert_eq!(word_count, 30_731);
|
||||
})
|
||||
});
|
||||
let token_filters = vec![
|
||||
BoxTokenFilter::from(RemoveLongFilter::limit(40)),
|
||||
BoxTokenFilter::from(LowerCaser),
|
||||
];
|
||||
let mut dynamic_analyzer = TextAnalyzer::new(SimpleTokenizer::default(), token_filters);
|
||||
c.bench_function("default-dynamic-tokenize-alice", |b| {
|
||||
b.iter(|| {
|
||||
let mut word_count = 0;
|
||||
let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
|
||||
while token_stream.advance() {
|
||||
word_count += 1;
|
||||
}
|
||||
assert_eq!(word_count, 30_731);
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(benches, criterion_benchmark);
|
||||
criterion_group! {
|
||||
name = benches;
|
||||
config = Criterion::default().sample_size(200);
|
||||
targets = criterion_benchmark
|
||||
}
|
||||
criterion_main!(benches);
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
//! // the "emoji" is dropped because its not an alphanum
|
||||
//! assert!(stream.next().is_none());
|
||||
//! ```
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// `TokenFilter` that removes all tokens that contain non
|
||||
/// ascii alphanumeric characters.
|
||||
@@ -39,23 +39,10 @@ impl<T> AlphaNumOnlyFilterStream<T> {
|
||||
}
|
||||
|
||||
impl TokenFilter for AlphaNumOnlyFilter {
|
||||
type Tokenizer<T: Tokenizer> = AlphaNumOnlyFilterWrapper<T>;
|
||||
type OutputTokenStream<T: TokenStream> = AlphaNumOnlyFilterStream<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> AlphaNumOnlyFilterWrapper<T> {
|
||||
AlphaNumOnlyFilterWrapper(tokenizer)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AlphaNumOnlyFilterWrapper<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for AlphaNumOnlyFilterWrapper<T> {
|
||||
type TokenStream<'a> = AlphaNumOnlyFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
AlphaNumOnlyFilterStream {
|
||||
tail: self.0.token_stream(text),
|
||||
}
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
||||
AlphaNumOnlyFilterStream { tail: token_stream }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::mem;
|
||||
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// This class converts alphabetic, numeric, and symbolic Unicode characters
|
||||
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
|
||||
@@ -9,48 +9,30 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
pub struct AsciiFoldingFilter;
|
||||
|
||||
impl TokenFilter for AsciiFoldingFilter {
|
||||
type Tokenizer<T: Tokenizer> = AsciiFoldingFilterWrapper<T>;
|
||||
type OutputTokenStream<T: TokenStream> = AsciiFoldingFilterTokenStream<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> AsciiFoldingFilterWrapper<T> {
|
||||
AsciiFoldingFilterWrapper {
|
||||
tokenizer,
|
||||
buffer: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AsciiFoldingFilterWrapper<T> {
|
||||
tokenizer: T,
|
||||
buffer: String,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
|
||||
type TokenStream<'a> = AsciiFoldingFilterTokenStream<'a, T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.buffer.clear();
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
||||
AsciiFoldingFilterTokenStream {
|
||||
buffer: &mut self.buffer,
|
||||
tail: self.tokenizer.token_stream(text),
|
||||
buffer: String::new(),
|
||||
tail: token_stream,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AsciiFoldingFilterTokenStream<'a, T> {
|
||||
buffer: &'a mut String,
|
||||
pub struct AsciiFoldingFilterTokenStream<T> {
|
||||
buffer: String,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> {
|
||||
impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
}
|
||||
if !self.token_mut().text.is_ascii() {
|
||||
// ignore its already ascii
|
||||
to_ascii(&self.tail.token().text, self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, self.buffer);
|
||||
to_ascii(&self.tail.token().text, &mut self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
@@ -1,42 +1,24 @@
|
||||
use std::mem;
|
||||
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// Token filter that lowercase terms.
|
||||
#[derive(Clone)]
|
||||
pub struct LowerCaser;
|
||||
|
||||
impl TokenFilter for LowerCaser {
|
||||
type Tokenizer<T: Tokenizer> = LowerCaserFilter<T>;
|
||||
type OutputTokenStream<T: TokenStream> = LowerCaserTokenStream<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
|
||||
LowerCaserFilter {
|
||||
tokenizer,
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
||||
LowerCaserTokenStream {
|
||||
tail: token_stream,
|
||||
buffer: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct LowerCaserFilter<T> {
|
||||
tokenizer: T,
|
||||
pub struct LowerCaserTokenStream<T> {
|
||||
buffer: String,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for LowerCaserFilter<T> {
|
||||
type TokenStream<'a> = LowerCaserTokenStream<'a, T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.buffer.clear();
|
||||
LowerCaserTokenStream {
|
||||
tail: self.tokenizer.token_stream(text),
|
||||
buffer: &mut self.buffer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LowerCaserTokenStream<'a, T> {
|
||||
buffer: &'a mut String,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
@@ -51,7 +33,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> {
|
||||
impl<T: TokenStream> TokenStream for LowerCaserTokenStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
@@ -60,8 +42,8 @@ impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> {
|
||||
// fast track for ascii.
|
||||
self.token_mut().text.make_ascii_lowercase();
|
||||
} else {
|
||||
to_lowercase_unicode(&self.tail.token().text, self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, self.buffer);
|
||||
to_lowercase_unicode(&self.tail.token().text, &mut self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
//! assert_eq!(stream.next().unwrap().text, "nice");
|
||||
//! assert!(stream.next().is_none());
|
||||
//! ```
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// `RemoveLongFilter` removes tokens that are longer
|
||||
/// than a given number of bytes (in UTF-8 representation).
|
||||
@@ -38,29 +38,12 @@ impl<T> RemoveLongFilterStream<T> {
|
||||
}
|
||||
|
||||
impl TokenFilter for RemoveLongFilter {
|
||||
type Tokenizer<T: Tokenizer> = RemoveLongFilterWrapper<T>;
|
||||
type OutputTokenStream<T: TokenStream> = RemoveLongFilterStream<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> RemoveLongFilterWrapper<T> {
|
||||
RemoveLongFilterWrapper {
|
||||
length_limit: self.length_limit,
|
||||
inner: tokenizer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RemoveLongFilterWrapper<T: Tokenizer> {
|
||||
length_limit: usize,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for RemoveLongFilterWrapper<T> {
|
||||
type TokenStream<'a> = RemoveLongFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
||||
RemoveLongFilterStream {
|
||||
token_length_limit: self.length_limit,
|
||||
tail: self.inner.token_stream(text),
|
||||
tail: token_stream,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
|
||||
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// A [`TokenFilter`] which splits compound words into their parts
|
||||
/// based on a given dictionary.
|
||||
@@ -80,29 +80,12 @@ impl SplitCompoundWords {
|
||||
}
|
||||
|
||||
impl TokenFilter for SplitCompoundWords {
|
||||
type Tokenizer<T: Tokenizer> = SplitCompoundWordsFilter<T>;
|
||||
type OutputTokenStream<T: TokenStream> = SplitCompoundWordsTokenStream<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> SplitCompoundWordsFilter<T> {
|
||||
SplitCompoundWordsFilter {
|
||||
dict: self.dict,
|
||||
inner: tokenizer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SplitCompoundWordsFilter<T> {
|
||||
dict: AhoCorasick,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
|
||||
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
||||
SplitCompoundWordsTokenStream {
|
||||
dict: self.dict.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
tail: token_stream,
|
||||
cuts: Vec::new(),
|
||||
parts: Vec::new(),
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::mem;
|
||||
use rust_stemmers::{self, Algorithm};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// Available stemmer languages.
|
||||
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
|
||||
@@ -81,29 +81,12 @@ impl Default for Stemmer {
|
||||
}
|
||||
|
||||
impl TokenFilter for Stemmer {
|
||||
type Tokenizer<T: Tokenizer> = StemmerFilter<T>;
|
||||
type OutputTokenStream<T: TokenStream> = StemmerTokenStream<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> StemmerFilter<T> {
|
||||
StemmerFilter {
|
||||
stemmer_algorithm: self.stemmer_algorithm,
|
||||
inner: tokenizer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct StemmerFilter<T> {
|
||||
stemmer_algorithm: Algorithm,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for StemmerFilter<T> {
|
||||
type TokenStream<'a> = StemmerTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
||||
let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||
StemmerTokenStream {
|
||||
tail: self.inner.token_stream(text),
|
||||
tail: token_stream,
|
||||
stemmer,
|
||||
buffer: String::new(),
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@ use rustc_hash::FxHashSet;
|
||||
|
||||
#[cfg(feature = "stopwords")]
|
||||
use super::Language;
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// `TokenFilter` that removes stop words from a token stream
|
||||
#[derive(Clone)]
|
||||
@@ -72,29 +72,12 @@ impl StopWordFilter {
|
||||
}
|
||||
|
||||
impl TokenFilter for StopWordFilter {
|
||||
type Tokenizer<T: Tokenizer> = StopWordFilterWrapper<T>;
|
||||
type OutputTokenStream<T: TokenStream> = StopWordFilterStream<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> StopWordFilterWrapper<T> {
|
||||
StopWordFilterWrapper {
|
||||
words: self.words,
|
||||
inner: tokenizer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct StopWordFilterWrapper<T> {
|
||||
words: Arc<FxHashSet<String>>,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for StopWordFilterWrapper<T> {
|
||||
type TokenStream<'a> = StopWordFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T> {
|
||||
StopWordFilterStream {
|
||||
words: self.words.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
tail: token_stream,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
use dyn_clone::DynClone;
|
||||
/// The tokenizer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
use tokenizer_api::{TokenFilter, TokenStream, Tokenizer};
|
||||
use tokenizer_api::{FilteredTokenizer, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
||||
|
||||
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||
#[derive(Clone)]
|
||||
pub struct TextAnalyzer {
|
||||
tokenizer: Box<dyn BoxableTokenizer>,
|
||||
token_filters: Vec<BoxTokenFilter>,
|
||||
}
|
||||
|
||||
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
||||
@@ -25,32 +25,30 @@ impl<T: Tokenizer> BoxableTokenizer for T {
|
||||
|
||||
dyn_clone::clone_trait_object!(BoxableTokenizer);
|
||||
|
||||
/// A boxed `BoxableTokenizer` which is a `Tokenizer` with its `TokenStream` type erased.
|
||||
#[derive(Clone)]
|
||||
struct BoxTokenizer(Box<dyn BoxableTokenizer>);
|
||||
|
||||
impl Tokenizer for BoxTokenizer {
|
||||
type TokenStream<'a> = Box<dyn TokenStream + 'a>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.0.box_token_stream(text).into()
|
||||
}
|
||||
}
|
||||
|
||||
/// A boxable `TokenFilter`, with its `Tokenizer` type erased.
|
||||
trait BoxableTokenFilter: 'static + Send + Sync {
|
||||
/// Wraps a `BoxedTokenizer` and returns a new one.
|
||||
fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer;
|
||||
trait BoxableTokenFilter: 'static + Send + Sync + DynClone {
|
||||
/// Transforms a boxed token stream into a new one.
|
||||
fn box_transform<'a>(
|
||||
&self,
|
||||
token_stream: Box<dyn TokenStream + 'a>,
|
||||
) -> Box<dyn TokenStream + 'a>;
|
||||
}
|
||||
|
||||
impl<T: TokenFilter> BoxableTokenFilter for T {
|
||||
fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer {
|
||||
let tokenizer = self.clone().transform(tokenizer);
|
||||
BoxTokenizer(Box::new(tokenizer))
|
||||
fn box_transform<'a>(
|
||||
&self,
|
||||
token_stream: Box<dyn TokenStream + 'a>,
|
||||
) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(self.clone().filter(token_stream))
|
||||
}
|
||||
}
|
||||
|
||||
/// A boxed `BoxableTokenFilter` which is a `TokenFilter` with its `Tokenizer` type erased.
|
||||
dyn_clone::clone_trait_object!(BoxableTokenFilter);
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
|
||||
///
|
||||
/// See [`TokenFilter`] for more information.
|
||||
#[derive(Clone)]
|
||||
pub struct BoxTokenFilter(Box<dyn BoxableTokenFilter>);
|
||||
|
||||
impl<T: TokenFilter> From<T> for BoxTokenFilter {
|
||||
@@ -59,6 +57,19 @@ impl<T: TokenFilter> From<T> for BoxTokenFilter {
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TextAnalyzer {
|
||||
fn clone(&self) -> Self {
|
||||
TextAnalyzer {
|
||||
tokenizer: self.tokenizer.clone(),
|
||||
token_filters: self
|
||||
.token_filters
|
||||
.iter()
|
||||
.map(|token_filter| token_filter.clone())
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TextAnalyzer {
|
||||
/// Builds a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
|
||||
///
|
||||
@@ -71,7 +82,7 @@ impl TextAnalyzer {
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let en_stem = TextAnalyzer::build(
|
||||
/// let en_stem = TextAnalyzer::new(
|
||||
/// SimpleTokenizer::default(),
|
||||
/// vec![
|
||||
/// BoxTokenFilter::from(RemoveLongFilter::limit(40)),
|
||||
@@ -79,27 +90,25 @@ impl TextAnalyzer {
|
||||
/// BoxTokenFilter::from(Stemmer::default()),
|
||||
/// ]);
|
||||
/// ```
|
||||
pub fn build<T: Tokenizer>(
|
||||
tokenizer: T,
|
||||
boxed_token_filters: Vec<BoxTokenFilter>,
|
||||
) -> TextAnalyzer {
|
||||
let mut boxed_tokenizer = BoxTokenizer(Box::new(tokenizer));
|
||||
for filter in boxed_token_filters.into_iter() {
|
||||
boxed_tokenizer = filter.0.box_transform(boxed_tokenizer);
|
||||
}
|
||||
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
|
||||
TextAnalyzer {
|
||||
tokenizer: boxed_tokenizer.0,
|
||||
tokenizer: Box::new(tokenizer),
|
||||
token_filters,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new TextAnalyzerBuilder
|
||||
/// Create a new TextAnalyzerBuilder.
|
||||
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
|
||||
TextAnalyzerBuilder { tokenizer }
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
self.tokenizer.box_token_stream(text)
|
||||
let mut token_stream = self.tokenizer.box_token_stream(text);
|
||||
for token_filter in &self.token_filters {
|
||||
token_stream = token_filter.0.box_transform(token_stream);
|
||||
}
|
||||
token_stream
|
||||
}
|
||||
}
|
||||
|
||||
@@ -134,7 +143,10 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
/// .filter(Stemmer::default())
|
||||
/// .build();
|
||||
/// ```
|
||||
pub fn filter<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder<F::Tokenizer<T>> {
|
||||
pub fn filter<F: TokenFilter>(
|
||||
self,
|
||||
token_filter: F,
|
||||
) -> TextAnalyzerBuilder<FilteredTokenizer<T, F>> {
|
||||
TextAnalyzerBuilder {
|
||||
tokenizer: token_filter.transform(self.tokenizer),
|
||||
}
|
||||
@@ -144,6 +156,7 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
pub fn build(self) -> TextAnalyzer {
|
||||
TextAnalyzer {
|
||||
tokenizer: Box::new(self.tokenizer),
|
||||
token_filters: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -168,7 +181,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_text_analyzer_with_filters_boxed() {
|
||||
let mut analyzer = TextAnalyzer::build(
|
||||
let mut analyzer = TextAnalyzer::new(
|
||||
WhitespaceTokenizer::default(),
|
||||
vec![
|
||||
BoxTokenFilter::from(AlphaNumOnlyFilter),
|
||||
|
||||
@@ -115,9 +115,31 @@ pub trait TokenStream {
|
||||
pub trait TokenFilter: 'static + Send + Sync + Clone {
|
||||
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
|
||||
/// Tokenizer.
|
||||
type Tokenizer<T: Tokenizer>: Tokenizer;
|
||||
type OutputTokenStream<T: TokenStream>: TokenStream;
|
||||
/// Filter a token stream and returns a new one.
|
||||
fn filter<T: TokenStream>(&self, token_stream: T) -> Self::OutputTokenStream<T>;
|
||||
/// Wraps a Tokenizer and returns a new one.
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> FilteredTokenizer<T, Self> {
|
||||
FilteredTokenizer {
|
||||
tokenizer,
|
||||
token_filter: self,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FilteredTokenizer<T: Tokenizer, F: TokenFilter> {
|
||||
tokenizer: T,
|
||||
token_filter: F,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer, F: TokenFilter> Tokenizer for FilteredTokenizer<T, F> {
|
||||
type TokenStream<'a> = F::OutputTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
let token_stream = self.tokenizer.token_stream(text);
|
||||
self.token_filter.filter(token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
Reference in New Issue
Block a user