Dynamic analyzer (#755)

* Removed generics in tokenizers

* lowercaser

* Added TokenizerExt

* Introducing BoxedTokenizer

* Introducing BoxXXXXX helper struct

* Closes #762.

* Introducing a TextAnalyzer
This commit is contained in:
Paul Masurel
2020-01-29 18:23:37 +09:00
committed by GitHub
parent f6847c46d7
commit 811fd0cb9e
22 changed files with 348 additions and 442 deletions

View File

@@ -2,7 +2,7 @@
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = SimpleTokenizer
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(5));
//!
//! let mut stream = tokenizer.token_stream("toolong nice");
@@ -13,6 +13,7 @@
//! ```
//!
use super::{Token, TokenFilter, TokenStream};
use crate::tokenizer::BoxTokenStream;
/// `RemoveLongFilter` removes tokens that are longer
/// than a given number of bytes (in UTF-8 representation).
@@ -31,56 +32,27 @@ impl RemoveLongFilter {
}
}
impl<TailTokenStream> RemoveLongFilterStream<TailTokenStream>
where
TailTokenStream: TokenStream,
{
impl<'a> RemoveLongFilterStream<'a> {
fn predicate(&self, token: &Token) -> bool {
token.text.len() < self.token_length_limit
}
}
fn wrap(
token_length_limit: usize,
tail: TailTokenStream,
) -> RemoveLongFilterStream<TailTokenStream> {
RemoveLongFilterStream {
token_length_limit,
tail,
}
impl TokenFilter for RemoveLongFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(RemoveLongFilterStream {
token_length_limit: self.length_limit,
tail: token_stream,
})
}
}
impl<TailTokenStream> TokenFilter<TailTokenStream> for RemoveLongFilter
where
TailTokenStream: TokenStream,
{
type ResultTokenStream = RemoveLongFilterStream<TailTokenStream>;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
RemoveLongFilterStream::wrap(self.length_limit, token_stream)
}
}
pub struct RemoveLongFilterStream<TailTokenStream>
where
TailTokenStream: TokenStream,
{
pub struct RemoveLongFilterStream<'a> {
token_length_limit: usize,
tail: TailTokenStream,
tail: BoxTokenStream<'a>,
}
impl<TailTokenStream> TokenStream for RemoveLongFilterStream<TailTokenStream>
where
TailTokenStream: TokenStream,
{
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
impl<'a> TokenStream for RemoveLongFilterStream<'a> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
@@ -89,4 +61,12 @@ where
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}