mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-31 15:40:40 +00:00
Dynamic analyzer (#755)
* Removed generics in tokenizers * lowercaser * Added TokenizerExt * Introducing BoxedTokenizer * Introducing BoxXXXXX helper struct * Closes #762. * Introducing a TextAnalyzer
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = SimpleTokenizer
|
||||
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! .filter(RemoveLongFilter::limit(5));
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("toolong nice");
|
||||
@@ -13,6 +13,7 @@
|
||||
//! ```
|
||||
//!
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
|
||||
/// `RemoveLongFilter` removes tokens that are longer
|
||||
/// than a given number of bytes (in UTF-8 representation).
|
||||
@@ -31,56 +32,27 @@ impl RemoveLongFilter {
|
||||
}
|
||||
}
|
||||
|
||||
impl<TailTokenStream> RemoveLongFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
impl<'a> RemoveLongFilterStream<'a> {
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
token.text.len() < self.token_length_limit
|
||||
}
|
||||
}
|
||||
|
||||
fn wrap(
|
||||
token_length_limit: usize,
|
||||
tail: TailTokenStream,
|
||||
) -> RemoveLongFilterStream<TailTokenStream> {
|
||||
RemoveLongFilterStream {
|
||||
token_length_limit,
|
||||
tail,
|
||||
}
|
||||
impl TokenFilter for RemoveLongFilter {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(RemoveLongFilterStream {
|
||||
token_length_limit: self.length_limit,
|
||||
tail: token_stream,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for RemoveLongFilter
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
type ResultTokenStream = RemoveLongFilterStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
RemoveLongFilterStream::wrap(self.length_limit, token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RemoveLongFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
pub struct RemoveLongFilterStream<'a> {
|
||||
token_length_limit: usize,
|
||||
tail: TailTokenStream,
|
||||
tail: BoxTokenStream<'a>,
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for RemoveLongFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for RemoveLongFilterStream<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
@@ -89,4 +61,12 @@ where
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user