mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 17:22:54 +00:00
* Removed generics in tokenizers * lowercaser * Added TokenizerExt * Introducing BoxedTokenizer * Introducing BoxXXXXX helper struct * Closes #762. * Introducing a TextAnalyzer
73 lines
1.9 KiB
Rust
73 lines
1.9 KiB
Rust
//! # Example
|
|
//! ```rust
|
|
//! use tantivy::tokenizer::*;
|
|
//!
|
|
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
|
//! .filter(RemoveLongFilter::limit(5));
|
|
//!
|
|
//! let mut stream = tokenizer.token_stream("toolong nice");
|
|
//! // because `toolong` is more than 5 characters, it is filtered
|
|
//! // out of the token stream.
|
|
//! assert_eq!(stream.next().unwrap().text, "nice");
|
|
//! assert!(stream.next().is_none());
|
|
//! ```
|
|
//!
|
|
use super::{Token, TokenFilter, TokenStream};
|
|
use crate::tokenizer::BoxTokenStream;
|
|
|
|
/// `RemoveLongFilter` removes tokens that are longer
|
|
/// than a given number of bytes (in UTF-8 representation).
|
|
///
|
|
/// It is especially useful when indexing unconstrained content.
|
|
/// e.g. Mail containing base-64 encoded pictures etc.
|
|
#[derive(Clone)]
|
|
pub struct RemoveLongFilter {
|
|
length_limit: usize,
|
|
}
|
|
|
|
impl RemoveLongFilter {
|
|
/// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation.
|
|
pub fn limit(length_limit: usize) -> RemoveLongFilter {
|
|
RemoveLongFilter { length_limit }
|
|
}
|
|
}
|
|
|
|
impl<'a> RemoveLongFilterStream<'a> {
|
|
fn predicate(&self, token: &Token) -> bool {
|
|
token.text.len() < self.token_length_limit
|
|
}
|
|
}
|
|
|
|
impl TokenFilter for RemoveLongFilter {
|
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
|
BoxTokenStream::from(RemoveLongFilterStream {
|
|
token_length_limit: self.length_limit,
|
|
tail: token_stream,
|
|
})
|
|
}
|
|
}
|
|
|
|
pub struct RemoveLongFilterStream<'a> {
|
|
token_length_limit: usize,
|
|
tail: BoxTokenStream<'a>,
|
|
}
|
|
|
|
impl<'a> TokenStream for RemoveLongFilterStream<'a> {
|
|
fn advance(&mut self) -> bool {
|
|
while self.tail.advance() {
|
|
if self.predicate(self.tail.token()) {
|
|
return true;
|
|
}
|
|
}
|
|
false
|
|
}
|
|
|
|
fn token(&self) -> &Token {
|
|
self.tail.token()
|
|
}
|
|
|
|
fn token_mut(&mut self) -> &mut Token {
|
|
self.tail.token_mut()
|
|
}
|
|
}
|