mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 04:52:55 +00:00
Compare commits
8 Commits
test_parse
...
fmassot/ad
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f777de12ea | ||
|
|
f6a6b4a2ff | ||
|
|
2cab111f99 | ||
|
|
dc783f8328 | ||
|
|
b82cd08f5d | ||
|
|
54f43135f2 | ||
|
|
6c6b97d4ef | ||
|
|
ad9b825067 |
@@ -19,6 +19,7 @@ oneshot = "0.1.5"
|
||||
base64 = "0.21.0"
|
||||
byteorder = "1.4.3"
|
||||
crc32fast = "1.3.2"
|
||||
dyn-clone = "1.0.11"
|
||||
once_cell = "1.10.0"
|
||||
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
|
||||
aho-corasick = "1.0"
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use tantivy::tokenizer::TokenizerManager;
|
||||
use tantivy::tokenizer::{
|
||||
BoxTokenFilter, LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
|
||||
};
|
||||
|
||||
const ALICE_TXT: &str = include_str!("alice.txt");
|
||||
|
||||
@@ -16,7 +18,26 @@ pub fn criterion_benchmark(c: &mut Criterion) {
|
||||
assert_eq!(word_count, 30_731);
|
||||
})
|
||||
});
|
||||
let token_filters = vec![
|
||||
BoxTokenFilter::from(RemoveLongFilter::limit(40)),
|
||||
BoxTokenFilter::from(LowerCaser),
|
||||
];
|
||||
let mut dynamic_analyzer = TextAnalyzer::new(SimpleTokenizer::default(), token_filters);
|
||||
c.bench_function("default-dynamic-tokenize-alice", |b| {
|
||||
b.iter(|| {
|
||||
let mut word_count = 0;
|
||||
let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
|
||||
while token_stream.advance() {
|
||||
word_count += 1;
|
||||
}
|
||||
assert_eq!(word_count, 30_731);
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(benches, criterion_benchmark);
|
||||
criterion_group! {
|
||||
name = benches;
|
||||
config = Criterion::default().sample_size(200);
|
||||
targets = criterion_benchmark
|
||||
}
|
||||
criterion_main!(benches);
|
||||
|
||||
@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
|
||||
@@ -1209,7 +1209,7 @@ mod tests {
|
||||
ff_tokenizer_manager.register(
|
||||
"custom_lowercase",
|
||||
TextAnalyzer::builder(RawTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.build(),
|
||||
);
|
||||
|
||||
|
||||
@@ -209,7 +209,7 @@ impl SegmentWriter {
|
||||
for value in values {
|
||||
let mut token_stream = match value {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
PreTokenizedStream::from(tok_str.clone()).into()
|
||||
Box::new(PreTokenizedStream::from(tok_str.clone()))
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
let text_analyzer =
|
||||
|
||||
@@ -4,9 +4,7 @@ use std::collections::{BinaryHeap, HashMap};
|
||||
use crate::query::bm25::idf;
|
||||
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
|
||||
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
|
||||
use crate::tokenizer::{
|
||||
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
|
||||
};
|
||||
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
|
||||
use crate::{DocAddress, Result, Searcher, TantivyError};
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
@@ -206,8 +204,7 @@ impl MoreLikeThis {
|
||||
for value in values {
|
||||
match value {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
let mut token_stream: BoxTokenStream =
|
||||
PreTokenizedStream::from(tok_str.clone()).into();
|
||||
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
|
||||
token_stream.process(&mut |token| {
|
||||
if !self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
|
||||
@@ -960,7 +960,8 @@ mod test {
|
||||
tokenizer_manager.register(
|
||||
"en_with_stop_words",
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.filter(LowerCaser::default())
|
||||
.filter(StopWordFilter::remove(vec!["the".to_string()]))
|
||||
.build(),
|
||||
);
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
//! // the "emoji" is dropped because its not an alphanum
|
||||
//! assert!(stream.next().is_none());
|
||||
//! ```
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// `TokenFilter` that removes all tokens that contain non
|
||||
/// ascii alphanumeric characters.
|
||||
@@ -39,23 +39,10 @@ impl<T> AlphaNumOnlyFilterStream<T> {
|
||||
}
|
||||
|
||||
impl TokenFilter for AlphaNumOnlyFilter {
|
||||
type Tokenizer<T: Tokenizer> = AlphaNumOnlyFilterWrapper<T>;
|
||||
type OutputTokenStream<'a, T: TokenStream> = AlphaNumOnlyFilterStream<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> AlphaNumOnlyFilterWrapper<T> {
|
||||
AlphaNumOnlyFilterWrapper(tokenizer)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AlphaNumOnlyFilterWrapper<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for AlphaNumOnlyFilterWrapper<T> {
|
||||
type TokenStream<'a> = AlphaNumOnlyFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
AlphaNumOnlyFilterStream {
|
||||
tail: self.0.token_stream(text),
|
||||
}
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
|
||||
AlphaNumOnlyFilterStream { tail: token_stream }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,38 +1,21 @@
|
||||
use std::mem;
|
||||
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// This class converts alphabetic, numeric, and symbolic Unicode characters
|
||||
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
|
||||
/// block) into their ASCII equivalents, if one exists.
|
||||
#[derive(Clone)]
|
||||
pub struct AsciiFoldingFilter;
|
||||
#[derive(Clone, Default)]
|
||||
pub struct AsciiFoldingFilter(String);
|
||||
|
||||
impl TokenFilter for AsciiFoldingFilter {
|
||||
type Tokenizer<T: Tokenizer> = AsciiFoldingFilterWrapper<T>;
|
||||
type OutputTokenStream<'a, T: TokenStream> = AsciiFoldingFilterTokenStream<'a, T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> AsciiFoldingFilterWrapper<T> {
|
||||
AsciiFoldingFilterWrapper {
|
||||
tokenizer,
|
||||
buffer: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AsciiFoldingFilterWrapper<T> {
|
||||
tokenizer: T,
|
||||
buffer: String,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
|
||||
type TokenStream<'a> = AsciiFoldingFilterTokenStream<'a, T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.buffer.clear();
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
|
||||
self.0.clear();
|
||||
AsciiFoldingFilterTokenStream {
|
||||
buffer: &mut self.buffer,
|
||||
tail: self.tokenizer.token_stream(text),
|
||||
buffer: &mut self.0,
|
||||
tail: token_stream,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1581,7 +1564,7 @@ mod tests {
|
||||
fn folding_helper(text: &str) -> Vec<String> {
|
||||
let mut tokens = Vec::new();
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(AsciiFoldingFilter)
|
||||
.filter(AsciiFoldingFilter::default())
|
||||
.build()
|
||||
.token_stream(text)
|
||||
.process(&mut |token| {
|
||||
@@ -1592,7 +1575,7 @@ mod tests {
|
||||
|
||||
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
|
||||
let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default())
|
||||
.filter(AsciiFoldingFilter)
|
||||
.filter(AsciiFoldingFilter::default())
|
||||
.build();
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
token_stream.advance();
|
||||
|
||||
@@ -1,36 +1,19 @@
|
||||
use std::mem;
|
||||
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// Token filter that lowercase terms.
|
||||
#[derive(Clone)]
|
||||
pub struct LowerCaser;
|
||||
#[derive(Clone, Default)]
|
||||
pub struct LowerCaser(String);
|
||||
|
||||
impl TokenFilter for LowerCaser {
|
||||
type Tokenizer<T: Tokenizer> = LowerCaserFilter<T>;
|
||||
type OutputTokenStream<'a, T: TokenStream> = LowerCaserTokenStream<'a, T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
|
||||
LowerCaserFilter {
|
||||
tokenizer,
|
||||
buffer: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct LowerCaserFilter<T> {
|
||||
tokenizer: T,
|
||||
buffer: String,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for LowerCaserFilter<T> {
|
||||
type TokenStream<'a> = LowerCaserTokenStream<'a, T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.buffer.clear();
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
|
||||
self.0.clear();
|
||||
LowerCaserTokenStream {
|
||||
tail: self.tokenizer.token_stream(text),
|
||||
buffer: &mut self.buffer,
|
||||
tail: token_stream,
|
||||
buffer: &mut self.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -94,7 +77,7 @@ mod tests {
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let mut token_stream = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.build();
|
||||
|
||||
let mut token_stream = token_stream.token_stream(text);
|
||||
|
||||
@@ -68,7 +68,7 @@
|
||||
//!
|
||||
//! let en_stem = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser)
|
||||
//! .filter(LowerCaser::default())
|
||||
//! .filter(Stemmer::new(Language::English))
|
||||
//! .build();
|
||||
//! ```
|
||||
@@ -115,7 +115,7 @@
|
||||
//! // We need to register our tokenizer :
|
||||
//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser)
|
||||
//! .filter(LowerCaser::default())
|
||||
//! .build();
|
||||
//! index
|
||||
//! .tokenizers()
|
||||
@@ -139,7 +139,7 @@ mod tokenizer;
|
||||
mod tokenizer_manager;
|
||||
mod whitespace_tokenizer;
|
||||
|
||||
pub use tokenizer_api::{BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer};
|
||||
pub use tokenizer_api::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||
pub use self::ascii_folding_filter::AsciiFoldingFilter;
|
||||
@@ -154,7 +154,7 @@ pub use self::split_compound_words::SplitCompoundWords;
|
||||
pub use self::stemmer::{Language, Stemmer};
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||
pub use self::tokenizer::{TextAnalyzer, TextAnalyzerBuilder};
|
||||
pub use self::tokenizer::{BoxTokenFilter, TextAnalyzer, TextAnalyzerBuilder};
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
pub use self::whitespace_tokenizer::WhitespaceTokenizer;
|
||||
|
||||
@@ -233,7 +233,7 @@ pub mod tests {
|
||||
"el_stem",
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.filter(Stemmer::new(Language::Greek))
|
||||
.build(),
|
||||
);
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
//! assert_eq!(stream.next().unwrap().text, "nice");
|
||||
//! assert!(stream.next().is_none());
|
||||
//! ```
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// `RemoveLongFilter` removes tokens that are longer
|
||||
/// than a given number of bytes (in UTF-8 representation).
|
||||
@@ -38,29 +38,12 @@ impl<T> RemoveLongFilterStream<T> {
|
||||
}
|
||||
|
||||
impl TokenFilter for RemoveLongFilter {
|
||||
type Tokenizer<T: Tokenizer> = RemoveLongFilterWrapper<T>;
|
||||
type OutputTokenStream<'a, T: TokenStream> = RemoveLongFilterStream<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> RemoveLongFilterWrapper<T> {
|
||||
RemoveLongFilterWrapper {
|
||||
length_limit: self.length_limit,
|
||||
inner: tokenizer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RemoveLongFilterWrapper<T: Tokenizer> {
|
||||
length_limit: usize,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for RemoveLongFilterWrapper<T> {
|
||||
type TokenStream<'a> = RemoveLongFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
|
||||
RemoveLongFilterStream {
|
||||
token_length_limit: self.length_limit,
|
||||
tail: self.inner.token_stream(text),
|
||||
tail: token_stream,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
|
||||
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// A [`TokenFilter`] which splits compound words into their parts
|
||||
/// based on a given dictionary.
|
||||
@@ -80,29 +80,12 @@ impl SplitCompoundWords {
|
||||
}
|
||||
|
||||
impl TokenFilter for SplitCompoundWords {
|
||||
type Tokenizer<T: Tokenizer> = SplitCompoundWordsFilter<T>;
|
||||
type OutputTokenStream<'a, T: TokenStream> = SplitCompoundWordsTokenStream<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> SplitCompoundWordsFilter<T> {
|
||||
SplitCompoundWordsFilter {
|
||||
dict: self.dict,
|
||||
inner: tokenizer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SplitCompoundWordsFilter<T> {
|
||||
dict: AhoCorasick,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
|
||||
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
|
||||
SplitCompoundWordsTokenStream {
|
||||
dict: self.dict.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
tail: token_stream,
|
||||
cuts: Vec::new(),
|
||||
parts: Vec::new(),
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::mem;
|
||||
use rust_stemmers::{self, Algorithm};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// Available stemmer languages.
|
||||
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
|
||||
@@ -81,29 +81,12 @@ impl Default for Stemmer {
|
||||
}
|
||||
|
||||
impl TokenFilter for Stemmer {
|
||||
type Tokenizer<T: Tokenizer> = StemmerFilter<T>;
|
||||
type OutputTokenStream<'a, T: TokenStream> = StemmerTokenStream<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> StemmerFilter<T> {
|
||||
StemmerFilter {
|
||||
stemmer_algorithm: self.stemmer_algorithm,
|
||||
inner: tokenizer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct StemmerFilter<T> {
|
||||
stemmer_algorithm: Algorithm,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for StemmerFilter<T> {
|
||||
type TokenStream<'a> = StemmerTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
|
||||
let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||
StemmerTokenStream {
|
||||
tail: self.inner.token_stream(text),
|
||||
tail: token_stream,
|
||||
stemmer,
|
||||
buffer: String::new(),
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@ use rustc_hash::FxHashSet;
|
||||
|
||||
#[cfg(feature = "stopwords")]
|
||||
use super::Language;
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// `TokenFilter` that removes stop words from a token stream
|
||||
#[derive(Clone)]
|
||||
@@ -72,29 +72,12 @@ impl StopWordFilter {
|
||||
}
|
||||
|
||||
impl TokenFilter for StopWordFilter {
|
||||
type Tokenizer<T: Tokenizer> = StopWordFilterWrapper<T>;
|
||||
type OutputTokenStream<'a, T: TokenStream> = StopWordFilterStream<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> StopWordFilterWrapper<T> {
|
||||
StopWordFilterWrapper {
|
||||
words: self.words,
|
||||
inner: tokenizer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct StopWordFilterWrapper<T> {
|
||||
words: Arc<FxHashSet<String>>,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for StopWordFilterWrapper<T> {
|
||||
type TokenStream<'a> = StopWordFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T> {
|
||||
StopWordFilterStream {
|
||||
words: self.words.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
tail: token_stream,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,37 +1,103 @@
|
||||
use dyn_clone::DynClone;
|
||||
/// The tokenizer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
|
||||
use tokenizer_api::{FilteredTokenizer, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
||||
|
||||
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||
#[derive(Clone)]
|
||||
pub struct TextAnalyzer {
|
||||
tokenizer: Box<dyn BoxableTokenizer>,
|
||||
token_filters: Vec<BoxTokenFilter>,
|
||||
}
|
||||
|
||||
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
||||
trait BoxableTokenizer: 'static + Send + Sync {
|
||||
trait BoxableTokenizer: 'static + Send + Sync + DynClone {
|
||||
/// Creates a boxed token stream for a given `str`.
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
/// Clone this tokenizer.
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a>;
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> BoxableTokenizer for T {
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.token_stream(text).into()
|
||||
}
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
|
||||
Box::new(self.clone())
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(self.token_stream(text))
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TextAnalyzer {
|
||||
fn clone(&self) -> Self {
|
||||
dyn_clone::clone_trait_object!(BoxableTokenizer);
|
||||
|
||||
/// A boxable `TokenFilter`, with its `Tokenizer` type erased.
|
||||
trait BoxableTokenFilter: 'static + Send + Sync + DynClone {
|
||||
/// Transforms a boxed token stream into a new one.
|
||||
fn box_filter<'a>(
|
||||
&'a mut self,
|
||||
token_stream: Box<dyn TokenStream + 'a>,
|
||||
) -> Box<dyn TokenStream + 'a>;
|
||||
}
|
||||
|
||||
impl<T: TokenFilter> BoxableTokenFilter for T {
|
||||
fn box_filter<'a>(
|
||||
&'a mut self,
|
||||
token_stream: Box<dyn TokenStream + 'a>,
|
||||
) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(self.filter(token_stream))
|
||||
}
|
||||
}
|
||||
|
||||
dyn_clone::clone_trait_object!(BoxableTokenFilter);
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
|
||||
///
|
||||
/// See [`TokenFilter`] for more information.
|
||||
#[derive(Clone)]
|
||||
pub struct BoxTokenFilter(Box<dyn BoxableTokenFilter>);
|
||||
|
||||
impl<T: TokenFilter> From<T> for BoxTokenFilter {
|
||||
fn from(tokenizer: T) -> BoxTokenFilter {
|
||||
BoxTokenFilter(Box::new(tokenizer))
|
||||
}
|
||||
}
|
||||
|
||||
impl TextAnalyzer {
|
||||
/// Builds a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
|
||||
///
|
||||
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
|
||||
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it
|
||||
/// will be more performant and create less boxes.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let en_stem = TextAnalyzer::new(
|
||||
/// SimpleTokenizer::default(),
|
||||
/// vec![
|
||||
/// BoxTokenFilter::from(RemoveLongFilter::limit(40)),
|
||||
/// BoxTokenFilter::from(LowerCaser),
|
||||
/// BoxTokenFilter::from(Stemmer::default()),
|
||||
/// ]);
|
||||
/// ```
|
||||
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
|
||||
TextAnalyzer {
|
||||
tokenizer: self.tokenizer.box_clone(),
|
||||
tokenizer: Box::new(tokenizer),
|
||||
token_filters,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new TextAnalyzerBuilder.
|
||||
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
|
||||
TextAnalyzerBuilder { tokenizer }
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
let mut token_stream = self.tokenizer.box_token_stream(text);
|
||||
for token_filter in self.token_filters.iter_mut() {
|
||||
token_stream = token_filter.0.box_filter(token_stream);
|
||||
}
|
||||
token_stream
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TextAnalyzer {
|
||||
@@ -46,20 +112,8 @@ impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
|
||||
}
|
||||
}
|
||||
|
||||
impl TextAnalyzer {
|
||||
/// Create a new TextAnalyzerBuilder
|
||||
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
|
||||
TextAnalyzerBuilder { tokenizer }
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.tokenizer.box_token_stream(text)
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder helper for [`TextAnalyzer`]
|
||||
pub struct TextAnalyzerBuilder<T> {
|
||||
pub struct TextAnalyzerBuilder<T: Tokenizer> {
|
||||
tokenizer: T,
|
||||
}
|
||||
|
||||
@@ -77,7 +131,10 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
/// .filter(Stemmer::default())
|
||||
/// .build();
|
||||
/// ```
|
||||
pub fn filter<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder<F::Tokenizer<T>> {
|
||||
pub fn filter<F: TokenFilter>(
|
||||
self,
|
||||
token_filter: F,
|
||||
) -> TextAnalyzerBuilder<FilteredTokenizer<T, F>> {
|
||||
TextAnalyzerBuilder {
|
||||
tokenizer: token_filter.transform(self.tokenizer),
|
||||
}
|
||||
@@ -87,6 +144,41 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
pub fn build(self) -> TextAnalyzer {
|
||||
TextAnalyzer {
|
||||
tokenizer: Box::new(self.tokenizer),
|
||||
token_filters: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer};
|
||||
|
||||
#[test]
|
||||
fn test_text_analyzer_builder() {
|
||||
let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default())
|
||||
.filter(AlphaNumOnlyFilter)
|
||||
.filter(RemoveLongFilter::limit(6))
|
||||
.filter(LowerCaser::default())
|
||||
.build();
|
||||
let mut stream = analyzer.token_stream("- first bullet point");
|
||||
assert_eq!(stream.next().unwrap().text, "first");
|
||||
assert_eq!(stream.next().unwrap().text, "point");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_analyzer_with_filters_boxed() {
|
||||
let mut analyzer = TextAnalyzer::new(
|
||||
WhitespaceTokenizer::default(),
|
||||
vec![
|
||||
BoxTokenFilter::from(AlphaNumOnlyFilter),
|
||||
BoxTokenFilter::from(LowerCaser::default()),
|
||||
BoxTokenFilter::from(RemoveLongFilter::limit(6)),
|
||||
],
|
||||
);
|
||||
let mut stream = analyzer.token_stream("- first bullet point");
|
||||
assert_eq!(stream.next().unwrap().text, "first");
|
||||
assert_eq!(stream.next().unwrap().text, "point");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,14 +63,14 @@ impl Default for TokenizerManager {
|
||||
"default",
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.build(),
|
||||
);
|
||||
manager.register(
|
||||
"en_stem",
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(LowerCaser::default())
|
||||
.filter(Stemmer::new(Language::English))
|
||||
.build(),
|
||||
);
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
|
||||
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -60,30 +59,6 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>;
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||
|
||||
impl<'a, T> From<T> for BoxTokenStream<'a>
|
||||
where T: TokenStream + 'a
|
||||
{
|
||||
fn from(token_stream: T) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream(Box::new(token_stream))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for BoxTokenStream<'a> {
|
||||
type Target = dyn TokenStream + 'a;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
impl<'a> DerefMut for BoxTokenStream<'a> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut *self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
let token_stream: &mut dyn TokenStream = self.borrow_mut();
|
||||
@@ -137,12 +112,34 @@ pub trait TokenStream {
|
||||
}
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait TokenFilter: 'static + Send + Sync {
|
||||
pub trait TokenFilter: 'static + Send + Sync + Clone {
|
||||
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
|
||||
/// Tokenizer.
|
||||
type Tokenizer<T: Tokenizer>: Tokenizer;
|
||||
type OutputTokenStream<'a, T: TokenStream>: TokenStream;
|
||||
/// Filter a token stream and returns a new one.
|
||||
fn filter<'a, T: TokenStream>(&'a mut self, token_stream: T) -> Self::OutputTokenStream<'a, T>;
|
||||
/// Wraps a Tokenizer and returns a new one.
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> FilteredTokenizer<T, Self> {
|
||||
FilteredTokenizer {
|
||||
tokenizer,
|
||||
token_filter: self,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FilteredTokenizer<T: Tokenizer, F: TokenFilter> {
|
||||
tokenizer: T,
|
||||
token_filter: F,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer, F: TokenFilter> Tokenizer for FilteredTokenizer<T, F> {
|
||||
type TokenStream<'a> = F::OutputTokenStream<'a, T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
let token_stream = self.tokenizer.token_stream(text);
|
||||
self.token_filter.filter(token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
Reference in New Issue
Block a user