refactor tokenization pipeline to use GATs (#1924)

* refactor tokenization pipeline to use GATs

* fix doctests

* fix clippy lints

* remove commented code
This commit is contained in:
trinity-1686a
2023-03-09 09:39:37 +01:00
committed by GitHub
parent a42a96f470
commit 064518156f
23 changed files with 353 additions and 239 deletions

View File

@@ -12,7 +12,7 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, Tokenizer};
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;

View File

@@ -50,12 +50,13 @@ fn main() -> tantivy::Result<()> {
// This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec![
"the".to_string(),
"and".to_string(),
]));
]))
.build();
index.tokenizers().register("stoppy", tokenizer);

View File

@@ -188,7 +188,7 @@ impl SegmentWriter {
let mut indexing_position = IndexingPosition::default();
postings_writer.index_text(
doc_id,
&mut *facet_tokenizer,
&mut facet_tokenizer,
term_buffer,
ctx,
&mut indexing_position,

View File

@@ -4,7 +4,9 @@ use std::collections::{BinaryHeap, HashMap};
use crate::query::bm25::idf;
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
use crate::tokenizer::{BoxTokenStream, FacetTokenizer, PreTokenizedStream, Tokenizer};
use crate::tokenizer::{
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
};
use crate::{DocAddress, Result, Searcher, TantivyError};
#[derive(Debug, PartialEq)]

View File

@@ -913,9 +913,10 @@ mod test {
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"en_with_stop_words",
TextAnalyzer::from(SimpleTokenizer)
TextAnalyzer::builder(SimpleTokenizer)
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec!["the".to_string()])),
.filter(StopWordFilter::remove(vec!["the".to_string()]))
.build(),
);
QueryParser::new(schema, default_fields, tokenizer_manager)
}

View File

@@ -2,16 +2,18 @@
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(RawTokenizer)
//! .filter(AlphaNumOnlyFilter);
//! let tokenizer = TextAnalyzer::builder(RawTokenizer)
//! .filter(AlphaNumOnlyFilter)
//! .build();
//!
//! let mut stream = tokenizer.token_stream("hello there");
//! // is none because the raw filter emits one token that
//! // contains a space
//! assert!(stream.next().is_none());
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(AlphaNumOnlyFilter);
//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
//! .filter(AlphaNumOnlyFilter)
//! .build();
//!
//! let mut stream = tokenizer.token_stream("hello there 💣");
//! assert!(stream.next().is_some());
@@ -19,30 +21,45 @@
//! // the "emoji" is dropped because its not an alphanum
//! assert!(stream.next().is_none());
//! ```
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
use super::{Token, TokenFilter, TokenStream, Tokenizer};
/// `TokenFilter` that removes all tokens that contain non
/// ascii alphanumeric characters.
#[derive(Clone)]
pub struct AlphaNumOnlyFilter;
pub struct AlphaNumOnlyFilterStream<'a> {
tail: BoxTokenStream<'a>,
pub struct AlphaNumOnlyFilterStream<T> {
tail: T,
}
impl<'a> AlphaNumOnlyFilterStream<'a> {
impl<T> AlphaNumOnlyFilterStream<T> {
fn predicate(&self, token: &Token) -> bool {
token.text.chars().all(|c| c.is_ascii_alphanumeric())
}
}
impl TokenFilter for AlphaNumOnlyFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream })
type Tokenizer<T: Tokenizer> = AlphaNumOnlyFilterWrapper<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> AlphaNumOnlyFilterWrapper<T> {
AlphaNumOnlyFilterWrapper(tokenizer)
}
}
impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> {
#[derive(Clone)]
pub struct AlphaNumOnlyFilterWrapper<T>(T);
impl<T: Tokenizer> Tokenizer for AlphaNumOnlyFilterWrapper<T> {
type TokenStream<'a> = AlphaNumOnlyFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
AlphaNumOnlyFilterStream {
tail: self.0.token_stream(text),
}
}
}
impl<T: TokenStream> TokenStream for AlphaNumOnlyFilterStream<T> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
@@ -79,7 +96,9 @@ mod tests {
}
fn token_stream_helper(text: &str) -> Vec<Token> {
let a = TextAnalyzer::from(SimpleTokenizer).filter(AlphaNumOnlyFilter);
let a = TextAnalyzer::builder(SimpleTokenizer)
.filter(AlphaNumOnlyFilter)
.build();
let mut token_stream = a.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {

View File

@@ -1,6 +1,6 @@
use std::mem;
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
use super::{Token, TokenFilter, TokenStream, Tokenizer};
/// This class converts alphabetic, numeric, and symbolic Unicode characters
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
@@ -9,20 +9,33 @@ use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
pub struct AsciiFoldingFilter;
impl TokenFilter for AsciiFoldingFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
From::from(AsciiFoldingFilterTokenStream {
tail: token_stream,
buffer: String::with_capacity(100),
})
type Tokenizer<T: Tokenizer> = AsciiFoldingFilterWrapper<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> AsciiFoldingFilterWrapper<T> {
AsciiFoldingFilterWrapper(tokenizer)
}
}
pub struct AsciiFoldingFilterTokenStream<'a> {
buffer: String,
tail: BoxTokenStream<'a>,
#[derive(Clone)]
pub struct AsciiFoldingFilterWrapper<T>(T);
impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
type TokenStream<'a> = AsciiFoldingFilterTokenStream<T::TokenStream<'a>>;
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
AsciiFoldingFilterTokenStream {
buffer: String::with_capacity(100),
tail: self.0.token_stream(text),
}
}
}
impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> {
pub struct AsciiFoldingFilterTokenStream<T> {
buffer: String,
tail: T,
}
impl<T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<T> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
@@ -1560,8 +1573,9 @@ mod tests {
fn folding_helper(text: &str) -> Vec<String> {
let mut tokens = Vec::new();
TextAnalyzer::from(SimpleTokenizer)
TextAnalyzer::builder(SimpleTokenizer)
.filter(AsciiFoldingFilter)
.build()
.token_stream(text)
.process(&mut |token| {
tokens.push(token.text.clone());
@@ -1570,8 +1584,9 @@ mod tests {
}
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
let mut token_stream = TextAnalyzer::from(RawTokenizer)
let mut token_stream = TextAnalyzer::builder(RawTokenizer)
.filter(AsciiFoldingFilter)
.build()
.token_stream(text);
token_stream.advance();
token_stream.token().text.clone()

View File

@@ -1,16 +1,17 @@
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, Tokenizer};
use crate::tokenizer::{Token, TokenStream, Tokenizer};
#[derive(Clone)]
pub(crate) struct EmptyTokenizer;
impl Tokenizer for EmptyTokenizer {
fn token_stream<'a>(&self, _text: &'a str) -> BoxTokenStream<'a> {
EmptyTokenStream::default().into()
type TokenStream<'a> = EmptyTokenStream;
fn token_stream(&self, _text: &str) -> EmptyTokenStream {
EmptyTokenStream::default()
}
}
#[derive(Default)]
struct EmptyTokenStream {
pub struct EmptyTokenStream {
token: Token,
}
@@ -30,7 +31,7 @@ impl TokenStream for EmptyTokenStream {
#[cfg(test)]
mod tests {
use crate::tokenizer::Tokenizer;
use crate::tokenizer::{TokenStream, Tokenizer};
#[test]
fn test_empty_tokenizer() {

View File

@@ -1,4 +1,4 @@
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
use super::{Token, TokenStream, Tokenizer};
use crate::schema::FACET_SEP_BYTE;
/// The `FacetTokenizer` process a `Facet` binary representation
@@ -26,7 +26,8 @@ pub struct FacetTokenStream<'a> {
}
impl Tokenizer for FacetTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
type TokenStream<'a> = FacetTokenStream<'a>;
fn token_stream<'a>(&self, text: &'a str) -> FacetTokenStream<'a> {
let token = Token {
position: 0,
..Default::default()
@@ -36,7 +37,6 @@ impl Tokenizer for FacetTokenizer {
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
token,
}
.into()
}
}
@@ -87,7 +87,7 @@ mod tests {
use super::FacetTokenizer;
use crate::schema::Facet;
use crate::tokenizer::{Token, Tokenizer};
use crate::tokenizer::{Token, TokenStream, Tokenizer};
#[test]
fn test_facet_tokenizer() {

View File

@@ -1,29 +1,42 @@
use std::mem;
use super::{Token, TokenFilter, TokenStream};
use crate::tokenizer::BoxTokenStream;
impl TokenFilter for LowerCaser {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(LowerCaserTokenStream {
tail: token_stream,
buffer: String::with_capacity(100),
})
}
}
use super::{Token, TokenFilter, TokenStream, Tokenizer};
/// Token filter that lowercase terms.
#[derive(Clone)]
pub struct LowerCaser;
pub struct LowerCaserTokenStream<'a> {
impl TokenFilter for LowerCaser {
type Tokenizer<T: Tokenizer> = LowerCaserFilter<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
LowerCaserFilter(tokenizer)
}
}
#[derive(Clone)]
pub struct LowerCaserFilter<T>(T);
impl<T: Tokenizer> Tokenizer for LowerCaserFilter<T> {
type TokenStream<'a> = LowerCaserTokenStream<T::TokenStream<'a>>;
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
LowerCaserTokenStream {
tail: self.0.token_stream(text),
buffer: String::new(),
}
}
}
pub struct LowerCaserTokenStream<T> {
buffer: String,
tail: BoxTokenStream<'a>,
tail: T,
}
// writes a lowercased version of text into output.
fn to_lowercase_unicode(text: &str, output: &mut String) {
output.clear();
output.reserve(50);
for c in text.chars() {
// Contrary to the std, we do not take care of sigma special case.
// This will have an normalizationo effect, which is ok for search.
@@ -31,7 +44,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) {
}
}
impl<'a> TokenStream for LowerCaserTokenStream<'a> {
impl<T: TokenStream> TokenStream for LowerCaserTokenStream<T> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
@@ -73,8 +86,9 @@ mod tests {
}
fn token_stream_helper(text: &str) -> Vec<Token> {
let mut token_stream = TextAnalyzer::from(SimpleTokenizer)
let mut token_stream = TextAnalyzer::builder(SimpleTokenizer)
.filter(LowerCaser)
.build()
.token_stream(text);
let mut tokens = vec![];
let mut add_token = |token: &Token| {

View File

@@ -66,10 +66,11 @@
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let en_stem = TextAnalyzer::from(SimpleTokenizer)
//! let en_stem = TextAnalyzer::builder(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser)
//! .filter(Stemmer::new(Language::English));
//! .filter(Stemmer::new(Language::English))
//! .build();
//! ```
//!
//! Once your tokenizer is defined, you need to
@@ -112,9 +113,10 @@
//! let index = Index::create_in_ram(schema);
//!
//! // We need to register our tokenizer :
//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser);
//! .filter(LowerCaser)
//! .build();
//! index
//! .tokenizers()
//! .register("custom_en", custom_en_tokenizer);
@@ -137,9 +139,7 @@ mod tokenizer;
mod tokenizer_manager;
mod whitespace_tokenizer;
pub use tokenizer_api::{
BoxTokenFilter, BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer,
};
pub use tokenizer_api::{BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer};
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::AsciiFoldingFilter;
@@ -237,10 +237,11 @@ pub mod tests {
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"el_stem",
TextAnalyzer::from(SimpleTokenizer)
TextAnalyzer::builder(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(Language::Greek)),
.filter(Stemmer::new(Language::Greek))
.build(),
);
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
let mut tokens: Vec<Token> = vec![];

View File

@@ -1,5 +1,4 @@
use super::{Token, TokenStream, Tokenizer};
use crate::tokenizer::BoxTokenStream;
/// Tokenize the text by splitting words into n-grams of the given size(s)
///
@@ -132,8 +131,9 @@ pub struct NgramTokenStream<'a> {
}
impl Tokenizer for NgramTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
From::from(NgramTokenStream {
type TokenStream<'a> = NgramTokenStream<'a>;
fn token_stream<'a>(&self, text: &'a str) -> NgramTokenStream<'a> {
NgramTokenStream {
ngram_charidx_iterator: StutteringIterator::new(
CodepointFrontiers::for_str(text),
self.min_gram,
@@ -142,7 +142,7 @@ impl Tokenizer for NgramTokenizer {
prefix_only: self.prefix_only,
text,
token: Token::default(),
})
}
}
}
@@ -303,9 +303,9 @@ mod tests {
use super::{utf8_codepoint_width, CodepointFrontiers, NgramTokenizer, StutteringIterator};
use crate::tokenizer::tests::assert_token;
use crate::tokenizer::{BoxTokenStream, Token, Tokenizer};
use crate::tokenizer::{Token, TokenStream, Tokenizer};
fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> {
fn test_helper<T: TokenStream>(mut tokenizer: T) -> Vec<Token> {
let mut tokens: Vec<Token> = vec![];
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
tokens

View File

@@ -1,5 +1,4 @@
use super::{Token, TokenStream, Tokenizer};
use crate::tokenizer::BoxTokenStream;
/// For each value of the field, emit a single unprocessed token.
#[derive(Clone)]
@@ -11,7 +10,8 @@ pub struct RawTokenStream {
}
impl Tokenizer for RawTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
type TokenStream<'a> = RawTokenStream;
fn token_stream(&self, text: &str) -> RawTokenStream {
let token = Token {
offset_from: 0,
offset_to: text.len(),
@@ -23,7 +23,6 @@ impl Tokenizer for RawTokenizer {
token,
has_token: true,
}
.into()
}
}

View File

@@ -1,6 +1,6 @@
use regex::Regex;
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
use super::{Token, TokenStream, Tokenizer};
use crate::TantivyError;
/// Tokenize the text by using a regex pattern to split.
@@ -60,13 +60,14 @@ impl RegexTokenizer {
}
impl Tokenizer for RegexTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
BoxTokenStream::from(RegexTokenStream {
type TokenStream<'a> = RegexTokenStream<'a>;
fn token_stream<'a>(&self, text: &'a str) -> RegexTokenStream<'a> {
RegexTokenStream {
regex: self.regex.clone(),
text,
token: Token::default(),
cursor: 0,
})
}
}
}

View File

@@ -2,8 +2,9 @@
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(5));
//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(5))
//! .build();
//!
//! let mut stream = tokenizer.token_stream("toolong nice");
//! // because `toolong` is more than 5 characters, it is filtered
@@ -11,8 +12,7 @@
//! assert_eq!(stream.next().unwrap().text, "nice");
//! assert!(stream.next().is_none());
//! ```
use super::{Token, TokenFilter, TokenStream};
use crate::tokenizer::BoxTokenStream;
use super::{Token, TokenFilter, TokenStream, Tokenizer};
/// `RemoveLongFilter` removes tokens that are longer
/// than a given number of bytes (in UTF-8 representation).
@@ -31,27 +31,46 @@ impl RemoveLongFilter {
}
}
impl<'a> RemoveLongFilterStream<'a> {
impl<T> RemoveLongFilterStream<T> {
fn predicate(&self, token: &Token) -> bool {
token.text.len() < self.token_length_limit
}
}
impl TokenFilter for RemoveLongFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(RemoveLongFilterStream {
token_length_limit: self.length_limit,
tail: token_stream,
})
type Tokenizer<T: Tokenizer> = RemoveLongFilterWrapper<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> RemoveLongFilterWrapper<T> {
RemoveLongFilterWrapper {
length_limit: self.length_limit,
inner: tokenizer,
}
}
}
pub struct RemoveLongFilterStream<'a> {
token_length_limit: usize,
tail: BoxTokenStream<'a>,
#[derive(Clone)]
pub struct RemoveLongFilterWrapper<T: Tokenizer> {
length_limit: usize,
inner: T,
}
impl<'a> TokenStream for RemoveLongFilterStream<'a> {
impl<T: Tokenizer> Tokenizer for RemoveLongFilterWrapper<T> {
type TokenStream<'a> = RemoveLongFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
RemoveLongFilterStream {
token_length_limit: self.length_limit,
tail: self.inner.token_stream(text),
}
}
}
pub struct RemoveLongFilterStream<T> {
token_length_limit: usize,
tail: T,
}
impl<T: TokenStream> TokenStream for RemoveLongFilterStream<T> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
@@ -84,7 +103,9 @@ mod tests {
}
fn token_stream_helper(text: &str) -> Vec<Token> {
let a = TextAnalyzer::from(SimpleTokenizer).filter(RemoveLongFilter::limit(6));
let a = TextAnalyzer::builder(SimpleTokenizer)
.filter(RemoveLongFilter::limit(6))
.build();
let mut token_stream = a.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {

View File

@@ -1,6 +1,6 @@
use std::str::CharIndices;
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
use super::{Token, TokenStream, Tokenizer};
/// Tokenize the text by splitting on whitespaces and punctuation.
#[derive(Clone)]
@@ -13,12 +13,13 @@ pub struct SimpleTokenStream<'a> {
}
impl Tokenizer for SimpleTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
BoxTokenStream::from(SimpleTokenStream {
type TokenStream<'a> = SimpleTokenStream<'a>;
fn token_stream<'a>(&self, text: &'a str) -> SimpleTokenStream<'a> {
SimpleTokenStream {
text,
chars: text.char_indices(),
token: Token::default(),
})
}
}
}

View File

@@ -2,7 +2,7 @@ use std::sync::Arc;
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind, StateID};
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
use super::{Token, TokenFilter, TokenStream, Tokenizer};
/// A [`TokenFilter`] which splits compound words into their parts
/// based on a given dictionary.
@@ -23,9 +23,11 @@ use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
/// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer};
///
/// let tokenizer =
/// TextAnalyzer::from(SimpleTokenizer).filter(SplitCompoundWords::from_dictionary([
/// TextAnalyzer::builder(SimpleTokenizer)
/// .filter(SplitCompoundWords::from_dictionary([
/// "dampf", "schiff", "fahrt", "brot", "backen", "automat",
/// ]));
/// ]))
/// .build();
///
/// let mut stream = tokenizer.token_stream("dampfschifffahrt");
/// assert_eq!(stream.next().unwrap().text, "dampf");
@@ -76,24 +78,45 @@ impl<S: StateID> SplitCompoundWords<S> {
}
impl<S: StateID + Send + Sync + 'static> TokenFilter for SplitCompoundWords<S> {
fn transform<'a>(&self, stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: stream,
cuts: Vec::new(),
parts: Vec::new(),
})
type Tokenizer<T: Tokenizer> = SplitCompoundWordsFilter<T, S>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> SplitCompoundWordsFilter<T, S> {
SplitCompoundWordsFilter {
dict: self.dict,
inner: tokenizer,
}
}
}
struct SplitCompoundWordsTokenStream<'a, S: StateID> {
#[derive(Clone)]
pub struct SplitCompoundWordsFilter<T, S: StateID> {
dict: Arc<AhoCorasick<S>>,
tail: BoxTokenStream<'a>,
inner: T,
}
impl<T: Tokenizer, S: StateID + Send + Sync + 'static> Tokenizer
for SplitCompoundWordsFilter<T, S>
{
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>, S>;
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: self.inner.token_stream(text),
cuts: Vec::new(),
parts: Vec::new(),
}
}
}
pub struct SplitCompoundWordsTokenStream<T, S: StateID> {
dict: Arc<AhoCorasick<S>>,
tail: T,
cuts: Vec<usize>,
parts: Vec<Token>,
}
impl<'a, S: StateID> SplitCompoundWordsTokenStream<'a, S> {
impl<T: TokenStream, S: StateID> SplitCompoundWordsTokenStream<T, S> {
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
// can fully be split into consecutive matches against `self.dict`.
fn split(&mut self) {
@@ -129,7 +152,7 @@ impl<'a, S: StateID> SplitCompoundWordsTokenStream<'a, S> {
}
}
impl<'a, S: StateID> TokenStream for SplitCompoundWordsTokenStream<'a, S> {
impl<T: TokenStream, S: StateID> TokenStream for SplitCompoundWordsTokenStream<T, S> {
fn advance(&mut self) -> bool {
self.parts.pop();
@@ -165,8 +188,9 @@ mod tests {
#[test]
fn splitting_compound_words_works() {
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]));
let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]))
.build();
{
let mut stream = tokenizer.token_stream("");

View File

@@ -4,8 +4,7 @@ use std::mem;
use rust_stemmers::{self, Algorithm};
use serde::{Deserialize, Serialize};
use super::{Token, TokenFilter, TokenStream};
use crate::tokenizer::BoxTokenStream;
use super::{Token, TokenFilter, TokenStream, Tokenizer};
/// Available stemmer languages.
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
@@ -82,23 +81,42 @@ impl Default for Stemmer {
}
impl TokenFilter for Stemmer {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
BoxTokenStream::from(StemmerTokenStream {
tail: token_stream,
stemmer: inner_stemmer,
buffer: String::new(),
})
type Tokenizer<T: Tokenizer> = StemmerFilter<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> StemmerFilter<T> {
StemmerFilter {
stemmer_algorithm: self.stemmer_algorithm,
inner: tokenizer,
}
}
}
pub struct StemmerTokenStream<'a> {
tail: BoxTokenStream<'a>,
#[derive(Clone)]
pub struct StemmerFilter<T> {
stemmer_algorithm: Algorithm,
inner: T,
}
impl<T: Tokenizer> Tokenizer for StemmerFilter<T> {
type TokenStream<'a> = StemmerTokenStream<T::TokenStream<'a>>;
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
StemmerTokenStream {
tail: self.inner.token_stream(text),
stemmer,
buffer: String::new(),
}
}
}
pub struct StemmerTokenStream<T> {
tail: T,
stemmer: rust_stemmers::Stemmer,
buffer: String,
}
impl<'a> TokenStream for StemmerTokenStream<'a> {
impl<T: TokenStream> TokenStream for StemmerTokenStream<T> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;

View File

@@ -2,8 +2,9 @@
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]));
//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]))
//! .build();
//!
//! let mut stream = tokenizer.token_stream("the fox is crafty");
//! assert_eq!(stream.next().unwrap().text, "fox");
@@ -20,7 +21,7 @@ use rustc_hash::FxHashSet;
#[cfg(feature = "stopwords")]
use super::Language;
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
use super::{Token, TokenFilter, TokenStream, Tokenizer};
/// `TokenFilter` that removes stop words from a token stream
#[derive(Clone)]
@@ -69,27 +70,46 @@ impl StopWordFilter {
}
}
pub struct StopWordFilterStream<'a> {
words: Arc<FxHashSet<String>>,
tail: BoxTokenStream<'a>,
}
impl TokenFilter for StopWordFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(StopWordFilterStream {
words: self.words.clone(),
tail: token_stream,
})
type Tokenizer<T: Tokenizer> = StopWordFilterWrapper<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> StopWordFilterWrapper<T> {
StopWordFilterWrapper {
words: self.words,
inner: tokenizer,
}
}
}
impl<'a> StopWordFilterStream<'a> {
#[derive(Clone)]
pub struct StopWordFilterWrapper<T> {
words: Arc<FxHashSet<String>>,
inner: T,
}
impl<T: Tokenizer> Tokenizer for StopWordFilterWrapper<T> {
type TokenStream<'a> = StopWordFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
StopWordFilterStream {
words: self.words.clone(),
tail: self.inner.token_stream(text),
}
}
}
pub struct StopWordFilterStream<T> {
words: Arc<FxHashSet<String>>,
tail: T,
}
impl<T> StopWordFilterStream<T> {
fn predicate(&self, token: &Token) -> bool {
!self.words.contains(&token.text)
}
}
impl<'a> TokenStream for StopWordFilterStream<'a> {
impl<T: TokenStream> TokenStream for StopWordFilterStream<T> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
@@ -131,7 +151,9 @@ mod tests {
"am".to_string(),
"i".to_string(),
];
let a = TextAnalyzer::from(SimpleTokenizer).filter(StopWordFilter::remove(stops));
let a = TextAnalyzer::builder(SimpleTokenizer)
.filter(StopWordFilter::remove(stops))
.build();
let mut token_stream = a.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {

View File

@@ -1,15 +1,12 @@
/// The tokenizer module contains all of the tools used to process
/// text in `tantivy`.
use tokenizer_api::{BoxTokenFilter, BoxTokenStream, Tokenizer};
use tokenizer_api::{BoxTokenStream, BoxableTokenizer, TokenFilter, Tokenizer};
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
///
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
pub struct TextAnalyzer {
tokenizer: Box<dyn Tokenizer>,
token_filters: Vec<BoxTokenFilter>,
tokenizer: Box<dyn BoxableTokenizer>,
}
impl Default for TextAnalyzer {
@@ -18,52 +15,21 @@ impl Default for TextAnalyzer {
}
}
impl<T: Tokenizer> From<T> for TextAnalyzer {
impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
fn from(tokenizer: T) -> Self {
TextAnalyzer::new(tokenizer, Vec::new())
TextAnalyzer::builder(tokenizer).build()
}
}
impl TextAnalyzer {
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
///
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
/// `TextAnalyzer::from(tokenizer)`.
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
TextAnalyzer {
tokenizer: Box::new(tokenizer),
token_filters,
}
}
/// Appends a token filter to the current tokenizer.
///
/// The method consumes the current `TokenStream` and returns a
/// new one.
///
/// # Example
///
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let en_stem = TextAnalyzer::from(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser)
/// .filter(Stemmer::default());
/// ```
#[must_use]
pub fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self {
self.token_filters.push(token_filter.into());
self
/// Create a new TextAnalyzerBuilder
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
TextAnalyzerBuilder { tokenizer }
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
let mut token_stream = self.tokenizer.token_stream(text);
for token_filter in &self.token_filters {
token_stream = token_filter.transform(token_stream);
}
token_stream
self.tokenizer.box_token_stream(text)
}
}
@@ -71,11 +37,39 @@ impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: self.tokenizer.box_clone(),
token_filters: self
.token_filters
.iter()
.map(|token_filter| token_filter.box_clone())
.collect(),
}
}
}
/// Builder helper for [`TextAnalyzer`]
pub struct TextAnalyzerBuilder<T> {
tokenizer: T,
}
impl<T: Tokenizer> TextAnalyzerBuilder<T> {
/// Appends a token filter to the current builder.
///
/// # Example
///
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let en_stem = TextAnalyzer::builder(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser)
/// .filter(Stemmer::default())
/// .build();
/// ```
pub fn filter<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder<F::Tokenizer<T>> {
TextAnalyzerBuilder {
tokenizer: token_filter.transform(self.tokenizer),
}
}
/// Finalize building the TextAnalyzer
pub fn build(self) -> TextAnalyzer {
TextAnalyzer {
tokenizer: Box::new(self.tokenizer),
}
}
}

View File

@@ -61,16 +61,18 @@ impl Default for TokenizerManager {
manager.register("raw", RawTokenizer);
manager.register(
"default",
TextAnalyzer::from(SimpleTokenizer)
TextAnalyzer::builder(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser),
.filter(LowerCaser)
.build(),
);
manager.register(
"en_stem",
TextAnalyzer::from(SimpleTokenizer)
TextAnalyzer::builder(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(Language::English)),
.filter(Stemmer::new(Language::English))
.build(),
);
manager.register("whitespace", WhitespaceTokenizer);
manager

View File

@@ -1,6 +1,6 @@
use std::str::CharIndices;
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
use super::{Token, TokenStream, Tokenizer};
/// Tokenize the text by splitting on whitespaces.
#[derive(Clone)]
@@ -13,12 +13,13 @@ pub struct WhitespaceTokenStream<'a> {
}
impl Tokenizer for WhitespaceTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
BoxTokenStream::from(WhitespaceTokenStream {
type TokenStream<'a> = WhitespaceTokenStream<'a>;
fn token_stream<'a>(&self, text: &'a str) -> WhitespaceTokenStream<'a> {
WhitespaceTokenStream {
text,
chars: text.char_indices(),
token: Token::default(),
})
}
}
}

View File

@@ -42,28 +42,31 @@ impl Default for Token {
/// `Tokenizer` are in charge of splitting text into a stream of token
/// before indexing.
///
/// # Warning
///
/// This API may change to use associated types.
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
pub trait Tokenizer: 'static + Clone + Send + Sync {
/// The token stream returned by this Tokenizer.
type TokenStream<'a>: TokenStream;
/// Creates a token stream for a given `str`.
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a>;
}
pub trait TokenizerClone {
fn box_clone(&self) -> Box<dyn Tokenizer>;
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
pub trait BoxableTokenizer: 'static + Send + Sync {
/// Creates a boxed token stream for a given `str`.
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
/// Clone this tokenizer.
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
}
impl<T: Tokenizer + Clone> TokenizerClone for T {
fn box_clone(&self) -> Box<dyn Tokenizer> {
impl<T: Tokenizer> BoxableTokenizer for T {
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
self.token_stream(text).into()
}
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
Box::new(self.clone())
}
}
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
///
/// See [`TokenStream`] for more information.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a, T> From<T> for BoxTokenStream<'a>
@@ -139,39 +142,13 @@ pub trait TokenStream {
}
}
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
///
/// See [`TokenFilter`] for more information.
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
impl Deref for BoxTokenFilter {
type Target = dyn TokenFilter;
fn deref(&self) -> &dyn TokenFilter {
&*self.0
}
}
impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(tokenizer: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(tokenizer))
}
}
pub trait TokenFilterClone {
fn box_clone(&self) -> BoxTokenFilter;
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
/// Wraps a token stream and returns the modified one.
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
}
impl<T: TokenFilter + Clone> TokenFilterClone for T {
fn box_clone(&self) -> BoxTokenFilter {
BoxTokenFilter::from(self.clone())
}
pub trait TokenFilter: 'static + Send + Sync {
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
/// Tokenizer.
type Tokenizer<T: Tokenizer>: Tokenizer;
/// Wraps a Tokenizer and returns a new one.
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
}
#[cfg(test)]