mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 01:02:55 +00:00
refactor tokenization pipeline to use GATs (#1924)
* refactor tokenization pipeline to use GATs * fix doctests * fix clippy lints * remove commented code
This commit is contained in:
@@ -12,7 +12,7 @@
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, Tokenizer};
|
||||
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
|
||||
use tantivy::{doc, Index, ReloadPolicy};
|
||||
use tempfile::TempDir;
|
||||
|
||||
|
||||
@@ -50,12 +50,13 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
]));
|
||||
]))
|
||||
.build();
|
||||
|
||||
index.tokenizers().register("stoppy", tokenizer);
|
||||
|
||||
|
||||
@@ -188,7 +188,7 @@ impl SegmentWriter {
|
||||
let mut indexing_position = IndexingPosition::default();
|
||||
postings_writer.index_text(
|
||||
doc_id,
|
||||
&mut *facet_tokenizer,
|
||||
&mut facet_tokenizer,
|
||||
term_buffer,
|
||||
ctx,
|
||||
&mut indexing_position,
|
||||
|
||||
@@ -4,7 +4,9 @@ use std::collections::{BinaryHeap, HashMap};
|
||||
use crate::query::bm25::idf;
|
||||
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
|
||||
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
|
||||
use crate::tokenizer::{BoxTokenStream, FacetTokenizer, PreTokenizedStream, Tokenizer};
|
||||
use crate::tokenizer::{
|
||||
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
|
||||
};
|
||||
use crate::{DocAddress, Result, Searcher, TantivyError};
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
|
||||
@@ -913,9 +913,10 @@ mod test {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register(
|
||||
"en_with_stop_words",
|
||||
TextAnalyzer::from(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec!["the".to_string()])),
|
||||
.filter(StopWordFilter::remove(vec!["the".to_string()]))
|
||||
.build(),
|
||||
);
|
||||
QueryParser::new(schema, default_fields, tokenizer_manager)
|
||||
}
|
||||
|
||||
@@ -2,16 +2,18 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::from(RawTokenizer)
|
||||
//! .filter(AlphaNumOnlyFilter);
|
||||
//! let tokenizer = TextAnalyzer::builder(RawTokenizer)
|
||||
//! .filter(AlphaNumOnlyFilter)
|
||||
//! .build();
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("hello there");
|
||||
//! // is none because the raw filter emits one token that
|
||||
//! // contains a space
|
||||
//! assert!(stream.next().is_none());
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! .filter(AlphaNumOnlyFilter);
|
||||
//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! .filter(AlphaNumOnlyFilter)
|
||||
//! .build();
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("hello there 💣");
|
||||
//! assert!(stream.next().is_some());
|
||||
@@ -19,30 +21,45 @@
|
||||
//! // the "emoji" is dropped because its not an alphanum
|
||||
//! assert!(stream.next().is_none());
|
||||
//! ```
|
||||
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
/// `TokenFilter` that removes all tokens that contain non
|
||||
/// ascii alphanumeric characters.
|
||||
#[derive(Clone)]
|
||||
pub struct AlphaNumOnlyFilter;
|
||||
|
||||
pub struct AlphaNumOnlyFilterStream<'a> {
|
||||
tail: BoxTokenStream<'a>,
|
||||
pub struct AlphaNumOnlyFilterStream<T> {
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl<'a> AlphaNumOnlyFilterStream<'a> {
|
||||
impl<T> AlphaNumOnlyFilterStream<T> {
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
token.text.chars().all(|c| c.is_ascii_alphanumeric())
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenFilter for AlphaNumOnlyFilter {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream })
|
||||
type Tokenizer<T: Tokenizer> = AlphaNumOnlyFilterWrapper<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> AlphaNumOnlyFilterWrapper<T> {
|
||||
AlphaNumOnlyFilterWrapper(tokenizer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> {
|
||||
#[derive(Clone)]
|
||||
pub struct AlphaNumOnlyFilterWrapper<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for AlphaNumOnlyFilterWrapper<T> {
|
||||
type TokenStream<'a> = AlphaNumOnlyFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
AlphaNumOnlyFilterStream {
|
||||
tail: self.0.token_stream(text),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for AlphaNumOnlyFilterStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
@@ -79,7 +96,9 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let a = TextAnalyzer::from(SimpleTokenizer).filter(AlphaNumOnlyFilter);
|
||||
let a = TextAnalyzer::builder(SimpleTokenizer)
|
||||
.filter(AlphaNumOnlyFilter)
|
||||
.build();
|
||||
let mut token_stream = a.token_stream(text);
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
let mut add_token = |token: &Token| {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::mem;
|
||||
|
||||
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
/// This class converts alphabetic, numeric, and symbolic Unicode characters
|
||||
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
|
||||
@@ -9,20 +9,33 @@ use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||
pub struct AsciiFoldingFilter;
|
||||
|
||||
impl TokenFilter for AsciiFoldingFilter {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
From::from(AsciiFoldingFilterTokenStream {
|
||||
tail: token_stream,
|
||||
buffer: String::with_capacity(100),
|
||||
})
|
||||
type Tokenizer<T: Tokenizer> = AsciiFoldingFilterWrapper<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> AsciiFoldingFilterWrapper<T> {
|
||||
AsciiFoldingFilterWrapper(tokenizer)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AsciiFoldingFilterTokenStream<'a> {
|
||||
buffer: String,
|
||||
tail: BoxTokenStream<'a>,
|
||||
#[derive(Clone)]
|
||||
pub struct AsciiFoldingFilterWrapper<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
|
||||
type TokenStream<'a> = AsciiFoldingFilterTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
AsciiFoldingFilterTokenStream {
|
||||
buffer: String::with_capacity(100),
|
||||
tail: self.0.token_stream(text),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> {
|
||||
pub struct AsciiFoldingFilterTokenStream<T> {
|
||||
buffer: String,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
@@ -1560,8 +1573,9 @@ mod tests {
|
||||
|
||||
fn folding_helper(text: &str) -> Vec<String> {
|
||||
let mut tokens = Vec::new();
|
||||
TextAnalyzer::from(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build()
|
||||
.token_stream(text)
|
||||
.process(&mut |token| {
|
||||
tokens.push(token.text.clone());
|
||||
@@ -1570,8 +1584,9 @@ mod tests {
|
||||
}
|
||||
|
||||
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
|
||||
let mut token_stream = TextAnalyzer::from(RawTokenizer)
|
||||
let mut token_stream = TextAnalyzer::builder(RawTokenizer)
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build()
|
||||
.token_stream(text);
|
||||
token_stream.advance();
|
||||
token_stream.token().text.clone()
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, Tokenizer};
|
||||
use crate::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct EmptyTokenizer;
|
||||
|
||||
impl Tokenizer for EmptyTokenizer {
|
||||
fn token_stream<'a>(&self, _text: &'a str) -> BoxTokenStream<'a> {
|
||||
EmptyTokenStream::default().into()
|
||||
type TokenStream<'a> = EmptyTokenStream;
|
||||
fn token_stream(&self, _text: &str) -> EmptyTokenStream {
|
||||
EmptyTokenStream::default()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct EmptyTokenStream {
|
||||
pub struct EmptyTokenStream {
|
||||
token: Token,
|
||||
}
|
||||
|
||||
@@ -30,7 +31,7 @@ impl TokenStream for EmptyTokenStream {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::tokenizer::Tokenizer;
|
||||
use crate::tokenizer::{TokenStream, Tokenizer};
|
||||
|
||||
#[test]
|
||||
fn test_empty_tokenizer() {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use crate::schema::FACET_SEP_BYTE;
|
||||
|
||||
/// The `FacetTokenizer` process a `Facet` binary representation
|
||||
@@ -26,7 +26,8 @@ pub struct FacetTokenStream<'a> {
|
||||
}
|
||||
|
||||
impl Tokenizer for FacetTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
type TokenStream<'a> = FacetTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> FacetTokenStream<'a> {
|
||||
let token = Token {
|
||||
position: 0,
|
||||
..Default::default()
|
||||
@@ -36,7 +37,6 @@ impl Tokenizer for FacetTokenizer {
|
||||
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
|
||||
token,
|
||||
}
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,7 +87,7 @@ mod tests {
|
||||
|
||||
use super::FacetTokenizer;
|
||||
use crate::schema::Facet;
|
||||
use crate::tokenizer::{Token, Tokenizer};
|
||||
use crate::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
#[test]
|
||||
fn test_facet_tokenizer() {
|
||||
|
||||
@@ -1,29 +1,42 @@
|
||||
use std::mem;
|
||||
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
|
||||
impl TokenFilter for LowerCaser {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(LowerCaserTokenStream {
|
||||
tail: token_stream,
|
||||
buffer: String::with_capacity(100),
|
||||
})
|
||||
}
|
||||
}
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
/// Token filter that lowercase terms.
|
||||
#[derive(Clone)]
|
||||
pub struct LowerCaser;
|
||||
|
||||
pub struct LowerCaserTokenStream<'a> {
|
||||
impl TokenFilter for LowerCaser {
|
||||
type Tokenizer<T: Tokenizer> = LowerCaserFilter<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
|
||||
LowerCaserFilter(tokenizer)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct LowerCaserFilter<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for LowerCaserFilter<T> {
|
||||
type TokenStream<'a> = LowerCaserTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
LowerCaserTokenStream {
|
||||
tail: self.0.token_stream(text),
|
||||
buffer: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LowerCaserTokenStream<T> {
|
||||
buffer: String,
|
||||
tail: BoxTokenStream<'a>,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
// writes a lowercased version of text into output.
|
||||
fn to_lowercase_unicode(text: &str, output: &mut String) {
|
||||
output.clear();
|
||||
output.reserve(50);
|
||||
for c in text.chars() {
|
||||
// Contrary to the std, we do not take care of sigma special case.
|
||||
// This will have an normalizationo effect, which is ok for search.
|
||||
@@ -31,7 +44,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for LowerCaserTokenStream<'a> {
|
||||
impl<T: TokenStream> TokenStream for LowerCaserTokenStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
@@ -73,8 +86,9 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let mut token_stream = TextAnalyzer::from(SimpleTokenizer)
|
||||
let mut token_stream = TextAnalyzer::builder(SimpleTokenizer)
|
||||
.filter(LowerCaser)
|
||||
.build()
|
||||
.token_stream(text);
|
||||
let mut tokens = vec![];
|
||||
let mut add_token = |token: &Token| {
|
||||
|
||||
@@ -66,10 +66,11 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let en_stem = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! let en_stem = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser)
|
||||
//! .filter(Stemmer::new(Language::English));
|
||||
//! .filter(Stemmer::new(Language::English))
|
||||
//! .build();
|
||||
//! ```
|
||||
//!
|
||||
//! Once your tokenizer is defined, you need to
|
||||
@@ -112,9 +113,10 @@
|
||||
//! let index = Index::create_in_ram(schema);
|
||||
//!
|
||||
//! // We need to register our tokenizer :
|
||||
//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser);
|
||||
//! .filter(LowerCaser)
|
||||
//! .build();
|
||||
//! index
|
||||
//! .tokenizers()
|
||||
//! .register("custom_en", custom_en_tokenizer);
|
||||
@@ -137,9 +139,7 @@ mod tokenizer;
|
||||
mod tokenizer_manager;
|
||||
mod whitespace_tokenizer;
|
||||
|
||||
pub use tokenizer_api::{
|
||||
BoxTokenFilter, BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer,
|
||||
};
|
||||
pub use tokenizer_api::{BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||
pub use self::ascii_folding_filter::AsciiFoldingFilter;
|
||||
@@ -237,10 +237,11 @@ pub mod tests {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register(
|
||||
"el_stem",
|
||||
TextAnalyzer::from(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::Greek)),
|
||||
.filter(Stemmer::new(Language::Greek))
|
||||
.build(),
|
||||
);
|
||||
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
|
||||
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
||||
///
|
||||
@@ -132,8 +131,9 @@ pub struct NgramTokenStream<'a> {
|
||||
}
|
||||
|
||||
impl Tokenizer for NgramTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
From::from(NgramTokenStream {
|
||||
type TokenStream<'a> = NgramTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> NgramTokenStream<'a> {
|
||||
NgramTokenStream {
|
||||
ngram_charidx_iterator: StutteringIterator::new(
|
||||
CodepointFrontiers::for_str(text),
|
||||
self.min_gram,
|
||||
@@ -142,7 +142,7 @@ impl Tokenizer for NgramTokenizer {
|
||||
prefix_only: self.prefix_only,
|
||||
text,
|
||||
token: Token::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -303,9 +303,9 @@ mod tests {
|
||||
|
||||
use super::{utf8_codepoint_width, CodepointFrontiers, NgramTokenizer, StutteringIterator};
|
||||
use crate::tokenizer::tests::assert_token;
|
||||
use crate::tokenizer::{BoxTokenStream, Token, Tokenizer};
|
||||
use crate::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> {
|
||||
fn test_helper<T: TokenStream>(mut tokenizer: T) -> Vec<Token> {
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
|
||||
tokens
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
|
||||
/// For each value of the field, emit a single unprocessed token.
|
||||
#[derive(Clone)]
|
||||
@@ -11,7 +10,8 @@ pub struct RawTokenStream {
|
||||
}
|
||||
|
||||
impl Tokenizer for RawTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
type TokenStream<'a> = RawTokenStream;
|
||||
fn token_stream(&self, text: &str) -> RawTokenStream {
|
||||
let token = Token {
|
||||
offset_from: 0,
|
||||
offset_to: text.len(),
|
||||
@@ -23,7 +23,6 @@ impl Tokenizer for RawTokenizer {
|
||||
token,
|
||||
has_token: true,
|
||||
}
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use regex::Regex;
|
||||
|
||||
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Tokenize the text by using a regex pattern to split.
|
||||
@@ -60,13 +60,14 @@ impl RegexTokenizer {
|
||||
}
|
||||
|
||||
impl Tokenizer for RegexTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(RegexTokenStream {
|
||||
type TokenStream<'a> = RegexTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> RegexTokenStream<'a> {
|
||||
RegexTokenStream {
|
||||
regex: self.regex.clone(),
|
||||
text,
|
||||
token: Token::default(),
|
||||
cursor: 0,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,8 +2,9 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! .filter(RemoveLongFilter::limit(5));
|
||||
//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! .filter(RemoveLongFilter::limit(5))
|
||||
//! .build();
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("toolong nice");
|
||||
//! // because `toolong` is more than 5 characters, it is filtered
|
||||
@@ -11,8 +12,7 @@
|
||||
//! assert_eq!(stream.next().unwrap().text, "nice");
|
||||
//! assert!(stream.next().is_none());
|
||||
//! ```
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
/// `RemoveLongFilter` removes tokens that are longer
|
||||
/// than a given number of bytes (in UTF-8 representation).
|
||||
@@ -31,27 +31,46 @@ impl RemoveLongFilter {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> RemoveLongFilterStream<'a> {
|
||||
impl<T> RemoveLongFilterStream<T> {
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
token.text.len() < self.token_length_limit
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenFilter for RemoveLongFilter {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(RemoveLongFilterStream {
|
||||
token_length_limit: self.length_limit,
|
||||
tail: token_stream,
|
||||
})
|
||||
type Tokenizer<T: Tokenizer> = RemoveLongFilterWrapper<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> RemoveLongFilterWrapper<T> {
|
||||
RemoveLongFilterWrapper {
|
||||
length_limit: self.length_limit,
|
||||
inner: tokenizer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RemoveLongFilterStream<'a> {
|
||||
token_length_limit: usize,
|
||||
tail: BoxTokenStream<'a>,
|
||||
#[derive(Clone)]
|
||||
pub struct RemoveLongFilterWrapper<T: Tokenizer> {
|
||||
length_limit: usize,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for RemoveLongFilterStream<'a> {
|
||||
impl<T: Tokenizer> Tokenizer for RemoveLongFilterWrapper<T> {
|
||||
type TokenStream<'a> = RemoveLongFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
RemoveLongFilterStream {
|
||||
token_length_limit: self.length_limit,
|
||||
tail: self.inner.token_stream(text),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RemoveLongFilterStream<T> {
|
||||
token_length_limit: usize,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for RemoveLongFilterStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
@@ -84,7 +103,9 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let a = TextAnalyzer::from(SimpleTokenizer).filter(RemoveLongFilter::limit(6));
|
||||
let a = TextAnalyzer::builder(SimpleTokenizer)
|
||||
.filter(RemoveLongFilter::limit(6))
|
||||
.build();
|
||||
let mut token_stream = a.token_stream(text);
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
let mut add_token = |token: &Token| {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::str::CharIndices;
|
||||
|
||||
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
|
||||
/// Tokenize the text by splitting on whitespaces and punctuation.
|
||||
#[derive(Clone)]
|
||||
@@ -13,12 +13,13 @@ pub struct SimpleTokenStream<'a> {
|
||||
}
|
||||
|
||||
impl Tokenizer for SimpleTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(SimpleTokenStream {
|
||||
type TokenStream<'a> = SimpleTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> SimpleTokenStream<'a> {
|
||||
SimpleTokenStream {
|
||||
text,
|
||||
chars: text.char_indices(),
|
||||
token: Token::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::sync::Arc;
|
||||
|
||||
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind, StateID};
|
||||
|
||||
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
/// A [`TokenFilter`] which splits compound words into their parts
|
||||
/// based on a given dictionary.
|
||||
@@ -23,9 +23,11 @@ use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||
/// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer};
|
||||
///
|
||||
/// let tokenizer =
|
||||
/// TextAnalyzer::from(SimpleTokenizer).filter(SplitCompoundWords::from_dictionary([
|
||||
/// TextAnalyzer::builder(SimpleTokenizer)
|
||||
/// .filter(SplitCompoundWords::from_dictionary([
|
||||
/// "dampf", "schiff", "fahrt", "brot", "backen", "automat",
|
||||
/// ]));
|
||||
/// ]))
|
||||
/// .build();
|
||||
///
|
||||
/// let mut stream = tokenizer.token_stream("dampfschifffahrt");
|
||||
/// assert_eq!(stream.next().unwrap().text, "dampf");
|
||||
@@ -76,24 +78,45 @@ impl<S: StateID> SplitCompoundWords<S> {
|
||||
}
|
||||
|
||||
impl<S: StateID + Send + Sync + 'static> TokenFilter for SplitCompoundWords<S> {
|
||||
fn transform<'a>(&self, stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(SplitCompoundWordsTokenStream {
|
||||
dict: self.dict.clone(),
|
||||
tail: stream,
|
||||
cuts: Vec::new(),
|
||||
parts: Vec::new(),
|
||||
})
|
||||
type Tokenizer<T: Tokenizer> = SplitCompoundWordsFilter<T, S>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> SplitCompoundWordsFilter<T, S> {
|
||||
SplitCompoundWordsFilter {
|
||||
dict: self.dict,
|
||||
inner: tokenizer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct SplitCompoundWordsTokenStream<'a, S: StateID> {
|
||||
#[derive(Clone)]
|
||||
pub struct SplitCompoundWordsFilter<T, S: StateID> {
|
||||
dict: Arc<AhoCorasick<S>>,
|
||||
tail: BoxTokenStream<'a>,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer, S: StateID + Send + Sync + 'static> Tokenizer
|
||||
for SplitCompoundWordsFilter<T, S>
|
||||
{
|
||||
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>, S>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
SplitCompoundWordsTokenStream {
|
||||
dict: self.dict.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
cuts: Vec::new(),
|
||||
parts: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SplitCompoundWordsTokenStream<T, S: StateID> {
|
||||
dict: Arc<AhoCorasick<S>>,
|
||||
tail: T,
|
||||
cuts: Vec<usize>,
|
||||
parts: Vec<Token>,
|
||||
}
|
||||
|
||||
impl<'a, S: StateID> SplitCompoundWordsTokenStream<'a, S> {
|
||||
impl<T: TokenStream, S: StateID> SplitCompoundWordsTokenStream<T, S> {
|
||||
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
|
||||
// can fully be split into consecutive matches against `self.dict`.
|
||||
fn split(&mut self) {
|
||||
@@ -129,7 +152,7 @@ impl<'a, S: StateID> SplitCompoundWordsTokenStream<'a, S> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, S: StateID> TokenStream for SplitCompoundWordsTokenStream<'a, S> {
|
||||
impl<T: TokenStream, S: StateID> TokenStream for SplitCompoundWordsTokenStream<T, S> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.parts.pop();
|
||||
|
||||
@@ -165,8 +188,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn splitting_compound_words_works() {
|
||||
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]));
|
||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]))
|
||||
.build();
|
||||
|
||||
{
|
||||
let mut stream = tokenizer.token_stream("");
|
||||
|
||||
@@ -4,8 +4,7 @@ use std::mem;
|
||||
use rust_stemmers::{self, Algorithm};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
/// Available stemmer languages.
|
||||
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
|
||||
@@ -82,23 +81,42 @@ impl Default for Stemmer {
|
||||
}
|
||||
|
||||
impl TokenFilter for Stemmer {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||
BoxTokenStream::from(StemmerTokenStream {
|
||||
tail: token_stream,
|
||||
stemmer: inner_stemmer,
|
||||
buffer: String::new(),
|
||||
})
|
||||
type Tokenizer<T: Tokenizer> = StemmerFilter<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> StemmerFilter<T> {
|
||||
StemmerFilter {
|
||||
stemmer_algorithm: self.stemmer_algorithm,
|
||||
inner: tokenizer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StemmerTokenStream<'a> {
|
||||
tail: BoxTokenStream<'a>,
|
||||
#[derive(Clone)]
|
||||
pub struct StemmerFilter<T> {
|
||||
stemmer_algorithm: Algorithm,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for StemmerFilter<T> {
|
||||
type TokenStream<'a> = StemmerTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||
StemmerTokenStream {
|
||||
tail: self.inner.token_stream(text),
|
||||
stemmer,
|
||||
buffer: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StemmerTokenStream<T> {
|
||||
tail: T,
|
||||
stemmer: rust_stemmers::Stemmer,
|
||||
buffer: String,
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for StemmerTokenStream<'a> {
|
||||
impl<T: TokenStream> TokenStream for StemmerTokenStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
|
||||
@@ -2,8 +2,9 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]));
|
||||
//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]))
|
||||
//! .build();
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("the fox is crafty");
|
||||
//! assert_eq!(stream.next().unwrap().text, "fox");
|
||||
@@ -20,7 +21,7 @@ use rustc_hash::FxHashSet;
|
||||
|
||||
#[cfg(feature = "stopwords")]
|
||||
use super::Language;
|
||||
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
/// `TokenFilter` that removes stop words from a token stream
|
||||
#[derive(Clone)]
|
||||
@@ -69,27 +70,46 @@ impl StopWordFilter {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StopWordFilterStream<'a> {
|
||||
words: Arc<FxHashSet<String>>,
|
||||
tail: BoxTokenStream<'a>,
|
||||
}
|
||||
|
||||
impl TokenFilter for StopWordFilter {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(StopWordFilterStream {
|
||||
words: self.words.clone(),
|
||||
tail: token_stream,
|
||||
})
|
||||
type Tokenizer<T: Tokenizer> = StopWordFilterWrapper<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> StopWordFilterWrapper<T> {
|
||||
StopWordFilterWrapper {
|
||||
words: self.words,
|
||||
inner: tokenizer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> StopWordFilterStream<'a> {
|
||||
#[derive(Clone)]
|
||||
pub struct StopWordFilterWrapper<T> {
|
||||
words: Arc<FxHashSet<String>>,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for StopWordFilterWrapper<T> {
|
||||
type TokenStream<'a> = StopWordFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
StopWordFilterStream {
|
||||
words: self.words.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StopWordFilterStream<T> {
|
||||
words: Arc<FxHashSet<String>>,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl<T> StopWordFilterStream<T> {
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
!self.words.contains(&token.text)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for StopWordFilterStream<'a> {
|
||||
impl<T: TokenStream> TokenStream for StopWordFilterStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
@@ -131,7 +151,9 @@ mod tests {
|
||||
"am".to_string(),
|
||||
"i".to_string(),
|
||||
];
|
||||
let a = TextAnalyzer::from(SimpleTokenizer).filter(StopWordFilter::remove(stops));
|
||||
let a = TextAnalyzer::builder(SimpleTokenizer)
|
||||
.filter(StopWordFilter::remove(stops))
|
||||
.build();
|
||||
let mut token_stream = a.token_stream(text);
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
let mut add_token = |token: &Token| {
|
||||
|
||||
@@ -1,15 +1,12 @@
|
||||
/// The tokenizer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
use tokenizer_api::{BoxTokenFilter, BoxTokenStream, Tokenizer};
|
||||
use tokenizer_api::{BoxTokenStream, BoxableTokenizer, TokenFilter, Tokenizer};
|
||||
|
||||
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
||||
|
||||
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||
///
|
||||
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
|
||||
pub struct TextAnalyzer {
|
||||
tokenizer: Box<dyn Tokenizer>,
|
||||
token_filters: Vec<BoxTokenFilter>,
|
||||
tokenizer: Box<dyn BoxableTokenizer>,
|
||||
}
|
||||
|
||||
impl Default for TextAnalyzer {
|
||||
@@ -18,52 +15,21 @@ impl Default for TextAnalyzer {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> From<T> for TextAnalyzer {
|
||||
impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
|
||||
fn from(tokenizer: T) -> Self {
|
||||
TextAnalyzer::new(tokenizer, Vec::new())
|
||||
TextAnalyzer::builder(tokenizer).build()
|
||||
}
|
||||
}
|
||||
|
||||
impl TextAnalyzer {
|
||||
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
|
||||
///
|
||||
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
|
||||
/// `TextAnalyzer::from(tokenizer)`.
|
||||
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
|
||||
TextAnalyzer {
|
||||
tokenizer: Box::new(tokenizer),
|
||||
token_filters,
|
||||
}
|
||||
}
|
||||
|
||||
/// Appends a token filter to the current tokenizer.
|
||||
///
|
||||
/// The method consumes the current `TokenStream` and returns a
|
||||
/// new one.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let en_stem = TextAnalyzer::from(SimpleTokenizer)
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser)
|
||||
/// .filter(Stemmer::default());
|
||||
/// ```
|
||||
#[must_use]
|
||||
pub fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self {
|
||||
self.token_filters.push(token_filter.into());
|
||||
self
|
||||
/// Create a new TextAnalyzerBuilder
|
||||
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
|
||||
TextAnalyzerBuilder { tokenizer }
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
let mut token_stream = self.tokenizer.token_stream(text);
|
||||
for token_filter in &self.token_filters {
|
||||
token_stream = token_filter.transform(token_stream);
|
||||
}
|
||||
token_stream
|
||||
self.tokenizer.box_token_stream(text)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,11 +37,39 @@ impl Clone for TextAnalyzer {
|
||||
fn clone(&self) -> Self {
|
||||
TextAnalyzer {
|
||||
tokenizer: self.tokenizer.box_clone(),
|
||||
token_filters: self
|
||||
.token_filters
|
||||
.iter()
|
||||
.map(|token_filter| token_filter.box_clone())
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder helper for [`TextAnalyzer`]
|
||||
pub struct TextAnalyzerBuilder<T> {
|
||||
tokenizer: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
/// Appends a token filter to the current builder.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let en_stem = TextAnalyzer::builder(SimpleTokenizer)
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser)
|
||||
/// .filter(Stemmer::default())
|
||||
/// .build();
|
||||
/// ```
|
||||
pub fn filter<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder<F::Tokenizer<T>> {
|
||||
TextAnalyzerBuilder {
|
||||
tokenizer: token_filter.transform(self.tokenizer),
|
||||
}
|
||||
}
|
||||
|
||||
/// Finalize building the TextAnalyzer
|
||||
pub fn build(self) -> TextAnalyzer {
|
||||
TextAnalyzer {
|
||||
tokenizer: Box::new(self.tokenizer),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,16 +61,18 @@ impl Default for TokenizerManager {
|
||||
manager.register("raw", RawTokenizer);
|
||||
manager.register(
|
||||
"default",
|
||||
TextAnalyzer::from(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser),
|
||||
.filter(LowerCaser)
|
||||
.build(),
|
||||
);
|
||||
manager.register(
|
||||
"en_stem",
|
||||
TextAnalyzer::from(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::English)),
|
||||
.filter(Stemmer::new(Language::English))
|
||||
.build(),
|
||||
);
|
||||
manager.register("whitespace", WhitespaceTokenizer);
|
||||
manager
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::str::CharIndices;
|
||||
|
||||
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
|
||||
/// Tokenize the text by splitting on whitespaces.
|
||||
#[derive(Clone)]
|
||||
@@ -13,12 +13,13 @@ pub struct WhitespaceTokenStream<'a> {
|
||||
}
|
||||
|
||||
impl Tokenizer for WhitespaceTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(WhitespaceTokenStream {
|
||||
type TokenStream<'a> = WhitespaceTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> WhitespaceTokenStream<'a> {
|
||||
WhitespaceTokenStream {
|
||||
text,
|
||||
chars: text.char_indices(),
|
||||
token: Token::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -42,28 +42,31 @@ impl Default for Token {
|
||||
|
||||
/// `Tokenizer` are in charge of splitting text into a stream of token
|
||||
/// before indexing.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This API may change to use associated types.
|
||||
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
|
||||
pub trait Tokenizer: 'static + Clone + Send + Sync {
|
||||
/// The token stream returned by this Tokenizer.
|
||||
type TokenStream<'a>: TokenStream;
|
||||
/// Creates a token stream for a given `str`.
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a>;
|
||||
}
|
||||
|
||||
pub trait TokenizerClone {
|
||||
fn box_clone(&self) -> Box<dyn Tokenizer>;
|
||||
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
||||
pub trait BoxableTokenizer: 'static + Send + Sync {
|
||||
/// Creates a boxed token stream for a given `str`.
|
||||
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
/// Clone this tokenizer.
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
|
||||
}
|
||||
|
||||
impl<T: Tokenizer + Clone> TokenizerClone for T {
|
||||
fn box_clone(&self) -> Box<dyn Tokenizer> {
|
||||
impl<T: Tokenizer> BoxableTokenizer for T {
|
||||
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.token_stream(text).into()
|
||||
}
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||
///
|
||||
/// See [`TokenStream`] for more information.
|
||||
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||
|
||||
impl<'a, T> From<T> for BoxTokenStream<'a>
|
||||
@@ -139,39 +142,13 @@ pub trait TokenStream {
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
|
||||
///
|
||||
/// See [`TokenFilter`] for more information.
|
||||
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
|
||||
|
||||
impl Deref for BoxTokenFilter {
|
||||
type Target = dyn TokenFilter;
|
||||
|
||||
fn deref(&self) -> &dyn TokenFilter {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenFilter> From<T> for BoxTokenFilter {
|
||||
fn from(tokenizer: T) -> BoxTokenFilter {
|
||||
BoxTokenFilter(Box::new(tokenizer))
|
||||
}
|
||||
}
|
||||
|
||||
pub trait TokenFilterClone {
|
||||
fn box_clone(&self) -> BoxTokenFilter;
|
||||
}
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
|
||||
/// Wraps a token stream and returns the modified one.
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
|
||||
}
|
||||
|
||||
impl<T: TokenFilter + Clone> TokenFilterClone for T {
|
||||
fn box_clone(&self) -> BoxTokenFilter {
|
||||
BoxTokenFilter::from(self.clone())
|
||||
}
|
||||
pub trait TokenFilter: 'static + Send + Sync {
|
||||
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
|
||||
/// Tokenizer.
|
||||
type Tokenizer<T: Tokenizer>: Tokenizer;
|
||||
/// Wraps a Tokenizer and returns a new one.
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
Reference in New Issue
Block a user