mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-13 04:22:54 +00:00
Fix bug. Cleanup some rough spots. Renamed functions. Fixed tests and docs.
This commit is contained in:
@@ -5,7 +5,7 @@
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::{NgramTokenizer, TextAnalyzer};
|
||||
use tantivy::tokenizer::NgramTokenizer;
|
||||
use tantivy::{doc, Index};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
@@ -52,10 +52,9 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
// here we are registering our custome tokenizer
|
||||
// this will store tokens of 3 characters each
|
||||
index.tokenizers().register(
|
||||
"ngram3",
|
||||
TextAnalyzer::new(NgramTokenizer::new(3, 3, false)),
|
||||
);
|
||||
index
|
||||
.tokenizers()
|
||||
.register("ngram3", NgramTokenizer::new(3, 3, false));
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
|
||||
@@ -50,12 +50,13 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = TextAnalyzer::new(SimpleTokenizer)
|
||||
let tokenizer = analyzer_builder(SimpleTokenizer)
|
||||
.filter(LowerCaser::new())
|
||||
.filter(StopWordFilter::new(vec![
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
]));
|
||||
]))
|
||||
.build();
|
||||
|
||||
index.tokenizers().register("stoppy", tokenizer);
|
||||
|
||||
|
||||
@@ -20,8 +20,8 @@ use crate::reader::IndexReaderBuilder;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::Schema;
|
||||
use crate::tokenizer::TextAnalyzerT;
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::tokenizer::Tokenizer;
|
||||
use crate::tokenizer::{TextAnalyzer, TextAnalyzerT, TokenizerManager};
|
||||
use crate::IndexWriter;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
|
||||
@@ -13,8 +13,8 @@ use crate::schema::Value;
|
||||
use crate::schema::{Field, FieldEntry};
|
||||
use crate::tokenizer::PreTokenizedStream;
|
||||
use crate::tokenizer::TokenStream;
|
||||
use crate::tokenizer::{DynTokenStreamChain, TextAnalyzerT, TokenStreamChain, Tokenizer};
|
||||
use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
|
||||
use crate::tokenizer::{DynTokenStreamChain, TokenStreamChain, Tokenizer};
|
||||
use crate::tokenizer::{FacetTokenizer, TextAnalyzer, TextAnalyzerT};
|
||||
use crate::Opstamp;
|
||||
use crate::{DocId, SegmentComponent};
|
||||
|
||||
|
||||
@@ -50,7 +50,9 @@ pub mod tests {
|
||||
use crate::schema::{Field, TextOptions};
|
||||
use crate::schema::{IndexRecordOption, TextFieldIndexing};
|
||||
use crate::schema::{Schema, Term, INDEXED, TEXT};
|
||||
use crate::tokenizer::{SimpleTokenizer, TextAnalyzer, MAX_TOKEN_LEN};
|
||||
use crate::tokenizer::{
|
||||
analyzer_builder, SimpleTokenizer, TextAnalyzer, TextAnalyzerT, MAX_TOKEN_LEN,
|
||||
};
|
||||
use crate::DocId;
|
||||
use crate::HasLen;
|
||||
use crate::Score;
|
||||
@@ -167,7 +169,7 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
index
|
||||
.tokenizers()
|
||||
.register("simple_no_truncation", TextAnalyzer::new(SimpleTokenizer));
|
||||
.register("simple_no_truncation", SimpleTokenizer);
|
||||
let reader = index.reader().unwrap();
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
|
||||
@@ -573,14 +573,13 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<dyn Query> {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::super::logical_ast::*;
|
||||
use super::QueryParser;
|
||||
use super::QueryParserError;
|
||||
use super::*;
|
||||
use crate::query::Query;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
||||
use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT};
|
||||
use crate::tokenizer::{
|
||||
LowerCaser, SimpleTokenizer, StopWordFilter, TextAnalyzer, TokenizerManager,
|
||||
analyzer_builder, LowerCaser, SimpleTokenizer, StopWordFilter, TextAnalyzer,
|
||||
};
|
||||
use crate::Index;
|
||||
use matches::assert_matches;
|
||||
@@ -619,9 +618,10 @@ mod test {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register(
|
||||
"en_with_stop_words",
|
||||
TextAnalyzer::new(SimpleTokenizer)
|
||||
analyzer_builder(SimpleTokenizer)
|
||||
.filter(LowerCaser::new())
|
||||
.filter(StopWordFilter::new(vec!["the".to_string()])),
|
||||
.filter(StopWordFilter::remove(vec!["the".to_string()]))
|
||||
.build(),
|
||||
);
|
||||
QueryParser::new(schema, default_fields, tokenizer_manager)
|
||||
}
|
||||
@@ -978,7 +978,7 @@ mod test {
|
||||
let index = Index::create_in_ram(schema);
|
||||
index
|
||||
.tokenizers()
|
||||
.register("customtokenizer", TextAnalyzer::new(SimpleTokenizer));
|
||||
.register("customtokenizer", SimpleTokenizer);
|
||||
let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
assert_eq!(
|
||||
query_parser.parse_query("title:\"happy tax\"").unwrap_err(),
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::query::Query;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Value;
|
||||
use crate::tokenizer::{TextAnalyzerT, Token};
|
||||
use crate::tokenizer::{TextAnalyzerT, Token, Tokenizer};
|
||||
use crate::Searcher;
|
||||
use crate::{Document, Score};
|
||||
use htmlescape::encode_minimal;
|
||||
@@ -350,8 +350,13 @@ Survey in 2016, 2017, and 2018."#;
|
||||
String::from("rust") => 1.0,
|
||||
String::from("language") => 0.9
|
||||
};
|
||||
let fragments =
|
||||
search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 100);
|
||||
|
||||
let fragments = search_fragments(
|
||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
100,
|
||||
);
|
||||
assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
@@ -378,8 +383,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
String::from("rust") =>1.0,
|
||||
String::from("language") => 0.9
|
||||
};
|
||||
let fragments =
|
||||
search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||
let fragments = search_fragments(
|
||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
20,
|
||||
);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score, 1.0);
|
||||
@@ -393,8 +402,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
String::from("rust") =>0.9,
|
||||
String::from("language") => 1.0
|
||||
};
|
||||
let fragments =
|
||||
search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||
let fragments = search_fragments(
|
||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
20,
|
||||
);
|
||||
//assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
@@ -413,7 +426,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("c"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3);
|
||||
let fragments = search_fragments(
|
||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
||||
&text,
|
||||
&terms,
|
||||
3,
|
||||
);
|
||||
|
||||
assert_eq!(fragments.len(), 1);
|
||||
{
|
||||
@@ -435,7 +453,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3);
|
||||
let fragments = search_fragments(
|
||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
||||
&text,
|
||||
&terms,
|
||||
3,
|
||||
);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
@@ -458,7 +481,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
terms.insert(String::from("a"), 0.9);
|
||||
|
||||
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 7);
|
||||
let fragments = search_fragments(
|
||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
||||
&text,
|
||||
&terms,
|
||||
7,
|
||||
);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
@@ -480,7 +508,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("z"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3);
|
||||
let fragments = search_fragments(
|
||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
||||
&text,
|
||||
&terms,
|
||||
3,
|
||||
);
|
||||
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
@@ -494,7 +527,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let text = "a b c d";
|
||||
|
||||
let terms = BTreeMap::new();
|
||||
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3);
|
||||
let fragments = search_fragments(
|
||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
||||
&text,
|
||||
&terms,
|
||||
3,
|
||||
);
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], &text);
|
||||
|
||||
@@ -2,16 +2,16 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::from(RawTokenizer)
|
||||
//! .filter(AlphaNumOnlyFilter);
|
||||
//! let tokenizer = analyzer_builder(RawTokenizer)
|
||||
//! .filter(AlphaNumOnlyFilter).build();
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("hello there");
|
||||
//! // is none because the raw filter emits one token that
|
||||
//! // contains a space
|
||||
//! assert!(stream.next().is_none());
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! .filter(AlphaNumOnlyFilter);
|
||||
//! let tokenizer = analyzer_builder(SimpleTokenizer)
|
||||
//! .filter(AlphaNumOnlyFilter).build();
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("hello there 💣");
|
||||
//! assert!(stream.next().is_some());
|
||||
@@ -23,14 +23,14 @@ use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// `TokenFilter` that removes all tokens that contain non
|
||||
/// ascii alphanumeric characters.
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct AlphaNumOnlyFilter;
|
||||
|
||||
impl TokenFilter for AlphaNumOnlyFilter {
|
||||
fn transform(&mut self, token: Token) -> Option<Token> {
|
||||
if token.text.chars().all(|c| c.is_ascii_alphanumeric()) {
|
||||
return None;
|
||||
return Some(token);
|
||||
}
|
||||
Some(token)
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use super::{analyzer_builder, Token, TokenFilter, TokenStream};
|
||||
use std::mem;
|
||||
|
||||
/// This class converts alphabetic, numeric, and symbolic Unicode characters
|
||||
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
|
||||
/// block) into their ASCII equivalents, if one exists.
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct AsciiFolding {
|
||||
buffer: String,
|
||||
}
|
||||
@@ -1543,8 +1543,9 @@ mod tests {
|
||||
}
|
||||
|
||||
fn folding_helper(text: &str) -> Vec<String> {
|
||||
let tokens = TextAnalyzer::new(SimpleTokenizer)
|
||||
let tokens = analyzer_builder(SimpleTokenizer)
|
||||
.filter(AsciiFolding::new())
|
||||
.build()
|
||||
.token_stream(text)
|
||||
.map(|token| token.text.clone())
|
||||
.collect();
|
||||
@@ -1552,8 +1553,9 @@ mod tests {
|
||||
}
|
||||
|
||||
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
|
||||
let mut token_stream = TextAnalyzer::new(RawTokenizer)
|
||||
let mut token_stream = analyzer_builder(RawTokenizer)
|
||||
.filter(AsciiFolding::new())
|
||||
.build()
|
||||
.token_stream(text);
|
||||
let Token { text, .. } = token_stream.next().unwrap();
|
||||
text
|
||||
|
||||
@@ -9,7 +9,7 @@ use crate::schema::FACET_SEP_BYTE;
|
||||
/// - `/america/north_america/canada`
|
||||
/// - `/america/north_america`
|
||||
/// - `/america`
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct FacetTokenizer;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -40,13 +40,13 @@ impl Tokenizer for FacetTokenizer {
|
||||
impl Iterator for FacetTokenStream {
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.state {
|
||||
self.state = match self.state {
|
||||
State::RootFacetNotEmitted => {
|
||||
self.state = if self.text.is_empty() {
|
||||
if self.text.is_empty() {
|
||||
State::Terminated
|
||||
} else {
|
||||
State::UpToPosition(0)
|
||||
};
|
||||
}
|
||||
}
|
||||
State::UpToPosition(cursor) => {
|
||||
if let Some(next_sep_pos) = self.text.as_bytes()[cursor + 1..]
|
||||
@@ -56,11 +56,11 @@ impl Iterator for FacetTokenStream {
|
||||
{
|
||||
let facet_part = &self.text[cursor..next_sep_pos];
|
||||
self.token.text.push_str(facet_part);
|
||||
self.state = State::UpToPosition(next_sep_pos);
|
||||
State::UpToPosition(next_sep_pos)
|
||||
} else {
|
||||
let facet_part = &self.text[cursor..];
|
||||
self.token.text.push_str(facet_part);
|
||||
self.state = State::Terminated;
|
||||
State::Terminated
|
||||
}
|
||||
}
|
||||
State::Terminated => return None,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::{Token, TokenFilter};
|
||||
use super::{analyzer_builder, TextAnalyzerT, Token, TokenFilter};
|
||||
use std::mem;
|
||||
|
||||
impl TokenFilter for LowerCaser {
|
||||
@@ -15,7 +15,7 @@ impl TokenFilter for LowerCaser {
|
||||
}
|
||||
|
||||
/// Token filter that lowercase terms.
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct LowerCaser {
|
||||
buffer: String,
|
||||
}
|
||||
@@ -46,15 +46,13 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_to_lower_case() {
|
||||
assert_eq!(
|
||||
lowercase_helper("Русский текст"),
|
||||
vec!["русский".to_string(), "текст".to_string()]
|
||||
);
|
||||
assert_eq!(lowercase_helper("Русский текст"), vec!["русский", "текст"]);
|
||||
}
|
||||
|
||||
fn lowercase_helper(text: &str) -> Vec<String> {
|
||||
TextAnalyzer::new(SimpleTokenizer)
|
||||
analyzer_builder(SimpleTokenizer)
|
||||
.filter(LowerCaser::new())
|
||||
.build()
|
||||
.token_stream(text)
|
||||
.map(|token| {
|
||||
let Token { text, .. } = token;
|
||||
@@ -65,7 +63,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_lowercaser() {
|
||||
assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]);
|
||||
assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]);
|
||||
assert_eq!(lowercase_helper("Tree"), vec!["tree"]);
|
||||
assert_eq!(lowercase_helper("Русский"), vec!["русский"]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,10 +64,10 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let en_stem = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! let en_stem = analyzer_builder(SimpleTokenizer)
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser)
|
||||
//! .filter(Stemmer::new(Language::English));
|
||||
//! .filter(LowerCaser::new())
|
||||
//! .filter(Stemmer::new(Language::English)).build();
|
||||
//! ```
|
||||
//!
|
||||
//! Once your tokenizer is defined, you need to
|
||||
@@ -109,9 +109,9 @@
|
||||
//! let index = Index::create_in_ram(schema);
|
||||
//!
|
||||
//! // We need to register our tokenizer :
|
||||
//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! let custom_en_tokenizer = analyzer_builder(SimpleTokenizer)
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser);
|
||||
//! .filter(LowerCaser::new()).build();
|
||||
//! index
|
||||
//! .tokenizers()
|
||||
//! .register("custom_en", custom_en_tokenizer);
|
||||
@@ -146,7 +146,8 @@ pub(crate) use self::token_stream_chain::{DynTokenStreamChain, TokenStreamChain}
|
||||
|
||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||
pub use self::tokenizer::{
|
||||
TextAnalyzer, TextAnalyzerT, Token, TokenFilter, TokenStream, Tokenizer,
|
||||
analyzer_builder, Identity, TextAnalyzer, TextAnalyzerT, Token, TokenFilter, TokenStream,
|
||||
Tokenizer,
|
||||
};
|
||||
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
@@ -215,10 +216,11 @@ pub mod tests {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register(
|
||||
"el_stem",
|
||||
TextAnalyzer::new(SimpleTokenizer)
|
||||
.filter(RemoveLongFilter::new(40))
|
||||
analyzer_builder(SimpleTokenizer)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser::new())
|
||||
.filter(Stemmer::new(Language::Greek)),
|
||||
.filter(Stemmer::new(Language::Greek))
|
||||
.build(),
|
||||
);
|
||||
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
|
||||
let tokens: Vec<Token> = en_tokenizer
|
||||
|
||||
@@ -78,7 +78,7 @@ use super::{Token, TokenStream, Tokenizer};
|
||||
/// }
|
||||
/// assert!(stream.next().is_none());
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct NgramTokenizer {
|
||||
/// min size of the n-gram
|
||||
min_gram: usize,
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
|
||||
/// For each value of the field, emit a single unprocessed token.
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct RawTokenizer;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RawTokenStream {
|
||||
token: Token,
|
||||
has_token: bool,
|
||||
token: Option<Token>,
|
||||
}
|
||||
|
||||
impl Tokenizer for RawTokenizer {
|
||||
@@ -21,8 +20,7 @@ impl Tokenizer for RawTokenizer {
|
||||
position_length: 1,
|
||||
};
|
||||
RawTokenStream {
|
||||
token,
|
||||
has_token: true,
|
||||
token: Some(token),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -30,12 +28,7 @@ impl Tokenizer for RawTokenizer {
|
||||
impl Iterator for RawTokenStream {
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Token> {
|
||||
if self.has_token {
|
||||
self.has_token = false;
|
||||
Some(self.token.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
self.token.take()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! .filter(RemoveLongFilter::limit(5));
|
||||
//! let tokenizer = analyzer_builder(SimpleTokenizer)
|
||||
//! .filter(RemoveLongFilter::limit(5)).build();
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("toolong nice");
|
||||
//! // because `toolong` is more than 5 characters, it is filtered
|
||||
@@ -26,7 +26,7 @@ pub struct RemoveLongFilter {
|
||||
|
||||
impl RemoveLongFilter {
|
||||
/// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation.
|
||||
pub fn new(limit: usize) -> RemoveLongFilter {
|
||||
pub fn limit(limit: usize) -> RemoveLongFilter {
|
||||
RemoveLongFilter { limit }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,37 +1,36 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use std::str::CharIndices;
|
||||
|
||||
impl TokenStream for SimpleTokenizerStream {}
|
||||
|
||||
/// Tokenize the text by splitting on whitespaces and punctuation.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SimpleTokenizer;
|
||||
impl Tokenizer for SimpleTokenizer {
|
||||
type Iter = SimpleTokenizerStream;
|
||||
fn token_stream(&self, text: &str) -> Self::Iter {
|
||||
let vec: Vec<_> = text.char_indices().collect();
|
||||
SimpleTokenizerStream {
|
||||
text: text.to_string(),
|
||||
chars: vec.into_iter(),
|
||||
position: usize::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SimpleTokenizerStream {
|
||||
text: String,
|
||||
idx: usize,
|
||||
chars: Vec<(usize, char)>,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl Tokenizer for SimpleTokenizer {
|
||||
type Iter = SimpleTokenizerStream;
|
||||
fn token_stream(&self, text: &str) -> Self::Iter {
|
||||
SimpleTokenizerStream {
|
||||
text: text.to_string(),
|
||||
chars: text.char_indices().collect(),
|
||||
idx: 0,
|
||||
token: Token::default(),
|
||||
}
|
||||
}
|
||||
chars: std::vec::IntoIter<(usize, char)>,
|
||||
position: usize,
|
||||
}
|
||||
|
||||
impl SimpleTokenizerStream {
|
||||
// search for the end of the current token.
|
||||
fn search_token_end(&mut self) -> usize {
|
||||
(&mut self.chars)
|
||||
.iter()
|
||||
.filter(|&&(_, ref c)| !c.is_alphanumeric())
|
||||
.map(|(offset, _)| *offset)
|
||||
.filter(|&(_, c)| !c.is_alphanumeric())
|
||||
.map(|(offset, _)| offset)
|
||||
.next()
|
||||
.unwrap_or_else(|| self.text.len())
|
||||
}
|
||||
@@ -40,37 +39,39 @@ impl SimpleTokenizerStream {
|
||||
impl Iterator for SimpleTokenizerStream {
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.idx >= self.chars.len() {
|
||||
return None;
|
||||
}
|
||||
self.token.text.clear();
|
||||
self.token.position = self.token.position.wrapping_add(1);
|
||||
while self.idx < self.chars.len() {
|
||||
let (offset_from, c) = self.chars[self.idx];
|
||||
self.position = self.position.wrapping_add(1);
|
||||
while let Some((offset_from, c)) = self.chars.next() {
|
||||
if c.is_alphanumeric() {
|
||||
let offset_to = self.search_token_end();
|
||||
self.token.offset_from = offset_from;
|
||||
self.token.offset_to = offset_to;
|
||||
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
||||
return Some(self.token.clone());
|
||||
let token = Token {
|
||||
text: self.text[offset_from..offset_to].into(),
|
||||
offset_from,
|
||||
offset_to,
|
||||
position: self.position,
|
||||
..Default::default()
|
||||
};
|
||||
return Some(token);
|
||||
}
|
||||
self.idx += 1;
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenStream for SimpleTokenizerStream {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_empty() {
|
||||
let mut empty = SimpleTokenizer.token_stream("");
|
||||
assert_eq!(empty.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_tokenizer() {
|
||||
let mut stream = SimpleTokenizer.token_stream("tokenizer hello world");
|
||||
dbg!(stream.next());
|
||||
dbg!(stream.next());
|
||||
dbg!(stream.next());
|
||||
let mut simple = SimpleTokenizer.token_stream("tokenizer hello world");
|
||||
assert_eq!(simple.next().unwrap().text, "tokenizer");
|
||||
assert_eq!(simple.next().unwrap().text, "hello");
|
||||
assert_eq!(simple.next().unwrap().text, "world");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]));
|
||||
//! let tokenizer = analyzer_builder(SimpleTokenizer)
|
||||
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()])).build();
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("the fox is crafty");
|
||||
//! assert_eq!(stream.next().unwrap().text, "fox");
|
||||
@@ -27,7 +27,7 @@ pub struct StopWordFilter {
|
||||
|
||||
impl StopWordFilter {
|
||||
/// Creates a `StopWordFilter` given a list of words to remove
|
||||
pub fn new(words: Vec<String>) -> StopWordFilter {
|
||||
pub fn remove(words: Vec<String>) -> StopWordFilter {
|
||||
let mut set = StopWordHashSet::default();
|
||||
|
||||
for word in words {
|
||||
@@ -44,7 +44,7 @@ impl StopWordFilter {
|
||||
"there", "these", "they", "this", "to", "was", "will", "with",
|
||||
];
|
||||
|
||||
StopWordFilter::new(words.iter().map(|&s| s.to_string()).collect())
|
||||
StopWordFilter::remove(words.iter().map(|&s| s.to_string()).collect())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,31 +2,64 @@ use crate::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
const POSITION_GAP: usize = 2;
|
||||
|
||||
pub(crate) struct TokenStreamChain<I> {
|
||||
streams_with_offsets: I,
|
||||
token: Token,
|
||||
pub(crate) struct TokenStreamChain<Inner, Outer> {
|
||||
streams_with_offsets: Outer,
|
||||
current: Option<(Inner, usize)>,
|
||||
position: usize,
|
||||
position_shift: usize,
|
||||
}
|
||||
|
||||
impl<'a, Out> TokenStreamChain<Out> {
|
||||
pub fn new<In>(streams_with_offsets: Out) -> TokenStreamChain<Out>
|
||||
where
|
||||
In: Iterator<Item = Token>,
|
||||
Out: Iterator<Item = (In, usize)>,
|
||||
{
|
||||
impl<'a, Inner, Outer> TokenStreamChain<Inner, Outer>
|
||||
where
|
||||
Inner: Iterator<Item = Token>,
|
||||
Outer: Iterator<Item = (Inner, usize)>,
|
||||
{
|
||||
pub fn new(mut streams_with_offsets: Outer) -> TokenStreamChain<Inner, Outer> {
|
||||
let current = streams_with_offsets.next();
|
||||
TokenStreamChain {
|
||||
streams_with_offsets,
|
||||
token: Token::default(),
|
||||
streams_with_offsets: streams_with_offsets,
|
||||
current,
|
||||
position: usize::max_value(),
|
||||
position_shift: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, Inner, Outer: Iterator<Item = (Inner, usize)>> TokenStream
|
||||
for TokenStreamChain<Inner, Outer>
|
||||
where
|
||||
Inner: Iterator<Item = Token>,
|
||||
{
|
||||
}
|
||||
|
||||
impl<'a, Inner, Outer> Iterator for TokenStreamChain<Inner, Outer>
|
||||
where
|
||||
Inner: Iterator<Item = Token>,
|
||||
Outer: Iterator<Item = (Inner, usize)>,
|
||||
{
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Token> {
|
||||
while let Some((ref mut token_stream, offset_offset)) = self.current {
|
||||
if let Some(mut token) = token_stream.next() {
|
||||
token.offset_from += offset_offset;
|
||||
token.offset_to += offset_offset;
|
||||
token.position += self.position_shift;
|
||||
self.position = token.position;
|
||||
return Some(token);
|
||||
}
|
||||
self.position_shift = self.position.wrapping_add(POSITION_GAP);
|
||||
self.current = self.streams_with_offsets.next();
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl DynTokenStreamChain {
|
||||
pub fn from_vec(streams_with_offsets: Vec<(Box<dyn TokenStream>, usize)>) -> impl TokenStream {
|
||||
DynTokenStreamChain {
|
||||
streams_with_offsets,
|
||||
idx: 0,
|
||||
token: Token::default(),
|
||||
position: usize::max_value(),
|
||||
position_shift: 0,
|
||||
}
|
||||
}
|
||||
@@ -35,7 +68,7 @@ impl DynTokenStreamChain {
|
||||
pub(crate) struct DynTokenStreamChain {
|
||||
streams_with_offsets: Vec<(Box<dyn TokenStream>, usize)>,
|
||||
idx: usize,
|
||||
token: Token,
|
||||
position: usize,
|
||||
position_shift: usize,
|
||||
}
|
||||
|
||||
@@ -44,48 +77,17 @@ impl<'a> TokenStream for DynTokenStreamChain {}
|
||||
impl Iterator for DynTokenStreamChain {
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Token> {
|
||||
if self.idx >= self.streams_with_offsets.len() {
|
||||
return None;
|
||||
};
|
||||
while self.idx < self.streams_with_offsets.len() {
|
||||
let (ref mut token_stream, offset_offset) = self.streams_with_offsets[self.idx];
|
||||
if let Some(token) = token_stream.next() {
|
||||
self.token = token;
|
||||
self.token.offset_from += offset_offset;
|
||||
self.token.offset_to += offset_offset;
|
||||
self.token.position += self.position_shift;
|
||||
return Some(self.token.clone());
|
||||
} else {
|
||||
self.idx += 1;
|
||||
self.position_shift = self.token.position.wrapping_add(POSITION_GAP);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, In, Out: Iterator<Item = (In, usize)>> TokenStream for TokenStreamChain<Out> where
|
||||
In: Iterator<Item = Token>
|
||||
{
|
||||
}
|
||||
|
||||
impl<'a, In, Out> Iterator for TokenStreamChain<Out>
|
||||
where
|
||||
In: Iterator<Item = Token>,
|
||||
Out: Iterator<Item = (In, usize)>,
|
||||
{
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Token> {
|
||||
while let Some((ref mut token_stream, offset_offset)) = self.streams_with_offsets.next() {
|
||||
if let Some(token) = token_stream.next() {
|
||||
self.token = token;
|
||||
self.token.offset_from += offset_offset;
|
||||
self.token.offset_to += offset_offset;
|
||||
self.token.position += self.position_shift;
|
||||
return Some(self.token.clone());
|
||||
} else {
|
||||
self.position_shift = self.token.position.wrapping_add(POSITION_GAP);
|
||||
while let Some((token_stream, offset_offset)) = self.streams_with_offsets.get_mut(self.idx)
|
||||
{
|
||||
if let Some(mut token) = token_stream.next() {
|
||||
token.offset_from += *offset_offset;
|
||||
token.offset_to += *offset_offset;
|
||||
token.position += self.position_shift;
|
||||
self.position = token.position;
|
||||
return Some(token);
|
||||
}
|
||||
self.idx += 1;
|
||||
self.position_shift = self.position.wrapping_add(POSITION_GAP);
|
||||
}
|
||||
None
|
||||
}
|
||||
@@ -103,11 +105,16 @@ mod tests {
|
||||
(SimpleTokenizer.token_stream("hello world"), 0),
|
||||
];
|
||||
let mut token_chain = TokenStreamChain::new(token_streams.into_iter());
|
||||
let token = token_chain.next().unwrap();
|
||||
assert_eq!(token.text, "hello");
|
||||
assert_eq!(token.offset_from, 0);
|
||||
assert_eq!(token.offset_to, 5);
|
||||
assert_eq!(token.position, POSITION_GAP - 1);
|
||||
let token = token_chain.next();
|
||||
|
||||
let expect = Token {
|
||||
offset_from: 0,
|
||||
offset_to: 5,
|
||||
position: POSITION_GAP - 1,
|
||||
text: "hello".into(),
|
||||
..Token::default()
|
||||
};
|
||||
assert_eq!(token.unwrap(), expect);
|
||||
|
||||
let token = token_chain.next().unwrap();
|
||||
assert_eq!(token.text, "world");
|
||||
|
||||
@@ -97,8 +97,12 @@ mod tests {
|
||||
],
|
||||
};
|
||||
|
||||
let token_stream: Vec<_> = PreTokenizedStream::from(tok_text.clone()).collect();
|
||||
assert_eq!(token_stream, tok_text.tokens);
|
||||
let mut token_stream = PreTokenizedStream::from(tok_text.clone());
|
||||
|
||||
for expected_token in tok_text.tokens {
|
||||
assert_eq!(token_stream.next().unwrap(), expected_token);
|
||||
}
|
||||
assert!(token_stream.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -125,7 +129,7 @@ mod tests {
|
||||
|
||||
let chain_parts = vec![&tok_text, &tok_text];
|
||||
|
||||
let token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]);
|
||||
let mut token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]);
|
||||
|
||||
let expected_tokens = vec![
|
||||
Token {
|
||||
@@ -157,6 +161,10 @@ mod tests {
|
||||
position_length: 1,
|
||||
},
|
||||
];
|
||||
assert_eq!(token_stream.collect::<Vec<_>>(), expected_tokens);
|
||||
|
||||
for expected_token in expected_tokens {
|
||||
assert_eq!(token_stream.next().unwrap(), expected_token);
|
||||
}
|
||||
assert!(token_stream.next().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,16 +36,101 @@ impl Default for Token {
|
||||
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||
///
|
||||
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
|
||||
pub struct TextAnalyzer<T> {
|
||||
tokenizer: T,
|
||||
filters: Vec<Box<dyn TokenFilter>>,
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct TextAnalyzer<T>(T);
|
||||
|
||||
/// Identity `TokenFilter`
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct Identity;
|
||||
|
||||
impl TokenFilter for Identity {
|
||||
fn transform(&mut self, token: Token) -> Option<Token> {
|
||||
Some(token)
|
||||
}
|
||||
}
|
||||
|
||||
/// Top-level trait for hiding the types contained in it.
|
||||
pub trait TextAnalyzerT: 'static + Send + Sync + TextAnalyzerClone {
|
||||
/// Top-level method that calls the corresponding `token_stream` on the
|
||||
/// contained type.
|
||||
fn token_stream(&self, text: &str) -> Box<dyn TokenStream>;
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct AnalyzerBuilder<T, F> {
|
||||
tokenizer: T,
|
||||
f: F,
|
||||
}
|
||||
|
||||
/// Construct an `AnalyzerBuilder` on which to apply `TokenFilter`.
|
||||
pub fn analyzer_builder<T: Tokenizer>(tokenizer: T) -> AnalyzerBuilder<T, Identity> {
|
||||
AnalyzerBuilder {
|
||||
tokenizer,
|
||||
f: Identity,
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, F> AnalyzerBuilder<T, F>
|
||||
where
|
||||
T: Tokenizer,
|
||||
F: TokenFilter,
|
||||
{
|
||||
/// Appends a token filter to the current tokenizer.
|
||||
///
|
||||
/// The method consumes the current `TokenStream` and returns a
|
||||
/// new one.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let en_stem = analyzer_builder(SimpleTokenizer)
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser::new())
|
||||
/// .filter(Stemmer::default()).build();
|
||||
/// ```
|
||||
///
|
||||
pub fn filter<G: TokenFilter>(self, f: G) -> AnalyzerBuilder<AnalyzerBuilder<T, F>, G> {
|
||||
AnalyzerBuilder { tokenizer: self, f }
|
||||
}
|
||||
/// Finalize the build process.
|
||||
pub fn build(self) -> TextAnalyzer<AnalyzerBuilder<T, F>> {
|
||||
TextAnalyzer(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Tokenizer, F: TokenFilter> Tokenizer for AnalyzerBuilder<T, F> {
|
||||
type Iter = Filter<T::Iter, F>;
|
||||
fn token_stream(&self, text: &str) -> Self::Iter {
|
||||
Filter {
|
||||
iter: self.tokenizer.token_stream(text),
|
||||
f: self.f.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `Filter` is a wrapper around a `TokenStream` and a `TokenFilter` which modifies the `TokenStream`.
|
||||
#[derive(Clone, Default, Debug)]
|
||||
pub struct Filter<I, F> {
|
||||
iter: I,
|
||||
f: F,
|
||||
}
|
||||
|
||||
impl<I, F> Iterator for Filter<I, F>
|
||||
where
|
||||
I: TokenStream,
|
||||
F: TokenFilter,
|
||||
{
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Token> {
|
||||
while let Some(token) = self.iter.next() {
|
||||
if let Some(tok) = self.f.transform(token) {
|
||||
return Some(tok);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<I, F> TokenStream for Filter<I, F>
|
||||
where
|
||||
I: TokenStream,
|
||||
F: TokenFilter,
|
||||
{
|
||||
}
|
||||
|
||||
pub trait TextAnalyzerClone {
|
||||
@@ -58,112 +143,25 @@ impl Clone for Box<dyn TextAnalyzerT> {
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Box<dyn TokenFilter> {
|
||||
fn clone(&self) -> Self {
|
||||
(**self).box_clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Clone + Tokenizer> TextAnalyzerClone for TextAnalyzer<T> {
|
||||
impl<T: Tokenizer> TextAnalyzerClone for TextAnalyzer<T> {
|
||||
fn box_clone(&self) -> Box<dyn TextAnalyzerT> {
|
||||
Box::new(TextAnalyzer {
|
||||
tokenizer: self.tokenizer.clone(),
|
||||
filters: self.filters.clone(),
|
||||
})
|
||||
Box::new(TextAnalyzer(self.0.clone()))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> TextAnalyzerT for TextAnalyzer<T> {
|
||||
fn token_stream(&self, text: &str) -> Box<dyn TokenStream> {
|
||||
let tokens = self.tokenizer.token_stream(text);
|
||||
Box::new(TextIter {
|
||||
tokens,
|
||||
// TODO: remove clone
|
||||
filters: self.filters.clone(),
|
||||
})
|
||||
Box::new(self.0.token_stream(text))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> TextAnalyzer<T>
|
||||
where
|
||||
T: Tokenizer,
|
||||
{
|
||||
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `Box<dyn TokenFilter>`.
|
||||
///
|
||||
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
|
||||
/// `TextAnalyzer::from(tokenizer)`.
|
||||
pub fn new(tokenizer: T) -> TextAnalyzer<T> {
|
||||
TextAnalyzer {
|
||||
tokenizer,
|
||||
filters: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
/// Appends a token filter to the current tokenizer.
|
||||
///
|
||||
/// The method consumes the current `TokenStream` and returns a
|
||||
/// new one.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let en_stem = TextAnalyzer::from(SimpleTokenizer)
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser)
|
||||
/// .filter(Stemmer::default());
|
||||
/// ```
|
||||
///
|
||||
pub fn filter<F: TokenFilter>(mut self, token_filter: F) -> Self {
|
||||
self.filters.push(Box::new(token_filter));
|
||||
self
|
||||
}
|
||||
|
||||
/// Tokenize an array`&str`
|
||||
///
|
||||
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
|
||||
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
||||
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream(&self, text: &str) -> TextIter<T::Iter> {
|
||||
let tokens = self.tokenizer.token_stream(text);
|
||||
TextIter {
|
||||
tokens,
|
||||
// TODO: remove clone
|
||||
filters: self.filters.clone(),
|
||||
}
|
||||
}
|
||||
/// 'Top-level' trait hiding concrete types, below which static dispatch occurs.
|
||||
pub trait TextAnalyzerT: 'static + Send + Sync + TextAnalyzerClone {
|
||||
/// 'Top-level' dynamic dispatch function hiding concrete types of the staticly
|
||||
/// dispatched `token_stream` from the `Tokenizer` trait.
|
||||
fn token_stream(&self, text: &str) -> Box<dyn TokenStream>;
|
||||
}
|
||||
|
||||
pub struct TextIter<I> {
|
||||
tokens: I,
|
||||
filters: Vec<Box<dyn TokenFilter>>,
|
||||
}
|
||||
|
||||
impl<I> Iterator for TextIter<I>
|
||||
where
|
||||
I: Iterator<Item = Token>,
|
||||
{
|
||||
type Item = I::Item;
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
'outer: while let Some(mut token) = self.tokens.next() {
|
||||
for filter in self.filters.iter_mut() {
|
||||
if let Some(tok) = filter.transform(token) {
|
||||
token = tok;
|
||||
continue;
|
||||
};
|
||||
continue 'outer;
|
||||
}
|
||||
return Some(token);
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = Token>> TokenStream for TextIter<I> {}
|
||||
|
||||
/// `Tokenizer` are in charge of splitting text into a stream of token
|
||||
/// before indexing.
|
||||
///
|
||||
@@ -193,22 +191,12 @@ pub trait Tokenizer: 'static + Send + Sync + Clone {
|
||||
}
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
|
||||
pub trait TokenFilter: 'static + Send + Sync + Clone {
|
||||
/// Take a `Token` and transform it or return `None` if it's to be removed
|
||||
/// from the output stream.
|
||||
fn transform(&mut self, token: Token) -> Option<Token>;
|
||||
}
|
||||
|
||||
pub trait TokenFilterClone {
|
||||
fn box_clone(&self) -> Box<dyn TokenFilter>;
|
||||
}
|
||||
|
||||
impl<T: TokenFilter + Clone> TokenFilterClone for T {
|
||||
fn box_clone(&self) -> Box<dyn TokenFilter> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// `TokenStream` is the result of the tokenization.
|
||||
///
|
||||
/// It consists consumable stream of `Token`s.
|
||||
@@ -218,9 +206,9 @@ impl<T: TokenFilter + Clone> TokenFilterClone for T {
|
||||
/// ```
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
/// let tokenizer = analyzer_builder(SimpleTokenizer)
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser);
|
||||
/// .filter(LowerCaser::new()).build();
|
||||
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||
/// {
|
||||
/// let token = token_stream.next().unwrap();
|
||||
@@ -239,6 +227,12 @@ impl<T: TokenFilter + Clone> TokenFilterClone for T {
|
||||
/// ```
|
||||
pub trait TokenStream: Iterator<Item = Token> {}
|
||||
|
||||
impl<T: Tokenizer> From<T> for TextAnalyzer<T> {
|
||||
fn from(src: T) -> TextAnalyzer<T> {
|
||||
TextAnalyzer(src)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
@@ -263,7 +257,7 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn text_analyzer() {
|
||||
let mut stream = TextAnalyzer::new(SimpleTokenizer).token_stream("tokenizer hello world");
|
||||
let mut stream = SimpleTokenizer.token_stream("tokenizer hello world");
|
||||
dbg!(stream.next());
|
||||
dbg!(stream.next());
|
||||
dbg!(stream.next());
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::tokenizer::stemmer::Language;
|
||||
use crate::tokenizer::tokenizer::{TextAnalyzer, TextAnalyzerT, Tokenizer};
|
||||
use crate::tokenizer::tokenizer::{analyzer_builder, TextAnalyzer, TextAnalyzerT, Tokenizer};
|
||||
use crate::tokenizer::LowerCaser;
|
||||
use crate::tokenizer::RawTokenizer;
|
||||
use crate::tokenizer::RemoveLongFilter;
|
||||
@@ -27,14 +27,14 @@ pub struct TokenizerManager {
|
||||
|
||||
impl TokenizerManager {
|
||||
/// Registers a new tokenizer associated with a given name.
|
||||
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
|
||||
pub fn register<U: Tokenizer, T>(&self, tokenizer_name: &str, tokenizer: T)
|
||||
where
|
||||
T: TextAnalyzerT,
|
||||
T: Into<TextAnalyzer<U>>,
|
||||
{
|
||||
self.tokenizers
|
||||
.write()
|
||||
.expect("Acquiring the lock should never fail")
|
||||
.insert(tokenizer_name.to_string(), Box::new(tokenizer));
|
||||
.insert(tokenizer_name.to_string(), Box::new(tokenizer.into()));
|
||||
}
|
||||
|
||||
/// Accessing a tokenizer given its name.
|
||||
@@ -57,19 +57,21 @@ impl Default for TokenizerManager {
|
||||
let manager = TokenizerManager {
|
||||
tokenizers: Arc::new(RwLock::new(HashMap::new())),
|
||||
};
|
||||
manager.register("raw", TextAnalyzer::new(RawTokenizer));
|
||||
manager.register("raw", RawTokenizer);
|
||||
manager.register(
|
||||
"default",
|
||||
TextAnalyzer::new(SimpleTokenizer)
|
||||
.filter(RemoveLongFilter::new(40))
|
||||
.filter(LowerCaser::new()),
|
||||
analyzer_builder(SimpleTokenizer)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser::new())
|
||||
.build(),
|
||||
);
|
||||
manager.register(
|
||||
"en_stem",
|
||||
TextAnalyzer::new(SimpleTokenizer)
|
||||
.filter(RemoveLongFilter::new(40))
|
||||
analyzer_builder(SimpleTokenizer)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser::new())
|
||||
.filter(Stemmer::new(Language::English)),
|
||||
.filter(Stemmer::new(Language::English))
|
||||
.build(),
|
||||
);
|
||||
manager
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user