Fix bug. Cleanup some rough spots. Renamed functions. Fixed tests and docs.

This commit is contained in:
dcraven
2020-12-30 13:28:27 +01:00
committed by Paul Masurel
parent 4e6b341422
commit ca6fd5effc
21 changed files with 360 additions and 313 deletions

View File

@@ -5,7 +5,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::{NgramTokenizer, TextAnalyzer};
use tantivy::tokenizer::NgramTokenizer;
use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> {
@@ -52,10 +52,9 @@ fn main() -> tantivy::Result<()> {
// here we are registering our custome tokenizer
// this will store tokens of 3 characters each
index.tokenizers().register(
"ngram3",
TextAnalyzer::new(NgramTokenizer::new(3, 3, false)),
);
index
.tokenizers()
.register("ngram3", NgramTokenizer::new(3, 3, false));
// To insert document we need an index writer.
// There must be only one writer at a time.

View File

@@ -50,12 +50,13 @@ fn main() -> tantivy::Result<()> {
// This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus
let tokenizer = TextAnalyzer::new(SimpleTokenizer)
let tokenizer = analyzer_builder(SimpleTokenizer)
.filter(LowerCaser::new())
.filter(StopWordFilter::new(vec![
.filter(StopWordFilter::remove(vec![
"the".to_string(),
"and".to_string(),
]));
]))
.build();
index.tokenizers().register("stoppy", tokenizer);

View File

@@ -20,8 +20,8 @@ use crate::reader::IndexReaderBuilder;
use crate::schema::Field;
use crate::schema::FieldType;
use crate::schema::Schema;
use crate::tokenizer::TextAnalyzerT;
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::tokenizer::Tokenizer;
use crate::tokenizer::{TextAnalyzer, TextAnalyzerT, TokenizerManager};
use crate::IndexWriter;
use std::collections::HashSet;
use std::fmt;

View File

@@ -13,8 +13,8 @@ use crate::schema::Value;
use crate::schema::{Field, FieldEntry};
use crate::tokenizer::PreTokenizedStream;
use crate::tokenizer::TokenStream;
use crate::tokenizer::{DynTokenStreamChain, TextAnalyzerT, TokenStreamChain, Tokenizer};
use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
use crate::tokenizer::{DynTokenStreamChain, TokenStreamChain, Tokenizer};
use crate::tokenizer::{FacetTokenizer, TextAnalyzer, TextAnalyzerT};
use crate::Opstamp;
use crate::{DocId, SegmentComponent};

View File

@@ -50,7 +50,9 @@ pub mod tests {
use crate::schema::{Field, TextOptions};
use crate::schema::{IndexRecordOption, TextFieldIndexing};
use crate::schema::{Schema, Term, INDEXED, TEXT};
use crate::tokenizer::{SimpleTokenizer, TextAnalyzer, MAX_TOKEN_LEN};
use crate::tokenizer::{
analyzer_builder, SimpleTokenizer, TextAnalyzer, TextAnalyzerT, MAX_TOKEN_LEN,
};
use crate::DocId;
use crate::HasLen;
use crate::Score;
@@ -167,7 +169,7 @@ pub mod tests {
let index = Index::create_in_ram(schema.clone());
index
.tokenizers()
.register("simple_no_truncation", TextAnalyzer::new(SimpleTokenizer));
.register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));

View File

@@ -573,14 +573,13 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<dyn Query> {
#[cfg(test)]
mod test {
use super::super::logical_ast::*;
use super::QueryParser;
use super::QueryParserError;
use super::*;
use crate::query::Query;
use crate::schema::Field;
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT};
use crate::tokenizer::{
LowerCaser, SimpleTokenizer, StopWordFilter, TextAnalyzer, TokenizerManager,
analyzer_builder, LowerCaser, SimpleTokenizer, StopWordFilter, TextAnalyzer,
};
use crate::Index;
use matches::assert_matches;
@@ -619,9 +618,10 @@ mod test {
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"en_with_stop_words",
TextAnalyzer::new(SimpleTokenizer)
analyzer_builder(SimpleTokenizer)
.filter(LowerCaser::new())
.filter(StopWordFilter::new(vec!["the".to_string()])),
.filter(StopWordFilter::remove(vec!["the".to_string()]))
.build(),
);
QueryParser::new(schema, default_fields, tokenizer_manager)
}
@@ -978,7 +978,7 @@ mod test {
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("customtokenizer", TextAnalyzer::new(SimpleTokenizer));
.register("customtokenizer", SimpleTokenizer);
let query_parser = QueryParser::for_index(&index, vec![title]);
assert_eq!(
query_parser.parse_query("title:\"happy tax\"").unwrap_err(),

View File

@@ -1,7 +1,7 @@
use crate::query::Query;
use crate::schema::Field;
use crate::schema::Value;
use crate::tokenizer::{TextAnalyzerT, Token};
use crate::tokenizer::{TextAnalyzerT, Token, Tokenizer};
use crate::Searcher;
use crate::{Document, Score};
use htmlescape::encode_minimal;
@@ -350,8 +350,13 @@ Survey in 2016, 2017, and 2018."#;
String::from("rust") => 1.0,
String::from("language") => 0.9
};
let fragments =
search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 100);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
TEST_TEXT,
&terms,
100,
);
assert_eq!(fragments.len(), 7);
{
let first = &fragments[0];
@@ -378,8 +383,12 @@ Survey in 2016, 2017, and 2018."#;
String::from("rust") =>1.0,
String::from("language") => 0.9
};
let fragments =
search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 20);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
TEST_TEXT,
&terms,
20,
);
{
let first = &fragments[0];
assert_eq!(first.score, 1.0);
@@ -393,8 +402,12 @@ Survey in 2016, 2017, and 2018."#;
String::from("rust") =>0.9,
String::from("language") => 1.0
};
let fragments =
search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 20);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
TEST_TEXT,
&terms,
20,
);
//assert_eq!(fragments.len(), 7);
{
let first = &fragments[0];
@@ -413,7 +426,12 @@ Survey in 2016, 2017, and 2018."#;
let mut terms = BTreeMap::new();
terms.insert(String::from("c"), 1.0);
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
3,
);
assert_eq!(fragments.len(), 1);
{
@@ -435,7 +453,12 @@ Survey in 2016, 2017, and 2018."#;
let mut terms = BTreeMap::new();
terms.insert(String::from("f"), 1.0);
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
3,
);
assert_eq!(fragments.len(), 2);
{
@@ -458,7 +481,12 @@ Survey in 2016, 2017, and 2018."#;
terms.insert(String::from("f"), 1.0);
terms.insert(String::from("a"), 0.9);
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 7);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
7,
);
assert_eq!(fragments.len(), 2);
{
@@ -480,7 +508,12 @@ Survey in 2016, 2017, and 2018."#;
let mut terms = BTreeMap::new();
terms.insert(String::from("z"), 1.0);
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
3,
);
assert_eq!(fragments.len(), 0);
@@ -494,7 +527,12 @@ Survey in 2016, 2017, and 2018."#;
let text = "a b c d";
let terms = BTreeMap::new();
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
3,
);
assert_eq!(fragments.len(), 0);
let snippet = select_best_fragment_combination(&fragments[..], &text);

View File

@@ -2,16 +2,16 @@
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(RawTokenizer)
//! .filter(AlphaNumOnlyFilter);
//! let tokenizer = analyzer_builder(RawTokenizer)
//! .filter(AlphaNumOnlyFilter).build();
//!
//! let mut stream = tokenizer.token_stream("hello there");
//! // is none because the raw filter emits one token that
//! // contains a space
//! assert!(stream.next().is_none());
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(AlphaNumOnlyFilter);
//! let tokenizer = analyzer_builder(SimpleTokenizer)
//! .filter(AlphaNumOnlyFilter).build();
//!
//! let mut stream = tokenizer.token_stream("hello there 💣");
//! assert!(stream.next().is_some());
@@ -23,14 +23,14 @@ use super::{Token, TokenFilter, TokenStream};
/// `TokenFilter` that removes all tokens that contain non
/// ascii alphanumeric characters.
#[derive(Clone)]
#[derive(Clone, Debug, Default)]
pub struct AlphaNumOnlyFilter;
impl TokenFilter for AlphaNumOnlyFilter {
fn transform(&mut self, token: Token) -> Option<Token> {
if token.text.chars().all(|c| c.is_ascii_alphanumeric()) {
return None;
return Some(token);
}
Some(token)
None
}
}

View File

@@ -1,10 +1,10 @@
use super::{Token, TokenFilter, TokenStream};
use super::{analyzer_builder, Token, TokenFilter, TokenStream};
use std::mem;
/// This class converts alphabetic, numeric, and symbolic Unicode characters
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
/// block) into their ASCII equivalents, if one exists.
#[derive(Clone, Debug)]
#[derive(Clone, Debug, Default)]
pub struct AsciiFolding {
buffer: String,
}
@@ -1543,8 +1543,9 @@ mod tests {
}
fn folding_helper(text: &str) -> Vec<String> {
let tokens = TextAnalyzer::new(SimpleTokenizer)
let tokens = analyzer_builder(SimpleTokenizer)
.filter(AsciiFolding::new())
.build()
.token_stream(text)
.map(|token| token.text.clone())
.collect();
@@ -1552,8 +1553,9 @@ mod tests {
}
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
let mut token_stream = TextAnalyzer::new(RawTokenizer)
let mut token_stream = analyzer_builder(RawTokenizer)
.filter(AsciiFolding::new())
.build()
.token_stream(text);
let Token { text, .. } = token_stream.next().unwrap();
text

View File

@@ -9,7 +9,7 @@ use crate::schema::FACET_SEP_BYTE;
/// - `/america/north_america/canada`
/// - `/america/north_america`
/// - `/america`
#[derive(Clone, Debug)]
#[derive(Clone, Debug, Default)]
pub struct FacetTokenizer;
#[derive(Clone, Debug)]
@@ -40,13 +40,13 @@ impl Tokenizer for FacetTokenizer {
impl Iterator for FacetTokenStream {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
match self.state {
self.state = match self.state {
State::RootFacetNotEmitted => {
self.state = if self.text.is_empty() {
if self.text.is_empty() {
State::Terminated
} else {
State::UpToPosition(0)
};
}
}
State::UpToPosition(cursor) => {
if let Some(next_sep_pos) = self.text.as_bytes()[cursor + 1..]
@@ -56,11 +56,11 @@ impl Iterator for FacetTokenStream {
{
let facet_part = &self.text[cursor..next_sep_pos];
self.token.text.push_str(facet_part);
self.state = State::UpToPosition(next_sep_pos);
State::UpToPosition(next_sep_pos)
} else {
let facet_part = &self.text[cursor..];
self.token.text.push_str(facet_part);
self.state = State::Terminated;
State::Terminated
}
}
State::Terminated => return None,

View File

@@ -1,4 +1,4 @@
use super::{Token, TokenFilter};
use super::{analyzer_builder, TextAnalyzerT, Token, TokenFilter};
use std::mem;
impl TokenFilter for LowerCaser {
@@ -15,7 +15,7 @@ impl TokenFilter for LowerCaser {
}
/// Token filter that lowercase terms.
#[derive(Clone, Debug)]
#[derive(Clone, Debug, Default)]
pub struct LowerCaser {
buffer: String,
}
@@ -46,15 +46,13 @@ mod tests {
#[test]
fn test_to_lower_case() {
assert_eq!(
lowercase_helper("Русский текст"),
vec!["русский".to_string(), "текст".to_string()]
);
assert_eq!(lowercase_helper("Русский текст"), vec!["русский", "текст"]);
}
fn lowercase_helper(text: &str) -> Vec<String> {
TextAnalyzer::new(SimpleTokenizer)
analyzer_builder(SimpleTokenizer)
.filter(LowerCaser::new())
.build()
.token_stream(text)
.map(|token| {
let Token { text, .. } = token;
@@ -65,7 +63,7 @@ mod tests {
#[test]
fn test_lowercaser() {
assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]);
assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]);
assert_eq!(lowercase_helper("Tree"), vec!["tree"]);
assert_eq!(lowercase_helper("Русский"), vec!["русский"]);
}
}

View File

@@ -64,10 +64,10 @@
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let en_stem = TextAnalyzer::from(SimpleTokenizer)
//! let en_stem = analyzer_builder(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser)
//! .filter(Stemmer::new(Language::English));
//! .filter(LowerCaser::new())
//! .filter(Stemmer::new(Language::English)).build();
//! ```
//!
//! Once your tokenizer is defined, you need to
@@ -109,9 +109,9 @@
//! let index = Index::create_in_ram(schema);
//!
//! // We need to register our tokenizer :
//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! let custom_en_tokenizer = analyzer_builder(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser);
//! .filter(LowerCaser::new()).build();
//! index
//! .tokenizers()
//! .register("custom_en", custom_en_tokenizer);
@@ -146,7 +146,8 @@ pub(crate) use self::token_stream_chain::{DynTokenStreamChain, TokenStreamChain}
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{
TextAnalyzer, TextAnalyzerT, Token, TokenFilter, TokenStream, Tokenizer,
analyzer_builder, Identity, TextAnalyzer, TextAnalyzerT, Token, TokenFilter, TokenStream,
Tokenizer,
};
pub use self::tokenizer_manager::TokenizerManager;
@@ -215,10 +216,11 @@ pub mod tests {
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"el_stem",
TextAnalyzer::new(SimpleTokenizer)
.filter(RemoveLongFilter::new(40))
analyzer_builder(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser::new())
.filter(Stemmer::new(Language::Greek)),
.filter(Stemmer::new(Language::Greek))
.build(),
);
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
let tokens: Vec<Token> = en_tokenizer

View File

@@ -78,7 +78,7 @@ use super::{Token, TokenStream, Tokenizer};
/// }
/// assert!(stream.next().is_none());
/// ```
#[derive(Clone, Debug)]
#[derive(Clone, Debug, Default)]
pub struct NgramTokenizer {
/// min size of the n-gram
min_gram: usize,

View File

@@ -1,13 +1,12 @@
use super::{Token, TokenStream, Tokenizer};
/// For each value of the field, emit a single unprocessed token.
#[derive(Clone, Debug)]
#[derive(Clone, Debug, Default)]
pub struct RawTokenizer;
#[derive(Clone, Debug)]
pub struct RawTokenStream {
token: Token,
has_token: bool,
token: Option<Token>,
}
impl Tokenizer for RawTokenizer {
@@ -21,8 +20,7 @@ impl Tokenizer for RawTokenizer {
position_length: 1,
};
RawTokenStream {
token,
has_token: true,
token: Some(token),
}
}
}
@@ -30,12 +28,7 @@ impl Tokenizer for RawTokenizer {
impl Iterator for RawTokenStream {
type Item = Token;
fn next(&mut self) -> Option<Token> {
if self.has_token {
self.has_token = false;
Some(self.token.clone())
} else {
None
}
self.token.take()
}
}

View File

@@ -2,8 +2,8 @@
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(5));
//! let tokenizer = analyzer_builder(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(5)).build();
//!
//! let mut stream = tokenizer.token_stream("toolong nice");
//! // because `toolong` is more than 5 characters, it is filtered
@@ -26,7 +26,7 @@ pub struct RemoveLongFilter {
impl RemoveLongFilter {
/// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation.
pub fn new(limit: usize) -> RemoveLongFilter {
pub fn limit(limit: usize) -> RemoveLongFilter {
RemoveLongFilter { limit }
}
}

View File

@@ -1,37 +1,36 @@
use super::{Token, TokenStream, Tokenizer};
use std::str::CharIndices;
impl TokenStream for SimpleTokenizerStream {}
/// Tokenize the text by splitting on whitespaces and punctuation.
#[derive(Clone, Debug)]
pub struct SimpleTokenizer;
impl Tokenizer for SimpleTokenizer {
type Iter = SimpleTokenizerStream;
fn token_stream(&self, text: &str) -> Self::Iter {
let vec: Vec<_> = text.char_indices().collect();
SimpleTokenizerStream {
text: text.to_string(),
chars: vec.into_iter(),
position: usize::max_value(),
}
}
}
#[derive(Clone, Debug)]
pub struct SimpleTokenizerStream {
text: String,
idx: usize,
chars: Vec<(usize, char)>,
token: Token,
}
impl Tokenizer for SimpleTokenizer {
type Iter = SimpleTokenizerStream;
fn token_stream(&self, text: &str) -> Self::Iter {
SimpleTokenizerStream {
text: text.to_string(),
chars: text.char_indices().collect(),
idx: 0,
token: Token::default(),
}
}
chars: std::vec::IntoIter<(usize, char)>,
position: usize,
}
impl SimpleTokenizerStream {
// search for the end of the current token.
fn search_token_end(&mut self) -> usize {
(&mut self.chars)
.iter()
.filter(|&&(_, ref c)| !c.is_alphanumeric())
.map(|(offset, _)| *offset)
.filter(|&(_, c)| !c.is_alphanumeric())
.map(|(offset, _)| offset)
.next()
.unwrap_or_else(|| self.text.len())
}
@@ -40,37 +39,39 @@ impl SimpleTokenizerStream {
impl Iterator for SimpleTokenizerStream {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
if self.idx >= self.chars.len() {
return None;
}
self.token.text.clear();
self.token.position = self.token.position.wrapping_add(1);
while self.idx < self.chars.len() {
let (offset_from, c) = self.chars[self.idx];
self.position = self.position.wrapping_add(1);
while let Some((offset_from, c)) = self.chars.next() {
if c.is_alphanumeric() {
let offset_to = self.search_token_end();
self.token.offset_from = offset_from;
self.token.offset_to = offset_to;
self.token.text.push_str(&self.text[offset_from..offset_to]);
return Some(self.token.clone());
let token = Token {
text: self.text[offset_from..offset_to].into(),
offset_from,
offset_to,
position: self.position,
..Default::default()
};
return Some(token);
}
self.idx += 1;
}
None
}
}
impl TokenStream for SimpleTokenizerStream {}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty() {
let mut empty = SimpleTokenizer.token_stream("");
assert_eq!(empty.next(), None);
}
#[test]
fn simple_tokenizer() {
let mut stream = SimpleTokenizer.token_stream("tokenizer hello world");
dbg!(stream.next());
dbg!(stream.next());
dbg!(stream.next());
let mut simple = SimpleTokenizer.token_stream("tokenizer hello world");
assert_eq!(simple.next().unwrap().text, "tokenizer");
assert_eq!(simple.next().unwrap().text, "hello");
assert_eq!(simple.next().unwrap().text, "world");
}
}

View File

@@ -2,8 +2,8 @@
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]));
//! let tokenizer = analyzer_builder(SimpleTokenizer)
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()])).build();
//!
//! let mut stream = tokenizer.token_stream("the fox is crafty");
//! assert_eq!(stream.next().unwrap().text, "fox");
@@ -27,7 +27,7 @@ pub struct StopWordFilter {
impl StopWordFilter {
/// Creates a `StopWordFilter` given a list of words to remove
pub fn new(words: Vec<String>) -> StopWordFilter {
pub fn remove(words: Vec<String>) -> StopWordFilter {
let mut set = StopWordHashSet::default();
for word in words {
@@ -44,7 +44,7 @@ impl StopWordFilter {
"there", "these", "they", "this", "to", "was", "will", "with",
];
StopWordFilter::new(words.iter().map(|&s| s.to_string()).collect())
StopWordFilter::remove(words.iter().map(|&s| s.to_string()).collect())
}
}

View File

@@ -2,31 +2,64 @@ use crate::tokenizer::{Token, TokenStream, Tokenizer};
const POSITION_GAP: usize = 2;
pub(crate) struct TokenStreamChain<I> {
streams_with_offsets: I,
token: Token,
pub(crate) struct TokenStreamChain<Inner, Outer> {
streams_with_offsets: Outer,
current: Option<(Inner, usize)>,
position: usize,
position_shift: usize,
}
impl<'a, Out> TokenStreamChain<Out> {
pub fn new<In>(streams_with_offsets: Out) -> TokenStreamChain<Out>
where
In: Iterator<Item = Token>,
Out: Iterator<Item = (In, usize)>,
{
impl<'a, Inner, Outer> TokenStreamChain<Inner, Outer>
where
Inner: Iterator<Item = Token>,
Outer: Iterator<Item = (Inner, usize)>,
{
pub fn new(mut streams_with_offsets: Outer) -> TokenStreamChain<Inner, Outer> {
let current = streams_with_offsets.next();
TokenStreamChain {
streams_with_offsets,
token: Token::default(),
streams_with_offsets: streams_with_offsets,
current,
position: usize::max_value(),
position_shift: 0,
}
}
}
impl<'a, Inner, Outer: Iterator<Item = (Inner, usize)>> TokenStream
for TokenStreamChain<Inner, Outer>
where
Inner: Iterator<Item = Token>,
{
}
impl<'a, Inner, Outer> Iterator for TokenStreamChain<Inner, Outer>
where
Inner: Iterator<Item = Token>,
Outer: Iterator<Item = (Inner, usize)>,
{
type Item = Token;
fn next(&mut self) -> Option<Token> {
while let Some((ref mut token_stream, offset_offset)) = self.current {
if let Some(mut token) = token_stream.next() {
token.offset_from += offset_offset;
token.offset_to += offset_offset;
token.position += self.position_shift;
self.position = token.position;
return Some(token);
}
self.position_shift = self.position.wrapping_add(POSITION_GAP);
self.current = self.streams_with_offsets.next();
}
None
}
}
impl DynTokenStreamChain {
pub fn from_vec(streams_with_offsets: Vec<(Box<dyn TokenStream>, usize)>) -> impl TokenStream {
DynTokenStreamChain {
streams_with_offsets,
idx: 0,
token: Token::default(),
position: usize::max_value(),
position_shift: 0,
}
}
@@ -35,7 +68,7 @@ impl DynTokenStreamChain {
pub(crate) struct DynTokenStreamChain {
streams_with_offsets: Vec<(Box<dyn TokenStream>, usize)>,
idx: usize,
token: Token,
position: usize,
position_shift: usize,
}
@@ -44,48 +77,17 @@ impl<'a> TokenStream for DynTokenStreamChain {}
impl Iterator for DynTokenStreamChain {
type Item = Token;
fn next(&mut self) -> Option<Token> {
if self.idx >= self.streams_with_offsets.len() {
return None;
};
while self.idx < self.streams_with_offsets.len() {
let (ref mut token_stream, offset_offset) = self.streams_with_offsets[self.idx];
if let Some(token) = token_stream.next() {
self.token = token;
self.token.offset_from += offset_offset;
self.token.offset_to += offset_offset;
self.token.position += self.position_shift;
return Some(self.token.clone());
} else {
self.idx += 1;
self.position_shift = self.token.position.wrapping_add(POSITION_GAP);
}
}
None
}
}
impl<'a, In, Out: Iterator<Item = (In, usize)>> TokenStream for TokenStreamChain<Out> where
In: Iterator<Item = Token>
{
}
impl<'a, In, Out> Iterator for TokenStreamChain<Out>
where
In: Iterator<Item = Token>,
Out: Iterator<Item = (In, usize)>,
{
type Item = Token;
fn next(&mut self) -> Option<Token> {
while let Some((ref mut token_stream, offset_offset)) = self.streams_with_offsets.next() {
if let Some(token) = token_stream.next() {
self.token = token;
self.token.offset_from += offset_offset;
self.token.offset_to += offset_offset;
self.token.position += self.position_shift;
return Some(self.token.clone());
} else {
self.position_shift = self.token.position.wrapping_add(POSITION_GAP);
while let Some((token_stream, offset_offset)) = self.streams_with_offsets.get_mut(self.idx)
{
if let Some(mut token) = token_stream.next() {
token.offset_from += *offset_offset;
token.offset_to += *offset_offset;
token.position += self.position_shift;
self.position = token.position;
return Some(token);
}
self.idx += 1;
self.position_shift = self.position.wrapping_add(POSITION_GAP);
}
None
}
@@ -103,11 +105,16 @@ mod tests {
(SimpleTokenizer.token_stream("hello world"), 0),
];
let mut token_chain = TokenStreamChain::new(token_streams.into_iter());
let token = token_chain.next().unwrap();
assert_eq!(token.text, "hello");
assert_eq!(token.offset_from, 0);
assert_eq!(token.offset_to, 5);
assert_eq!(token.position, POSITION_GAP - 1);
let token = token_chain.next();
let expect = Token {
offset_from: 0,
offset_to: 5,
position: POSITION_GAP - 1,
text: "hello".into(),
..Token::default()
};
assert_eq!(token.unwrap(), expect);
let token = token_chain.next().unwrap();
assert_eq!(token.text, "world");

View File

@@ -97,8 +97,12 @@ mod tests {
],
};
let token_stream: Vec<_> = PreTokenizedStream::from(tok_text.clone()).collect();
assert_eq!(token_stream, tok_text.tokens);
let mut token_stream = PreTokenizedStream::from(tok_text.clone());
for expected_token in tok_text.tokens {
assert_eq!(token_stream.next().unwrap(), expected_token);
}
assert!(token_stream.next().is_none());
}
#[test]
@@ -125,7 +129,7 @@ mod tests {
let chain_parts = vec![&tok_text, &tok_text];
let token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]);
let mut token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]);
let expected_tokens = vec![
Token {
@@ -157,6 +161,10 @@ mod tests {
position_length: 1,
},
];
assert_eq!(token_stream.collect::<Vec<_>>(), expected_tokens);
for expected_token in expected_tokens {
assert_eq!(token_stream.next().unwrap(), expected_token);
}
assert!(token_stream.next().is_none());
}
}

View File

@@ -36,16 +36,101 @@ impl Default for Token {
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
///
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
pub struct TextAnalyzer<T> {
tokenizer: T,
filters: Vec<Box<dyn TokenFilter>>,
#[derive(Clone, Debug, Default)]
pub struct TextAnalyzer<T>(T);
/// Identity `TokenFilter`
#[derive(Clone, Debug, Default)]
pub struct Identity;
impl TokenFilter for Identity {
fn transform(&mut self, token: Token) -> Option<Token> {
Some(token)
}
}
/// Top-level trait for hiding the types contained in it.
pub trait TextAnalyzerT: 'static + Send + Sync + TextAnalyzerClone {
/// Top-level method that calls the corresponding `token_stream` on the
/// contained type.
fn token_stream(&self, text: &str) -> Box<dyn TokenStream>;
#[derive(Clone, Debug, Default)]
pub struct AnalyzerBuilder<T, F> {
tokenizer: T,
f: F,
}
/// Construct an `AnalyzerBuilder` on which to apply `TokenFilter`.
pub fn analyzer_builder<T: Tokenizer>(tokenizer: T) -> AnalyzerBuilder<T, Identity> {
AnalyzerBuilder {
tokenizer,
f: Identity,
}
}
impl<T, F> AnalyzerBuilder<T, F>
where
T: Tokenizer,
F: TokenFilter,
{
/// Appends a token filter to the current tokenizer.
///
/// The method consumes the current `TokenStream` and returns a
/// new one.
///
/// # Example
///
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let en_stem = analyzer_builder(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser::new())
/// .filter(Stemmer::default()).build();
/// ```
///
pub fn filter<G: TokenFilter>(self, f: G) -> AnalyzerBuilder<AnalyzerBuilder<T, F>, G> {
AnalyzerBuilder { tokenizer: self, f }
}
/// Finalize the build process.
pub fn build(self) -> TextAnalyzer<AnalyzerBuilder<T, F>> {
TextAnalyzer(self)
}
}
impl<T: Tokenizer, F: TokenFilter> Tokenizer for AnalyzerBuilder<T, F> {
type Iter = Filter<T::Iter, F>;
fn token_stream(&self, text: &str) -> Self::Iter {
Filter {
iter: self.tokenizer.token_stream(text),
f: self.f.clone(),
}
}
}
/// `Filter` is a wrapper around a `TokenStream` and a `TokenFilter` which modifies the `TokenStream`.
#[derive(Clone, Default, Debug)]
pub struct Filter<I, F> {
iter: I,
f: F,
}
impl<I, F> Iterator for Filter<I, F>
where
I: TokenStream,
F: TokenFilter,
{
type Item = Token;
fn next(&mut self) -> Option<Token> {
while let Some(token) = self.iter.next() {
if let Some(tok) = self.f.transform(token) {
return Some(tok);
}
}
None
}
}
impl<I, F> TokenStream for Filter<I, F>
where
I: TokenStream,
F: TokenFilter,
{
}
pub trait TextAnalyzerClone {
@@ -58,112 +143,25 @@ impl Clone for Box<dyn TextAnalyzerT> {
}
}
impl Clone for Box<dyn TokenFilter> {
fn clone(&self) -> Self {
(**self).box_clone()
}
}
impl<T: Clone + Tokenizer> TextAnalyzerClone for TextAnalyzer<T> {
impl<T: Tokenizer> TextAnalyzerClone for TextAnalyzer<T> {
fn box_clone(&self) -> Box<dyn TextAnalyzerT> {
Box::new(TextAnalyzer {
tokenizer: self.tokenizer.clone(),
filters: self.filters.clone(),
})
Box::new(TextAnalyzer(self.0.clone()))
}
}
impl<T: Tokenizer> TextAnalyzerT for TextAnalyzer<T> {
fn token_stream(&self, text: &str) -> Box<dyn TokenStream> {
let tokens = self.tokenizer.token_stream(text);
Box::new(TextIter {
tokens,
// TODO: remove clone
filters: self.filters.clone(),
})
Box::new(self.0.token_stream(text))
}
}
impl<T> TextAnalyzer<T>
where
T: Tokenizer,
{
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `Box<dyn TokenFilter>`.
///
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
/// `TextAnalyzer::from(tokenizer)`.
pub fn new(tokenizer: T) -> TextAnalyzer<T> {
TextAnalyzer {
tokenizer,
filters: vec![],
}
}
/// Appends a token filter to the current tokenizer.
///
/// The method consumes the current `TokenStream` and returns a
/// new one.
///
/// # Example
///
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let en_stem = TextAnalyzer::from(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser)
/// .filter(Stemmer::default());
/// ```
///
pub fn filter<F: TokenFilter>(mut self, token_filter: F) -> Self {
self.filters.push(Box::new(token_filter));
self
}
/// Tokenize an array`&str`
///
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
/// to prevent accidental `PhraseQuery` to match accross two terms.
/// Creates a token stream for a given `str`.
pub fn token_stream(&self, text: &str) -> TextIter<T::Iter> {
let tokens = self.tokenizer.token_stream(text);
TextIter {
tokens,
// TODO: remove clone
filters: self.filters.clone(),
}
}
/// 'Top-level' trait hiding concrete types, below which static dispatch occurs.
pub trait TextAnalyzerT: 'static + Send + Sync + TextAnalyzerClone {
/// 'Top-level' dynamic dispatch function hiding concrete types of the staticly
/// dispatched `token_stream` from the `Tokenizer` trait.
fn token_stream(&self, text: &str) -> Box<dyn TokenStream>;
}
pub struct TextIter<I> {
tokens: I,
filters: Vec<Box<dyn TokenFilter>>,
}
impl<I> Iterator for TextIter<I>
where
I: Iterator<Item = Token>,
{
type Item = I::Item;
fn next(&mut self) -> Option<Self::Item> {
'outer: while let Some(mut token) = self.tokens.next() {
for filter in self.filters.iter_mut() {
if let Some(tok) = filter.transform(token) {
token = tok;
continue;
};
continue 'outer;
}
return Some(token);
}
None
}
}
impl<I: Iterator<Item = Token>> TokenStream for TextIter<I> {}
/// `Tokenizer` are in charge of splitting text into a stream of token
/// before indexing.
///
@@ -193,22 +191,12 @@ pub trait Tokenizer: 'static + Send + Sync + Clone {
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
pub trait TokenFilter: 'static + Send + Sync + Clone {
/// Take a `Token` and transform it or return `None` if it's to be removed
/// from the output stream.
fn transform(&mut self, token: Token) -> Option<Token>;
}
pub trait TokenFilterClone {
fn box_clone(&self) -> Box<dyn TokenFilter>;
}
impl<T: TokenFilter + Clone> TokenFilterClone for T {
fn box_clone(&self) -> Box<dyn TokenFilter> {
Box::new(self.clone())
}
}
/// `TokenStream` is the result of the tokenization.
///
/// It consists consumable stream of `Token`s.
@@ -218,9 +206,9 @@ impl<T: TokenFilter + Clone> TokenFilterClone for T {
/// ```
/// use tantivy::tokenizer::*;
///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
/// let tokenizer = analyzer_builder(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser);
/// .filter(LowerCaser::new()).build();
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
/// {
/// let token = token_stream.next().unwrap();
@@ -239,6 +227,12 @@ impl<T: TokenFilter + Clone> TokenFilterClone for T {
/// ```
pub trait TokenStream: Iterator<Item = Token> {}
impl<T: Tokenizer> From<T> for TextAnalyzer<T> {
fn from(src: T) -> TextAnalyzer<T> {
TextAnalyzer(src)
}
}
#[cfg(test)]
mod test {
use super::*;
@@ -263,7 +257,7 @@ mod test {
#[test]
fn text_analyzer() {
let mut stream = TextAnalyzer::new(SimpleTokenizer).token_stream("tokenizer hello world");
let mut stream = SimpleTokenizer.token_stream("tokenizer hello world");
dbg!(stream.next());
dbg!(stream.next());
dbg!(stream.next());

View File

@@ -1,5 +1,5 @@
use crate::tokenizer::stemmer::Language;
use crate::tokenizer::tokenizer::{TextAnalyzer, TextAnalyzerT, Tokenizer};
use crate::tokenizer::tokenizer::{analyzer_builder, TextAnalyzer, TextAnalyzerT, Tokenizer};
use crate::tokenizer::LowerCaser;
use crate::tokenizer::RawTokenizer;
use crate::tokenizer::RemoveLongFilter;
@@ -27,14 +27,14 @@ pub struct TokenizerManager {
impl TokenizerManager {
/// Registers a new tokenizer associated with a given name.
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
pub fn register<U: Tokenizer, T>(&self, tokenizer_name: &str, tokenizer: T)
where
T: TextAnalyzerT,
T: Into<TextAnalyzer<U>>,
{
self.tokenizers
.write()
.expect("Acquiring the lock should never fail")
.insert(tokenizer_name.to_string(), Box::new(tokenizer));
.insert(tokenizer_name.to_string(), Box::new(tokenizer.into()));
}
/// Accessing a tokenizer given its name.
@@ -57,19 +57,21 @@ impl Default for TokenizerManager {
let manager = TokenizerManager {
tokenizers: Arc::new(RwLock::new(HashMap::new())),
};
manager.register("raw", TextAnalyzer::new(RawTokenizer));
manager.register("raw", RawTokenizer);
manager.register(
"default",
TextAnalyzer::new(SimpleTokenizer)
.filter(RemoveLongFilter::new(40))
.filter(LowerCaser::new()),
analyzer_builder(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser::new())
.build(),
);
manager.register(
"en_stem",
TextAnalyzer::new(SimpleTokenizer)
.filter(RemoveLongFilter::new(40))
analyzer_builder(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser::new())
.filter(Stemmer::new(Language::English)),
.filter(Stemmer::new(Language::English))
.build(),
);
manager
}