Compare commits

...

22 Commits

Author SHA1 Message Date
Paul Masurel
a106e71b9b Adapted benchmark to the analyzer's Iterator<Item=Token> form 2021-01-06 22:26:58 +09:00
Paul Masurel
ee8e61a062 Wrapping stemmer into an Arc 2021-01-06 20:20:08 +09:00
dcraven
da023f33c0 Fix imports 2021-01-06 19:20:21 +09:00
dcraven
de5a8bfab3 Remove unused imports. 2021-01-06 19:20:21 +09:00
dcraven
0356b7d779 Remove patched rust-stemmers from Cargo.toml 2021-01-06 19:20:21 +09:00
dcraven
50812d0081 Remove TokenStream trait. 2021-01-06 19:20:21 +09:00
dcraven
4a68c8a712 Reorder for more linear reading. 2021-01-06 19:20:21 +09:00
dcraven
ca6fd5effc Fix bug. Cleanup some rough spots. Renamed functions. Fixed tests and docs. 2021-01-06 19:20:21 +09:00
dcraven
4e6b341422 Tests compile. 2021-01-06 19:20:21 +09:00
dcraven
a56330b234 Fix generic types in TokenChainIterator. Fix filter implementations. 2021-01-06 19:20:21 +09:00
dcraven
6af6c11ec2 Formulate more as iterators. 2021-01-06 19:20:21 +09:00
dcraven
9633d2e657 Small changes. 2021-01-06 19:20:21 +09:00
dcraven
39e8739ea5 Reformulate as Iterators, Checkpoint 2. Finished, now bubble up changes. 2021-01-06 19:20:21 +09:00
dcraven
801c82a5e1 Formulate as Iterators, Checkpoint 1. 2021-01-06 19:20:21 +09:00
dcraven
ccd0f3ccc9 Checkpoint converting to Iterators and static dispatch. 2021-01-06 19:20:21 +09:00
dcraven
f1973759ef Remove forgotten code. 2021-01-06 19:20:21 +09:00
dcraven
2bd1d8230c Remove unnecessary lifetime. 2021-01-06 19:20:21 +09:00
dcraven
c7407cc2a7 Simplify control flow. 2021-01-06 19:20:21 +09:00
dcraven
0150b406c5 Remove BoxTokenFilter. 2021-01-06 19:20:21 +09:00
dcraven
f73209a868 Reduced number of allocations. 2021-01-06 19:20:21 +09:00
dcraven
09375decc2 Removed unnecessary lifetimes. 2021-01-06 19:20:21 +09:00
dcraven
b8c7f1fe9c Removed unnecessary trait impls 2021-01-06 19:20:21 +09:00
29 changed files with 738 additions and 1015 deletions

View File

@@ -10,7 +10,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
b.iter(|| { b.iter(|| {
let mut word_count = 0; let mut word_count = 0;
let mut token_stream = tokenizer.token_stream(ALICE_TXT); let mut token_stream = tokenizer.token_stream(ALICE_TXT);
while token_stream.advance() { for token in token_stream {
word_count += 1; word_count += 1;
} }
assert_eq!(word_count, 30_731); assert_eq!(word_count, 30_731);

View File

@@ -17,12 +17,7 @@ use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir; use tempfile::TempDir;
fn pre_tokenize_text(text: &str) -> Vec<Token> { fn pre_tokenize_text(text: &str) -> Vec<Token> {
let mut token_stream = SimpleTokenizer.token_stream(text); SimpleTokenizer.token_stream(text).collect()
let mut tokens = vec![];
while token_stream.advance() {
tokens.push(token_stream.token().clone());
}
tokens
} }
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {

View File

@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?; let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
for (score, doc_address) in top_docs { for (score, doc_address) in top_docs {
let doc = searcher.doc(doc_address)?; let doc = searcher.doc(doc_address)?;

View File

@@ -50,12 +50,13 @@ fn main() -> tantivy::Result<()> {
// This tokenizer lowers all of the text (to help with stop word matching) // This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus // then removes all instances of `the` and `and` from the corpus
let tokenizer = TextAnalyzer::from(SimpleTokenizer) let tokenizer = analyzer_builder(SimpleTokenizer)
.filter(LowerCaser) .filter(LowerCaser::new())
.filter(StopWordFilter::remove(vec![ .filter(StopWordFilter::remove(vec![
"the".to_string(), "the".to_string(),
"and".to_string(), "and".to_string(),
])); ]))
.build();
index.tokenizers().register("stoppy", tokenizer); index.tokenizers().register("stoppy", tokenizer);

View File

@@ -20,7 +20,7 @@ use crate::reader::IndexReaderBuilder;
use crate::schema::Field; use crate::schema::Field;
use crate::schema::FieldType; use crate::schema::FieldType;
use crate::schema::Schema; use crate::schema::Schema;
use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::tokenizer::{TextAnalyzerT, TokenizerManager};
use crate::IndexWriter; use crate::IndexWriter;
use std::collections::HashSet; use std::collections::HashSet;
use std::fmt; use std::fmt;
@@ -119,13 +119,12 @@ impl Index {
return Index::create(dir, schema); return Index::create(dir, schema);
} }
let index = Index::open(dir)?; let index = Index::open(dir)?;
if index.schema() == schema { if index.schema() != schema {
Ok(index) return Err(TantivyError::SchemaError(
} else {
Err(TantivyError::SchemaError(
"An index exists but the schema does not match.".to_string(), "An index exists but the schema does not match.".to_string(),
)) ));
} }
Ok(index)
} }
/// Creates a new index in a temp directory. /// Creates a new index in a temp directory.
@@ -181,11 +180,11 @@ impl Index {
} }
/// Helper to access the tokenizer associated to a specific field. /// Helper to access the tokenizer associated to a specific field.
pub fn tokenizer_for_field(&self, field: Field) -> crate::Result<TextAnalyzer> { pub fn tokenizer_for_field(&self, field: Field) -> crate::Result<Box<dyn TextAnalyzerT>> {
let field_entry = self.schema.get_field_entry(field); let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type(); let field_type = field_entry.field_type();
let tokenizer_manager: &TokenizerManager = self.tokenizers(); let tokenizer_manager: &TokenizerManager = self.tokenizers();
let tokenizer_name_opt: Option<TextAnalyzer> = match field_type { let tokenizer_name_opt: Option<Box<dyn TextAnalyzerT>> = match field_type {
FieldType::Str(text_options) => text_options FieldType::Str(text_options) => text_options
.get_indexing_options() .get_indexing_options()
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string()) .map(|text_indexing_options| text_indexing_options.tokenizer().to_string())

View File

@@ -310,7 +310,7 @@ impl SegmentReader {
} }
/// Returns an iterator that will iterate over the alive document ids /// Returns an iterator that will iterate over the alive document ids
pub fn doc_ids_alive<'a>(&'a self) -> impl Iterator<Item = DocId> + 'a { pub fn doc_ids_alive(&self) -> impl Iterator<Item = DocId> + '_ {
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc)) (0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
} }

View File

@@ -10,10 +10,9 @@ use crate::schema::FieldType;
use crate::schema::Schema; use crate::schema::Schema;
use crate::schema::Term; use crate::schema::Term;
use crate::schema::Value; use crate::schema::Value;
use crate::schema::{Field, FieldEntry}; use crate::tokenizer::PreTokenizedStream;
use crate::tokenizer::{BoxTokenStream, PreTokenizedStream}; use crate::tokenizer::{DynTokenStreamChain, Tokenizer};
use crate::tokenizer::{FacetTokenizer, TextAnalyzer}; use crate::tokenizer::{FacetTokenizer, TextAnalyzerT, Token};
use crate::tokenizer::{TokenStreamChain, Tokenizer};
use crate::Opstamp; use crate::Opstamp;
use crate::{DocId, SegmentComponent}; use crate::{DocId, SegmentComponent};
@@ -23,7 +22,7 @@ use crate::{DocId, SegmentComponent};
fn initial_table_size(per_thread_memory_budget: usize) -> crate::Result<usize> { fn initial_table_size(per_thread_memory_budget: usize) -> crate::Result<usize> {
let table_memory_upper_bound = per_thread_memory_budget / 3; let table_memory_upper_bound = per_thread_memory_budget / 3;
if let Some(limit) = (10..) if let Some(limit) = (10..)
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_memory_upper_bound) .take_while(|&num_bits| compute_table_size(num_bits) < table_memory_upper_bound)
.last() .last()
{ {
Ok(limit.min(19)) // we cap it at 2^19 = 512K. Ok(limit.min(19)) // we cap it at 2^19 = 512K.
@@ -45,7 +44,8 @@ pub struct SegmentWriter {
fast_field_writers: FastFieldsWriter, fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FieldNormsWriter, fieldnorms_writer: FieldNormsWriter,
doc_opstamps: Vec<Opstamp>, doc_opstamps: Vec<Opstamp>,
tokenizers: Vec<Option<TextAnalyzer>>, // TODO: change type
tokenizers: Vec<Option<Box<dyn TextAnalyzerT>>>,
term_buffer: Term, term_buffer: Term,
} }
@@ -70,17 +70,17 @@ impl SegmentWriter {
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits); let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
let tokenizers = schema let tokenizers = schema
.fields() .fields()
.map( .map(|(_, field_entry)| match field_entry.field_type() {
|(_, field_entry): (Field, &FieldEntry)| match field_entry.field_type() { FieldType::Str(text_options) => {
FieldType::Str(ref text_options) => text_options text_options
.get_indexing_options() .get_indexing_options()
.and_then(|text_index_option| { .and_then(|text_index_option| {
let tokenizer_name = &text_index_option.tokenizer(); let tokenizer_name = &text_index_option.tokenizer();
tokenizer_manager.get(tokenizer_name) tokenizer_manager.get(tokenizer_name)
}), })
_ => None, }
}, _ => None,
) })
.collect(); .collect();
Ok(SegmentWriter { Ok(SegmentWriter {
max_doc: 0, max_doc: 0,
@@ -141,13 +141,13 @@ impl SegmentWriter {
} }
let (term_buffer, multifield_postings) = let (term_buffer, multifield_postings) =
(&mut self.term_buffer, &mut self.multifield_postings); (&mut self.term_buffer, &mut self.multifield_postings);
match *field_entry.field_type() { match field_entry.field_type() {
FieldType::HierarchicalFacet => { FieldType::HierarchicalFacet => {
term_buffer.set_field(field); term_buffer.set_field(field);
let facets = let facets =
field_values field_values
.iter() .iter()
.flat_map(|field_value| match *field_value.value() { .flat_map(|field_value| match field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()), Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => { _ => {
panic!("Expected hierarchical facet"); panic!("Expected hierarchical facet");
@@ -157,12 +157,13 @@ impl SegmentWriter {
let mut unordered_term_id_opt = None; let mut unordered_term_id_opt = None;
FacetTokenizer FacetTokenizer
.token_stream(facet_str) .token_stream(facet_str)
.process(&mut |token| { .map(|token| {
term_buffer.set_text(&token.text); term_buffer.set_text(&token.text);
let unordered_term_id = let unordered_term_id =
multifield_postings.subscribe(doc_id, &term_buffer); multifield_postings.subscribe(doc_id, &term_buffer);
unordered_term_id_opt = Some(unordered_term_id); unordered_term_id_opt = Some(unordered_term_id);
}); })
.count();
if let Some(unordered_term_id) = unordered_term_id_opt { if let Some(unordered_term_id) = unordered_term_id_opt {
self.fast_field_writers self.fast_field_writers
.get_multivalue_writer(field) .get_multivalue_writer(field)
@@ -172,37 +173,38 @@ impl SegmentWriter {
} }
} }
FieldType::Str(_) => { FieldType::Str(_) => {
let mut token_streams: Vec<BoxTokenStream> = vec![]; let mut streams_with_offsets = vec![];
let mut offsets = vec![];
let mut total_offset = 0; let mut total_offset = 0;
for field_value in field_values { for field_value in field_values {
match field_value.value() { match field_value.value() {
Value::PreTokStr(tok_str) => { Value::PreTokStr(tok_str) => {
offsets.push(total_offset); streams_with_offsets.push((
Box::new(PreTokenizedStream::from(tok_str.clone()))
as Box<dyn Iterator<Item = Token>>,
total_offset,
));
if let Some(last_token) = tok_str.tokens.last() { if let Some(last_token) = tok_str.tokens.last() {
total_offset += last_token.offset_to; total_offset += last_token.offset_to;
} }
token_streams
.push(PreTokenizedStream::from(tok_str.clone()).into());
} }
Value::Str(ref text) => { Value::Str(text) => {
if let Some(ref mut tokenizer) = if let Some(ref mut tokenizer) =
self.tokenizers[field.field_id() as usize] self.tokenizers[field.field_id() as usize]
{ {
offsets.push(total_offset); streams_with_offsets
.push((tokenizer.token_stream(text), total_offset));
total_offset += text.len(); total_offset += text.len();
token_streams.push(tokenizer.token_stream(text));
} }
} }
_ => (), _ => (),
} }
} }
let num_tokens = if token_streams.is_empty() { let num_tokens = if streams_with_offsets.is_empty() {
0 0
} else { } else {
let mut token_stream = TokenStreamChain::new(offsets, token_streams); let mut token_stream = DynTokenStreamChain::from_vec(streams_with_offsets);
multifield_postings.index_text( multifield_postings.index_text(
doc_id, doc_id,
field, field,
@@ -213,71 +215,62 @@ impl SegmentWriter {
self.fieldnorms_writer.record(doc_id, field, num_tokens); self.fieldnorms_writer.record(doc_id, field, num_tokens);
} }
FieldType::U64(ref int_option) => { FieldType::U64(int_option) if int_option.is_indexed() => {
if int_option.is_indexed() { for field_value in field_values {
for field_value in field_values { term_buffer.set_field(field_value.field());
term_buffer.set_field(field_value.field()); let u64_val = field_value
let u64_val = field_value .value()
.value() .u64_value()
.u64_value() .ok_or_else(make_schema_error)?;
.ok_or_else(make_schema_error)?; term_buffer.set_u64(u64_val);
term_buffer.set_u64(u64_val); multifield_postings.subscribe(doc_id, &term_buffer);
multifield_postings.subscribe(doc_id, &term_buffer);
}
} }
} }
FieldType::Date(ref int_option) => { FieldType::Date(int_option) if int_option.is_indexed() => {
if int_option.is_indexed() { for field_value in field_values {
for field_value in field_values { term_buffer.set_field(field_value.field());
term_buffer.set_field(field_value.field()); let date_val = field_value
let date_val = field_value .value()
.value() .date_value()
.date_value() .ok_or_else(make_schema_error)?;
.ok_or_else(make_schema_error)?; term_buffer.set_i64(date_val.timestamp());
term_buffer.set_i64(date_val.timestamp()); multifield_postings.subscribe(doc_id, &term_buffer);
multifield_postings.subscribe(doc_id, &term_buffer);
}
} }
} }
FieldType::I64(ref int_option) => { FieldType::I64(int_option) if int_option.is_indexed() => {
if int_option.is_indexed() { for field_value in field_values {
for field_value in field_values { term_buffer.set_field(field_value.field());
term_buffer.set_field(field_value.field()); let i64_val = field_value
let i64_val = field_value .value()
.value() .i64_value()
.i64_value() .ok_or_else(make_schema_error)?;
.ok_or_else(make_schema_error)?; term_buffer.set_i64(i64_val);
term_buffer.set_i64(i64_val); multifield_postings.subscribe(doc_id, &term_buffer);
multifield_postings.subscribe(doc_id, &term_buffer);
}
} }
} }
FieldType::F64(ref int_option) => { FieldType::F64(int_option) if int_option.is_indexed() => {
if int_option.is_indexed() { for field_value in field_values {
for field_value in field_values { term_buffer.set_field(field_value.field());
term_buffer.set_field(field_value.field()); let f64_val = field_value
let f64_val = field_value .value()
.value() .f64_value()
.f64_value() .ok_or_else(make_schema_error)?;
.ok_or_else(make_schema_error)?; term_buffer.set_f64(f64_val);
term_buffer.set_f64(f64_val); multifield_postings.subscribe(doc_id, &term_buffer);
multifield_postings.subscribe(doc_id, &term_buffer);
}
} }
} }
FieldType::Bytes(ref option) => { FieldType::Bytes(option) if option.is_indexed() => {
if option.is_indexed() { for field_value in field_values {
for field_value in field_values { term_buffer.set_field(field_value.field());
term_buffer.set_field(field_value.field()); let bytes = field_value
let bytes = field_value .value()
.value() .bytes_value()
.bytes_value() .ok_or_else(make_schema_error)?;
.ok_or_else(make_schema_error)?; term_buffer.set_bytes(bytes);
term_buffer.set_bytes(bytes); self.multifield_postings.subscribe(doc_id, &term_buffer);
self.multifield_postings.subscribe(doc_id, &term_buffer);
}
} }
} }
_ => {}
} }
} }
doc.filter_fields(|field| schema.get_field_entry(field).is_stored()); doc.filter_fields(|field| schema.get_field_entry(field).is_stored());

View File

@@ -132,7 +132,7 @@ impl PositionReader {
"offset arguments should be increasing." "offset arguments should be increasing."
); );
let delta_to_block_offset = offset as i64 - self.block_offset as i64; let delta_to_block_offset = offset as i64 - self.block_offset as i64;
if delta_to_block_offset < 0 || delta_to_block_offset >= 128 { if !(0..128).contains(&delta_to_block_offset) {
// The first position is not within the first block. // The first position is not within the first block.
// We need to decompress the first block. // We need to decompress the first block.
let delta_to_anchor_offset = offset - self.anchor_offset; let delta_to_anchor_offset = offset - self.anchor_offset;

View File

@@ -109,9 +109,9 @@ impl BlockSearcher {
/// The results should be equivalent to /// The results should be equivalent to
/// ```compile_fail /// ```compile_fail
/// block[..] /// block[..]
// .iter() /// .iter()
// .take_while(|&&val| val < target) /// .take_while(|&&val| val < target)
// .count() /// .count()
/// ``` /// ```
/// ///
/// The `start` argument is just used to hint that the response is /// The `start` argument is just used to hint that the response is

View File

@@ -9,7 +9,6 @@ use crate::postings::{FieldSerializer, InvertedIndexSerializer};
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::schema::{Field, FieldEntry, FieldType, Schema, Term}; use crate::schema::{Field, FieldEntry, FieldType, Schema, Term};
use crate::termdict::TermOrdinal; use crate::termdict::TermOrdinal;
use crate::tokenizer::TokenStream;
use crate::tokenizer::{Token, MAX_TOKEN_LEN}; use crate::tokenizer::{Token, MAX_TOKEN_LEN};
use crate::DocId; use crate::DocId;
use fnv::FnvHashMap; use fnv::FnvHashMap;
@@ -100,12 +99,10 @@ impl MultiFieldPostingsWriter {
&mut self, &mut self,
doc: DocId, doc: DocId,
field: Field, field: Field,
token_stream: &mut dyn TokenStream, token_stream: &mut dyn Iterator<Item = Token>,
term_buffer: &mut Term, term_buffer: &mut Term,
) -> u32 { ) -> u32 {
let postings_writer = self.per_field_postings_writers[field.field_id() as usize].index_text(
self.per_field_postings_writers[field.field_id() as usize].deref_mut();
postings_writer.index_text(
&mut self.term_index, &mut self.term_index,
doc, doc,
field, field,
@@ -217,7 +214,7 @@ pub trait PostingsWriter {
term_index: &mut TermHashMap, term_index: &mut TermHashMap,
doc_id: DocId, doc_id: DocId,
field: Field, field: Field,
token_stream: &mut dyn TokenStream, token_stream: &mut dyn Iterator<Item = Token>,
heap: &mut MemoryArena, heap: &mut MemoryArena,
term_buffer: &mut Term, term_buffer: &mut Term,
) -> u32 { ) -> u32 {
@@ -242,7 +239,7 @@ pub trait PostingsWriter {
); );
} }
}; };
token_stream.process(&mut sink) token_stream.map(|tok| sink(&tok)).count() as u32
} }
fn total_num_tokens(&self) -> u64; fn total_num_tokens(&self) -> u64;

View File

@@ -289,7 +289,7 @@ impl QueryParser {
let field_name = field_entry.name().to_string(); let field_name = field_entry.name().to_string();
return Err(QueryParserError::FieldNotIndexed(field_name)); return Err(QueryParserError::FieldNotIndexed(field_name));
} }
match *field_type { match field_type {
FieldType::I64(_) => { FieldType::I64(_) => {
let val: i64 = i64::from_str(phrase)?; let val: i64 = i64::from_str(phrase)?;
let term = Term::from_field_i64(field, val); let term = Term::from_field_i64(field, val);
@@ -312,7 +312,7 @@ impl QueryParser {
let term = Term::from_field_u64(field, val); let term = Term::from_field_u64(field, val);
Ok(vec![(0, term)]) Ok(vec![(0, term)])
} }
FieldType::Str(ref str_options) => { FieldType::Str(str_options) => {
if let Some(option) = str_options.get_indexing_options() { if let Some(option) = str_options.get_indexing_options() {
let tokenizer = let tokenizer =
self.tokenizer_manager self.tokenizer_manager
@@ -323,15 +323,14 @@ impl QueryParser {
option.tokenizer().to_string(), option.tokenizer().to_string(),
) )
})?; })?;
let mut terms: Vec<(usize, Term)> = Vec::new(); let token_stream = tokenizer.token_stream(phrase);
let mut token_stream = tokenizer.token_stream(phrase); let terms: Vec<_> = token_stream
token_stream.process(&mut |token| { .map(|token| {
let term = Term::from_field_text(field, &token.text); let term = Term::from_field_text(field, &token.text);
terms.push((token.position, term)); (token.position, term)
}); })
if terms.is_empty() { .collect();
Ok(vec![]) if terms.len() <= 1 {
} else if terms.len() == 1 {
Ok(terms) Ok(terms)
} else { } else {
let field_entry = self.schema.get_field_entry(field); let field_entry = self.schema.get_field_entry(field);
@@ -414,7 +413,7 @@ impl QueryParser {
&self, &self,
given_field: &Option<String>, given_field: &Option<String>,
) -> Result<Cow<'_, [Field]>, QueryParserError> { ) -> Result<Cow<'_, [Field]>, QueryParserError> {
match *given_field { match given_field {
None => { None => {
if self.default_fields.is_empty() { if self.default_fields.is_empty() {
Err(QueryParserError::NoDefaultFieldDeclared) Err(QueryParserError::NoDefaultFieldDeclared)
@@ -422,7 +421,7 @@ impl QueryParser {
Ok(Cow::from(&self.default_fields[..])) Ok(Cow::from(&self.default_fields[..]))
} }
} }
Some(ref field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])), Some(field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])),
} }
} }
@@ -574,15 +573,12 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<dyn Query> {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::super::logical_ast::*; use super::super::logical_ast::*;
use super::QueryParser; use super::*;
use super::QueryParserError;
use crate::query::Query; use crate::query::Query;
use crate::schema::Field; use crate::schema::Field;
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions}; use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT}; use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT};
use crate::tokenizer::{ use crate::tokenizer::{analyzer_builder, LowerCaser, SimpleTokenizer, StopWordFilter};
LowerCaser, SimpleTokenizer, StopWordFilter, TextAnalyzer, TokenizerManager,
};
use crate::Index; use crate::Index;
use matches::assert_matches; use matches::assert_matches;
@@ -620,9 +616,10 @@ mod test {
let tokenizer_manager = TokenizerManager::default(); let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register( tokenizer_manager.register(
"en_with_stop_words", "en_with_stop_words",
TextAnalyzer::from(SimpleTokenizer) analyzer_builder(SimpleTokenizer)
.filter(LowerCaser) .filter(LowerCaser::new())
.filter(StopWordFilter::remove(vec!["the".to_string()])), .filter(StopWordFilter::remove(vec!["the".to_string()]))
.build(),
); );
QueryParser::new(schema, default_fields, tokenizer_manager) QueryParser::new(schema, default_fields, tokenizer_manager)
} }

View File

@@ -1,7 +1,7 @@
use crate::query::Query; use crate::query::Query;
use crate::schema::Field; use crate::schema::Field;
use crate::schema::Value; use crate::schema::Value;
use crate::tokenizer::{TextAnalyzer, Token}; use crate::tokenizer::{TextAnalyzerT, Token};
use crate::Searcher; use crate::Searcher;
use crate::{Document, Score}; use crate::{Document, Score};
use htmlescape::encode_minimal; use htmlescape::encode_minimal;
@@ -139,9 +139,9 @@ impl Snippet {
/// ///
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ /// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
/// has to be a valid string. /// has to be a valid string.
fn search_fragments<'a>( fn search_fragments(
tokenizer: &TextAnalyzer, tokenizer: &dyn TextAnalyzerT,
text: &'a str, text: &str,
terms: &BTreeMap<String, Score>, terms: &BTreeMap<String, Score>,
max_num_chars: usize, max_num_chars: usize,
) -> Vec<FragmentCandidate> { ) -> Vec<FragmentCandidate> {
@@ -155,7 +155,7 @@ fn search_fragments<'a>(
}; };
fragment = FragmentCandidate::new(next.offset_from); fragment = FragmentCandidate::new(next.offset_from);
} }
fragment.try_add_token(next, &terms); fragment.try_add_token(&next, &terms);
} }
if fragment.score > 0.0 { if fragment.score > 0.0 {
fragments.push(fragment) fragments.push(fragment)
@@ -249,7 +249,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
/// ``` /// ```
pub struct SnippetGenerator { pub struct SnippetGenerator {
terms_text: BTreeMap<String, Score>, terms_text: BTreeMap<String, Score>,
tokenizer: TextAnalyzer, tokenizer: Box<dyn TextAnalyzerT>,
field: Field, field: Field,
max_num_chars: usize, max_num_chars: usize,
} }
@@ -297,33 +297,37 @@ impl SnippetGenerator {
/// ///
/// This method extract the text associated to the `SnippetGenerator`'s field /// This method extract the text associated to the `SnippetGenerator`'s field
/// and computes a snippet. /// and computes a snippet.
pub fn snippet_from_doc(&self, doc: &Document) -> Snippet { pub fn snippet_from_doc(&mut self, doc: &Document) -> Snippet {
let text: String = doc let text: String = doc
.get_all(self.field) .get_all(self.field)
.flat_map(Value::text) .flat_map(Value::text)
.collect::<Vec<&str>>() .collect::<Vec<&str>>()
.join(" "); .join(" ");
self.snippet(&text) self.snippet(text.as_ref())
} }
/// Generates a snippet for the given text. /// Generates a snippet for the given text.
pub fn snippet(&self, text: &str) -> Snippet { pub fn snippet(&mut self, text: &str) -> Snippet {
let fragment_candidates = let fragment_candidates = search_fragments(
search_fragments(&self.tokenizer, &text, &self.terms_text, self.max_num_chars); &mut *self.tokenizer,
select_best_fragment_combination(&fragment_candidates[..], &text) text,
&self.terms_text,
self.max_num_chars,
);
select_best_fragment_combination(&fragment_candidates[..], text)
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::{search_fragments, select_best_fragment_combination}; use super::*;
use crate::query::QueryParser; use crate::query::QueryParser;
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT}; use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
use crate::tokenizer::SimpleTokenizer; use crate::tokenizer::SimpleTokenizer;
use crate::tokenizer::TextAnalyzer;
use crate::Index; use crate::Index;
use crate::SnippetGenerator; use crate::SnippetGenerator;
use maplit::btreemap; use maplit::btreemap;
use std::collections::BTreeMap;
use std::iter::Iterator; use std::iter::Iterator;
const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by
@@ -346,7 +350,13 @@ Survey in 2016, 2017, and 2018."#;
String::from("rust") => 1.0, String::from("rust") => 1.0,
String::from("language") => 0.9 String::from("language") => 0.9
}; };
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
TEST_TEXT,
&terms,
100,
);
assert_eq!(fragments.len(), 7); assert_eq!(fragments.len(), 7);
{ {
let first = &fragments[0]; let first = &fragments[0];
@@ -373,7 +383,12 @@ Survey in 2016, 2017, and 2018."#;
String::from("rust") =>1.0, String::from("rust") =>1.0,
String::from("language") => 0.9 String::from("language") => 0.9
}; };
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20); let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
TEST_TEXT,
&terms,
20,
);
{ {
let first = &fragments[0]; let first = &fragments[0];
assert_eq!(first.score, 1.0); assert_eq!(first.score, 1.0);
@@ -387,7 +402,12 @@ Survey in 2016, 2017, and 2018."#;
String::from("rust") =>0.9, String::from("rust") =>0.9,
String::from("language") => 1.0 String::from("language") => 1.0
}; };
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20); let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
TEST_TEXT,
&terms,
20,
);
//assert_eq!(fragments.len(), 7); //assert_eq!(fragments.len(), 7);
{ {
let first = &fragments[0]; let first = &fragments[0];
@@ -406,7 +426,12 @@ Survey in 2016, 2017, and 2018."#;
let mut terms = BTreeMap::new(); let mut terms = BTreeMap::new();
terms.insert(String::from("c"), 1.0); terms.insert(String::from("c"), 1.0);
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3); let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
3,
);
assert_eq!(fragments.len(), 1); assert_eq!(fragments.len(), 1);
{ {
@@ -428,7 +453,12 @@ Survey in 2016, 2017, and 2018."#;
let mut terms = BTreeMap::new(); let mut terms = BTreeMap::new();
terms.insert(String::from("f"), 1.0); terms.insert(String::from("f"), 1.0);
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3); let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
3,
);
assert_eq!(fragments.len(), 2); assert_eq!(fragments.len(), 2);
{ {
@@ -451,7 +481,12 @@ Survey in 2016, 2017, and 2018."#;
terms.insert(String::from("f"), 1.0); terms.insert(String::from("f"), 1.0);
terms.insert(String::from("a"), 0.9); terms.insert(String::from("a"), 0.9);
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 7); let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
7,
);
assert_eq!(fragments.len(), 2); assert_eq!(fragments.len(), 2);
{ {
@@ -473,7 +508,12 @@ Survey in 2016, 2017, and 2018."#;
let mut terms = BTreeMap::new(); let mut terms = BTreeMap::new();
terms.insert(String::from("z"), 1.0); terms.insert(String::from("z"), 1.0);
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3); let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
3,
);
assert_eq!(fragments.len(), 0); assert_eq!(fragments.len(), 0);
@@ -487,7 +527,12 @@ Survey in 2016, 2017, and 2018."#;
let text = "a b c d"; let text = "a b c d";
let terms = BTreeMap::new(); let terms = BTreeMap::new();
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3); let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
3,
);
assert_eq!(fragments.len(), 0); assert_eq!(fragments.len(), 0);
let snippet = select_best_fragment_combination(&fragments[..], &text); let snippet = select_best_fragment_combination(&fragments[..], &text);
@@ -572,12 +617,12 @@ Survey in 2016, 2017, and 2018."#;
let mut snippet_generator = let mut snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap(); SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
{ {
let snippet = snippet_generator.snippet(TEST_TEXT); let snippet = snippet_generator.snippet(TEST_TEXT.into());
assert_eq!(snippet.to_html(), "imperative-procedural paradigms. <b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to provide better memory safety"); assert_eq!(snippet.to_html(), "imperative-procedural paradigms. <b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to provide better memory safety");
} }
{ {
snippet_generator.set_max_num_chars(90); snippet_generator.set_max_num_chars(90);
let snippet = snippet_generator.snippet(TEST_TEXT); let snippet = snippet_generator.snippet(TEST_TEXT.into());
assert_eq!(snippet.to_html(), "<b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to"); assert_eq!(snippet.to_html(), "<b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to");
} }
} }

View File

@@ -35,11 +35,11 @@ struct Layer {
} }
impl Layer { impl Layer {
fn cursor<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a { fn cursor(&self) -> impl Iterator<Item = Checkpoint> + '_ {
self.cursor_at_offset(0u64) self.cursor_at_offset(0u64)
} }
fn cursor_at_offset<'a>(&'a self, start_offset: u64) -> impl Iterator<Item = Checkpoint> + 'a { fn cursor_at_offset(&self, start_offset: u64) -> impl Iterator<Item = Checkpoint> + '_ {
let data = &self.data.as_slice(); let data = &self.data.as_slice();
LayerCursor { LayerCursor {
remaining: &data[start_offset as usize..], remaining: &data[start_offset as usize..],
@@ -59,7 +59,7 @@ pub struct SkipIndex {
} }
impl SkipIndex { impl SkipIndex {
pub(crate) fn checkpoints<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a { pub(crate) fn checkpoints(&self) -> impl Iterator<Item = Checkpoint> + '_ {
self.layers self.layers
.last() .last()
.into_iter() .into_iter()

View File

@@ -46,7 +46,7 @@ impl StoreReader {
}) })
} }
pub(crate) fn block_checkpoints<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a { pub(crate) fn block_checkpoints(&self) -> impl Iterator<Item = Checkpoint> + '_ {
self.skip_index.checkpoints() self.skip_index.checkpoints()
} }

View File

@@ -2,16 +2,16 @@
//! ```rust //! ```rust
//! use tantivy::tokenizer::*; //! use tantivy::tokenizer::*;
//! //!
//! let tokenizer = TextAnalyzer::from(RawTokenizer) //! let tokenizer = analyzer_builder(RawTokenizer)
//! .filter(AlphaNumOnlyFilter); //! .filter(AlphaNumOnlyFilter).build();
//! //!
//! let mut stream = tokenizer.token_stream("hello there"); //! let mut stream = tokenizer.token_stream("hello there");
//! // is none because the raw filter emits one token that //! // is none because the raw filter emits one token that
//! // contains a space //! // contains a space
//! assert!(stream.next().is_none()); //! assert!(stream.next().is_none());
//! //!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) //! let tokenizer = analyzer_builder(SimpleTokenizer)
//! .filter(AlphaNumOnlyFilter); //! .filter(AlphaNumOnlyFilter).build();
//! //!
//! let mut stream = tokenizer.token_stream("hello there 💣"); //! let mut stream = tokenizer.token_stream("hello there 💣");
//! assert!(stream.next().is_some()); //! assert!(stream.next().is_some());
@@ -19,45 +19,18 @@
//! // the "emoji" is dropped because its not an alphanum //! // the "emoji" is dropped because its not an alphanum
//! assert!(stream.next().is_none()); //! assert!(stream.next().is_none());
//! ``` //! ```
use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; use super::{Token, TokenFilter};
/// `TokenFilter` that removes all tokens that contain non /// `TokenFilter` that removes all tokens that contain non
/// ascii alphanumeric characters. /// ascii alphanumeric characters.
#[derive(Clone)] #[derive(Clone, Debug, Default)]
pub struct AlphaNumOnlyFilter; pub struct AlphaNumOnlyFilter;
pub struct AlphaNumOnlyFilterStream<'a> {
tail: BoxTokenStream<'a>,
}
impl<'a> AlphaNumOnlyFilterStream<'a> {
fn predicate(&self, token: &Token) -> bool {
token.text.chars().all(|c| c.is_ascii_alphanumeric())
}
}
impl TokenFilter for AlphaNumOnlyFilter { impl TokenFilter for AlphaNumOnlyFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { fn transform(&mut self, token: Token) -> Option<Token> {
BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream }) if token.text.chars().all(|c| c.is_ascii_alphanumeric()) {
} return Some(token);
}
impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
} }
None
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
} }
} }

View File

@@ -1,45 +1,31 @@
use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; use super::{Token, TokenFilter};
use std::mem; use std::mem;
/// This class converts alphabetic, numeric, and symbolic Unicode characters /// This class converts alphabetic, numeric, and symbolic Unicode characters
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode /// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
/// block) into their ASCII equivalents, if one exists. /// block) into their ASCII equivalents, if one exists.
#[derive(Clone)] #[derive(Clone, Debug, Default)]
pub struct AsciiFoldingFilter; pub struct AsciiFolding {
impl TokenFilter for AsciiFoldingFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
From::from(AsciiFoldingFilterTokenStream {
tail: token_stream,
buffer: String::with_capacity(100),
})
}
}
pub struct AsciiFoldingFilterTokenStream<'a> {
buffer: String, buffer: String,
tail: BoxTokenStream<'a>,
} }
impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> { impl AsciiFolding {
fn advance(&mut self) -> bool { /// Construct a new `AsciiFolding` filter.
if !self.tail.advance() { pub fn new() -> Self {
return false; Self {
buffer: String::with_capacity(100),
} }
if !self.token_mut().text.is_ascii() { }
}
impl TokenFilter for AsciiFolding {
fn transform(&mut self, mut token: Token) -> Option<Token> {
if !token.text.is_ascii() {
// ignore its already ascii // ignore its already ascii
to_ascii(&mut self.tail.token_mut().text, &mut self.buffer); to_ascii(&token.text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer); mem::swap(&mut token.text, &mut self.buffer);
} }
true Some(token)
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
} }
} }
@@ -1526,7 +1512,7 @@ fn fold_non_ascii_char(c: char) -> Option<&'static str> {
} }
// https://github.com/apache/lucene-solr/blob/master/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java#L187 // https://github.com/apache/lucene-solr/blob/master/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java#L187
fn to_ascii(text: &mut String, output: &mut String) { fn to_ascii(text: &String, output: &mut String) {
output.clear(); output.clear();
for c in text.chars() { for c in text.chars() {
@@ -1540,11 +1526,8 @@ fn to_ascii(text: &mut String, output: &mut String) {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::to_ascii; use super::super::*;
use crate::tokenizer::AsciiFoldingFilter; use super::*;
use crate::tokenizer::RawTokenizer;
use crate::tokenizer::SimpleTokenizer;
use crate::tokenizer::TextAnalyzer;
use std::iter; use std::iter;
#[test] #[test]
@@ -1560,22 +1543,22 @@ mod tests {
} }
fn folding_helper(text: &str) -> Vec<String> { fn folding_helper(text: &str) -> Vec<String> {
let mut tokens = Vec::new(); let tokens = analyzer_builder(SimpleTokenizer)
TextAnalyzer::from(SimpleTokenizer) .filter(AsciiFolding::new())
.filter(AsciiFoldingFilter) .build()
.token_stream(text) .token_stream(text)
.process(&mut |token| { .map(|token| token.text.clone())
tokens.push(token.text.clone()); .collect();
});
tokens tokens
} }
fn folding_using_raw_tokenizer_helper(text: &str) -> String { fn folding_using_raw_tokenizer_helper(text: &str) -> String {
let mut token_stream = TextAnalyzer::from(RawTokenizer) let mut token_stream = analyzer_builder(RawTokenizer)
.filter(AsciiFoldingFilter) .filter(AsciiFolding::new())
.build()
.token_stream(text); .token_stream(text);
token_stream.advance(); let Token { text, .. } = token_stream.next().unwrap();
token_stream.token().text.clone() text
} }
#[test] #[test]
@@ -1626,9 +1609,9 @@ mod tests {
#[test] #[test]
fn test_to_ascii() { fn test_to_ascii() {
let mut input = "Rámon".to_string(); let input = "Rámon".to_string();
let mut buffer = String::new(); let mut buffer = String::new();
to_ascii(&mut input, &mut buffer); to_ascii(&input, &mut buffer);
assert_eq!("Ramon", buffer); assert_eq!("Ramon", buffer);
} }

View File

@@ -1,4 +1,4 @@
use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; use super::{Token, Tokenizer};
use crate::schema::FACET_SEP_BYTE; use crate::schema::FACET_SEP_BYTE;
/// The `FacetTokenizer` process a `Facet` binary representation /// The `FacetTokenizer` process a `Facet` binary representation
@@ -9,72 +9,63 @@ use crate::schema::FACET_SEP_BYTE;
/// - `/america/north_america/canada` /// - `/america/north_america/canada`
/// - `/america/north_america` /// - `/america/north_america`
/// - `/america` /// - `/america`
#[derive(Clone)] #[derive(Clone, Debug, Default)]
pub struct FacetTokenizer; pub struct FacetTokenizer;
#[derive(Debug)] #[derive(Clone, Debug)]
enum State { enum State {
RootFacetNotEmitted, RootFacetNotEmitted,
UpToPosition(usize), //< we already emitted facet prefix up to &text[..cursor] UpToPosition(usize), //< we already emitted facet prefix up to &text[..cursor]
Terminated, Terminated,
} }
pub struct FacetTokenStream<'a> { #[derive(Clone, Debug)]
text: &'a str, pub struct FacetTokenStream {
text: String,
state: State, state: State,
token: Token, token: Token,
} }
impl Tokenizer for FacetTokenizer { impl Tokenizer for FacetTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { type Iter = FacetTokenStream;
fn token_stream(&self, text: &str) -> Self::Iter {
FacetTokenStream { FacetTokenStream {
text, text: text.to_string(),
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet. state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
token: Token::default(), token: Token::default(),
} }
.into()
} }
} }
impl<'a> TokenStream for FacetTokenStream<'a> { impl Iterator for FacetTokenStream {
fn advance(&mut self) -> bool { type Item = Token;
match self.state { fn next(&mut self) -> Option<Self::Item> {
self.state = match self.state {
State::RootFacetNotEmitted => { State::RootFacetNotEmitted => {
self.state = if self.text.is_empty() { if self.text.is_empty() {
State::Terminated State::Terminated
} else { } else {
State::UpToPosition(0) State::UpToPosition(0)
}; }
true
} }
State::UpToPosition(cursor) => { State::UpToPosition(cursor) => {
let bytes: &[u8] = self.text.as_bytes(); if let Some(next_sep_pos) = self.text.as_bytes()[cursor + 1..]
if let Some(next_sep_pos) = bytes[cursor + 1..]
.iter() .iter()
.cloned() .position(|&b| b == FACET_SEP_BYTE)
.position(|b| b == FACET_SEP_BYTE)
.map(|pos| cursor + 1 + pos) .map(|pos| cursor + 1 + pos)
{ {
let facet_part = &self.text[cursor..next_sep_pos]; let facet_part = &self.text[cursor..next_sep_pos];
self.token.text.push_str(facet_part); self.token.text.push_str(facet_part);
self.state = State::UpToPosition(next_sep_pos); State::UpToPosition(next_sep_pos)
} else { } else {
let facet_part = &self.text[cursor..]; let facet_part = &self.text[cursor..];
self.token.text.push_str(facet_part); self.token.text.push_str(facet_part);
self.state = State::Terminated; State::Terminated
} }
true
} }
State::Terminated => false, State::Terminated => return None,
} };
} Some(self.token.clone())
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
} }
} }
@@ -83,21 +74,19 @@ mod tests {
use super::FacetTokenizer; use super::FacetTokenizer;
use crate::schema::Facet; use crate::schema::Facet;
use crate::tokenizer::{Token, Tokenizer}; use crate::tokenizer::Tokenizer;
#[test] #[test]
fn test_facet_tokenizer() { fn test_facet_tokenizer() {
let facet = Facet::from_path(vec!["top", "a", "b"]); let facet = Facet::from_path(vec!["top", "a", "b"]);
let mut tokens = vec![]; let tokens: Vec<_> = FacetTokenizer
{ .token_stream(facet.encoded_str())
let mut add_token = |token: &Token| { .map(|token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); Facet::from_encoded(token.text.as_bytes().to_owned())
tokens.push(format!("{}", facet)); .unwrap()
}; .to_string()
FacetTokenizer })
.token_stream(facet.encoded_str()) .collect();
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4); assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0], "/"); assert_eq!(tokens[0], "/");
assert_eq!(tokens[1], "/top"); assert_eq!(tokens[1], "/top");
@@ -108,16 +97,14 @@ mod tests {
#[test] #[test]
fn test_facet_tokenizer_root_facets() { fn test_facet_tokenizer_root_facets() {
let facet = Facet::root(); let facet = Facet::root();
let mut tokens = vec![]; let tokens: Vec<_> = FacetTokenizer
{ .token_stream(facet.encoded_str())
let mut add_token = |token: &Token| { .map(|token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test Facet::from_encoded(token.text.as_bytes().to_owned())
tokens.push(format!("{}", facet)); .unwrap()
}; .to_string()
FacetTokenizer })
.token_stream(facet.encoded_str()) // ok test .collect();
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1); assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0], "/"); assert_eq!(tokens[0], "/");
} }

View File

@@ -1,27 +1,36 @@
use super::{Token, TokenFilter, TokenStream}; use super::{Token, TokenFilter};
use crate::tokenizer::BoxTokenStream;
use std::mem; use std::mem;
impl TokenFilter for LowerCaser { impl TokenFilter for LowerCaser {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { fn transform(&mut self, mut token: Token) -> Option<Token> {
BoxTokenStream::from(LowerCaserTokenStream { if token.text.is_ascii() {
tail: token_stream, // fast track for ascii.
buffer: String::with_capacity(100), token.text.make_ascii_lowercase();
}) } else {
to_lowercase_unicode(&token.text, &mut self.buffer);
mem::swap(&mut token.text, &mut self.buffer);
}
Some(token)
} }
} }
/// Token filter that lowercase terms. /// Token filter that lowercase terms.
#[derive(Clone)] #[derive(Clone, Debug, Default)]
pub struct LowerCaser; pub struct LowerCaser {
pub struct LowerCaserTokenStream<'a> {
buffer: String, buffer: String,
tail: BoxTokenStream<'a>, }
impl LowerCaser {
/// Initialize the `LowerCaser`
pub fn new() -> Self {
LowerCaser {
buffer: String::with_capacity(100),
}
}
} }
// writes a lowercased version of text into output. // writes a lowercased version of text into output.
fn to_lowercase_unicode(text: &mut String, output: &mut String) { fn to_lowercase_unicode(text: &String, output: &mut String) {
output.clear(); output.clear();
for c in text.chars() { for c in text.chars() {
// Contrary to the std, we do not take care of sigma special case. // Contrary to the std, we do not take care of sigma special case.
@@ -30,57 +39,31 @@ fn to_lowercase_unicode(text: &mut String, output: &mut String) {
} }
} }
impl<'a> TokenStream for LowerCaserTokenStream<'a> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
}
if self.token_mut().text.is_ascii() {
// fast track for ascii.
self.token_mut().text.make_ascii_lowercase();
} else {
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
true
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer}; use super::*;
use crate::tokenizer::{analyzer_builder, LowerCaser, SimpleTokenizer, TextAnalyzerT};
#[test] #[test]
fn test_to_lower_case() { fn test_to_lower_case() {
assert_eq!( assert_eq!(lowercase_helper("Русский текст"), vec!["русский", "текст"]);
lowercase_helper("Русский текст"),
vec!["русский".to_string(), "текст".to_string()]
);
} }
fn lowercase_helper(text: &str) -> Vec<String> { fn lowercase_helper(text: &str) -> Vec<String> {
let mut tokens = vec![]; analyzer_builder(SimpleTokenizer)
let mut token_stream = TextAnalyzer::from(SimpleTokenizer) .filter(LowerCaser::new())
.filter(LowerCaser) .build()
.token_stream(text); .token_stream(text)
while token_stream.advance() { .map(|token| {
let token_text = token_stream.token().text.clone(); let Token { text, .. } = token;
tokens.push(token_text); text
} })
tokens .collect()
} }
#[test] #[test]
fn test_lowercaser() { fn test_lowercaser() {
assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]); assert_eq!(lowercase_helper("Tree"), vec!["tree"]);
assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]); assert_eq!(lowercase_helper("Русский"), vec!["русский"]);
} }
} }

View File

@@ -64,10 +64,10 @@
//! ```rust //! ```rust
//! use tantivy::tokenizer::*; //! use tantivy::tokenizer::*;
//! //!
//! let en_stem = TextAnalyzer::from(SimpleTokenizer) //! let en_stem = analyzer_builder(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(40)) //! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser) //! .filter(LowerCaser::new())
//! .filter(Stemmer::new(Language::English)); //! .filter(Stemmer::new(Language::English)).build();
//! ``` //! ```
//! //!
//! Once your tokenizer is defined, you need to //! Once your tokenizer is defined, you need to
@@ -109,9 +109,9 @@
//! let index = Index::create_in_ram(schema); //! let index = Index::create_in_ram(schema);
//! //!
//! // We need to register our tokenizer : //! // We need to register our tokenizer :
//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer) //! let custom_en_tokenizer = analyzer_builder(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(40)) //! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser); //! .filter(LowerCaser::new()).build();
//! index //! index
//! .tokenizers() //! .tokenizers()
//! .register("custom_en", custom_en_tokenizer); //! .register("custom_en", custom_en_tokenizer);
@@ -133,7 +133,7 @@ mod tokenizer;
mod tokenizer_manager; mod tokenizer_manager;
pub use self::alphanum_only::AlphaNumOnlyFilter; pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::AsciiFoldingFilter; pub use self::ascii_folding_filter::AsciiFolding;
pub use self::facet_tokenizer::FacetTokenizer; pub use self::facet_tokenizer::FacetTokenizer;
pub use self::lower_caser::LowerCaser; pub use self::lower_caser::LowerCaser;
pub use self::ngram_tokenizer::NgramTokenizer; pub use self::ngram_tokenizer::NgramTokenizer;
@@ -142,11 +142,11 @@ pub use self::remove_long::RemoveLongFilter;
pub use self::simple_tokenizer::SimpleTokenizer; pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::stemmer::{Language, Stemmer}; pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter; pub use self::stop_word_filter::StopWordFilter;
pub(crate) use self::token_stream_chain::TokenStreamChain; pub(crate) use self::token_stream_chain::{DynTokenStreamChain, TokenStreamChain};
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{ pub use self::tokenizer::{
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer, analyzer_builder, Identity, TextAnalyzer, TextAnalyzerT, Token, TokenFilter, Tokenizer,
}; };
pub use self::tokenizer_manager::TokenizerManager; pub use self::tokenizer_manager::TokenizerManager;
@@ -160,10 +160,7 @@ pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4;
#[cfg(test)] #[cfg(test)]
pub mod tests { pub mod tests {
use super::{ use super::*;
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, TokenizerManager,
};
use crate::tokenizer::TextAnalyzer;
/// This is a function that can be used in tests and doc tests /// This is a function that can be used in tests and doc tests
/// to assert a token's correctness. /// to assert a token's correctness.
@@ -190,15 +187,9 @@ pub mod tests {
fn test_raw_tokenizer() { fn test_raw_tokenizer() {
let tokenizer_manager = TokenizerManager::default(); let tokenizer_manager = TokenizerManager::default();
let en_tokenizer = tokenizer_manager.get("raw").unwrap(); let en_tokenizer = tokenizer_manager.get("raw").unwrap();
let mut tokens: Vec<Token> = vec![]; let tokens: Vec<Token> = en_tokenizer
{ .token_stream("Hello, happy tax payer!")
let mut add_token = |token: &Token| { .collect();
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1); assert_eq!(tokens.len(), 1);
assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23); assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23);
} }
@@ -208,15 +199,9 @@ pub mod tests {
let tokenizer_manager = TokenizerManager::default(); let tokenizer_manager = TokenizerManager::default();
assert!(tokenizer_manager.get("en_doesnotexist").is_none()); assert!(tokenizer_manager.get("en_doesnotexist").is_none());
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
let mut tokens: Vec<Token> = vec![]; let tokens: Vec<Token> = en_tokenizer
{ .token_stream("Hello, happy tax payer!")
let mut add_token = |token: &Token| { .collect();
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4); assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "hello", 0, 5); assert_token(&tokens[0], 0, "hello", 0, 5);
@@ -230,21 +215,16 @@ pub mod tests {
let tokenizer_manager = TokenizerManager::default(); let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register( tokenizer_manager.register(
"el_stem", "el_stem",
TextAnalyzer::from(SimpleTokenizer) analyzer_builder(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40)) .filter(RemoveLongFilter::limit(40))
.filter(LowerCaser) .filter(LowerCaser::new())
.filter(Stemmer::new(Language::Greek)), .filter(Stemmer::new(Language::Greek))
.build(),
); );
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap(); let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
let mut tokens: Vec<Token> = vec![]; let tokens: Vec<Token> = en_tokenizer
{ .token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
let mut add_token = |token: &Token| { .collect();
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 3); assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "καλημερ", 0, 16); assert_token(&tokens[0], 0, "καλημερ", 0, 16);
@@ -256,25 +236,9 @@ pub mod tests {
fn test_tokenizer_empty() { fn test_tokenizer_empty() {
let tokenizer_manager = TokenizerManager::default(); let tokenizer_manager = TokenizerManager::default();
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
{ let tokens: Vec<Token> = en_tokenizer.token_stream(" ").collect();
let mut tokens: Vec<Token> = vec![]; assert!(tokens.is_empty());
{ let tokens: Vec<Token> = en_tokenizer.token_stream(" ").collect();
let mut add_token = |token: &Token| { assert!(tokens.is_empty());
tokens.push(token.clone());
};
en_tokenizer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
{
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
} }
} }

View File

@@ -1,5 +1,4 @@
use super::{Token, TokenStream, Tokenizer}; use super::{Token, Tokenizer};
use crate::tokenizer::BoxTokenStream;
/// Tokenize the text by splitting words into n-grams of the given size(s) /// Tokenize the text by splitting words into n-grams of the given size(s)
/// ///
@@ -79,7 +78,7 @@ use crate::tokenizer::BoxTokenStream;
/// } /// }
/// assert!(stream.next().is_none()); /// assert!(stream.next().is_none());
/// ``` /// ```
#[derive(Clone)] #[derive(Clone, Debug, Default)]
pub struct NgramTokenizer { pub struct NgramTokenizer {
/// min size of the n-gram /// min size of the n-gram
min_gram: usize, min_gram: usize,
@@ -119,54 +118,48 @@ impl NgramTokenizer {
} }
/// TokenStream associate to the `NgramTokenizer` /// TokenStream associate to the `NgramTokenizer`
pub struct NgramTokenStream<'a> { pub struct NgramTokenStream {
/// parameters /// parameters
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>, ngram_charidx_iterator: StutteringIterator<CodepointFrontiers>,
/// true if the NgramTokenStream is in prefix mode. /// true if the NgramTokenStream is in prefix mode.
prefix_only: bool, prefix_only: bool,
/// input /// input
text: &'a str, text: String,
/// output /// output
token: Token, token: Token,
} }
impl Tokenizer for NgramTokenizer { impl Tokenizer for NgramTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { type Iter = NgramTokenStream;
From::from(NgramTokenStream { fn token_stream(&self, text: &str) -> Self::Iter {
NgramTokenStream {
ngram_charidx_iterator: StutteringIterator::new( ngram_charidx_iterator: StutteringIterator::new(
CodepointFrontiers::for_str(text), CodepointFrontiers::for_str(text),
self.min_gram, self.min_gram,
self.max_gram, self.max_gram,
), ),
prefix_only: self.prefix_only, prefix_only: self.prefix_only,
text, text: text.to_string(),
token: Token::default(), token: Token::default(),
}) }
} }
} }
impl<'a> TokenStream for NgramTokenStream<'a> { impl Iterator for NgramTokenStream {
fn advance(&mut self) -> bool { type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() { if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
if self.prefix_only && offset_from > 0 { if self.prefix_only && offset_from > 0 {
return false; return None;
} }
self.token.position = 0; self.token.position = 0;
self.token.offset_from = offset_from; self.token.offset_from = offset_from;
self.token.offset_to = offset_to; self.token.offset_to = offset_to;
self.token.text.clear(); self.token.text.clear();
self.token.text.push_str(&self.text[offset_from..offset_to]); self.token.text.push_str(&self.text[offset_from..offset_to]);
true return Some(self.token.clone());
} else { };
false None
}
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
} }
} }
@@ -257,21 +250,21 @@ where
/// or a codepoint ends. /// or a codepoint ends.
/// ///
/// By convention, we emit [0] for the empty string. /// By convention, we emit [0] for the empty string.
struct CodepointFrontiers<'a> { struct CodepointFrontiers {
s: &'a str, s: String,
next_el: Option<usize>, next_el: Option<usize>,
} }
impl<'a> CodepointFrontiers<'a> { impl CodepointFrontiers {
fn for_str(s: &'a str) -> Self { fn for_str(s: &str) -> Self {
CodepointFrontiers { CodepointFrontiers {
s, s: s.to_string(),
next_el: Some(0), next_el: Some(0),
} }
} }
} }
impl<'a> Iterator for CodepointFrontiers<'a> { impl<'a> Iterator for CodepointFrontiers {
type Item = usize; type Item = usize;
fn next(&mut self) -> Option<usize> { fn next(&mut self) -> Option<usize> {
@@ -280,7 +273,7 @@ impl<'a> Iterator for CodepointFrontiers<'a> {
self.next_el = None; self.next_el = None;
} else { } else {
let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]); let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]);
self.s = &self.s[first_codepoint_width..]; self.s = (&self.s[first_codepoint_width..]).to_string();
self.next_el = Some(offset + first_codepoint_width); self.next_el = Some(offset + first_codepoint_width);
} }
offset offset
@@ -301,20 +294,8 @@ fn utf8_codepoint_width(b: u8) -> usize {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*;
use super::utf8_codepoint_width;
use super::CodepointFrontiers;
use super::NgramTokenizer;
use super::StutteringIterator;
use crate::tokenizer::tests::assert_token; use crate::tokenizer::tests::assert_token;
use crate::tokenizer::tokenizer::Tokenizer;
use crate::tokenizer::{BoxTokenStream, Token};
fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> {
let mut tokens: Vec<Token> = vec![];
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
tokens
}
#[test] #[test]
fn test_utf8_codepoint_width() { fn test_utf8_codepoint_width() {
@@ -351,7 +332,9 @@ mod tests {
#[test] #[test]
fn test_ngram_tokenizer_1_2_false() { fn test_ngram_tokenizer_1_2_false() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello")); let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 2)
.token_stream("hello")
.collect();
assert_eq!(tokens.len(), 9); assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1); assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "he", 0, 2); assert_token(&tokens[1], 0, "he", 0, 2);
@@ -366,7 +349,9 @@ mod tests {
#[test] #[test]
fn test_ngram_tokenizer_min_max_equal() { fn test_ngram_tokenizer_min_max_equal() {
let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello")); let tokens: Vec<_> = NgramTokenizer::all_ngrams(3, 3)
.token_stream("hello")
.collect();
assert_eq!(tokens.len(), 3); assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "hel", 0, 3); assert_token(&tokens[0], 0, "hel", 0, 3);
assert_token(&tokens[1], 0, "ell", 1, 4); assert_token(&tokens[1], 0, "ell", 1, 4);
@@ -375,7 +360,9 @@ mod tests {
#[test] #[test]
fn test_ngram_tokenizer_2_5_prefix() { fn test_ngram_tokenizer_2_5_prefix() {
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein")); let tokens: Vec<_> = NgramTokenizer::prefix_only(2, 5)
.token_stream("frankenstein")
.collect();
assert_eq!(tokens.len(), 4); assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "fr", 0, 2); assert_token(&tokens[0], 0, "fr", 0, 2);
assert_token(&tokens[1], 0, "fra", 0, 3); assert_token(&tokens[1], 0, "fra", 0, 3);
@@ -385,7 +372,9 @@ mod tests {
#[test] #[test]
fn test_ngram_non_ascii_1_2() { fn test_ngram_non_ascii_1_2() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo")); let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 2)
.token_stream("hεllo")
.collect();
assert_eq!(tokens.len(), 9); assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1); assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "", 0, 3); assert_token(&tokens[1], 0, "", 0, 3);
@@ -400,7 +389,9 @@ mod tests {
#[test] #[test]
fn test_ngram_non_ascii_2_5_prefix() { fn test_ngram_non_ascii_2_5_prefix() {
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo")); let tokens: Vec<_> = NgramTokenizer::prefix_only(2, 5)
.token_stream("hεllo")
.collect();
assert_eq!(tokens.len(), 4); assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "", 0, 3); assert_token(&tokens[0], 0, "", 0, 3);
assert_token(&tokens[1], 0, "hεl", 0, 4); assert_token(&tokens[1], 0, "hεl", 0, 4);
@@ -410,16 +401,16 @@ mod tests {
#[test] #[test]
fn test_ngram_empty() { fn test_ngram_empty() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream("")); let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 5).token_stream("").collect();
assert!(tokens.is_empty()); assert!(tokens.is_empty());
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream("")); let tokens: Vec<_> = NgramTokenizer::all_ngrams(2, 5).token_stream("").collect();
assert!(tokens.is_empty()); assert!(tokens.is_empty());
} }
#[test] #[test]
#[should_panic(expected = "min_gram must be greater than 0")] #[should_panic(expected = "min_gram must be greater than 0")]
fn test_ngram_min_max_interval_empty() { fn test_ngram_min_max_interval_empty() {
test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss")); NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss");
} }
#[test] #[test]

View File

@@ -1,17 +1,17 @@
use super::{Token, TokenStream, Tokenizer}; use super::{Token, Tokenizer};
use crate::tokenizer::BoxTokenStream;
/// For each value of the field, emit a single unprocessed token. /// For each value of the field, emit a single unprocessed token.
#[derive(Clone)] #[derive(Clone, Debug, Default)]
pub struct RawTokenizer; pub struct RawTokenizer;
#[derive(Clone, Debug)]
pub struct RawTokenStream { pub struct RawTokenStream {
token: Token, token: Option<Token>,
has_token: bool,
} }
impl Tokenizer for RawTokenizer { impl Tokenizer for RawTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { type Iter = RawTokenStream;
fn token_stream(&self, text: &str) -> Self::Iter {
let token = Token { let token = Token {
offset_from: 0, offset_from: 0,
offset_to: text.len(), offset_to: text.len(),
@@ -19,26 +19,13 @@ impl Tokenizer for RawTokenizer {
text: text.to_string(), text: text.to_string(),
position_length: 1, position_length: 1,
}; };
RawTokenStream { RawTokenStream { token: Some(token) }
token,
has_token: true,
}
.into()
} }
} }
impl TokenStream for RawTokenStream { impl Iterator for RawTokenStream {
fn advance(&mut self) -> bool { type Item = Token;
let result = self.has_token; fn next(&mut self) -> Option<Token> {
self.has_token = false; self.token.take()
result
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
} }
} }

View File

@@ -2,8 +2,8 @@
//! ```rust //! ```rust
//! use tantivy::tokenizer::*; //! use tantivy::tokenizer::*;
//! //!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) //! let tokenizer = analyzer_builder(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(5)); //! .filter(RemoveLongFilter::limit(5)).build();
//! //!
//! let mut stream = tokenizer.token_stream("toolong nice"); //! let mut stream = tokenizer.token_stream("toolong nice");
//! // because `toolong` is more than 5 characters, it is filtered //! // because `toolong` is more than 5 characters, it is filtered
@@ -12,61 +12,30 @@
//! assert!(stream.next().is_none()); //! assert!(stream.next().is_none());
//! ``` //! ```
//! //!
use super::{Token, TokenFilter, TokenStream}; use super::{Token, TokenFilter};
use crate::tokenizer::BoxTokenStream;
/// `RemoveLongFilter` removes tokens that are longer /// `RemoveLongFilter` removes tokens that are longer
/// than a given number of bytes (in UTF-8 representation). /// than a given number of bytes (in UTF-8 representation).
/// ///
/// It is especially useful when indexing unconstrained content. /// It is especially useful when indexing unconstrained content.
/// e.g. Mail containing base-64 encoded pictures etc. /// e.g. Mail containing base-64 encoded pictures etc.
#[derive(Clone)] #[derive(Clone, Debug)]
pub struct RemoveLongFilter { pub struct RemoveLongFilter {
length_limit: usize, limit: usize,
} }
impl RemoveLongFilter { impl RemoveLongFilter {
/// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation. /// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation.
pub fn limit(length_limit: usize) -> RemoveLongFilter { pub fn limit(limit: usize) -> RemoveLongFilter {
RemoveLongFilter { length_limit } RemoveLongFilter { limit }
}
}
impl<'a> RemoveLongFilterStream<'a> {
fn predicate(&self, token: &Token) -> bool {
token.text.len() < self.token_length_limit
} }
} }
impl TokenFilter for RemoveLongFilter { impl TokenFilter for RemoveLongFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { fn transform(&mut self, token: Token) -> Option<Token> {
BoxTokenStream::from(RemoveLongFilterStream { if token.text.len() >= self.limit {
token_length_limit: self.length_limit, return None;
tail: token_stream,
})
}
}
pub struct RemoveLongFilterStream<'a> {
token_length_limit: usize,
tail: BoxTokenStream<'a>,
}
impl<'a> TokenStream for RemoveLongFilterStream<'a> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
} }
false Some(token)
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
} }
} }

View File

@@ -1,59 +1,74 @@
use super::BoxTokenStream; use super::{Token, Tokenizer};
use super::{Token, TokenStream, Tokenizer};
use std::str::CharIndices;
/// Tokenize the text by splitting on whitespaces and punctuation. /// Tokenize the text by splitting on whitespaces and punctuation.
#[derive(Clone)] #[derive(Clone, Debug)]
pub struct SimpleTokenizer; pub struct SimpleTokenizer;
pub struct SimpleTokenStream<'a> {
text: &'a str,
chars: CharIndices<'a>,
token: Token,
}
impl Tokenizer for SimpleTokenizer { impl Tokenizer for SimpleTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { type Iter = SimpleTokenizerStream;
BoxTokenStream::from(SimpleTokenStream { fn token_stream(&self, text: &str) -> Self::Iter {
text, let vec: Vec<_> = text.char_indices().collect();
chars: text.char_indices(), SimpleTokenizerStream {
token: Token::default(), text: text.to_string(),
}) chars: vec.into_iter(),
position: usize::max_value(),
}
} }
} }
impl<'a> SimpleTokenStream<'a> { #[derive(Clone, Debug)]
pub struct SimpleTokenizerStream {
text: String,
chars: std::vec::IntoIter<(usize, char)>,
position: usize,
}
impl SimpleTokenizerStream {
// search for the end of the current token. // search for the end of the current token.
fn search_token_end(&mut self) -> usize { fn search_token_end(&mut self) -> usize {
(&mut self.chars) (&mut self.chars)
.filter(|&(_, ref c)| !c.is_alphanumeric()) .filter(|&(_, c)| !c.is_alphanumeric())
.map(|(offset, _)| offset) .map(|(offset, _)| offset)
.next() .next()
.unwrap_or_else(|| self.text.len()) .unwrap_or_else(|| self.text.len())
} }
} }
impl<'a> TokenStream for SimpleTokenStream<'a> { impl Iterator for SimpleTokenizerStream {
fn advance(&mut self) -> bool { type Item = Token;
self.token.text.clear(); fn next(&mut self) -> Option<Self::Item> {
self.token.position = self.token.position.wrapping_add(1); self.position = self.position.wrapping_add(1);
while let Some((offset_from, c)) = self.chars.next() { while let Some((offset_from, c)) = self.chars.next() {
if c.is_alphanumeric() { if c.is_alphanumeric() {
let offset_to = self.search_token_end(); let offset_to = self.search_token_end();
self.token.offset_from = offset_from; let token = Token {
self.token.offset_to = offset_to; text: self.text[offset_from..offset_to].into(),
self.token.text.push_str(&self.text[offset_from..offset_to]); offset_from,
return true; offset_to,
position: self.position,
..Default::default()
};
return Some(token);
} }
} }
false None
} }
}
fn token(&self) -> &Token {
&self.token #[cfg(test)]
} mod tests {
use super::*;
fn token_mut(&mut self) -> &mut Token {
&mut self.token #[test]
fn test_empty() {
let mut empty = SimpleTokenizer.token_stream("");
assert_eq!(empty.next(), None);
}
#[test]
fn simple_tokenizer() {
let mut simple = SimpleTokenizer.token_stream("tokenizer hello world");
assert_eq!(simple.next().unwrap().text, "tokenizer");
assert_eq!(simple.next().unwrap().text, "hello");
assert_eq!(simple.next().unwrap().text, "world");
} }
} }

View File

@@ -1,5 +1,6 @@
use super::{Token, TokenFilter, TokenStream}; use std::sync::Arc;
use crate::tokenizer::BoxTokenStream;
use super::{Token, TokenFilter};
use rust_stemmers::{self, Algorithm}; use rust_stemmers::{self, Algorithm};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@@ -58,14 +59,15 @@ impl Language {
/// Tokens are expected to be lowercased beforehand. /// Tokens are expected to be lowercased beforehand.
#[derive(Clone)] #[derive(Clone)]
pub struct Stemmer { pub struct Stemmer {
stemmer_algorithm: Algorithm, stemmer: Arc<rust_stemmers::Stemmer>,
} }
impl Stemmer { impl Stemmer {
/// Creates a new Stemmer `TokenFilter` for a given language algorithm. /// Creates a new Stemmer `TokenFilter` for a given language algorithm.
pub fn new(language: Language) -> Stemmer { pub fn new(language: Language) -> Stemmer {
let stemmer = rust_stemmers::Stemmer::create(language.algorithm());
Stemmer { Stemmer {
stemmer_algorithm: language.algorithm(), stemmer: Arc::new(stemmer),
} }
} }
} }
@@ -78,37 +80,12 @@ impl Default for Stemmer {
} }
impl TokenFilter for Stemmer { impl TokenFilter for Stemmer {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { fn transform(&mut self, mut token: Token) -> Option<Token> {
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
BoxTokenStream::from(StemmerTokenStream {
tail: token_stream,
stemmer: inner_stemmer,
})
}
}
pub struct StemmerTokenStream<'a> {
tail: BoxTokenStream<'a>,
stemmer: rust_stemmers::Stemmer,
}
impl<'a> TokenStream for StemmerTokenStream<'a> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
}
// TODO remove allocation // TODO remove allocation
let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned(); let stemmed_str: String = self.stemmer.stem(&token.text).into_owned();
self.token_mut().text.clear(); // TODO remove clear
self.token_mut().text.push_str(&stemmed_str); token.text.clear();
true token.text.push_str(&stemmed_str);
} Some(token)
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
} }
} }

View File

@@ -2,16 +2,15 @@
//! ```rust //! ```rust
//! use tantivy::tokenizer::*; //! use tantivy::tokenizer::*;
//! //!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) //! let tokenizer = analyzer_builder(SimpleTokenizer)
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()])); //! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()])).build();
//! //!
//! let mut stream = tokenizer.token_stream("the fox is crafty"); //! let mut stream = tokenizer.token_stream("the fox is crafty");
//! assert_eq!(stream.next().unwrap().text, "fox"); //! assert_eq!(stream.next().unwrap().text, "fox");
//! assert_eq!(stream.next().unwrap().text, "crafty"); //! assert_eq!(stream.next().unwrap().text, "crafty");
//! assert!(stream.next().is_none()); //! assert!(stream.next().is_none());
//! ``` //! ```
use super::{Token, TokenFilter, TokenStream}; use super::{Token, TokenFilter};
use crate::tokenizer::BoxTokenStream;
use fnv::FnvHasher; use fnv::FnvHasher;
use std::collections::HashSet; use std::collections::HashSet;
use std::hash::BuildHasherDefault; use std::hash::BuildHasherDefault;
@@ -49,42 +48,12 @@ impl StopWordFilter {
} }
} }
pub struct StopWordFilterStream<'a> {
words: StopWordHashSet,
tail: BoxTokenStream<'a>,
}
impl TokenFilter for StopWordFilter { impl TokenFilter for StopWordFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { fn transform(&mut self, token: Token) -> Option<Token> {
BoxTokenStream::from(StopWordFilterStream { if self.words.contains(&token.text) {
words: self.words.clone(), return None;
tail: token_stream,
})
}
}
impl<'a> StopWordFilterStream<'a> {
fn predicate(&self, token: &Token) -> bool {
!self.words.contains(&token.text)
}
}
impl<'a> TokenStream for StopWordFilterStream<'a> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
} }
false Some(token)
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
} }
} }

View File

@@ -1,95 +1,121 @@
use crate::tokenizer::{BoxTokenStream, Token, TokenStream}; use crate::tokenizer::Token;
use std::ops::DerefMut;
const POSITION_GAP: usize = 2; const POSITION_GAP: usize = 2;
pub(crate) struct TokenStreamChain<'a> { pub(crate) struct TokenStreamChain<Inner, Outer> {
offsets: Vec<usize>, streams_with_offsets: Outer,
token_streams: Vec<BoxTokenStream<'a>>, current: Option<(Inner, usize)>,
position: usize,
position_shift: usize, position_shift: usize,
stream_idx: usize,
token: Token,
} }
impl<'a> TokenStreamChain<'a> { impl<'a, Inner, Outer> TokenStreamChain<Inner, Outer>
pub fn new( where
offsets: Vec<usize>, Inner: Iterator<Item = Token>,
token_streams: Vec<BoxTokenStream<'a>>, Outer: Iterator<Item = (Inner, usize)>,
) -> TokenStreamChain<'a> { {
pub fn new(mut streams_with_offsets: Outer) -> TokenStreamChain<Inner, Outer> {
let current = streams_with_offsets.next();
TokenStreamChain { TokenStreamChain {
offsets, streams_with_offsets: streams_with_offsets,
stream_idx: 0, current,
token_streams, position: usize::max_value(),
position_shift: 0, position_shift: 0,
token: Token::default(),
} }
} }
} }
impl<'a> TokenStream for TokenStreamChain<'a> { impl<'a, Inner, Outer> Iterator for TokenStreamChain<Inner, Outer>
fn advance(&mut self) -> bool { where
while self.stream_idx < self.token_streams.len() { Inner: Iterator<Item = Token>,
let token_stream = self.token_streams[self.stream_idx].deref_mut(); Outer: Iterator<Item = (Inner, usize)>,
if token_stream.advance() { {
let token = token_stream.token(); type Item = Token;
let offset_offset = self.offsets[self.stream_idx]; fn next(&mut self) -> Option<Token> {
self.token.offset_from = token.offset_from + offset_offset; while let Some((ref mut token_stream, offset_offset)) = self.current {
self.token.offset_to = token.offset_to + offset_offset; if let Some(mut token) = token_stream.next() {
self.token.position = token.position + self.position_shift; token.offset_from += offset_offset;
self.token.text.clear(); token.offset_to += offset_offset;
self.token.text.push_str(token.text.as_str()); token.position += self.position_shift;
return true; self.position = token.position;
} else { return Some(token);
self.stream_idx += 1;
self.position_shift = self.token.position.wrapping_add(POSITION_GAP);
} }
self.position_shift = self.position.wrapping_add(POSITION_GAP);
self.current = self.streams_with_offsets.next();
} }
false None
} }
}
fn token(&self) -> &Token { impl DynTokenStreamChain {
assert!( pub fn from_vec(
self.stream_idx <= self.token_streams.len(), streams_with_offsets: Vec<(Box<dyn Iterator<Item = Token>>, usize)>,
"You called .token(), after the end of the token stream has been reached" ) -> impl Iterator<Item = Token> {
); DynTokenStreamChain {
&self.token streams_with_offsets,
idx: 0,
position: usize::max_value(),
position_shift: 0,
}
} }
}
fn token_mut(&mut self) -> &mut Token { pub(crate) struct DynTokenStreamChain {
assert!( streams_with_offsets: Vec<(Box<dyn Iterator<Item = Token>>, usize)>,
self.stream_idx <= self.token_streams.len(), idx: usize,
"You called .token(), after the end of the token stream has been reached" position: usize,
); position_shift: usize,
&mut self.token }
impl Iterator for DynTokenStreamChain {
type Item = Token;
fn next(&mut self) -> Option<Token> {
while let Some((token_stream, offset_offset)) = self.streams_with_offsets.get_mut(self.idx)
{
if let Some(mut token) = token_stream.next() {
token.offset_from += *offset_offset;
token.offset_to += *offset_offset;
token.position += self.position_shift;
self.position = token.position;
return Some(token);
}
self.idx += 1;
self.position_shift = self.position.wrapping_add(POSITION_GAP);
}
None
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::super::{SimpleTokenizer, TokenStream, Tokenizer}; use super::super::tokenizer::Tokenizer;
use super::TokenStreamChain; use super::super::SimpleTokenizer;
use super::POSITION_GAP; use super::*;
#[test] #[test]
fn test_chain_first_emits_no_tokens() { fn test_chain_first_emits_no_tokens() {
let token_streams = vec![ let token_streams = vec![
SimpleTokenizer.token_stream(""), (SimpleTokenizer.token_stream(""), 0),
SimpleTokenizer.token_stream("hello world"), (SimpleTokenizer.token_stream("hello world"), 0),
]; ];
let mut token_chain = TokenStreamChain::new(vec![0, 0], token_streams); let mut token_chain = TokenStreamChain::new(token_streams.into_iter());
let token = token_chain.next();
assert!(token_chain.advance()); let expect = Token {
assert_eq!(token_chain.token().text, "hello"); offset_from: 0,
assert_eq!(token_chain.token().offset_from, 0); offset_to: 5,
assert_eq!(token_chain.token().offset_to, 5); position: POSITION_GAP - 1,
assert_eq!(token_chain.token().position, POSITION_GAP - 1); text: "hello".into(),
..Token::default()
};
assert_eq!(token.unwrap(), expect);
assert!(token_chain.advance()); let token = token_chain.next().unwrap();
assert_eq!(token_chain.token().text, "world"); assert_eq!(token.text, "world");
assert_eq!(token_chain.token().offset_from, 6); assert_eq!(token.offset_from, 6);
assert_eq!(token_chain.token().offset_to, 11); assert_eq!(token.offset_to, 11);
assert_eq!(token_chain.token().position, POSITION_GAP); assert_eq!(token.position, POSITION_GAP);
assert!(!token_chain.advance()); assert!(token_chain.next().is_none());
} }
} }

View File

@@ -1,4 +1,4 @@
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain}; use crate::tokenizer::{Token, TokenStreamChain};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::cmp::Ordering; use std::cmp::Ordering;
@@ -26,14 +26,14 @@ impl PartialOrd for PreTokenizedString {
/// TokenStream implementation which wraps PreTokenizedString /// TokenStream implementation which wraps PreTokenizedString
pub struct PreTokenizedStream { pub struct PreTokenizedStream {
tokenized_string: PreTokenizedString, tokenized_string: PreTokenizedString,
current_token: i64, current_token: usize,
} }
impl From<PreTokenizedString> for PreTokenizedStream { impl From<PreTokenizedString> for PreTokenizedStream {
fn from(s: PreTokenizedString) -> PreTokenizedStream { fn from(s: PreTokenizedString) -> PreTokenizedStream {
PreTokenizedStream { PreTokenizedStream {
tokenized_string: s, tokenized_string: s,
current_token: -1, current_token: 0,
} }
} }
} }
@@ -41,49 +41,28 @@ impl From<PreTokenizedString> for PreTokenizedStream {
impl PreTokenizedStream { impl PreTokenizedStream {
/// Creates a TokenStream from PreTokenizedString array /// Creates a TokenStream from PreTokenizedString array
pub fn chain_tokenized_strings<'a>( pub fn chain_tokenized_strings<'a>(
tok_strings: &'a [&'a PreTokenizedString], tok_strings: &'a [&PreTokenizedString],
) -> BoxTokenStream { ) -> impl Iterator<Item = Token> + 'a {
if tok_strings.len() == 1 { let streams_with_offsets = tok_strings.iter().scan(0, |total_offset, tok_string| {
PreTokenizedStream::from((*tok_strings[0]).clone()).into() let next = Some((
} else { PreTokenizedStream::from((*tok_string).to_owned()),
let mut offsets = vec![]; *total_offset,
let mut total_offset = 0; ));
for &tok_string in tok_strings { if let Some(last_token) = tok_string.tokens.last() {
offsets.push(total_offset); *total_offset += last_token.offset_to;
if let Some(last_token) = tok_string.tokens.last() {
total_offset += last_token.offset_to;
}
} }
// TODO remove the string cloning. next
let token_streams: Vec<BoxTokenStream<'static>> = tok_strings });
.iter() TokenStreamChain::new(streams_with_offsets)
.map(|&tok_string| PreTokenizedStream::from((*tok_string).clone()).into())
.collect();
TokenStreamChain::new(offsets, token_streams).into()
}
} }
} }
impl TokenStream for PreTokenizedStream { impl Iterator for PreTokenizedStream {
fn advance(&mut self) -> bool { type Item = Token;
fn next(&mut self) -> Option<Token> {
let token = self.tokenized_string.tokens.get(self.current_token)?;
self.current_token += 1; self.current_token += 1;
self.current_token < self.tokenized_string.tokens.len() as i64 Some(token.clone())
}
fn token(&self) -> &Token {
assert!(
self.current_token >= 0,
"TokenStream not initialized. You should call advance() at least once."
);
&self.tokenized_string.tokens[self.current_token as usize]
}
fn token_mut(&mut self) -> &mut Token {
assert!(
self.current_token >= 0,
"TokenStream not initialized. You should call advance() at least once."
);
&mut self.tokenized_string.tokens[self.current_token as usize]
} }
} }
@@ -119,10 +98,9 @@ mod tests {
let mut token_stream = PreTokenizedStream::from(tok_text.clone()); let mut token_stream = PreTokenizedStream::from(tok_text.clone());
for expected_token in tok_text.tokens { for expected_token in tok_text.tokens {
assert!(token_stream.advance()); assert_eq!(token_stream.next().unwrap(), expected_token);
assert_eq!(token_stream.token(), &expected_token);
} }
assert!(!token_stream.advance()); assert!(token_stream.next().is_none());
} }
#[test] #[test]
@@ -183,9 +161,8 @@ mod tests {
]; ];
for expected_token in expected_tokens { for expected_token in expected_tokens {
assert!(token_stream.advance()); assert_eq!(token_stream.next().unwrap(), expected_token);
assert_eq!(token_stream.token(), &expected_token);
} }
assert!(!token_stream.advance()); assert!(token_stream.next().is_none());
} }
} }

View File

@@ -2,8 +2,23 @@ use crate::tokenizer::TokenStreamChain;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
/// The tokenizer module contains all of the tools used to process /// The tokenizer module contains all of the tools used to process
/// text in `tantivy`. /// text in `tantivy`.
use std::borrow::{Borrow, BorrowMut};
use std::ops::{Deref, DerefMut}; pub trait TextAnalyzerClone {
fn box_clone(&self) -> Box<dyn TextAnalyzerT>;
}
/// 'Top-level' trait hiding concrete types, below which static dispatch occurs.
pub trait TextAnalyzerT: 'static + Send + Sync + TextAnalyzerClone {
/// 'Top-level' dynamic dispatch function hiding concrete types of the staticly
/// dispatched `token_stream` from the `Tokenizer` trait.
fn token_stream(&self, text: &str) -> Box<dyn Iterator<Item = Token>>;
}
impl Clone for Box<dyn TextAnalyzerT> {
fn clone(&self) -> Self {
(**self).box_clone()
}
}
/// Token /// Token
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
@@ -35,35 +50,116 @@ impl Default for Token {
} }
} }
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`. /// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync + Clone {
/// Take a `Token` and transform it or return `None` if it's to be removed
/// from the output stream.
fn transform(&mut self, token: Token) -> Option<Token>;
}
/// `Tokenizer` are in charge of splitting text into a stream of token
/// before indexing.
///
/// See the [module documentation](./index.html) for more detail.
pub trait Tokenizer: 'static + Send + Sync + Clone {
/// An iteratable type is returned.
type Iter: Iterator<Item = Token>;
/// Creates a token stream for a given `str`.
fn token_stream(&self, text: &str) -> Self::Iter;
/// Tokenize an array`&str`
///
/// The resulting `Token` stream is equivalent to what would be obtained if the &str were
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
/// to prevent accidental `PhraseQuery` to match accross two terms.
fn token_stream_texts<'a>(&'a self, texts: &'a [&str]) -> Box<dyn Iterator<Item = Token> + 'a> {
let streams_with_offsets = texts.iter().scan(0, move |total_offset, &text| {
let temp = *total_offset;
*total_offset += text.len();
Some((self.token_stream(text), temp))
});
Box::new(TokenStreamChain::new(streams_with_offsets))
}
}
/// `TextAnalyzer` wraps the tokenization of an input text and its modification by any filters applied onto it.
/// ///
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially. /// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
pub struct TextAnalyzer { #[derive(Clone, Debug, Default)]
tokenizer: Box<dyn Tokenizer>, pub struct TextAnalyzer<T>(T);
token_filters: Vec<BoxTokenFilter>,
}
impl<T: Tokenizer> From<T> for TextAnalyzer { impl<T: Tokenizer> From<T> for TextAnalyzer<T> {
fn from(tokenizer: T) -> Self { fn from(src: T) -> TextAnalyzer<T> {
TextAnalyzer::new(tokenizer, Vec::new()) TextAnalyzer(src)
} }
} }
impl TextAnalyzer { impl<T: Tokenizer> TextAnalyzerClone for TextAnalyzer<T> {
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`. fn box_clone(&self) -> Box<dyn TextAnalyzerT> {
/// Box::new(TextAnalyzer(self.0.clone()))
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using }
/// `TextAnalyzer::from(tokenizer)`. }
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
TextAnalyzer { impl<T: Tokenizer> TextAnalyzerT for TextAnalyzer<T> {
tokenizer: Box::new(tokenizer), fn token_stream(&self, text: &str) -> Box<dyn Iterator<Item = Token>> {
token_filters, Box::new(self.0.token_stream(text))
}
}
/// Identity `TokenFilter`
#[derive(Clone, Debug, Default)]
pub struct Identity;
impl TokenFilter for Identity {
fn transform(&mut self, token: Token) -> Option<Token> {
Some(token)
}
}
/// `Filter` is a wrapper around a `Token` stream and a `TokenFilter` which modifies it.
#[derive(Clone, Default, Debug)]
pub struct Filter<I, F> {
iter: I,
f: F,
}
impl<I, F> Iterator for Filter<I, F>
where
I: Iterator<Item = Token>,
F: TokenFilter,
{
type Item = Token;
fn next(&mut self) -> Option<Token> {
while let Some(token) = self.iter.next() {
if let Some(tok) = self.f.transform(token) {
return Some(tok);
}
} }
None
} }
}
#[derive(Clone, Debug, Default)]
pub struct AnalyzerBuilder<T, F> {
tokenizer: T,
f: F,
}
/// Construct an `AnalyzerBuilder` on which to apply `TokenFilter`.
pub fn analyzer_builder<T: Tokenizer>(tokenizer: T) -> AnalyzerBuilder<T, Identity> {
AnalyzerBuilder {
tokenizer,
f: Identity,
}
}
impl<T, F> AnalyzerBuilder<T, F>
where
T: Tokenizer,
F: TokenFilter,
{
/// Appends a token filter to the current tokenizer. /// Appends a token filter to the current tokenizer.
/// ///
/// The method consumes the current `TokenStream` and returns a /// The method consumes the current `Token` and returns a
/// new one. /// new one.
/// ///
/// # Example /// # Example
@@ -71,248 +167,35 @@ impl TextAnalyzer {
/// ```rust /// ```rust
/// use tantivy::tokenizer::*; /// use tantivy::tokenizer::*;
/// ///
/// let en_stem = TextAnalyzer::from(SimpleTokenizer) /// let en_stem = analyzer_builder(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40)) /// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser) /// .filter(LowerCaser::new())
/// .filter(Stemmer::default()); /// .filter(Stemmer::default()).build();
/// ``` /// ```
/// ///
pub fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self { pub fn filter<G: TokenFilter>(self, f: G) -> AnalyzerBuilder<AnalyzerBuilder<T, F>, G> {
self.token_filters.push(token_filter.into()); AnalyzerBuilder { tokenizer: self, f }
self
} }
/// Finalize the build process.
pub fn build(self) -> TextAnalyzer<AnalyzerBuilder<T, F>> {
TextAnalyzer(self)
}
}
/// Tokenize an array`&str` impl<T: Tokenizer, F: TokenFilter> Tokenizer for AnalyzerBuilder<T, F> {
/// type Iter = Filter<T::Iter, F>;
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were fn token_stream(&self, text: &str) -> Self::Iter {
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields Filter {
/// to prevent accidental `PhraseQuery` to match accross two terms. iter: self.tokenizer.token_stream(text),
pub fn token_stream_texts<'a>(&self, texts: &'a [&'a str]) -> BoxTokenStream<'a> { f: self.f.clone(),
assert!(!texts.is_empty());
if texts.len() == 1 {
self.token_stream(texts[0])
} else {
let mut offsets = vec![];
let mut total_offset = 0;
for &text in texts {
offsets.push(total_offset);
total_offset += text.len();
}
let token_streams: Vec<BoxTokenStream<'a>> = texts
.iter()
.cloned()
.map(|text| self.token_stream(text))
.collect();
From::from(TokenStreamChain::new(offsets, token_streams))
} }
} }
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
let mut token_stream = self.tokenizer.token_stream(text);
for token_filter in &self.token_filters {
token_stream = token_filter.transform(token_stream);
}
token_stream
}
}
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: self.tokenizer.box_clone(),
token_filters: self
.token_filters
.iter()
.map(|token_filter| token_filter.box_clone())
.collect(),
}
}
}
/// `Tokenizer` are in charge of splitting text into a stream of token
/// before indexing.
///
/// See the [module documentation](./index.html) for more detail.
///
/// # Warning
///
/// This API may change to use associated types.
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
/// Creates a token stream for a given `str`.
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
}
pub trait TokenizerClone {
fn box_clone(&self) -> Box<dyn Tokenizer>;
}
impl<T: Tokenizer + Clone> TokenizerClone for T {
fn box_clone(&self) -> Box<dyn Tokenizer> {
Box::new(self.clone())
}
}
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
fn advance(&mut self) -> bool {
let token_stream: &mut dyn TokenStream = self.borrow_mut();
token_stream.advance()
}
fn token<'b>(&'b self) -> &'b Token {
let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
token_stream.token()
}
fn token_mut<'b>(&'b mut self) -> &'b mut Token {
let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
token_stream.token_mut()
}
}
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
///
/// See `TokenStream` for more information.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a, T> From<T> for BoxTokenStream<'a>
where
T: TokenStream + 'a,
{
fn from(token_stream: T) -> BoxTokenStream<'a> {
BoxTokenStream(Box::new(token_stream))
}
}
impl<'a> Deref for BoxTokenStream<'a> {
type Target = dyn TokenStream + 'a;
fn deref(&self) -> &Self::Target {
&*self.0
}
}
impl<'a> DerefMut for BoxTokenStream<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut *self.0
}
}
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
///
/// See `TokenStream` for more information.
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
impl Deref for BoxTokenFilter {
type Target = dyn TokenFilter;
fn deref(&self) -> &dyn TokenFilter {
&*self.0
}
}
impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(tokenizer: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(tokenizer))
}
}
/// `TokenStream` is the result of the tokenization.
///
/// It consists consumable stream of `Token`s.
///
/// # Example
///
/// ```
/// use tantivy::tokenizer::*;
///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser);
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "hello");
/// assert_eq!(token.offset_from, 0);
/// assert_eq!(token.offset_to, 5);
/// assert_eq!(token.position, 0);
/// }
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "happy");
/// assert_eq!(token.offset_from, 7);
/// assert_eq!(token.offset_to, 12);
/// assert_eq!(token.position, 1);
/// }
/// ```
///
pub trait TokenStream {
/// Advance to the next token
///
/// Returns false if there are no other tokens.
fn advance(&mut self) -> bool;
/// Returns a reference to the current token.
fn token(&self) -> &Token;
/// Returns a mutable reference to the current token.
fn token_mut(&mut self) -> &mut Token;
/// Helper to iterate over tokens. It
/// simply combines a call to `.advance()`
/// and `.token()`.
///
/// ```
/// use tantivy::tokenizer::*;
///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser);
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
/// while let Some(token) = token_stream.next() {
/// println!("Token {:?}", token.text);
/// }
/// ```
fn next(&mut self) -> Option<&Token> {
if self.advance() {
Some(self.token())
} else {
None
}
}
/// Helper function to consume the entire `TokenStream`
/// and push the tokens to a sink function.
///
/// Remove this.
fn process(&mut self, sink: &mut dyn FnMut(&Token)) -> u32 {
let mut num_tokens_pushed = 0u32;
while self.advance() {
sink(self.token());
num_tokens_pushed += 1u32;
}
num_tokens_pushed
}
}
pub trait TokenFilterClone {
fn box_clone(&self) -> BoxTokenFilter;
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
/// Wraps a token stream and returns the modified one.
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
}
impl<T: TokenFilter + Clone> TokenFilterClone for T {
fn box_clone(&self) -> BoxTokenFilter {
BoxTokenFilter::from(self.clone())
}
} }
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::Token; use super::*;
use crate::tokenizer::SimpleTokenizer;
#[test] #[test]
fn clone() { fn clone() {
@@ -330,4 +213,15 @@ mod test {
assert_eq!(t1.offset_to, t2.offset_to); assert_eq!(t1.offset_to, t2.offset_to);
assert_eq!(t1.text, t2.text); assert_eq!(t1.text, t2.text);
} }
#[test]
fn text_analyzer() {
let mut stream = SimpleTokenizer.token_stream("tokenizer hello world");
dbg!(stream.next());
dbg!(stream.next());
dbg!(stream.next());
dbg!(stream.next());
dbg!(stream.next());
dbg!(stream.next());
}
} }

View File

@@ -1,5 +1,5 @@
use crate::tokenizer::stemmer::Language; use crate::tokenizer::stemmer::Language;
use crate::tokenizer::tokenizer::TextAnalyzer; use crate::tokenizer::tokenizer::{analyzer_builder, TextAnalyzer, TextAnalyzerT, Tokenizer};
use crate::tokenizer::LowerCaser; use crate::tokenizer::LowerCaser;
use crate::tokenizer::RawTokenizer; use crate::tokenizer::RawTokenizer;
use crate::tokenizer::RemoveLongFilter; use crate::tokenizer::RemoveLongFilter;
@@ -22,24 +22,23 @@ use std::sync::{Arc, RwLock};
/// search engine. /// search engine.
#[derive(Clone)] #[derive(Clone)]
pub struct TokenizerManager { pub struct TokenizerManager {
tokenizers: Arc<RwLock<HashMap<String, TextAnalyzer>>>, tokenizers: Arc<RwLock<HashMap<String, Box<dyn TextAnalyzerT>>>>,
} }
impl TokenizerManager { impl TokenizerManager {
/// Registers a new tokenizer associated with a given name. /// Registers a new tokenizer associated with a given name.
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T) pub fn register<U: Tokenizer, T>(&self, tokenizer_name: &str, tokenizer: T)
where where
TextAnalyzer: From<T>, T: Into<TextAnalyzer<U>>,
{ {
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
self.tokenizers self.tokenizers
.write() .write()
.expect("Acquiring the lock should never fail") .expect("Acquiring the lock should never fail")
.insert(tokenizer_name.to_string(), boxed_tokenizer); .insert(tokenizer_name.to_string(), Box::new(tokenizer.into()));
} }
/// Accessing a tokenizer given its name. /// Accessing a tokenizer given its name.
pub fn get(&self, tokenizer_name: &str) -> Option<TextAnalyzer> { pub fn get(&self, tokenizer_name: &str) -> Option<Box<dyn TextAnalyzerT>> {
self.tokenizers self.tokenizers
.read() .read()
.expect("Acquiring the lock should never fail") .expect("Acquiring the lock should never fail")
@@ -54,23 +53,25 @@ impl Default for TokenizerManager {
/// - simple /// - simple
/// - en_stem /// - en_stem
/// - ja /// - ja
fn default() -> TokenizerManager { fn default() -> Self {
let manager = TokenizerManager { let manager = TokenizerManager {
tokenizers: Arc::new(RwLock::new(HashMap::new())), tokenizers: Arc::new(RwLock::new(HashMap::new())),
}; };
manager.register("raw", RawTokenizer); manager.register("raw", RawTokenizer);
manager.register( manager.register(
"default", "default",
TextAnalyzer::from(SimpleTokenizer) analyzer_builder(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40)) .filter(RemoveLongFilter::limit(40))
.filter(LowerCaser), .filter(LowerCaser::new())
.build(),
); );
manager.register( manager.register(
"en_stem", "en_stem",
TextAnalyzer::from(SimpleTokenizer) analyzer_builder(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40)) .filter(RemoveLongFilter::limit(40))
.filter(LowerCaser) .filter(LowerCaser::new())
.filter(Stemmer::new(Language::English)), .filter(Stemmer::new(Language::English))
.build(),
); );
manager manager
} }