mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-29 22:50:41 +00:00
Tests compile.
This commit is contained in:
@@ -87,6 +87,9 @@ members = ["query-grammar"]
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
[patch.crates-io]
|
||||
rust-stemmers = {path = "src/vendor/rust-stemmers"}
|
||||
|
||||
# Following the "fail" crate best practises, we isolate
|
||||
# tests that define specific behavior in fail check points
|
||||
# in a different binary.
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::NgramTokenizer;
|
||||
use tantivy::tokenizer::{NgramTokenizer, TextAnalyzer};
|
||||
use tantivy::{doc, Index};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
@@ -52,9 +52,10 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
// here we are registering our custome tokenizer
|
||||
// this will store tokens of 3 characters each
|
||||
index
|
||||
.tokenizers()
|
||||
.register("ngram3", NgramTokenizer::new(3, 3, false));
|
||||
index.tokenizers().register(
|
||||
"ngram3",
|
||||
TextAnalyzer::new(NgramTokenizer::new(3, 3, false)),
|
||||
);
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
|
||||
@@ -17,12 +17,7 @@ use tantivy::{doc, Index, ReloadPolicy};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn pre_tokenize_text(text: &str) -> Vec<Token> {
|
||||
let mut token_stream = SimpleTokenizer.token_stream(text);
|
||||
let mut tokens = vec![];
|
||||
while token_stream.advance() {
|
||||
tokens.push(token_stream.token().clone());
|
||||
}
|
||||
tokens
|
||||
SimpleTokenizer.token_stream(text).collect()
|
||||
}
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
|
||||
@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
|
||||
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
|
||||
let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
|
||||
|
||||
for (score, doc_address) in top_docs {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
|
||||
@@ -50,9 +50,9 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
let tokenizer = TextAnalyzer::new(SimpleTokenizer)
|
||||
.filter(LowerCaser::new())
|
||||
.filter(StopWordFilter::new(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
]));
|
||||
|
||||
@@ -37,12 +37,14 @@ fn load_metas(
|
||||
) -> crate::Result<IndexMeta> {
|
||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||
let meta_string = String::from_utf8_lossy(&meta_data);
|
||||
IndexMeta::deserialize(&meta_string, &inventory).map_err(|e| {
|
||||
DataCorruption::new(
|
||||
META_FILEPATH.to_path_buf(),
|
||||
format!("Meta file cannot be deserialized. {:?}.", e),
|
||||
)
|
||||
})?
|
||||
IndexMeta::deserialize(&meta_string, &inventory)
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
META_FILEPATH.to_path_buf(),
|
||||
format!("Meta file cannot be deserialized. {:?}.", e),
|
||||
)
|
||||
})
|
||||
.map_err(From::from)
|
||||
}
|
||||
|
||||
/// Search Index
|
||||
@@ -179,11 +181,11 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Helper to access the tokenizer associated to a specific field.
|
||||
pub fn tokenizer_for_field(&'a self, field: Field) -> crate::Result<Box<dyn TextAnalyzerT<'a>>> {
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> crate::Result<Box<dyn TextAnalyzerT>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let tokenizer_manager: &TokenizerManager<'a> = self.tokenizers();
|
||||
let tokenizer_name_opt: Option<Box<dyn TextAnalyzerT<'static>>> = match field_type {
|
||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
||||
let tokenizer_name_opt: Option<Box<dyn TextAnalyzerT>> = match field_type {
|
||||
FieldType::Str(text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||
|
||||
@@ -13,8 +13,8 @@ use crate::schema::Value;
|
||||
use crate::schema::{Field, FieldEntry};
|
||||
use crate::tokenizer::PreTokenizedStream;
|
||||
use crate::tokenizer::TokenStream;
|
||||
use crate::tokenizer::{DynTokenStreamChain, TextAnalyzerT, TokenStreamChain, Tokenizer};
|
||||
use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
|
||||
use crate::tokenizer::{TextAnalyzerT, TokenStreamChain, Tokenizer};
|
||||
use crate::Opstamp;
|
||||
use crate::{DocId, SegmentComponent};
|
||||
|
||||
@@ -24,7 +24,7 @@ use crate::{DocId, SegmentComponent};
|
||||
fn initial_table_size(per_thread_memory_budget: usize) -> crate::Result<usize> {
|
||||
let table_memory_upper_bound = per_thread_memory_budget / 3;
|
||||
if let Some(limit) = (10..)
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_memory_upper_bound)
|
||||
.take_while(|&num_bits| compute_table_size(num_bits) < table_memory_upper_bound)
|
||||
.last()
|
||||
{
|
||||
Ok(limit.min(19)) // we cap it at 2^19 = 512K.
|
||||
@@ -46,8 +46,8 @@ pub struct SegmentWriter {
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FieldNormsWriter,
|
||||
doc_opstamps: Vec<Opstamp>,
|
||||
// TODO: redo ugly trait
|
||||
tokenizers: Vec<Option<Box<dyn TextAnalyzerT<'static>>>>,
|
||||
// TODO: change type
|
||||
tokenizers: Vec<Option<Box<dyn TextAnalyzerT>>>,
|
||||
term_buffer: Term,
|
||||
}
|
||||
|
||||
@@ -72,17 +72,17 @@ impl SegmentWriter {
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
|
||||
let tokenizers = schema
|
||||
.fields()
|
||||
.map(
|
||||
|(_, field_entry): (Field, &FieldEntry)| match field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.map(|(_, field_entry)| match field_entry.field_type() {
|
||||
FieldType::Str(text_options) => {
|
||||
text_options
|
||||
.get_indexing_options()
|
||||
.and_then(|text_index_option| {
|
||||
let tokenizer_name = &text_index_option.tokenizer();
|
||||
tokenizer_manager.get(tokenizer_name)
|
||||
}),
|
||||
_ => None,
|
||||
},
|
||||
)
|
||||
})
|
||||
}
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
Ok(SegmentWriter {
|
||||
max_doc: 0,
|
||||
@@ -159,12 +159,13 @@ impl SegmentWriter {
|
||||
let mut unordered_term_id_opt = None;
|
||||
FacetTokenizer
|
||||
.token_stream(facet_str)
|
||||
.process(&mut |token| {
|
||||
.map(|token| {
|
||||
term_buffer.set_text(&token.text);
|
||||
let unordered_term_id =
|
||||
multifield_postings.subscribe(doc_id, &term_buffer);
|
||||
unordered_term_id_opt = Some(unordered_term_id);
|
||||
});
|
||||
})
|
||||
.count();
|
||||
if let Some(unordered_term_id) = unordered_term_id_opt {
|
||||
self.fast_field_writers
|
||||
.get_multivalue_writer(field)
|
||||
@@ -189,7 +190,7 @@ impl SegmentWriter {
|
||||
total_offset += last_token.offset_to;
|
||||
}
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
Value::Str(text) => {
|
||||
if let Some(ref mut tokenizer) =
|
||||
self.tokenizers[field.field_id() as usize]
|
||||
{
|
||||
@@ -205,7 +206,7 @@ impl SegmentWriter {
|
||||
let num_tokens = if streams_with_offsets.is_empty() {
|
||||
0
|
||||
} else {
|
||||
let mut token_stream = TokenStreamChain::new(streams_with_offsets);
|
||||
let mut token_stream = DynTokenStreamChain::from_vec(streams_with_offsets);
|
||||
multifield_postings.index_text(
|
||||
doc_id,
|
||||
field,
|
||||
@@ -271,6 +272,7 @@ impl SegmentWriter {
|
||||
self.multifield_postings.subscribe(doc_id, &term_buffer);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
doc.filter_fields(|field| schema.get_field_entry(field).is_stored());
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
|
||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||
#![warn(missing_docs)]
|
||||
#![allow(unused_imports)]
|
||||
|
||||
//! # `tantivy`
|
||||
//!
|
||||
|
||||
@@ -50,7 +50,7 @@ pub mod tests {
|
||||
use crate::schema::{Field, TextOptions};
|
||||
use crate::schema::{IndexRecordOption, TextFieldIndexing};
|
||||
use crate::schema::{Schema, Term, INDEXED, TEXT};
|
||||
use crate::tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
|
||||
use crate::tokenizer::{SimpleTokenizer, TextAnalyzer, MAX_TOKEN_LEN};
|
||||
use crate::DocId;
|
||||
use crate::HasLen;
|
||||
use crate::Score;
|
||||
@@ -167,7 +167,7 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
index
|
||||
.tokenizers()
|
||||
.register("simple_no_truncation", SimpleTokenizer);
|
||||
.register("simple_no_truncation", TextAnalyzer::new(SimpleTokenizer));
|
||||
let reader = index.reader().unwrap();
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
|
||||
@@ -240,7 +240,7 @@ pub trait PostingsWriter {
|
||||
);
|
||||
}
|
||||
};
|
||||
token_stream.process(&mut sink)
|
||||
token_stream.map(|tok| sink(&tok)).count() as u32
|
||||
}
|
||||
|
||||
fn total_num_tokens(&self) -> u64;
|
||||
|
||||
@@ -289,7 +289,7 @@ impl QueryParser {
|
||||
let field_name = field_entry.name().to_string();
|
||||
return Err(QueryParserError::FieldNotIndexed(field_name));
|
||||
}
|
||||
match *field_type {
|
||||
match field_type {
|
||||
FieldType::I64(_) => {
|
||||
let val: i64 = i64::from_str(phrase)?;
|
||||
let term = Term::from_field_i64(field, val);
|
||||
@@ -312,7 +312,7 @@ impl QueryParser {
|
||||
let term = Term::from_field_u64(field, val);
|
||||
Ok(vec![(0, term)])
|
||||
}
|
||||
FieldType::Str(ref str_options) => {
|
||||
FieldType::Str(str_options) => {
|
||||
if let Some(option) = str_options.get_indexing_options() {
|
||||
let tokenizer =
|
||||
self.tokenizer_manager
|
||||
@@ -323,12 +323,13 @@ impl QueryParser {
|
||||
option.tokenizer().to_string(),
|
||||
)
|
||||
})?;
|
||||
let mut terms: Vec<(usize, Term)> = Vec::new();
|
||||
let mut token_stream = tokenizer.token_stream(phrase);
|
||||
token_stream.process(&mut |token| {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
terms.push((token.position, term));
|
||||
});
|
||||
let token_stream = tokenizer.token_stream(phrase);
|
||||
let terms: Vec<_> = token_stream
|
||||
.map(|token| {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
(token.position, term)
|
||||
})
|
||||
.collect();
|
||||
if terms.len() <= 1 {
|
||||
Ok(terms)
|
||||
} else {
|
||||
@@ -412,7 +413,7 @@ impl QueryParser {
|
||||
&self,
|
||||
given_field: &Option<String>,
|
||||
) -> Result<Cow<'_, [Field]>, QueryParserError> {
|
||||
match *given_field {
|
||||
match given_field {
|
||||
None => {
|
||||
if self.default_fields.is_empty() {
|
||||
Err(QueryParserError::NoDefaultFieldDeclared)
|
||||
@@ -420,7 +421,7 @@ impl QueryParser {
|
||||
Ok(Cow::from(&self.default_fields[..]))
|
||||
}
|
||||
}
|
||||
Some(ref field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])),
|
||||
Some(field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -618,9 +619,9 @@ mod test {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register(
|
||||
"en_with_stop_words",
|
||||
TextAnalyzer::from(SimpleTokenizer)
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec!["the".to_string()])),
|
||||
TextAnalyzer::new(SimpleTokenizer)
|
||||
.filter(LowerCaser::new())
|
||||
.filter(StopWordFilter::new(vec!["the".to_string()])),
|
||||
);
|
||||
QueryParser::new(schema, default_fields, tokenizer_manager)
|
||||
}
|
||||
@@ -977,7 +978,7 @@ mod test {
|
||||
let index = Index::create_in_ram(schema);
|
||||
index
|
||||
.tokenizers()
|
||||
.register("customtokenizer", SimpleTokenizer);
|
||||
.register("customtokenizer", TextAnalyzer::new(SimpleTokenizer));
|
||||
let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
assert_eq!(
|
||||
query_parser.parse_query("title:\"happy tax\"").unwrap_err(),
|
||||
|
||||
@@ -139,13 +139,13 @@ impl Snippet {
|
||||
///
|
||||
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
|
||||
/// has to be a valid string.
|
||||
fn search_fragments<'a>(
|
||||
tokenizer: &(dyn TextAnalyzerT<'a> + 'a),
|
||||
text: String,
|
||||
fn search_fragments(
|
||||
tokenizer: &dyn TextAnalyzerT,
|
||||
text: &str,
|
||||
terms: &BTreeMap<String, Score>,
|
||||
max_num_chars: usize,
|
||||
) -> Vec<FragmentCandidate> {
|
||||
let mut token_stream = tokenizer.token_stream(text.as_ref());
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
let mut fragment = FragmentCandidate::new(0);
|
||||
let mut fragments: Vec<FragmentCandidate> = vec![];
|
||||
while let Some(next) = token_stream.next() {
|
||||
@@ -249,7 +249,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
||||
/// ```
|
||||
pub struct SnippetGenerator {
|
||||
terms_text: BTreeMap<String, Score>,
|
||||
tokenizer: Box<dyn TextAnalyzerT<'static>>,
|
||||
tokenizer: Box<dyn TextAnalyzerT>,
|
||||
field: Field,
|
||||
max_num_chars: usize,
|
||||
}
|
||||
@@ -297,33 +297,37 @@ impl SnippetGenerator {
|
||||
///
|
||||
/// This method extract the text associated to the `SnippetGenerator`'s field
|
||||
/// and computes a snippet.
|
||||
pub fn snippet_from_doc(&self, doc: &Document) -> Snippet {
|
||||
pub fn snippet_from_doc(&mut self, doc: &Document) -> Snippet {
|
||||
let text: String = doc
|
||||
.get_all(self.field)
|
||||
.flat_map(Value::text)
|
||||
.collect::<Vec<&str>>()
|
||||
.join(" ");
|
||||
self.snippet(text)
|
||||
self.snippet(text.as_ref())
|
||||
}
|
||||
|
||||
/// Generates a snippet for the given text.
|
||||
pub fn snippet(&self, text: String) -> Snippet {
|
||||
let fragment_candidates =
|
||||
search_fragments(&mut *self.tokenizer, text, &self.terms_text, self.max_num_chars);
|
||||
select_best_fragment_combination(&fragment_candidates[..], &text)
|
||||
pub fn snippet(&mut self, text: &str) -> Snippet {
|
||||
let fragment_candidates = search_fragments(
|
||||
&mut *self.tokenizer,
|
||||
text,
|
||||
&self.terms_text,
|
||||
self.max_num_chars,
|
||||
);
|
||||
select_best_fragment_combination(&fragment_candidates[..], text)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{search_fragments, select_best_fragment_combination};
|
||||
use super::*;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
|
||||
use crate::tokenizer::SimpleTokenizer;
|
||||
use crate::tokenizer::TextAnalyzer;
|
||||
use crate::Index;
|
||||
use crate::SnippetGenerator;
|
||||
use maplit::btreemap;
|
||||
use std::collections::BTreeMap;
|
||||
use std::iter::Iterator;
|
||||
|
||||
const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by
|
||||
@@ -346,7 +350,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
String::from("rust") => 1.0,
|
||||
String::from("language") => 0.9
|
||||
};
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100);
|
||||
let fragments =
|
||||
search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 100);
|
||||
assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
@@ -373,7 +378,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
String::from("rust") =>1.0,
|
||||
String::from("language") => 0.9
|
||||
};
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||
let fragments =
|
||||
search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score, 1.0);
|
||||
@@ -387,7 +393,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
String::from("rust") =>0.9,
|
||||
String::from("language") => 1.0
|
||||
};
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||
let fragments =
|
||||
search_fragments(&TextAnalyzer::new(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||
//assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
@@ -406,7 +413,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("c"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 1);
|
||||
{
|
||||
@@ -428,7 +435,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
@@ -451,7 +458,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
terms.insert(String::from("a"), 0.9);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 7);
|
||||
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 7);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
@@ -473,7 +480,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("z"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
@@ -487,7 +494,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let text = "a b c d";
|
||||
|
||||
let terms = BTreeMap::new();
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||
let fragments = search_fragments(&TextAnalyzer::new(SimpleTokenizer), &text, &terms, 3);
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], &text);
|
||||
@@ -572,12 +579,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut snippet_generator =
|
||||
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
|
||||
{
|
||||
let snippet = snippet_generator.snippet(TEST_TEXT);
|
||||
let snippet = snippet_generator.snippet(TEST_TEXT.into());
|
||||
assert_eq!(snippet.to_html(), "imperative-procedural paradigms. <b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to provide better memory safety");
|
||||
}
|
||||
{
|
||||
snippet_generator.set_max_num_chars(90);
|
||||
let snippet = snippet_generator.snippet(TEST_TEXT);
|
||||
let snippet = snippet_generator.snippet(TEST_TEXT.into());
|
||||
assert_eq!(snippet.to_html(), "<b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,38 +26,11 @@ use super::{Token, TokenFilter, TokenStream};
|
||||
#[derive(Clone)]
|
||||
pub struct AlphaNumOnlyFilter;
|
||||
|
||||
pub struct AlphaNumOnlyFilterStream<'a> {
|
||||
tail: Box<dyn TokenStream + 'a>,
|
||||
}
|
||||
|
||||
impl<'a> AlphaNumOnlyFilterStream<'a> {
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
token.text.chars().all(|c| c.is_ascii_alphanumeric())
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenFilter for AlphaNumOnlyFilter {
|
||||
fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(AlphaNumOnlyFilterStream { tail: token_stream })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
return true;
|
||||
}
|
||||
fn transform(&mut self, token: Token) -> Option<Token> {
|
||||
if token.text.chars().all(|c| c.is_ascii_alphanumeric()) {
|
||||
return None;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
Some(token)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::{Token, TokenStream, TokenFilter};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use std::mem;
|
||||
|
||||
/// This class converts alphabetic, numeric, and symbolic Unicode characters
|
||||
@@ -9,14 +9,23 @@ pub struct AsciiFolding {
|
||||
buffer: String,
|
||||
}
|
||||
|
||||
impl AsciiFolding {
|
||||
/// Construct a new `AsciiFolding` filter.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
buffer: String::with_capacity(100),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenFilter for AsciiFolding {
|
||||
fn transform(&mut self, mut token: Token) -> Option<Token> {
|
||||
let token = &mut token;
|
||||
if !token.text.is_ascii() {
|
||||
// ignore its already ascii
|
||||
to_ascii(&mut token.text, &mut self.buffer);
|
||||
mem::swap(&mut token, &mut self.buffer);
|
||||
to_ascii(&token.text, &mut self.buffer);
|
||||
mem::swap(&mut token.text, &mut self.buffer);
|
||||
}
|
||||
Some(token)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1517,11 +1526,8 @@ fn to_ascii(text: &String, output: &mut String) {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::to_ascii;
|
||||
use crate::tokenizer::AsciiFoldingFilter;
|
||||
use crate::tokenizer::RawTokenizer;
|
||||
use crate::tokenizer::SimpleTokenizer;
|
||||
use crate::tokenizer::TextAnalyzer;
|
||||
use super::super::*;
|
||||
use super::*;
|
||||
use std::iter;
|
||||
|
||||
#[test]
|
||||
@@ -1537,22 +1543,20 @@ mod tests {
|
||||
}
|
||||
|
||||
fn folding_helper(text: &str) -> Vec<String> {
|
||||
let mut tokens = Vec::new();
|
||||
TextAnalyzer::from(SimpleTokenizer)
|
||||
.filter(AsciiFoldingFilter)
|
||||
let tokens = TextAnalyzer::new(SimpleTokenizer)
|
||||
.filter(AsciiFolding::new())
|
||||
.token_stream(text)
|
||||
.process(&mut |token| {
|
||||
tokens.push(token.text.clone());
|
||||
});
|
||||
.map(|token| token.text.clone())
|
||||
.collect();
|
||||
tokens
|
||||
}
|
||||
|
||||
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
|
||||
let mut token_stream = TextAnalyzer::from(RawTokenizer)
|
||||
.filter(AsciiFoldingFilter)
|
||||
let mut token_stream = TextAnalyzer::new(RawTokenizer)
|
||||
.filter(AsciiFolding::new())
|
||||
.token_stream(text);
|
||||
token_stream.advance();
|
||||
token_stream.token().text.clone()
|
||||
let Token { text, .. } = token_stream.next().unwrap();
|
||||
text
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1603,9 +1607,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_to_ascii() {
|
||||
let mut input = "Rámon".to_string();
|
||||
let input = "Rámon".to_string();
|
||||
let mut buffer = String::new();
|
||||
to_ascii(&mut input, &mut buffer);
|
||||
to_ascii(&input, &mut buffer);
|
||||
assert_eq!("Ramon", buffer);
|
||||
}
|
||||
|
||||
|
||||
@@ -20,24 +20,24 @@ enum State {
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct FacetTokenStream<'a> {
|
||||
text: &'a str,
|
||||
pub struct FacetTokenStream {
|
||||
text: String,
|
||||
state: State,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl<'a> Tokenizer<'a> for FacetTokenizer {
|
||||
type Iter = FacetTokenStream<'a>;
|
||||
fn token_stream(&self, text: &'a str) -> Self::Iter {
|
||||
impl Tokenizer for FacetTokenizer {
|
||||
type Iter = FacetTokenStream;
|
||||
fn token_stream(&self, text: &str) -> Self::Iter {
|
||||
FacetTokenStream {
|
||||
text,
|
||||
text: text.to_string(),
|
||||
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
|
||||
token: Token::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for FacetTokenStream<'a> {
|
||||
impl Iterator for FacetTokenStream {
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.state {
|
||||
@@ -69,7 +69,7 @@ impl<'a> Iterator for FacetTokenStream<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for FacetTokenStream<'a> {}
|
||||
impl TokenStream for FacetTokenStream {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
@@ -81,16 +81,14 @@ mod tests {
|
||||
#[test]
|
||||
fn test_facet_tokenizer() {
|
||||
let facet = Facet::from_path(vec!["top", "a", "b"]);
|
||||
let mut tokens = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
|
||||
tokens.push(format!("{}", facet));
|
||||
};
|
||||
FacetTokenizer
|
||||
.token_stream(facet.encoded_str())
|
||||
.process(&mut add_token);
|
||||
}
|
||||
let tokens: Vec<_> = FacetTokenizer
|
||||
.token_stream(facet.encoded_str())
|
||||
.map(|token| {
|
||||
Facet::from_encoded(token.text.as_bytes().to_owned())
|
||||
.unwrap()
|
||||
.to_string()
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_eq!(tokens[0], "/");
|
||||
assert_eq!(tokens[1], "/top");
|
||||
@@ -101,16 +99,14 @@ mod tests {
|
||||
#[test]
|
||||
fn test_facet_tokenizer_root_facets() {
|
||||
let facet = Facet::root();
|
||||
let mut tokens = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
|
||||
tokens.push(format!("{}", facet));
|
||||
};
|
||||
FacetTokenizer
|
||||
.token_stream(facet.encoded_str()) // ok test
|
||||
.process(&mut add_token);
|
||||
}
|
||||
let tokens: Vec<_> = FacetTokenizer
|
||||
.token_stream(facet.encoded_str())
|
||||
.map(|token| {
|
||||
Facet::from_encoded(token.text.as_bytes().to_owned())
|
||||
.unwrap()
|
||||
.to_string()
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(tokens.len(), 1);
|
||||
assert_eq!(tokens[0], "/");
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ pub struct LowerCaser {
|
||||
}
|
||||
|
||||
impl LowerCaser {
|
||||
/// Initialize the `LowerCaser`
|
||||
pub fn new() -> Self {
|
||||
LowerCaser {
|
||||
buffer: String::with_capacity(100),
|
||||
@@ -40,6 +41,7 @@ fn to_lowercase_unicode(text: &String, output: &mut String) {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer};
|
||||
|
||||
#[test]
|
||||
@@ -51,13 +53,14 @@ mod tests {
|
||||
}
|
||||
|
||||
fn lowercase_helper(text: &str) -> Vec<String> {
|
||||
let mut tokens = vec![];
|
||||
let mut token_stream = TextAnalyzer::new(SimpleTokenizer, text).filter(LowerCaser::new());
|
||||
while let Some(token) = token_stream.next() {
|
||||
let token_text = token.text.clone();
|
||||
tokens.push(token_text);
|
||||
}
|
||||
tokens
|
||||
TextAnalyzer::new(SimpleTokenizer)
|
||||
.filter(LowerCaser::new())
|
||||
.token_stream(text)
|
||||
.map(|token| {
|
||||
let Token { text, .. } = token;
|
||||
text
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -117,8 +117,8 @@
|
||||
//! .register("custom_en", custom_en_tokenizer);
|
||||
//! ```
|
||||
//!
|
||||
// mod alphanum_only;
|
||||
// mod ascii_folding_filter;
|
||||
mod alphanum_only;
|
||||
mod ascii_folding_filter;
|
||||
mod facet_tokenizer;
|
||||
mod lower_caser;
|
||||
mod ngram_tokenizer;
|
||||
@@ -126,14 +126,14 @@ mod raw_tokenizer;
|
||||
mod remove_long;
|
||||
mod simple_tokenizer;
|
||||
mod stemmer;
|
||||
// mod stop_word_filter;
|
||||
mod stop_word_filter;
|
||||
mod token_stream_chain;
|
||||
mod tokenized_string;
|
||||
mod tokenizer;
|
||||
mod tokenizer_manager;
|
||||
|
||||
// pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||
// pub use self::ascii_folding_filter::AsciiFolding;
|
||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||
pub use self::ascii_folding_filter::AsciiFolding;
|
||||
pub use self::facet_tokenizer::FacetTokenizer;
|
||||
pub use self::lower_caser::LowerCaser;
|
||||
pub use self::ngram_tokenizer::NgramTokenizer;
|
||||
@@ -141,8 +141,8 @@ pub use self::raw_tokenizer::RawTokenizer;
|
||||
pub use self::remove_long::RemoveLongFilter;
|
||||
pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::stemmer::{Language, Stemmer};
|
||||
// pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::{DynTokenStreamChain, TokenStreamChain};
|
||||
|
||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||
pub use self::tokenizer::{
|
||||
@@ -187,15 +187,9 @@ pub mod tests {
|
||||
fn test_raw_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let en_tokenizer = tokenizer_manager.get("raw").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
en_tokenizer
|
||||
.token_stream("Hello, happy tax payer!")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
let tokens: Vec<Token> = en_tokenizer
|
||||
.token_stream("Hello, happy tax payer!")
|
||||
.collect();
|
||||
assert_eq!(tokens.len(), 1);
|
||||
assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23);
|
||||
}
|
||||
@@ -205,15 +199,9 @@ pub mod tests {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
|
||||
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
en_tokenizer
|
||||
.token_stream("Hello, happy tax payer!")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
let tokens: Vec<Token> = en_tokenizer
|
||||
.token_stream("Hello, happy tax payer!")
|
||||
.collect();
|
||||
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_token(&tokens[0], 0, "hello", 0, 5);
|
||||
@@ -228,20 +216,14 @@ pub mod tests {
|
||||
tokenizer_manager.register(
|
||||
"el_stem",
|
||||
TextAnalyzer::new(SimpleTokenizer)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(RemoveLongFilter::new(40))
|
||||
.filter(LowerCaser::new())
|
||||
.filter(Stemmer::new(Language::Greek)),
|
||||
);
|
||||
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
en_tokenizer
|
||||
.token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
let tokens: Vec<Token> = en_tokenizer
|
||||
.token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
|
||||
.collect();
|
||||
|
||||
assert_eq!(tokens.len(), 3);
|
||||
assert_token(&tokens[0], 0, "καλημερ", 0, 16);
|
||||
@@ -253,25 +235,9 @@ pub mod tests {
|
||||
fn test_tokenizer_empty() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||
{
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
en_tokenizer.token_stream(" ").process(&mut add_token);
|
||||
}
|
||||
assert!(tokens.is_empty());
|
||||
}
|
||||
{
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
en_tokenizer.token_stream(" ").process(&mut add_token);
|
||||
}
|
||||
assert!(tokens.is_empty());
|
||||
}
|
||||
let tokens: Vec<Token> = en_tokenizer.token_stream(" ").collect();
|
||||
assert!(tokens.is_empty());
|
||||
let tokens: Vec<Token> = en_tokenizer.token_stream(" ").collect();
|
||||
assert!(tokens.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,20 +118,20 @@ impl NgramTokenizer {
|
||||
}
|
||||
|
||||
/// TokenStream associate to the `NgramTokenizer`
|
||||
pub struct NgramTokenStream<'a> {
|
||||
pub struct NgramTokenStream {
|
||||
/// parameters
|
||||
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>,
|
||||
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers>,
|
||||
/// true if the NgramTokenStream is in prefix mode.
|
||||
prefix_only: bool,
|
||||
/// input
|
||||
text: &'a str,
|
||||
text: String,
|
||||
/// output
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl<'a> Tokenizer<'a> for NgramTokenizer {
|
||||
type Iter = NgramTokenStream<'a>;
|
||||
fn token_stream(&self, text: &'a str) -> Self::Iter {
|
||||
impl Tokenizer for NgramTokenizer {
|
||||
type Iter = NgramTokenStream;
|
||||
fn token_stream(&self, text: &str) -> Self::Iter {
|
||||
NgramTokenStream {
|
||||
ngram_charidx_iterator: StutteringIterator::new(
|
||||
CodepointFrontiers::for_str(text),
|
||||
@@ -139,15 +139,15 @@ impl<'a> Tokenizer<'a> for NgramTokenizer {
|
||||
self.max_gram,
|
||||
),
|
||||
prefix_only: self.prefix_only,
|
||||
text,
|
||||
text: text.to_string(),
|
||||
token: Token::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for NgramTokenStream<'a> {}
|
||||
impl TokenStream for NgramTokenStream {}
|
||||
|
||||
impl<'a> Iterator for NgramTokenStream<'a> {
|
||||
impl Iterator for NgramTokenStream {
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
|
||||
@@ -252,21 +252,21 @@ where
|
||||
/// or a codepoint ends.
|
||||
///
|
||||
/// By convention, we emit [0] for the empty string.
|
||||
struct CodepointFrontiers<'a> {
|
||||
s: &'a str,
|
||||
struct CodepointFrontiers {
|
||||
s: String,
|
||||
next_el: Option<usize>,
|
||||
}
|
||||
|
||||
impl<'a> CodepointFrontiers<'a> {
|
||||
fn for_str(s: &'a str) -> Self {
|
||||
impl CodepointFrontiers {
|
||||
fn for_str(s: &str) -> Self {
|
||||
CodepointFrontiers {
|
||||
s,
|
||||
s: s.to_string(),
|
||||
next_el: Some(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for CodepointFrontiers<'a> {
|
||||
impl<'a> Iterator for CodepointFrontiers {
|
||||
type Item = usize;
|
||||
|
||||
fn next(&mut self) -> Option<usize> {
|
||||
@@ -275,7 +275,7 @@ impl<'a> Iterator for CodepointFrontiers<'a> {
|
||||
self.next_el = None;
|
||||
} else {
|
||||
let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]);
|
||||
self.s = &self.s[first_codepoint_width..];
|
||||
self.s = (&self.s[first_codepoint_width..]).to_string();
|
||||
self.next_el = Some(offset + first_codepoint_width);
|
||||
}
|
||||
offset
|
||||
@@ -305,10 +305,8 @@ mod tests {
|
||||
use crate::tokenizer::tokenizer::Tokenizer;
|
||||
use crate::tokenizer::{Token, TokenStream};
|
||||
|
||||
fn test_helper(mut tokenizer: Box<dyn TokenStream>) -> Vec<Token> {
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
|
||||
tokens
|
||||
fn test_helper<T: TokenStream>(tokens: T) -> Vec<Token> {
|
||||
tokens.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -10,9 +10,9 @@ pub struct RawTokenStream {
|
||||
has_token: bool,
|
||||
}
|
||||
|
||||
impl<'a> Tokenizer<'a> for RawTokenizer {
|
||||
impl Tokenizer for RawTokenizer {
|
||||
type Iter = RawTokenStream;
|
||||
fn token_stream(&self, text: &'a str) -> Self::Iter {
|
||||
fn token_stream(&self, text: &str) -> Self::Iter {
|
||||
let token = Token {
|
||||
offset_from: 0,
|
||||
offset_to: text.len(),
|
||||
|
||||
@@ -32,7 +32,7 @@ impl RemoveLongFilter {
|
||||
}
|
||||
|
||||
impl TokenFilter for RemoveLongFilter {
|
||||
fn transform(&mut self, mut token: Token) -> Option<Token> {
|
||||
fn transform(&mut self, token: Token) -> Option<Token> {
|
||||
if token.text.len() >= self.limit {
|
||||
return None;
|
||||
}
|
||||
|
||||
@@ -6,40 +6,47 @@ use std::str::CharIndices;
|
||||
pub struct SimpleTokenizer;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SimpleTokenizerStream<'a> {
|
||||
text: &'a str,
|
||||
chars: CharIndices<'a>,
|
||||
pub struct SimpleTokenizerStream {
|
||||
text: String,
|
||||
idx: usize,
|
||||
chars: Vec<(usize, char)>,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl<'a> Tokenizer<'a> for SimpleTokenizer {
|
||||
type Iter = SimpleTokenizerStream<'a>;
|
||||
fn token_stream(&self, text: &'a str) -> Self::Iter {
|
||||
impl Tokenizer for SimpleTokenizer {
|
||||
type Iter = SimpleTokenizerStream;
|
||||
fn token_stream(&self, text: &str) -> Self::Iter {
|
||||
SimpleTokenizerStream {
|
||||
text,
|
||||
chars: text.char_indices(),
|
||||
text: text.to_string(),
|
||||
chars: text.char_indices().collect(),
|
||||
idx: 0,
|
||||
token: Token::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> SimpleTokenizerStream<'a> {
|
||||
impl SimpleTokenizerStream {
|
||||
// search for the end of the current token.
|
||||
fn search_token_end(&mut self) -> usize {
|
||||
(&mut self.chars)
|
||||
.filter(|&(_, ref c)| !c.is_alphanumeric())
|
||||
.map(|(offset, _)| offset)
|
||||
.iter()
|
||||
.filter(|&&(_, ref c)| !c.is_alphanumeric())
|
||||
.map(|(offset, _)| *offset)
|
||||
.next()
|
||||
.unwrap_or_else(|| self.text.len())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for SimpleTokenizerStream<'a> {
|
||||
impl Iterator for SimpleTokenizerStream {
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.idx >= self.chars.len() {
|
||||
return None;
|
||||
}
|
||||
self.token.text.clear();
|
||||
self.token.position = self.token.position.wrapping_add(1);
|
||||
while let Some((offset_from, c)) = self.chars.next() {
|
||||
while self.idx < self.chars.len() {
|
||||
let (offset_from, c) = self.chars[self.idx];
|
||||
if c.is_alphanumeric() {
|
||||
let offset_to = self.search_token_end();
|
||||
self.token.offset_from = offset_from;
|
||||
@@ -47,9 +54,23 @@ impl<'a> Iterator for SimpleTokenizerStream<'a> {
|
||||
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
||||
return Some(self.token.clone());
|
||||
}
|
||||
self.idx += 1;
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for SimpleTokenizerStream<'a> {}
|
||||
impl TokenStream for SimpleTokenizerStream {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn simple_tokenizer() {
|
||||
let mut stream = SimpleTokenizer.token_stream("tokenizer hello world");
|
||||
dbg!(stream.next());
|
||||
dbg!(stream.next());
|
||||
dbg!(stream.next());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ pub struct StopWordFilter {
|
||||
|
||||
impl StopWordFilter {
|
||||
/// Creates a `StopWordFilter` given a list of words to remove
|
||||
pub fn remove(words: Vec<String>) -> StopWordFilter {
|
||||
pub fn new(words: Vec<String>) -> StopWordFilter {
|
||||
let mut set = StopWordHashSet::default();
|
||||
|
||||
for word in words {
|
||||
@@ -44,46 +44,16 @@ impl StopWordFilter {
|
||||
"there", "these", "they", "this", "to", "was", "will", "with",
|
||||
];
|
||||
|
||||
StopWordFilter::remove(words.iter().map(|&s| s.to_string()).collect())
|
||||
StopWordFilter::new(words.iter().map(|&s| s.to_string()).collect())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StopWordFilterStream<'a> {
|
||||
words: StopWordHashSet,
|
||||
tail: Box<dyn TokenStream + 'a>,
|
||||
}
|
||||
|
||||
impl TokenFilter for StopWordFilter {
|
||||
fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(StopWordFilterStream {
|
||||
words: self.words.clone(),
|
||||
tail: token_stream,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> StopWordFilterStream<'a> {
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
!self.words.contains(&token.text)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for StopWordFilterStream<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
return true;
|
||||
}
|
||||
fn transform(&mut self, token: Token) -> Option<Token> {
|
||||
if self.words.contains(&token.text) {
|
||||
return None;
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
Some(token)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -21,6 +21,48 @@ impl<'a, Out> TokenStreamChain<Out> {
|
||||
}
|
||||
}
|
||||
}
|
||||
impl DynTokenStreamChain {
|
||||
pub fn from_vec(streams_with_offsets: Vec<(Box<dyn TokenStream>, usize)>) -> impl TokenStream {
|
||||
DynTokenStreamChain {
|
||||
streams_with_offsets,
|
||||
idx: 0,
|
||||
token: Token::default(),
|
||||
position_shift: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct DynTokenStreamChain {
|
||||
streams_with_offsets: Vec<(Box<dyn TokenStream>, usize)>,
|
||||
idx: usize,
|
||||
token: Token,
|
||||
position_shift: usize,
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for DynTokenStreamChain {}
|
||||
|
||||
impl Iterator for DynTokenStreamChain {
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Token> {
|
||||
if self.idx >= self.streams_with_offsets.len() {
|
||||
return None;
|
||||
};
|
||||
while self.idx < self.streams_with_offsets.len() {
|
||||
let (ref mut token_stream, offset_offset) = self.streams_with_offsets[self.idx];
|
||||
if let Some(token) = token_stream.next() {
|
||||
self.token = token;
|
||||
self.token.offset_from += offset_offset;
|
||||
self.token.offset_to += offset_offset;
|
||||
self.token.position += self.position_shift;
|
||||
return Some(self.token.clone());
|
||||
} else {
|
||||
self.idx += 1;
|
||||
self.position_shift = self.token.position.wrapping_add(POSITION_GAP);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, In, Out: Iterator<Item = (In, usize)>> TokenStream for TokenStreamChain<Out> where
|
||||
In: Iterator<Item = Token>
|
||||
@@ -61,19 +103,18 @@ mod tests {
|
||||
(SimpleTokenizer.token_stream("hello world"), 0),
|
||||
];
|
||||
let mut token_chain = TokenStreamChain::new(token_streams.into_iter());
|
||||
let token = token_chain.next().unwrap();
|
||||
assert_eq!(token.text, "hello");
|
||||
assert_eq!(token.offset_from, 0);
|
||||
assert_eq!(token.offset_to, 5);
|
||||
assert_eq!(token.position, POSITION_GAP - 1);
|
||||
|
||||
assert!(token_chain.advance());
|
||||
assert_eq!(token_chain.token().text, "hello");
|
||||
assert_eq!(token_chain.token().offset_from, 0);
|
||||
assert_eq!(token_chain.token().offset_to, 5);
|
||||
assert_eq!(token_chain.token().position, POSITION_GAP - 1);
|
||||
let token = token_chain.next().unwrap();
|
||||
assert_eq!(token.text, "world");
|
||||
assert_eq!(token.offset_from, 6);
|
||||
assert_eq!(token.offset_to, 11);
|
||||
assert_eq!(token.position, POSITION_GAP);
|
||||
|
||||
assert!(token_chain.advance());
|
||||
assert_eq!(token_chain.token().text, "world");
|
||||
assert_eq!(token_chain.token().offset_from, 6);
|
||||
assert_eq!(token_chain.token().offset_to, 11);
|
||||
assert_eq!(token_chain.token().position, POSITION_GAP);
|
||||
|
||||
assert!(!token_chain.advance());
|
||||
assert!(token_chain.next().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,13 +97,8 @@ mod tests {
|
||||
],
|
||||
};
|
||||
|
||||
let mut token_stream = PreTokenizedStream::from(tok_text.clone());
|
||||
|
||||
for expected_token in tok_text.tokens {
|
||||
assert!(token_stream.advance());
|
||||
assert_eq!(token_stream.token(), &expected_token);
|
||||
}
|
||||
assert!(!token_stream.advance());
|
||||
let token_stream: Vec<_> = PreTokenizedStream::from(tok_text.clone()).collect();
|
||||
assert_eq!(token_stream, tok_text.tokens);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -130,7 +125,7 @@ mod tests {
|
||||
|
||||
let chain_parts = vec![&tok_text, &tok_text];
|
||||
|
||||
let mut token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]);
|
||||
let token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]);
|
||||
|
||||
let expected_tokens = vec![
|
||||
Token {
|
||||
@@ -162,11 +157,6 @@ mod tests {
|
||||
position_length: 1,
|
||||
},
|
||||
];
|
||||
|
||||
for expected_token in expected_tokens {
|
||||
assert!(token_stream.advance());
|
||||
assert_eq!(token_stream.token(), &expected_token);
|
||||
}
|
||||
assert!(!token_stream.advance());
|
||||
assert_eq!(token_stream.collect::<Vec<_>>(), expected_tokens);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::tokenizer::TokenStreamChain;
|
||||
use crate::tokenizer::{DynTokenStreamChain, TokenStreamChain};
|
||||
use serde::{Deserialize, Serialize};
|
||||
/// The tokenizer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
@@ -41,27 +41,31 @@ pub struct TextAnalyzer<T> {
|
||||
filters: Vec<Box<dyn TokenFilter>>,
|
||||
}
|
||||
|
||||
pub trait TextAnalyzerT<'a>: 'static + Send + Sync + TextAnalyzerClone<'a> {
|
||||
fn token_stream(&self, text: &'a str) -> Box<dyn TokenStream + 'a>;
|
||||
/// Top-level trait for hiding the types contained in it.
|
||||
pub trait TextAnalyzerT: 'static + Send + Sync + TextAnalyzerClone {
|
||||
/// Top-level method that calls the corresponding `token_stream` on the
|
||||
/// contained type.
|
||||
fn token_stream(&self, text: &str) -> Box<dyn TokenStream>;
|
||||
}
|
||||
|
||||
pub trait TextAnalyzerClone<'a> {
|
||||
fn box_clone(&self) -> Box<dyn TextAnalyzerT<'a>>;
|
||||
pub trait TextAnalyzerClone {
|
||||
fn box_clone(&self) -> Box<dyn TextAnalyzerT>;
|
||||
}
|
||||
|
||||
impl<'a> Clone for Box<dyn TextAnalyzerT<'a>> {
|
||||
impl Clone for Box<dyn TextAnalyzerT> {
|
||||
fn clone(&self) -> Self {
|
||||
(**self).box_clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Box<dyn TokenFilter> {
|
||||
fn clone(&self) -> Self {
|
||||
(**self).box_clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Clone + Tokenizer<'a>> TextAnalyzerClone<'a> for TextAnalyzer<T> {
|
||||
fn box_clone(&self) -> Box<dyn TextAnalyzerT<'a>> {
|
||||
impl<T: Clone + Tokenizer> TextAnalyzerClone for TextAnalyzer<T> {
|
||||
fn box_clone(&self) -> Box<dyn TextAnalyzerT> {
|
||||
Box::new(TextAnalyzer {
|
||||
tokenizer: self.tokenizer.clone(),
|
||||
filters: self.filters.clone(),
|
||||
@@ -69,8 +73,8 @@ impl<'a, T: Clone + Tokenizer<'a>> TextAnalyzerClone<'a> for TextAnalyzer<T> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Tokenizer<'a>> TextAnalyzerT<'a> for TextAnalyzer<T> {
|
||||
fn token_stream(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
impl<T: Tokenizer> TextAnalyzerT for TextAnalyzer<T> {
|
||||
fn token_stream(&self, text: &str) -> Box<dyn TokenStream> {
|
||||
let tokens = self.tokenizer.token_stream(text);
|
||||
Box::new(TextIter {
|
||||
tokens,
|
||||
@@ -80,9 +84,9 @@ impl<'a, T: Tokenizer<'a>> TextAnalyzerT<'a> for TextAnalyzer<T> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> TextAnalyzer<T>
|
||||
impl<T> TextAnalyzer<T>
|
||||
where
|
||||
T: Tokenizer<'a>,
|
||||
T: Tokenizer,
|
||||
{
|
||||
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `Box<dyn TokenFilter>`.
|
||||
///
|
||||
@@ -123,7 +127,7 @@ where
|
||||
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream(&self, text: &'a str) -> TextIter<T::Iter> {
|
||||
pub fn token_stream(&self, text: &str) -> TextIter<T::Iter> {
|
||||
let tokens = self.tokenizer.token_stream(text);
|
||||
TextIter {
|
||||
tokens,
|
||||
@@ -133,12 +137,12 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
struct TextIter<I> {
|
||||
pub struct TextIter<I> {
|
||||
tokens: I,
|
||||
filters: Vec<Box<dyn TokenFilter>>,
|
||||
}
|
||||
|
||||
impl<'a, I> Iterator for TextIter<I>
|
||||
impl<I> Iterator for TextIter<I>
|
||||
where
|
||||
I: Iterator<Item = Token>,
|
||||
{
|
||||
@@ -152,6 +156,7 @@ where
|
||||
};
|
||||
continue 'outer;
|
||||
}
|
||||
return Some(token);
|
||||
}
|
||||
None
|
||||
}
|
||||
@@ -167,27 +172,30 @@ impl<I: Iterator<Item = Token>> TokenStream for TextIter<I> {}
|
||||
/// # Warning
|
||||
///
|
||||
/// This API may change to use associated types.
|
||||
pub trait Tokenizer<'a>: 'static + Send + Sync + Clone {
|
||||
type Iter: Iterator<Item = Token> + 'a;
|
||||
pub trait Tokenizer: 'static + Send + Sync + Clone {
|
||||
/// An iteratable type is returned.
|
||||
type Iter: TokenStream;
|
||||
/// Creates a token stream for a given `str`.
|
||||
// TODO: make clone unnecessary
|
||||
fn token_stream(&self, text: &'a str) -> Self::Iter;
|
||||
}
|
||||
|
||||
fn token_stream_texts<'a, T: Tokenizer<'a>>(
|
||||
tokenizer: &'a T,
|
||||
texts: &'a [&str],
|
||||
) -> impl TokenStream + 'a {
|
||||
let streams_with_offsets = texts.iter().scan(0, move |total_offset, &text| {
|
||||
let temp = *total_offset;
|
||||
*total_offset += text.len();
|
||||
Some((tokenizer.token_stream(text), temp))
|
||||
});
|
||||
TokenStreamChain::new(streams_with_offsets)
|
||||
fn token_stream(&self, text: &str) -> Self::Iter;
|
||||
/// Tokenize an array`&str`
|
||||
///
|
||||
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
|
||||
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
||||
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
||||
fn token_stream_texts<'a>(&'a self, texts: &'a [&str]) -> Box<dyn TokenStream + 'a> {
|
||||
let streams_with_offsets = texts.iter().scan(0, move |total_offset, &text| {
|
||||
let temp = *total_offset;
|
||||
*total_offset += text.len();
|
||||
Some((self.token_stream(text), temp))
|
||||
});
|
||||
Box::new(TokenStreamChain::new(streams_with_offsets))
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
|
||||
/// Take a `Token` and transform it or return `None` if it's to be removed
|
||||
/// from the output stream.
|
||||
fn transform(&mut self, token: Token) -> Option<Token>;
|
||||
}
|
||||
|
||||
@@ -201,35 +209,66 @@ impl<T: TokenFilter + Clone> TokenFilterClone for T {
|
||||
}
|
||||
}
|
||||
|
||||
pub trait TokenStream: Iterator<Item = Token> {
|
||||
fn process(&mut self, sink: &mut dyn FnMut(&Token)) -> u32 {
|
||||
let mut num_tokens_pushed = 0u32;
|
||||
while let Some(token) = self.next() {
|
||||
sink(&token);
|
||||
num_tokens_pushed += 1u32;
|
||||
}
|
||||
num_tokens_pushed
|
||||
/// `TokenStream` is the result of the tokenization.
|
||||
///
|
||||
/// It consists consumable stream of `Token`s.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser);
|
||||
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||
/// {
|
||||
/// let token = token_stream.next().unwrap();
|
||||
/// assert_eq!(&token.text, "hello");
|
||||
/// assert_eq!(token.offset_from, 0);
|
||||
/// assert_eq!(token.offset_to, 5);
|
||||
/// assert_eq!(token.position, 0);
|
||||
/// }
|
||||
/// {
|
||||
/// let token = token_stream.next().unwrap();
|
||||
/// assert_eq!(&token.text, "happy");
|
||||
/// assert_eq!(token.offset_from, 7);
|
||||
/// assert_eq!(token.offset_to, 12);
|
||||
/// assert_eq!(token.position, 1);
|
||||
/// }
|
||||
/// ```
|
||||
pub trait TokenStream: Iterator<Item = Token> {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use crate::tokenizer::SimpleTokenizer;
|
||||
|
||||
#[test]
|
||||
fn clone() {
|
||||
let t1 = Token {
|
||||
position: 1,
|
||||
offset_from: 2,
|
||||
offset_to: 3,
|
||||
text: "abc".to_string(),
|
||||
position_length: 1,
|
||||
};
|
||||
let t2 = t1.clone();
|
||||
|
||||
assert_eq!(t1.position, t2.position);
|
||||
assert_eq!(t1.offset_from, t2.offset_from);
|
||||
assert_eq!(t1.offset_to, t2.offset_to);
|
||||
assert_eq!(t1.text, t2.text);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn text_analyzer() {
|
||||
let mut stream = TextAnalyzer::new(SimpleTokenizer).token_stream("tokenizer hello world");
|
||||
dbg!(stream.next());
|
||||
dbg!(stream.next());
|
||||
dbg!(stream.next());
|
||||
dbg!(stream.next());
|
||||
dbg!(stream.next());
|
||||
dbg!(stream.next());
|
||||
}
|
||||
}
|
||||
|
||||
// #[cfg(test)]
|
||||
// mod test {
|
||||
// use super::Token;
|
||||
|
||||
// #[test]
|
||||
// fn clone() {
|
||||
// let t1 = Token {
|
||||
// position: 1,
|
||||
// offset_from: 2,
|
||||
// offset_to: 3,
|
||||
// text: "abc".to_string(),
|
||||
// position_length: 1,
|
||||
// };
|
||||
// let t2 = t1.clone();
|
||||
|
||||
// assert_eq!(t1.position, t2.position);
|
||||
// assert_eq!(t1.offset_from, t2.offset_from);
|
||||
// assert_eq!(t1.offset_to, t2.offset_to);
|
||||
// assert_eq!(t1.text, t2.text);
|
||||
// }
|
||||
// }
|
||||
|
||||
@@ -21,15 +21,15 @@ use std::sync::{Arc, RwLock};
|
||||
/// resulting tokens. Stemming can improve the recall of your
|
||||
/// search engine.
|
||||
#[derive(Clone)]
|
||||
pub struct TokenizerManager<'a> {
|
||||
tokenizers: Arc<RwLock<HashMap<String, Box<dyn TextAnalyzerT<'a>>>>>,
|
||||
pub struct TokenizerManager {
|
||||
tokenizers: Arc<RwLock<HashMap<String, Box<dyn TextAnalyzerT>>>>,
|
||||
}
|
||||
|
||||
impl<'a> TokenizerManager<'a> {
|
||||
impl TokenizerManager {
|
||||
/// Registers a new tokenizer associated with a given name.
|
||||
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
|
||||
where
|
||||
T: TextAnalyzerT<'a>,
|
||||
T: TextAnalyzerT,
|
||||
{
|
||||
self.tokenizers
|
||||
.write()
|
||||
@@ -38,7 +38,7 @@ impl<'a> TokenizerManager<'a> {
|
||||
}
|
||||
|
||||
/// Accessing a tokenizer given its name.
|
||||
pub fn get(&self, tokenizer_name: &str) -> Option<Box<dyn TextAnalyzerT<'a>>> {
|
||||
pub fn get(&self, tokenizer_name: &str) -> Option<Box<dyn TextAnalyzerT>> {
|
||||
self.tokenizers
|
||||
.read()
|
||||
.expect("Acquiring the lock should never fail")
|
||||
@@ -47,7 +47,7 @@ impl<'a> TokenizerManager<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Default for TokenizerManager<'a> {
|
||||
impl Default for TokenizerManager {
|
||||
/// Creates an `TokenizerManager` prepopulated with
|
||||
/// the default pre-configured tokenizers of `tantivy`.
|
||||
/// - simple
|
||||
|
||||
Reference in New Issue
Block a user