mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
Compare commits
8 Commits
troublesco
...
debugging-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
08f7706973 | ||
|
|
bf6e6e8a7c | ||
|
|
203b0256a3 | ||
|
|
caf2a38b7e | ||
|
|
96f24b078e | ||
|
|
332b50a4eb | ||
|
|
8ca0954b3b | ||
|
|
36343e2de8 |
@@ -47,6 +47,7 @@ murmurhash32 = "0.2"
|
|||||||
chrono = "0.4"
|
chrono = "0.4"
|
||||||
smallvec = "1"
|
smallvec = "1"
|
||||||
rayon = "1"
|
rayon = "1"
|
||||||
|
env_logger = "0.8"
|
||||||
lru = "0.6"
|
lru = "0.6"
|
||||||
|
|
||||||
[target.'cfg(windows)'.dependencies]
|
[target.'cfg(windows)'.dependencies]
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
|
|||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
let mut word_count = 0;
|
let mut word_count = 0;
|
||||||
let mut token_stream = tokenizer.token_stream(ALICE_TXT);
|
let mut token_stream = tokenizer.token_stream(ALICE_TXT);
|
||||||
for token in token_stream {
|
while token_stream.advance() {
|
||||||
word_count += 1;
|
word_count += 1;
|
||||||
}
|
}
|
||||||
assert_eq!(word_count, 30_731);
|
assert_eq!(word_count, 30_731);
|
||||||
|
|||||||
@@ -17,7 +17,12 @@ use tantivy::{doc, Index, ReloadPolicy};
|
|||||||
use tempfile::TempDir;
|
use tempfile::TempDir;
|
||||||
|
|
||||||
fn pre_tokenize_text(text: &str) -> Vec<Token> {
|
fn pre_tokenize_text(text: &str) -> Vec<Token> {
|
||||||
SimpleTokenizer.token_stream(text).collect()
|
let mut token_stream = SimpleTokenizer.token_stream(text);
|
||||||
|
let mut tokens = vec![];
|
||||||
|
while token_stream.advance() {
|
||||||
|
tokens.push(token_stream.token().clone());
|
||||||
|
}
|
||||||
|
tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
|
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||||
|
|
||||||
let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
|
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
|
||||||
|
|
||||||
for (score, doc_address) in top_docs {
|
for (score, doc_address) in top_docs {
|
||||||
let doc = searcher.doc(doc_address)?;
|
let doc = searcher.doc(doc_address)?;
|
||||||
|
|||||||
@@ -50,13 +50,12 @@ fn main() -> tantivy::Result<()> {
|
|||||||
|
|
||||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||||
// then removes all instances of `the` and `and` from the corpus
|
// then removes all instances of `the` and `and` from the corpus
|
||||||
let tokenizer = analyzer_builder(SimpleTokenizer)
|
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||||
.filter(LowerCaser::new())
|
.filter(LowerCaser)
|
||||||
.filter(StopWordFilter::remove(vec![
|
.filter(StopWordFilter::remove(vec![
|
||||||
"the".to_string(),
|
"the".to_string(),
|
||||||
"and".to_string(),
|
"and".to_string(),
|
||||||
]))
|
]));
|
||||||
.build();
|
|
||||||
|
|
||||||
index.tokenizers().register("stoppy", tokenizer);
|
index.tokenizers().register("stoppy", tokenizer);
|
||||||
|
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ use crate::reader::IndexReaderBuilder;
|
|||||||
use crate::schema::Field;
|
use crate::schema::Field;
|
||||||
use crate::schema::FieldType;
|
use crate::schema::FieldType;
|
||||||
use crate::schema::Schema;
|
use crate::schema::Schema;
|
||||||
use crate::tokenizer::{TextAnalyzerT, TokenizerManager};
|
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||||
use crate::IndexWriter;
|
use crate::IndexWriter;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
@@ -35,12 +35,18 @@ fn load_metas(
|
|||||||
inventory: &SegmentMetaInventory,
|
inventory: &SegmentMetaInventory,
|
||||||
) -> crate::Result<IndexMeta> {
|
) -> crate::Result<IndexMeta> {
|
||||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||||
let meta_string = String::from_utf8_lossy(&meta_data);
|
let meta_string = String::from_utf8(meta_data)
|
||||||
|
.map_err(|utf8_err| {
|
||||||
|
DataCorruption::new(
|
||||||
|
META_FILEPATH.to_path_buf(),
|
||||||
|
format!("Meta file is not valid utf-8. {:?}", utf8_err)
|
||||||
|
)
|
||||||
|
})?;
|
||||||
IndexMeta::deserialize(&meta_string, &inventory)
|
IndexMeta::deserialize(&meta_string, &inventory)
|
||||||
.map_err(|e| {
|
.map_err(|e| {
|
||||||
DataCorruption::new(
|
DataCorruption::new(
|
||||||
META_FILEPATH.to_path_buf(),
|
META_FILEPATH.to_path_buf(),
|
||||||
format!("Meta file cannot be deserialized. {:?}.", e),
|
format!("Meta file cannot be deserialized. {:?}. content = {}", e, meta_string),
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
.map_err(From::from)
|
.map_err(From::from)
|
||||||
@@ -119,12 +125,13 @@ impl Index {
|
|||||||
return Index::create(dir, schema);
|
return Index::create(dir, schema);
|
||||||
}
|
}
|
||||||
let index = Index::open(dir)?;
|
let index = Index::open(dir)?;
|
||||||
if index.schema() != schema {
|
if index.schema() == schema {
|
||||||
return Err(TantivyError::SchemaError(
|
Ok(index)
|
||||||
|
} else {
|
||||||
|
Err(TantivyError::SchemaError(
|
||||||
"An index exists but the schema does not match.".to_string(),
|
"An index exists but the schema does not match.".to_string(),
|
||||||
));
|
))
|
||||||
}
|
}
|
||||||
Ok(index)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a new index in a temp directory.
|
/// Creates a new index in a temp directory.
|
||||||
@@ -180,11 +187,11 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Helper to access the tokenizer associated to a specific field.
|
/// Helper to access the tokenizer associated to a specific field.
|
||||||
pub fn tokenizer_for_field(&self, field: Field) -> crate::Result<Box<dyn TextAnalyzerT>> {
|
pub fn tokenizer_for_field(&self, field: Field) -> crate::Result<TextAnalyzer> {
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
let field_entry = self.schema.get_field_entry(field);
|
||||||
let field_type = field_entry.field_type();
|
let field_type = field_entry.field_type();
|
||||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
||||||
let tokenizer_name_opt: Option<Box<dyn TextAnalyzerT>> = match field_type {
|
let tokenizer_name_opt: Option<TextAnalyzer> = match field_type {
|
||||||
FieldType::Str(text_options) => text_options
|
FieldType::Str(text_options) => text_options
|
||||||
.get_indexing_options()
|
.get_indexing_options()
|
||||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||||
|
|||||||
@@ -310,7 +310,7 @@ impl SegmentReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns an iterator that will iterate over the alive document ids
|
/// Returns an iterator that will iterate over the alive document ids
|
||||||
pub fn doc_ids_alive(&self) -> impl Iterator<Item = DocId> + '_ {
|
pub fn doc_ids_alive<'a>(&'a self) -> impl Iterator<Item = DocId> + 'a {
|
||||||
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
|
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,31 +1,77 @@
|
|||||||
use rand::thread_rng;
|
|
||||||
use std::collections::HashSet;
|
|
||||||
|
|
||||||
use crate::schema::*;
|
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
use crate::Searcher;
|
use crate::Searcher;
|
||||||
|
use crate::{doc, schema::*};
|
||||||
|
use rand::thread_rng;
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
|
||||||
assert!(searcher.segment_readers().len() < 20);
|
assert!(searcher.segment_readers().len() < 20);
|
||||||
assert_eq!(searcher.num_docs() as usize, vals.len());
|
assert_eq!(searcher.num_docs() as usize, vals.len());
|
||||||
|
for segment_reader in searcher.segment_readers() {
|
||||||
|
let store_reader = segment_reader.get_store_reader()?;
|
||||||
|
for doc_id in 0..segment_reader.max_doc() {
|
||||||
|
let _doc = store_reader.get(doc_id)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[ignore]
|
#[ignore]
|
||||||
fn test_indexing() {
|
fn test_functional_store() -> crate::Result<()> {
|
||||||
|
env_logger::init();
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
|
let id_field = schema_builder.add_u64_field("id", INDEXED | STORED);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let reader = index.reader()?;
|
||||||
|
|
||||||
|
let mut rng = thread_rng();
|
||||||
|
|
||||||
|
let mut index_writer = index.writer_with_num_threads(3, 12_000_000)?;
|
||||||
|
|
||||||
|
let mut doc_set: Vec<u64> = Vec::new();
|
||||||
|
|
||||||
|
let mut doc_id = 0u64;
|
||||||
|
for iteration in 0.. {
|
||||||
|
let num_docs: usize = rng.gen_range(0..4);
|
||||||
|
if doc_set.len() >= 1 {
|
||||||
|
let doc_to_remove_id = rng.gen_range(0..doc_set.len());
|
||||||
|
let removed_doc_id = doc_set.swap_remove(doc_to_remove_id);
|
||||||
|
index_writer.delete_term(Term::from_field_u64(id_field, removed_doc_id));
|
||||||
|
}
|
||||||
|
for _ in 0..num_docs {
|
||||||
|
doc_set.push(doc_id);
|
||||||
|
index_writer.add_document(doc!(id_field=>doc_id));
|
||||||
|
doc_id += 1;
|
||||||
|
}
|
||||||
|
index_writer.commit()?;
|
||||||
|
reader.reload()?;
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
println!("#{} - {}", iteration, searcher.segment_readers().len());
|
||||||
|
check_index_content(&searcher, &doc_set)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[ignore]
|
||||||
|
fn test_functional_indexing() -> crate::Result<()> {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
let id_field = schema_builder.add_u64_field("id", INDEXED);
|
let id_field = schema_builder.add_u64_field("id", INDEXED);
|
||||||
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
|
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let index = Index::create_from_tempdir(schema).unwrap();
|
let index = Index::create_from_tempdir(schema)?;
|
||||||
let reader = index.reader().unwrap();
|
let reader = index.reader()?;
|
||||||
|
|
||||||
let mut rng = thread_rng();
|
let mut rng = thread_rng();
|
||||||
|
|
||||||
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
|
||||||
|
|
||||||
let mut committed_docs: HashSet<u64> = HashSet::new();
|
let mut committed_docs: HashSet<u64> = HashSet::new();
|
||||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||||
@@ -33,13 +79,16 @@ fn test_indexing() {
|
|||||||
for _ in 0..200 {
|
for _ in 0..200 {
|
||||||
let random_val = rng.gen_range(0..20);
|
let random_val = rng.gen_range(0..20);
|
||||||
if random_val == 0 {
|
if random_val == 0 {
|
||||||
index_writer.commit().expect("Commit failed");
|
index_writer.commit()?;
|
||||||
committed_docs.extend(&uncommitted_docs);
|
committed_docs.extend(&uncommitted_docs);
|
||||||
uncommitted_docs.clear();
|
uncommitted_docs.clear();
|
||||||
reader.reload().unwrap();
|
reader.reload()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
// check that everything is correct.
|
// check that everything is correct.
|
||||||
check_index_content(&searcher, &committed_docs);
|
check_index_content(
|
||||||
|
&searcher,
|
||||||
|
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
|
||||||
|
)?;
|
||||||
} else {
|
} else {
|
||||||
if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
||||||
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
||||||
@@ -55,4 +104,5 @@ fn test_indexing() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,9 +10,10 @@ use crate::schema::FieldType;
|
|||||||
use crate::schema::Schema;
|
use crate::schema::Schema;
|
||||||
use crate::schema::Term;
|
use crate::schema::Term;
|
||||||
use crate::schema::Value;
|
use crate::schema::Value;
|
||||||
use crate::tokenizer::PreTokenizedStream;
|
use crate::schema::{Field, FieldEntry};
|
||||||
use crate::tokenizer::{DynTokenStreamChain, Tokenizer};
|
use crate::tokenizer::{BoxTokenStream, PreTokenizedStream};
|
||||||
use crate::tokenizer::{FacetTokenizer, TextAnalyzerT, Token};
|
use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
|
||||||
|
use crate::tokenizer::{TokenStreamChain, Tokenizer};
|
||||||
use crate::Opstamp;
|
use crate::Opstamp;
|
||||||
use crate::{DocId, SegmentComponent};
|
use crate::{DocId, SegmentComponent};
|
||||||
|
|
||||||
@@ -22,7 +23,7 @@ use crate::{DocId, SegmentComponent};
|
|||||||
fn initial_table_size(per_thread_memory_budget: usize) -> crate::Result<usize> {
|
fn initial_table_size(per_thread_memory_budget: usize) -> crate::Result<usize> {
|
||||||
let table_memory_upper_bound = per_thread_memory_budget / 3;
|
let table_memory_upper_bound = per_thread_memory_budget / 3;
|
||||||
if let Some(limit) = (10..)
|
if let Some(limit) = (10..)
|
||||||
.take_while(|&num_bits| compute_table_size(num_bits) < table_memory_upper_bound)
|
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_memory_upper_bound)
|
||||||
.last()
|
.last()
|
||||||
{
|
{
|
||||||
Ok(limit.min(19)) // we cap it at 2^19 = 512K.
|
Ok(limit.min(19)) // we cap it at 2^19 = 512K.
|
||||||
@@ -44,8 +45,7 @@ pub struct SegmentWriter {
|
|||||||
fast_field_writers: FastFieldsWriter,
|
fast_field_writers: FastFieldsWriter,
|
||||||
fieldnorms_writer: FieldNormsWriter,
|
fieldnorms_writer: FieldNormsWriter,
|
||||||
doc_opstamps: Vec<Opstamp>,
|
doc_opstamps: Vec<Opstamp>,
|
||||||
// TODO: change type
|
tokenizers: Vec<Option<TextAnalyzer>>,
|
||||||
tokenizers: Vec<Option<Box<dyn TextAnalyzerT>>>,
|
|
||||||
term_buffer: Term,
|
term_buffer: Term,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -70,17 +70,17 @@ impl SegmentWriter {
|
|||||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
|
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
|
||||||
let tokenizers = schema
|
let tokenizers = schema
|
||||||
.fields()
|
.fields()
|
||||||
.map(|(_, field_entry)| match field_entry.field_type() {
|
.map(
|
||||||
FieldType::Str(text_options) => {
|
|(_, field_entry): (Field, &FieldEntry)| match field_entry.field_type() {
|
||||||
text_options
|
FieldType::Str(ref text_options) => text_options
|
||||||
.get_indexing_options()
|
.get_indexing_options()
|
||||||
.and_then(|text_index_option| {
|
.and_then(|text_index_option| {
|
||||||
let tokenizer_name = &text_index_option.tokenizer();
|
let tokenizer_name = &text_index_option.tokenizer();
|
||||||
tokenizer_manager.get(tokenizer_name)
|
tokenizer_manager.get(tokenizer_name)
|
||||||
})
|
}),
|
||||||
}
|
_ => None,
|
||||||
_ => None,
|
},
|
||||||
})
|
)
|
||||||
.collect();
|
.collect();
|
||||||
Ok(SegmentWriter {
|
Ok(SegmentWriter {
|
||||||
max_doc: 0,
|
max_doc: 0,
|
||||||
@@ -141,13 +141,13 @@ impl SegmentWriter {
|
|||||||
}
|
}
|
||||||
let (term_buffer, multifield_postings) =
|
let (term_buffer, multifield_postings) =
|
||||||
(&mut self.term_buffer, &mut self.multifield_postings);
|
(&mut self.term_buffer, &mut self.multifield_postings);
|
||||||
match field_entry.field_type() {
|
match *field_entry.field_type() {
|
||||||
FieldType::HierarchicalFacet => {
|
FieldType::HierarchicalFacet => {
|
||||||
term_buffer.set_field(field);
|
term_buffer.set_field(field);
|
||||||
let facets =
|
let facets =
|
||||||
field_values
|
field_values
|
||||||
.iter()
|
.iter()
|
||||||
.flat_map(|field_value| match field_value.value() {
|
.flat_map(|field_value| match *field_value.value() {
|
||||||
Value::Facet(ref facet) => Some(facet.encoded_str()),
|
Value::Facet(ref facet) => Some(facet.encoded_str()),
|
||||||
_ => {
|
_ => {
|
||||||
panic!("Expected hierarchical facet");
|
panic!("Expected hierarchical facet");
|
||||||
@@ -157,13 +157,12 @@ impl SegmentWriter {
|
|||||||
let mut unordered_term_id_opt = None;
|
let mut unordered_term_id_opt = None;
|
||||||
FacetTokenizer
|
FacetTokenizer
|
||||||
.token_stream(facet_str)
|
.token_stream(facet_str)
|
||||||
.map(|token| {
|
.process(&mut |token| {
|
||||||
term_buffer.set_text(&token.text);
|
term_buffer.set_text(&token.text);
|
||||||
let unordered_term_id =
|
let unordered_term_id =
|
||||||
multifield_postings.subscribe(doc_id, &term_buffer);
|
multifield_postings.subscribe(doc_id, &term_buffer);
|
||||||
unordered_term_id_opt = Some(unordered_term_id);
|
unordered_term_id_opt = Some(unordered_term_id);
|
||||||
})
|
});
|
||||||
.count();
|
|
||||||
if let Some(unordered_term_id) = unordered_term_id_opt {
|
if let Some(unordered_term_id) = unordered_term_id_opt {
|
||||||
self.fast_field_writers
|
self.fast_field_writers
|
||||||
.get_multivalue_writer(field)
|
.get_multivalue_writer(field)
|
||||||
@@ -173,38 +172,37 @@ impl SegmentWriter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
FieldType::Str(_) => {
|
FieldType::Str(_) => {
|
||||||
let mut streams_with_offsets = vec![];
|
let mut token_streams: Vec<BoxTokenStream> = vec![];
|
||||||
|
let mut offsets = vec![];
|
||||||
let mut total_offset = 0;
|
let mut total_offset = 0;
|
||||||
|
|
||||||
for field_value in field_values {
|
for field_value in field_values {
|
||||||
match field_value.value() {
|
match field_value.value() {
|
||||||
Value::PreTokStr(tok_str) => {
|
Value::PreTokStr(tok_str) => {
|
||||||
streams_with_offsets.push((
|
offsets.push(total_offset);
|
||||||
Box::new(PreTokenizedStream::from(tok_str.clone()))
|
|
||||||
as Box<dyn Iterator<Item = Token>>,
|
|
||||||
total_offset,
|
|
||||||
));
|
|
||||||
if let Some(last_token) = tok_str.tokens.last() {
|
if let Some(last_token) = tok_str.tokens.last() {
|
||||||
total_offset += last_token.offset_to;
|
total_offset += last_token.offset_to;
|
||||||
}
|
}
|
||||||
|
token_streams
|
||||||
|
.push(PreTokenizedStream::from(tok_str.clone()).into());
|
||||||
}
|
}
|
||||||
Value::Str(text) => {
|
Value::Str(ref text) => {
|
||||||
if let Some(ref mut tokenizer) =
|
if let Some(ref mut tokenizer) =
|
||||||
self.tokenizers[field.field_id() as usize]
|
self.tokenizers[field.field_id() as usize]
|
||||||
{
|
{
|
||||||
streams_with_offsets
|
offsets.push(total_offset);
|
||||||
.push((tokenizer.token_stream(text), total_offset));
|
|
||||||
total_offset += text.len();
|
total_offset += text.len();
|
||||||
|
token_streams.push(tokenizer.token_stream(text));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => (),
|
_ => (),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let num_tokens = if streams_with_offsets.is_empty() {
|
let num_tokens = if token_streams.is_empty() {
|
||||||
0
|
0
|
||||||
} else {
|
} else {
|
||||||
let mut token_stream = DynTokenStreamChain::from_vec(streams_with_offsets);
|
let mut token_stream = TokenStreamChain::new(offsets, token_streams);
|
||||||
multifield_postings.index_text(
|
multifield_postings.index_text(
|
||||||
doc_id,
|
doc_id,
|
||||||
field,
|
field,
|
||||||
@@ -215,62 +213,71 @@ impl SegmentWriter {
|
|||||||
|
|
||||||
self.fieldnorms_writer.record(doc_id, field, num_tokens);
|
self.fieldnorms_writer.record(doc_id, field, num_tokens);
|
||||||
}
|
}
|
||||||
FieldType::U64(int_option) if int_option.is_indexed() => {
|
FieldType::U64(ref int_option) => {
|
||||||
for field_value in field_values {
|
if int_option.is_indexed() {
|
||||||
term_buffer.set_field(field_value.field());
|
for field_value in field_values {
|
||||||
let u64_val = field_value
|
term_buffer.set_field(field_value.field());
|
||||||
.value()
|
let u64_val = field_value
|
||||||
.u64_value()
|
.value()
|
||||||
.ok_or_else(make_schema_error)?;
|
.u64_value()
|
||||||
term_buffer.set_u64(u64_val);
|
.ok_or_else(make_schema_error)?;
|
||||||
multifield_postings.subscribe(doc_id, &term_buffer);
|
term_buffer.set_u64(u64_val);
|
||||||
|
multifield_postings.subscribe(doc_id, &term_buffer);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
FieldType::Date(int_option) if int_option.is_indexed() => {
|
FieldType::Date(ref int_option) => {
|
||||||
for field_value in field_values {
|
if int_option.is_indexed() {
|
||||||
term_buffer.set_field(field_value.field());
|
for field_value in field_values {
|
||||||
let date_val = field_value
|
term_buffer.set_field(field_value.field());
|
||||||
.value()
|
let date_val = field_value
|
||||||
.date_value()
|
.value()
|
||||||
.ok_or_else(make_schema_error)?;
|
.date_value()
|
||||||
term_buffer.set_i64(date_val.timestamp());
|
.ok_or_else(make_schema_error)?;
|
||||||
multifield_postings.subscribe(doc_id, &term_buffer);
|
term_buffer.set_i64(date_val.timestamp());
|
||||||
|
multifield_postings.subscribe(doc_id, &term_buffer);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
FieldType::I64(int_option) if int_option.is_indexed() => {
|
FieldType::I64(ref int_option) => {
|
||||||
for field_value in field_values {
|
if int_option.is_indexed() {
|
||||||
term_buffer.set_field(field_value.field());
|
for field_value in field_values {
|
||||||
let i64_val = field_value
|
term_buffer.set_field(field_value.field());
|
||||||
.value()
|
let i64_val = field_value
|
||||||
.i64_value()
|
.value()
|
||||||
.ok_or_else(make_schema_error)?;
|
.i64_value()
|
||||||
term_buffer.set_i64(i64_val);
|
.ok_or_else(make_schema_error)?;
|
||||||
multifield_postings.subscribe(doc_id, &term_buffer);
|
term_buffer.set_i64(i64_val);
|
||||||
|
multifield_postings.subscribe(doc_id, &term_buffer);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
FieldType::F64(int_option) if int_option.is_indexed() => {
|
FieldType::F64(ref int_option) => {
|
||||||
for field_value in field_values {
|
if int_option.is_indexed() {
|
||||||
term_buffer.set_field(field_value.field());
|
for field_value in field_values {
|
||||||
let f64_val = field_value
|
term_buffer.set_field(field_value.field());
|
||||||
.value()
|
let f64_val = field_value
|
||||||
.f64_value()
|
.value()
|
||||||
.ok_or_else(make_schema_error)?;
|
.f64_value()
|
||||||
term_buffer.set_f64(f64_val);
|
.ok_or_else(make_schema_error)?;
|
||||||
multifield_postings.subscribe(doc_id, &term_buffer);
|
term_buffer.set_f64(f64_val);
|
||||||
|
multifield_postings.subscribe(doc_id, &term_buffer);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
FieldType::Bytes(option) if option.is_indexed() => {
|
FieldType::Bytes(ref option) => {
|
||||||
for field_value in field_values {
|
if option.is_indexed() {
|
||||||
term_buffer.set_field(field_value.field());
|
for field_value in field_values {
|
||||||
let bytes = field_value
|
term_buffer.set_field(field_value.field());
|
||||||
.value()
|
let bytes = field_value
|
||||||
.bytes_value()
|
.value()
|
||||||
.ok_or_else(make_schema_error)?;
|
.bytes_value()
|
||||||
term_buffer.set_bytes(bytes);
|
.ok_or_else(make_schema_error)?;
|
||||||
self.multifield_postings.subscribe(doc_id, &term_buffer);
|
term_buffer.set_bytes(bytes);
|
||||||
|
self.multifield_postings.subscribe(doc_id, &term_buffer);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => {}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
doc.filter_fields(|field| schema.get_field_entry(field).is_stored());
|
doc.filter_fields(|field| schema.get_field_entry(field).is_stored());
|
||||||
|
|||||||
@@ -132,7 +132,7 @@ impl PositionReader {
|
|||||||
"offset arguments should be increasing."
|
"offset arguments should be increasing."
|
||||||
);
|
);
|
||||||
let delta_to_block_offset = offset as i64 - self.block_offset as i64;
|
let delta_to_block_offset = offset as i64 - self.block_offset as i64;
|
||||||
if !(0..128).contains(&delta_to_block_offset) {
|
if delta_to_block_offset < 0 || delta_to_block_offset >= 128 {
|
||||||
// The first position is not within the first block.
|
// The first position is not within the first block.
|
||||||
// We need to decompress the first block.
|
// We need to decompress the first block.
|
||||||
let delta_to_anchor_offset = offset - self.anchor_offset;
|
let delta_to_anchor_offset = offset - self.anchor_offset;
|
||||||
|
|||||||
@@ -109,9 +109,9 @@ impl BlockSearcher {
|
|||||||
/// The results should be equivalent to
|
/// The results should be equivalent to
|
||||||
/// ```compile_fail
|
/// ```compile_fail
|
||||||
/// block[..]
|
/// block[..]
|
||||||
/// .iter()
|
// .iter()
|
||||||
/// .take_while(|&&val| val < target)
|
// .take_while(|&&val| val < target)
|
||||||
/// .count()
|
// .count()
|
||||||
/// ```
|
/// ```
|
||||||
///
|
///
|
||||||
/// The `start` argument is just used to hint that the response is
|
/// The `start` argument is just used to hint that the response is
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ use crate::postings::{FieldSerializer, InvertedIndexSerializer};
|
|||||||
use crate::schema::IndexRecordOption;
|
use crate::schema::IndexRecordOption;
|
||||||
use crate::schema::{Field, FieldEntry, FieldType, Schema, Term};
|
use crate::schema::{Field, FieldEntry, FieldType, Schema, Term};
|
||||||
use crate::termdict::TermOrdinal;
|
use crate::termdict::TermOrdinal;
|
||||||
|
use crate::tokenizer::TokenStream;
|
||||||
use crate::tokenizer::{Token, MAX_TOKEN_LEN};
|
use crate::tokenizer::{Token, MAX_TOKEN_LEN};
|
||||||
use crate::DocId;
|
use crate::DocId;
|
||||||
use fnv::FnvHashMap;
|
use fnv::FnvHashMap;
|
||||||
@@ -99,10 +100,12 @@ impl MultiFieldPostingsWriter {
|
|||||||
&mut self,
|
&mut self,
|
||||||
doc: DocId,
|
doc: DocId,
|
||||||
field: Field,
|
field: Field,
|
||||||
token_stream: &mut dyn Iterator<Item = Token>,
|
token_stream: &mut dyn TokenStream,
|
||||||
term_buffer: &mut Term,
|
term_buffer: &mut Term,
|
||||||
) -> u32 {
|
) -> u32 {
|
||||||
self.per_field_postings_writers[field.field_id() as usize].index_text(
|
let postings_writer =
|
||||||
|
self.per_field_postings_writers[field.field_id() as usize].deref_mut();
|
||||||
|
postings_writer.index_text(
|
||||||
&mut self.term_index,
|
&mut self.term_index,
|
||||||
doc,
|
doc,
|
||||||
field,
|
field,
|
||||||
@@ -214,7 +217,7 @@ pub trait PostingsWriter {
|
|||||||
term_index: &mut TermHashMap,
|
term_index: &mut TermHashMap,
|
||||||
doc_id: DocId,
|
doc_id: DocId,
|
||||||
field: Field,
|
field: Field,
|
||||||
token_stream: &mut dyn Iterator<Item = Token>,
|
token_stream: &mut dyn TokenStream,
|
||||||
heap: &mut MemoryArena,
|
heap: &mut MemoryArena,
|
||||||
term_buffer: &mut Term,
|
term_buffer: &mut Term,
|
||||||
) -> u32 {
|
) -> u32 {
|
||||||
@@ -239,7 +242,7 @@ pub trait PostingsWriter {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
token_stream.map(|tok| sink(&tok)).count() as u32
|
token_stream.process(&mut sink)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn total_num_tokens(&self) -> u64;
|
fn total_num_tokens(&self) -> u64;
|
||||||
|
|||||||
@@ -289,7 +289,7 @@ impl QueryParser {
|
|||||||
let field_name = field_entry.name().to_string();
|
let field_name = field_entry.name().to_string();
|
||||||
return Err(QueryParserError::FieldNotIndexed(field_name));
|
return Err(QueryParserError::FieldNotIndexed(field_name));
|
||||||
}
|
}
|
||||||
match field_type {
|
match *field_type {
|
||||||
FieldType::I64(_) => {
|
FieldType::I64(_) => {
|
||||||
let val: i64 = i64::from_str(phrase)?;
|
let val: i64 = i64::from_str(phrase)?;
|
||||||
let term = Term::from_field_i64(field, val);
|
let term = Term::from_field_i64(field, val);
|
||||||
@@ -312,7 +312,7 @@ impl QueryParser {
|
|||||||
let term = Term::from_field_u64(field, val);
|
let term = Term::from_field_u64(field, val);
|
||||||
Ok(vec![(0, term)])
|
Ok(vec![(0, term)])
|
||||||
}
|
}
|
||||||
FieldType::Str(str_options) => {
|
FieldType::Str(ref str_options) => {
|
||||||
if let Some(option) = str_options.get_indexing_options() {
|
if let Some(option) = str_options.get_indexing_options() {
|
||||||
let tokenizer =
|
let tokenizer =
|
||||||
self.tokenizer_manager
|
self.tokenizer_manager
|
||||||
@@ -323,14 +323,15 @@ impl QueryParser {
|
|||||||
option.tokenizer().to_string(),
|
option.tokenizer().to_string(),
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
let token_stream = tokenizer.token_stream(phrase);
|
let mut terms: Vec<(usize, Term)> = Vec::new();
|
||||||
let terms: Vec<_> = token_stream
|
let mut token_stream = tokenizer.token_stream(phrase);
|
||||||
.map(|token| {
|
token_stream.process(&mut |token| {
|
||||||
let term = Term::from_field_text(field, &token.text);
|
let term = Term::from_field_text(field, &token.text);
|
||||||
(token.position, term)
|
terms.push((token.position, term));
|
||||||
})
|
});
|
||||||
.collect();
|
if terms.is_empty() {
|
||||||
if terms.len() <= 1 {
|
Ok(vec![])
|
||||||
|
} else if terms.len() == 1 {
|
||||||
Ok(terms)
|
Ok(terms)
|
||||||
} else {
|
} else {
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
let field_entry = self.schema.get_field_entry(field);
|
||||||
@@ -413,7 +414,7 @@ impl QueryParser {
|
|||||||
&self,
|
&self,
|
||||||
given_field: &Option<String>,
|
given_field: &Option<String>,
|
||||||
) -> Result<Cow<'_, [Field]>, QueryParserError> {
|
) -> Result<Cow<'_, [Field]>, QueryParserError> {
|
||||||
match given_field {
|
match *given_field {
|
||||||
None => {
|
None => {
|
||||||
if self.default_fields.is_empty() {
|
if self.default_fields.is_empty() {
|
||||||
Err(QueryParserError::NoDefaultFieldDeclared)
|
Err(QueryParserError::NoDefaultFieldDeclared)
|
||||||
@@ -421,7 +422,7 @@ impl QueryParser {
|
|||||||
Ok(Cow::from(&self.default_fields[..]))
|
Ok(Cow::from(&self.default_fields[..]))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some(field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])),
|
Some(ref field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -573,12 +574,15 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<dyn Query> {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::super::logical_ast::*;
|
use super::super::logical_ast::*;
|
||||||
use super::*;
|
use super::QueryParser;
|
||||||
|
use super::QueryParserError;
|
||||||
use crate::query::Query;
|
use crate::query::Query;
|
||||||
use crate::schema::Field;
|
use crate::schema::Field;
|
||||||
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
||||||
use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT};
|
use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT};
|
||||||
use crate::tokenizer::{analyzer_builder, LowerCaser, SimpleTokenizer, StopWordFilter};
|
use crate::tokenizer::{
|
||||||
|
LowerCaser, SimpleTokenizer, StopWordFilter, TextAnalyzer, TokenizerManager,
|
||||||
|
};
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
use matches::assert_matches;
|
use matches::assert_matches;
|
||||||
|
|
||||||
@@ -616,10 +620,9 @@ mod test {
|
|||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
tokenizer_manager.register(
|
tokenizer_manager.register(
|
||||||
"en_with_stop_words",
|
"en_with_stop_words",
|
||||||
analyzer_builder(SimpleTokenizer)
|
TextAnalyzer::from(SimpleTokenizer)
|
||||||
.filter(LowerCaser::new())
|
.filter(LowerCaser)
|
||||||
.filter(StopWordFilter::remove(vec!["the".to_string()]))
|
.filter(StopWordFilter::remove(vec!["the".to_string()])),
|
||||||
.build(),
|
|
||||||
);
|
);
|
||||||
QueryParser::new(schema, default_fields, tokenizer_manager)
|
QueryParser::new(schema, default_fields, tokenizer_manager)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use crate::query::Query;
|
use crate::query::Query;
|
||||||
use crate::schema::Field;
|
use crate::schema::Field;
|
||||||
use crate::schema::Value;
|
use crate::schema::Value;
|
||||||
use crate::tokenizer::{TextAnalyzerT, Token};
|
use crate::tokenizer::{TextAnalyzer, Token};
|
||||||
use crate::Searcher;
|
use crate::Searcher;
|
||||||
use crate::{Document, Score};
|
use crate::{Document, Score};
|
||||||
use htmlescape::encode_minimal;
|
use htmlescape::encode_minimal;
|
||||||
@@ -139,9 +139,9 @@ impl Snippet {
|
|||||||
///
|
///
|
||||||
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
|
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
|
||||||
/// has to be a valid string.
|
/// has to be a valid string.
|
||||||
fn search_fragments(
|
fn search_fragments<'a>(
|
||||||
tokenizer: &dyn TextAnalyzerT,
|
tokenizer: &TextAnalyzer,
|
||||||
text: &str,
|
text: &'a str,
|
||||||
terms: &BTreeMap<String, Score>,
|
terms: &BTreeMap<String, Score>,
|
||||||
max_num_chars: usize,
|
max_num_chars: usize,
|
||||||
) -> Vec<FragmentCandidate> {
|
) -> Vec<FragmentCandidate> {
|
||||||
@@ -155,7 +155,7 @@ fn search_fragments(
|
|||||||
};
|
};
|
||||||
fragment = FragmentCandidate::new(next.offset_from);
|
fragment = FragmentCandidate::new(next.offset_from);
|
||||||
}
|
}
|
||||||
fragment.try_add_token(&next, &terms);
|
fragment.try_add_token(next, &terms);
|
||||||
}
|
}
|
||||||
if fragment.score > 0.0 {
|
if fragment.score > 0.0 {
|
||||||
fragments.push(fragment)
|
fragments.push(fragment)
|
||||||
@@ -249,7 +249,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
|||||||
/// ```
|
/// ```
|
||||||
pub struct SnippetGenerator {
|
pub struct SnippetGenerator {
|
||||||
terms_text: BTreeMap<String, Score>,
|
terms_text: BTreeMap<String, Score>,
|
||||||
tokenizer: Box<dyn TextAnalyzerT>,
|
tokenizer: TextAnalyzer,
|
||||||
field: Field,
|
field: Field,
|
||||||
max_num_chars: usize,
|
max_num_chars: usize,
|
||||||
}
|
}
|
||||||
@@ -297,37 +297,33 @@ impl SnippetGenerator {
|
|||||||
///
|
///
|
||||||
/// This method extract the text associated to the `SnippetGenerator`'s field
|
/// This method extract the text associated to the `SnippetGenerator`'s field
|
||||||
/// and computes a snippet.
|
/// and computes a snippet.
|
||||||
pub fn snippet_from_doc(&mut self, doc: &Document) -> Snippet {
|
pub fn snippet_from_doc(&self, doc: &Document) -> Snippet {
|
||||||
let text: String = doc
|
let text: String = doc
|
||||||
.get_all(self.field)
|
.get_all(self.field)
|
||||||
.flat_map(Value::text)
|
.flat_map(Value::text)
|
||||||
.collect::<Vec<&str>>()
|
.collect::<Vec<&str>>()
|
||||||
.join(" ");
|
.join(" ");
|
||||||
self.snippet(text.as_ref())
|
self.snippet(&text)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Generates a snippet for the given text.
|
/// Generates a snippet for the given text.
|
||||||
pub fn snippet(&mut self, text: &str) -> Snippet {
|
pub fn snippet(&self, text: &str) -> Snippet {
|
||||||
let fragment_candidates = search_fragments(
|
let fragment_candidates =
|
||||||
&mut *self.tokenizer,
|
search_fragments(&self.tokenizer, &text, &self.terms_text, self.max_num_chars);
|
||||||
text,
|
select_best_fragment_combination(&fragment_candidates[..], &text)
|
||||||
&self.terms_text,
|
|
||||||
self.max_num_chars,
|
|
||||||
);
|
|
||||||
select_best_fragment_combination(&fragment_candidates[..], text)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::{search_fragments, select_best_fragment_combination};
|
||||||
use crate::query::QueryParser;
|
use crate::query::QueryParser;
|
||||||
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
|
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
|
||||||
use crate::tokenizer::SimpleTokenizer;
|
use crate::tokenizer::SimpleTokenizer;
|
||||||
use crate::tokenizer::TextAnalyzer;
|
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
use crate::SnippetGenerator;
|
use crate::SnippetGenerator;
|
||||||
use maplit::btreemap;
|
use maplit::btreemap;
|
||||||
|
use std::collections::BTreeMap;
|
||||||
use std::iter::Iterator;
|
use std::iter::Iterator;
|
||||||
|
|
||||||
const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by
|
const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by
|
||||||
@@ -350,13 +346,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
String::from("rust") => 1.0,
|
String::from("rust") => 1.0,
|
||||||
String::from("language") => 0.9
|
String::from("language") => 0.9
|
||||||
};
|
};
|
||||||
|
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100);
|
||||||
let fragments = search_fragments(
|
|
||||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
|
||||||
TEST_TEXT,
|
|
||||||
&terms,
|
|
||||||
100,
|
|
||||||
);
|
|
||||||
assert_eq!(fragments.len(), 7);
|
assert_eq!(fragments.len(), 7);
|
||||||
{
|
{
|
||||||
let first = &fragments[0];
|
let first = &fragments[0];
|
||||||
@@ -383,12 +373,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
String::from("rust") =>1.0,
|
String::from("rust") =>1.0,
|
||||||
String::from("language") => 0.9
|
String::from("language") => 0.9
|
||||||
};
|
};
|
||||||
let fragments = search_fragments(
|
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
|
||||||
TEST_TEXT,
|
|
||||||
&terms,
|
|
||||||
20,
|
|
||||||
);
|
|
||||||
{
|
{
|
||||||
let first = &fragments[0];
|
let first = &fragments[0];
|
||||||
assert_eq!(first.score, 1.0);
|
assert_eq!(first.score, 1.0);
|
||||||
@@ -402,12 +387,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
String::from("rust") =>0.9,
|
String::from("rust") =>0.9,
|
||||||
String::from("language") => 1.0
|
String::from("language") => 1.0
|
||||||
};
|
};
|
||||||
let fragments = search_fragments(
|
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
|
||||||
TEST_TEXT,
|
|
||||||
&terms,
|
|
||||||
20,
|
|
||||||
);
|
|
||||||
//assert_eq!(fragments.len(), 7);
|
//assert_eq!(fragments.len(), 7);
|
||||||
{
|
{
|
||||||
let first = &fragments[0];
|
let first = &fragments[0];
|
||||||
@@ -426,12 +406,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
let mut terms = BTreeMap::new();
|
let mut terms = BTreeMap::new();
|
||||||
terms.insert(String::from("c"), 1.0);
|
terms.insert(String::from("c"), 1.0);
|
||||||
|
|
||||||
let fragments = search_fragments(
|
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
|
||||||
&text,
|
|
||||||
&terms,
|
|
||||||
3,
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(fragments.len(), 1);
|
assert_eq!(fragments.len(), 1);
|
||||||
{
|
{
|
||||||
@@ -453,12 +428,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
let mut terms = BTreeMap::new();
|
let mut terms = BTreeMap::new();
|
||||||
terms.insert(String::from("f"), 1.0);
|
terms.insert(String::from("f"), 1.0);
|
||||||
|
|
||||||
let fragments = search_fragments(
|
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
|
||||||
&text,
|
|
||||||
&terms,
|
|
||||||
3,
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(fragments.len(), 2);
|
assert_eq!(fragments.len(), 2);
|
||||||
{
|
{
|
||||||
@@ -481,12 +451,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
terms.insert(String::from("f"), 1.0);
|
terms.insert(String::from("f"), 1.0);
|
||||||
terms.insert(String::from("a"), 0.9);
|
terms.insert(String::from("a"), 0.9);
|
||||||
|
|
||||||
let fragments = search_fragments(
|
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 7);
|
||||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
|
||||||
&text,
|
|
||||||
&terms,
|
|
||||||
7,
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(fragments.len(), 2);
|
assert_eq!(fragments.len(), 2);
|
||||||
{
|
{
|
||||||
@@ -508,12 +473,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
let mut terms = BTreeMap::new();
|
let mut terms = BTreeMap::new();
|
||||||
terms.insert(String::from("z"), 1.0);
|
terms.insert(String::from("z"), 1.0);
|
||||||
|
|
||||||
let fragments = search_fragments(
|
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
|
||||||
&text,
|
|
||||||
&terms,
|
|
||||||
3,
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(fragments.len(), 0);
|
assert_eq!(fragments.len(), 0);
|
||||||
|
|
||||||
@@ -527,12 +487,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
let text = "a b c d";
|
let text = "a b c d";
|
||||||
|
|
||||||
let terms = BTreeMap::new();
|
let terms = BTreeMap::new();
|
||||||
let fragments = search_fragments(
|
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||||
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
|
|
||||||
&text,
|
|
||||||
&terms,
|
|
||||||
3,
|
|
||||||
);
|
|
||||||
assert_eq!(fragments.len(), 0);
|
assert_eq!(fragments.len(), 0);
|
||||||
|
|
||||||
let snippet = select_best_fragment_combination(&fragments[..], &text);
|
let snippet = select_best_fragment_combination(&fragments[..], &text);
|
||||||
@@ -617,12 +572,12 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
let mut snippet_generator =
|
let mut snippet_generator =
|
||||||
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
|
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
|
||||||
{
|
{
|
||||||
let snippet = snippet_generator.snippet(TEST_TEXT.into());
|
let snippet = snippet_generator.snippet(TEST_TEXT);
|
||||||
assert_eq!(snippet.to_html(), "imperative-procedural paradigms. <b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to provide better memory safety");
|
assert_eq!(snippet.to_html(), "imperative-procedural paradigms. <b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to provide better memory safety");
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
snippet_generator.set_max_num_chars(90);
|
snippet_generator.set_max_num_chars(90);
|
||||||
let snippet = snippet_generator.snippet(TEST_TEXT.into());
|
let snippet = snippet_generator.snippet(TEST_TEXT);
|
||||||
assert_eq!(snippet.to_html(), "<b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to");
|
assert_eq!(snippet.to_html(), "<b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -43,6 +43,9 @@ impl CheckpointBlock {
|
|||||||
|
|
||||||
/// Adding another checkpoint in the block.
|
/// Adding another checkpoint in the block.
|
||||||
pub fn push(&mut self, checkpoint: Checkpoint) {
|
pub fn push(&mut self, checkpoint: Checkpoint) {
|
||||||
|
if let Some(prev_checkpoint) = self.checkpoints.last() {
|
||||||
|
assert!(checkpoint.follows(prev_checkpoint));
|
||||||
|
}
|
||||||
self.checkpoints.push(checkpoint);
|
self.checkpoints.push(checkpoint);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
const CHECKPOINT_PERIOD: usize = 8;
|
const CHECKPOINT_PERIOD: usize = 2;
|
||||||
|
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
mod block;
|
mod block;
|
||||||
@@ -26,6 +26,13 @@ pub struct Checkpoint {
|
|||||||
pub end_offset: u64,
|
pub end_offset: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Checkpoint {
|
||||||
|
pub(crate) fn follows(&self, other: &Checkpoint) -> bool {
|
||||||
|
(self.start_doc == other.end_doc) &&
|
||||||
|
(self.start_offset == other.end_offset)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl fmt::Debug for Checkpoint {
|
impl fmt::Debug for Checkpoint {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
write!(
|
write!(
|
||||||
@@ -39,13 +46,16 @@ impl fmt::Debug for Checkpoint {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use std::io;
|
use std::{io, iter};
|
||||||
|
|
||||||
|
use futures::executor::block_on;
|
||||||
use proptest::strategy::{BoxedStrategy, Strategy};
|
use proptest::strategy::{BoxedStrategy, Strategy};
|
||||||
|
|
||||||
use crate::directory::OwnedBytes;
|
use crate::directory::OwnedBytes;
|
||||||
|
use crate::indexer::NoMergePolicy;
|
||||||
|
use crate::schema::{SchemaBuilder, STORED, STRING};
|
||||||
use crate::store::index::Checkpoint;
|
use crate::store::index::Checkpoint;
|
||||||
use crate::DocId;
|
use crate::{DocAddress, DocId, Index, Term};
|
||||||
|
|
||||||
use super::{SkipIndex, SkipIndexBuilder};
|
use super::{SkipIndex, SkipIndexBuilder};
|
||||||
|
|
||||||
@@ -54,7 +64,7 @@ mod tests {
|
|||||||
let mut output: Vec<u8> = Vec::new();
|
let mut output: Vec<u8> = Vec::new();
|
||||||
let skip_index_builder: SkipIndexBuilder = SkipIndexBuilder::new();
|
let skip_index_builder: SkipIndexBuilder = SkipIndexBuilder::new();
|
||||||
skip_index_builder.write(&mut output)?;
|
skip_index_builder.write(&mut output)?;
|
||||||
let skip_index: SkipIndex = SkipIndex::from(OwnedBytes::new(output));
|
let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output));
|
||||||
let mut skip_cursor = skip_index.checkpoints();
|
let mut skip_cursor = skip_index.checkpoints();
|
||||||
assert!(skip_cursor.next().is_none());
|
assert!(skip_cursor.next().is_none());
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -72,7 +82,7 @@ mod tests {
|
|||||||
};
|
};
|
||||||
skip_index_builder.insert(checkpoint);
|
skip_index_builder.insert(checkpoint);
|
||||||
skip_index_builder.write(&mut output)?;
|
skip_index_builder.write(&mut output)?;
|
||||||
let skip_index: SkipIndex = SkipIndex::from(OwnedBytes::new(output));
|
let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output));
|
||||||
let mut skip_cursor = skip_index.checkpoints();
|
let mut skip_cursor = skip_index.checkpoints();
|
||||||
assert_eq!(skip_cursor.next(), Some(checkpoint));
|
assert_eq!(skip_cursor.next(), Some(checkpoint));
|
||||||
assert_eq!(skip_cursor.next(), None);
|
assert_eq!(skip_cursor.next(), None);
|
||||||
@@ -121,7 +131,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
skip_index_builder.write(&mut output)?;
|
skip_index_builder.write(&mut output)?;
|
||||||
|
|
||||||
let skip_index: SkipIndex = SkipIndex::from(OwnedBytes::new(output));
|
let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output));
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&skip_index.checkpoints().collect::<Vec<_>>()[..],
|
&skip_index.checkpoints().collect::<Vec<_>>()[..],
|
||||||
&checkpoints[..]
|
&checkpoints[..]
|
||||||
@@ -133,6 +143,40 @@ mod tests {
|
|||||||
(doc as u64) * (doc as u64)
|
(doc as u64) * (doc as u64)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_merge_store_with_stacking_reproducing_issue969() -> crate::Result<()> {
|
||||||
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
let text = schema_builder.add_text_field("text", STORED | STRING);
|
||||||
|
let body = schema_builder.add_text_field("body", STORED);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
|
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||||
|
let long_text: String = iter::repeat("abcdefghijklmnopqrstuvwxyz")
|
||||||
|
.take(1_000)
|
||||||
|
.collect();
|
||||||
|
for _ in 0..20 {
|
||||||
|
index_writer.add_document(doc!(body=>long_text.clone()));
|
||||||
|
}
|
||||||
|
index_writer.commit()?;
|
||||||
|
index_writer.add_document(doc!(text=>"testb"));
|
||||||
|
for _ in 0..10 {
|
||||||
|
index_writer.add_document(doc!(text=>"testd", body=>long_text.clone()));
|
||||||
|
}
|
||||||
|
index_writer.commit()?;
|
||||||
|
index_writer.delete_term(Term::from_field_text(text, "testb"));
|
||||||
|
index_writer.commit()?;
|
||||||
|
let segment_ids = index.searchable_segment_ids()?;
|
||||||
|
block_on(index_writer.merge(&segment_ids))?;
|
||||||
|
let reader = index.reader()?;
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
assert_eq!(searcher.num_docs(), 30);
|
||||||
|
for i in 0..searcher.num_docs() as u32 {
|
||||||
|
let _doc = searcher.doc(DocAddress(0u32, i))?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_skip_index_long() -> io::Result<()> {
|
fn test_skip_index_long() -> io::Result<()> {
|
||||||
let mut output: Vec<u8> = Vec::new();
|
let mut output: Vec<u8> = Vec::new();
|
||||||
@@ -150,7 +194,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
skip_index_builder.write(&mut output)?;
|
skip_index_builder.write(&mut output)?;
|
||||||
assert_eq!(output.len(), 4035);
|
assert_eq!(output.len(), 4035);
|
||||||
let resulting_checkpoints: Vec<Checkpoint> = SkipIndex::from(OwnedBytes::new(output))
|
let resulting_checkpoints: Vec<Checkpoint> = SkipIndex::open(OwnedBytes::new(output))
|
||||||
.checkpoints()
|
.checkpoints()
|
||||||
.collect();
|
.collect();
|
||||||
assert_eq!(&resulting_checkpoints, &checkpoints);
|
assert_eq!(&resulting_checkpoints, &checkpoints);
|
||||||
@@ -221,7 +265,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
skip_index_builder.write(&mut buffer).unwrap();
|
skip_index_builder.write(&mut buffer).unwrap();
|
||||||
let skip_index = SkipIndex::from(OwnedBytes::new(buffer));
|
let skip_index = SkipIndex::open(OwnedBytes::new(buffer));
|
||||||
let iter_checkpoints: Vec<Checkpoint> = skip_index.checkpoints().collect();
|
let iter_checkpoints: Vec<Checkpoint> = skip_index.checkpoints().collect();
|
||||||
assert_eq!(&checkpoints[..], &iter_checkpoints[..]);
|
assert_eq!(&checkpoints[..], &iter_checkpoints[..]);
|
||||||
test_skip_index_aux(skip_index, &checkpoints[..]);
|
test_skip_index_aux(skip_index, &checkpoints[..]);
|
||||||
|
|||||||
@@ -35,11 +35,11 @@ struct Layer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Layer {
|
impl Layer {
|
||||||
fn cursor(&self) -> impl Iterator<Item = Checkpoint> + '_ {
|
fn cursor<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a {
|
||||||
self.cursor_at_offset(0u64)
|
self.cursor_at_offset(0u64)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn cursor_at_offset(&self, start_offset: u64) -> impl Iterator<Item = Checkpoint> + '_ {
|
fn cursor_at_offset<'a>(&'a self, start_offset: u64) -> impl Iterator<Item = Checkpoint> + 'a {
|
||||||
let data = &self.data.as_slice();
|
let data = &self.data.as_slice();
|
||||||
LayerCursor {
|
LayerCursor {
|
||||||
remaining: &data[start_offset as usize..],
|
remaining: &data[start_offset as usize..],
|
||||||
@@ -59,7 +59,25 @@ pub struct SkipIndex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl SkipIndex {
|
impl SkipIndex {
|
||||||
pub(crate) fn checkpoints(&self) -> impl Iterator<Item = Checkpoint> + '_ {
|
pub fn open(mut data: OwnedBytes) -> SkipIndex {
|
||||||
|
let offsets: Vec<u64> = Vec::<VInt>::deserialize(&mut data)
|
||||||
|
.unwrap()
|
||||||
|
.into_iter()
|
||||||
|
.map(|el| el.0)
|
||||||
|
.collect();
|
||||||
|
let mut start_offset = 0;
|
||||||
|
let mut layers = Vec::new();
|
||||||
|
for end_offset in offsets {
|
||||||
|
let layer = Layer {
|
||||||
|
data: data.slice(start_offset as usize, end_offset as usize),
|
||||||
|
};
|
||||||
|
layers.push(layer);
|
||||||
|
start_offset = end_offset;
|
||||||
|
}
|
||||||
|
SkipIndex { layers }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn checkpoints<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a {
|
||||||
self.layers
|
self.layers
|
||||||
.last()
|
.last()
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@@ -90,22 +108,3 @@ impl SkipIndex {
|
|||||||
Some(cur_checkpoint)
|
Some(cur_checkpoint)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<OwnedBytes> for SkipIndex {
|
|
||||||
fn from(mut data: OwnedBytes) -> SkipIndex {
|
|
||||||
let offsets: Vec<u64> = Vec::<VInt>::deserialize(&mut data)
|
|
||||||
.unwrap()
|
|
||||||
.into_iter()
|
|
||||||
.map(|el| el.0)
|
|
||||||
.collect();
|
|
||||||
let mut start_offset = 0;
|
|
||||||
let mut layers = Vec::new();
|
|
||||||
for end_offset in offsets {
|
|
||||||
layers.push(Layer {
|
|
||||||
data: data.slice(start_offset as usize, end_offset as usize),
|
|
||||||
});
|
|
||||||
start_offset = end_offset;
|
|
||||||
}
|
|
||||||
SkipIndex { layers }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -28,18 +28,20 @@ impl LayerBuilder {
|
|||||||
///
|
///
|
||||||
/// If the block was empty to begin with, simply return None.
|
/// If the block was empty to begin with, simply return None.
|
||||||
fn flush_block(&mut self) -> Option<Checkpoint> {
|
fn flush_block(&mut self) -> Option<Checkpoint> {
|
||||||
self.block.doc_interval().map(|(start_doc, end_doc)| {
|
if let Some((start_doc, end_doc)) = self.block.doc_interval() {
|
||||||
let start_offset = self.buffer.len() as u64;
|
let start_offset = self.buffer.len() as u64;
|
||||||
self.block.serialize(&mut self.buffer);
|
self.block.serialize(&mut self.buffer);
|
||||||
let end_offset = self.buffer.len() as u64;
|
let end_offset = self.buffer.len() as u64;
|
||||||
self.block.clear();
|
self.block.clear();
|
||||||
Checkpoint {
|
Some(Checkpoint {
|
||||||
start_doc,
|
start_doc,
|
||||||
end_doc,
|
end_doc,
|
||||||
start_offset,
|
start_offset,
|
||||||
end_offset,
|
end_offset,
|
||||||
}
|
})
|
||||||
})
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn push(&mut self, checkpoint: Checkpoint) {
|
fn push(&mut self, checkpoint: Checkpoint) {
|
||||||
@@ -48,7 +50,7 @@ impl LayerBuilder {
|
|||||||
|
|
||||||
fn insert(&mut self, checkpoint: Checkpoint) -> Option<Checkpoint> {
|
fn insert(&mut self, checkpoint: Checkpoint) -> Option<Checkpoint> {
|
||||||
self.push(checkpoint);
|
self.push(checkpoint);
|
||||||
let emit_skip_info = (self.block.len() % CHECKPOINT_PERIOD) == 0;
|
let emit_skip_info = self.block.len() >= CHECKPOINT_PERIOD;
|
||||||
if emit_skip_info {
|
if emit_skip_info {
|
||||||
self.flush_block()
|
self.flush_block()
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ impl StoreReader {
|
|||||||
let (data_file, offset_index_file) = split_file(store_file)?;
|
let (data_file, offset_index_file) = split_file(store_file)?;
|
||||||
let index_data = offset_index_file.read_bytes()?;
|
let index_data = offset_index_file.read_bytes()?;
|
||||||
let space_usage = StoreSpaceUsage::new(data_file.len(), offset_index_file.len());
|
let space_usage = StoreSpaceUsage::new(data_file.len(), offset_index_file.len());
|
||||||
let skip_index = SkipIndex::from(index_data);
|
let skip_index = SkipIndex::open(index_data);
|
||||||
Ok(StoreReader {
|
Ok(StoreReader {
|
||||||
data: data_file,
|
data: data_file,
|
||||||
cache: Arc::new(Mutex::new(LruCache::new(LRU_CACHE_CAPACITY))),
|
cache: Arc::new(Mutex::new(LruCache::new(LRU_CACHE_CAPACITY))),
|
||||||
@@ -46,7 +46,7 @@ impl StoreReader {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn block_checkpoints(&self) -> impl Iterator<Item = Checkpoint> + '_ {
|
pub(crate) fn block_checkpoints<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a {
|
||||||
self.skip_index.checkpoints()
|
self.skip_index.checkpoints()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
50
src/store/tests_store.rs
Normal file
50
src/store/tests_store.rs
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use crate::HasLen;
|
||||||
|
use crate::directory::{Directory, ManagedDirectory, MmapDirectory, RAMDirectory};
|
||||||
|
use crate::fastfield::DeleteBitSet;
|
||||||
|
|
||||||
|
use super::{StoreReader, StoreWriter};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_toto2() -> crate::Result<()> {
|
||||||
|
let directory = ManagedDirectory::wrap(MmapDirectory::open("src/store/broken_seg")?)?;
|
||||||
|
let path = Path::new("b6029ade1b954ea1acad15b432eaacb9.store");
|
||||||
|
assert!(directory.validate_checksum(path)?);
|
||||||
|
let store_file = directory.open_read(path)?;
|
||||||
|
let store = StoreReader::open(store_file)?;
|
||||||
|
let documents = store.documents();
|
||||||
|
// for doc in documents {
|
||||||
|
// println!("{:?}", doc);
|
||||||
|
// }
|
||||||
|
let doc= store.get(15_086)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_toto() -> crate::Result<()> {
|
||||||
|
let directory = ManagedDirectory::wrap(MmapDirectory::open("src/store/broken_seg")?)?;
|
||||||
|
assert!(directory.validate_checksum(Path::new("e6ece22e5bca4e0dbe7ce3e4dcbd5bbf.store"))?);
|
||||||
|
let store_file = directory.open_read(Path::new("e6ece22e5bca4e0dbe7ce3e4dcbd5bbf.store.patched"))?;
|
||||||
|
let store = StoreReader::open(store_file)?;
|
||||||
|
let doc= store.get(53)?;
|
||||||
|
println!("{:?}", doc);
|
||||||
|
// let documents = store.documents();
|
||||||
|
// let ram_directory = RAMDirectory::create();
|
||||||
|
// let path = Path::new("store");
|
||||||
|
|
||||||
|
// let store_wrt = ram_directory.open_write(path)?;
|
||||||
|
// let mut store_writer = StoreWriter::new(store_wrt);
|
||||||
|
// for doc in &documents {
|
||||||
|
// store_writer.store(doc)?;
|
||||||
|
// }
|
||||||
|
// store_writer.close()?;
|
||||||
|
// let store_data = ram_directory.open_read(path)?;
|
||||||
|
// let new_store = StoreReader::open(store_data)?;
|
||||||
|
// for doc in 0..59 {
|
||||||
|
// println!("{}", doc);
|
||||||
|
// let doc = new_store.get(doc)?;
|
||||||
|
// println!("{:?}", doc);
|
||||||
|
// }
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -10,7 +10,7 @@ use crate::store::index::Checkpoint;
|
|||||||
use crate::DocId;
|
use crate::DocId;
|
||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
|
|
||||||
const BLOCK_SIZE: usize = 16_384;
|
const BLOCK_SIZE: usize = 30;
|
||||||
|
|
||||||
/// Write tantivy's [`Store`](./index.html)
|
/// Write tantivy's [`Store`](./index.html)
|
||||||
///
|
///
|
||||||
@@ -72,6 +72,7 @@ impl StoreWriter {
|
|||||||
if !self.current_block.is_empty() {
|
if !self.current_block.is_empty() {
|
||||||
self.write_and_compress_block()?;
|
self.write_and_compress_block()?;
|
||||||
}
|
}
|
||||||
|
assert_eq!(self.first_doc_in_block, self.doc);
|
||||||
let doc_shift = self.doc;
|
let doc_shift = self.doc;
|
||||||
let start_shift = self.writer.written_bytes() as u64;
|
let start_shift = self.writer.written_bytes() as u64;
|
||||||
|
|
||||||
@@ -86,12 +87,17 @@ impl StoreWriter {
|
|||||||
checkpoint.end_doc += doc_shift;
|
checkpoint.end_doc += doc_shift;
|
||||||
checkpoint.start_offset += start_shift;
|
checkpoint.start_offset += start_shift;
|
||||||
checkpoint.end_offset += start_shift;
|
checkpoint.end_offset += start_shift;
|
||||||
self.offset_index_writer.insert(checkpoint);
|
self.register_checkpoint(checkpoint);
|
||||||
self.doc = checkpoint.end_doc;
|
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn register_checkpoint(&mut self, checkpoint: Checkpoint) {
|
||||||
|
self.offset_index_writer.insert(checkpoint);
|
||||||
|
self.first_doc_in_block = checkpoint.end_doc;
|
||||||
|
self.doc = checkpoint.end_doc;
|
||||||
|
}
|
||||||
|
|
||||||
fn write_and_compress_block(&mut self) -> io::Result<()> {
|
fn write_and_compress_block(&mut self) -> io::Result<()> {
|
||||||
assert!(self.doc > 0);
|
assert!(self.doc > 0);
|
||||||
self.intermediary_buffer.clear();
|
self.intermediary_buffer.clear();
|
||||||
@@ -100,14 +106,13 @@ impl StoreWriter {
|
|||||||
self.writer.write_all(&self.intermediary_buffer)?;
|
self.writer.write_all(&self.intermediary_buffer)?;
|
||||||
let end_offset = self.writer.written_bytes();
|
let end_offset = self.writer.written_bytes();
|
||||||
let end_doc = self.doc;
|
let end_doc = self.doc;
|
||||||
self.offset_index_writer.insert(Checkpoint {
|
self.register_checkpoint(Checkpoint {
|
||||||
start_doc: self.first_doc_in_block,
|
start_doc: self.first_doc_in_block,
|
||||||
end_doc,
|
end_doc,
|
||||||
start_offset,
|
start_offset,
|
||||||
end_offset,
|
end_offset,
|
||||||
});
|
});
|
||||||
self.current_block.clear();
|
self.current_block.clear();
|
||||||
self.first_doc_in_block = self.doc;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,16 +2,16 @@
|
|||||||
//! ```rust
|
//! ```rust
|
||||||
//! use tantivy::tokenizer::*;
|
//! use tantivy::tokenizer::*;
|
||||||
//!
|
//!
|
||||||
//! let tokenizer = analyzer_builder(RawTokenizer)
|
//! let tokenizer = TextAnalyzer::from(RawTokenizer)
|
||||||
//! .filter(AlphaNumOnlyFilter).build();
|
//! .filter(AlphaNumOnlyFilter);
|
||||||
//!
|
//!
|
||||||
//! let mut stream = tokenizer.token_stream("hello there");
|
//! let mut stream = tokenizer.token_stream("hello there");
|
||||||
//! // is none because the raw filter emits one token that
|
//! // is none because the raw filter emits one token that
|
||||||
//! // contains a space
|
//! // contains a space
|
||||||
//! assert!(stream.next().is_none());
|
//! assert!(stream.next().is_none());
|
||||||
//!
|
//!
|
||||||
//! let tokenizer = analyzer_builder(SimpleTokenizer)
|
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||||
//! .filter(AlphaNumOnlyFilter).build();
|
//! .filter(AlphaNumOnlyFilter);
|
||||||
//!
|
//!
|
||||||
//! let mut stream = tokenizer.token_stream("hello there 💣");
|
//! let mut stream = tokenizer.token_stream("hello there 💣");
|
||||||
//! assert!(stream.next().is_some());
|
//! assert!(stream.next().is_some());
|
||||||
@@ -19,18 +19,45 @@
|
|||||||
//! // the "emoji" is dropped because its not an alphanum
|
//! // the "emoji" is dropped because its not an alphanum
|
||||||
//! assert!(stream.next().is_none());
|
//! assert!(stream.next().is_none());
|
||||||
//! ```
|
//! ```
|
||||||
use super::{Token, TokenFilter};
|
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||||
|
|
||||||
/// `TokenFilter` that removes all tokens that contain non
|
/// `TokenFilter` that removes all tokens that contain non
|
||||||
/// ascii alphanumeric characters.
|
/// ascii alphanumeric characters.
|
||||||
#[derive(Clone, Debug, Default)]
|
#[derive(Clone)]
|
||||||
pub struct AlphaNumOnlyFilter;
|
pub struct AlphaNumOnlyFilter;
|
||||||
|
|
||||||
impl TokenFilter for AlphaNumOnlyFilter {
|
pub struct AlphaNumOnlyFilterStream<'a> {
|
||||||
fn transform(&mut self, token: Token) -> Option<Token> {
|
tail: BoxTokenStream<'a>,
|
||||||
if token.text.chars().all(|c| c.is_ascii_alphanumeric()) {
|
}
|
||||||
return Some(token);
|
|
||||||
}
|
impl<'a> AlphaNumOnlyFilterStream<'a> {
|
||||||
None
|
fn predicate(&self, token: &Token) -> bool {
|
||||||
|
token.text.chars().all(|c| c.is_ascii_alphanumeric())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TokenFilter for AlphaNumOnlyFilter {
|
||||||
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||||
|
BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> {
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
while self.tail.advance() {
|
||||||
|
if self.predicate(self.tail.token()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
self.tail.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
self.tail.token_mut()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,31 +1,45 @@
|
|||||||
use super::{Token, TokenFilter};
|
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||||
use std::mem;
|
use std::mem;
|
||||||
|
|
||||||
/// This class converts alphabetic, numeric, and symbolic Unicode characters
|
/// This class converts alphabetic, numeric, and symbolic Unicode characters
|
||||||
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
|
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
|
||||||
/// block) into their ASCII equivalents, if one exists.
|
/// block) into their ASCII equivalents, if one exists.
|
||||||
#[derive(Clone, Debug, Default)]
|
#[derive(Clone)]
|
||||||
pub struct AsciiFolding {
|
pub struct AsciiFoldingFilter;
|
||||||
buffer: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl AsciiFolding {
|
impl TokenFilter for AsciiFoldingFilter {
|
||||||
/// Construct a new `AsciiFolding` filter.
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||||
pub fn new() -> Self {
|
From::from(AsciiFoldingFilterTokenStream {
|
||||||
Self {
|
tail: token_stream,
|
||||||
buffer: String::with_capacity(100),
|
buffer: String::with_capacity(100),
|
||||||
}
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TokenFilter for AsciiFolding {
|
pub struct AsciiFoldingFilterTokenStream<'a> {
|
||||||
fn transform(&mut self, mut token: Token) -> Option<Token> {
|
buffer: String,
|
||||||
if !token.text.is_ascii() {
|
tail: BoxTokenStream<'a>,
|
||||||
// ignore its already ascii
|
}
|
||||||
to_ascii(&token.text, &mut self.buffer);
|
|
||||||
mem::swap(&mut token.text, &mut self.buffer);
|
impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> {
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
if !self.tail.advance() {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
Some(token)
|
if !self.token_mut().text.is_ascii() {
|
||||||
|
// ignore its already ascii
|
||||||
|
to_ascii(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||||
|
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
self.tail.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
self.tail.token_mut()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1512,7 +1526,7 @@ fn fold_non_ascii_char(c: char) -> Option<&'static str> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// https://github.com/apache/lucene-solr/blob/master/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java#L187
|
// https://github.com/apache/lucene-solr/blob/master/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java#L187
|
||||||
fn to_ascii(text: &String, output: &mut String) {
|
fn to_ascii(text: &mut String, output: &mut String) {
|
||||||
output.clear();
|
output.clear();
|
||||||
|
|
||||||
for c in text.chars() {
|
for c in text.chars() {
|
||||||
@@ -1526,8 +1540,11 @@ fn to_ascii(text: &String, output: &mut String) {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::super::*;
|
use super::to_ascii;
|
||||||
use super::*;
|
use crate::tokenizer::AsciiFoldingFilter;
|
||||||
|
use crate::tokenizer::RawTokenizer;
|
||||||
|
use crate::tokenizer::SimpleTokenizer;
|
||||||
|
use crate::tokenizer::TextAnalyzer;
|
||||||
use std::iter;
|
use std::iter;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -1543,22 +1560,22 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn folding_helper(text: &str) -> Vec<String> {
|
fn folding_helper(text: &str) -> Vec<String> {
|
||||||
let tokens = analyzer_builder(SimpleTokenizer)
|
let mut tokens = Vec::new();
|
||||||
.filter(AsciiFolding::new())
|
TextAnalyzer::from(SimpleTokenizer)
|
||||||
.build()
|
.filter(AsciiFoldingFilter)
|
||||||
.token_stream(text)
|
.token_stream(text)
|
||||||
.map(|token| token.text.clone())
|
.process(&mut |token| {
|
||||||
.collect();
|
tokens.push(token.text.clone());
|
||||||
|
});
|
||||||
tokens
|
tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
|
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
|
||||||
let mut token_stream = analyzer_builder(RawTokenizer)
|
let mut token_stream = TextAnalyzer::from(RawTokenizer)
|
||||||
.filter(AsciiFolding::new())
|
.filter(AsciiFoldingFilter)
|
||||||
.build()
|
|
||||||
.token_stream(text);
|
.token_stream(text);
|
||||||
let Token { text, .. } = token_stream.next().unwrap();
|
token_stream.advance();
|
||||||
text
|
token_stream.token().text.clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -1609,9 +1626,9 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_to_ascii() {
|
fn test_to_ascii() {
|
||||||
let input = "Rámon".to_string();
|
let mut input = "Rámon".to_string();
|
||||||
let mut buffer = String::new();
|
let mut buffer = String::new();
|
||||||
to_ascii(&input, &mut buffer);
|
to_ascii(&mut input, &mut buffer);
|
||||||
assert_eq!("Ramon", buffer);
|
assert_eq!("Ramon", buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use super::{Token, Tokenizer};
|
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
|
||||||
use crate::schema::FACET_SEP_BYTE;
|
use crate::schema::FACET_SEP_BYTE;
|
||||||
|
|
||||||
/// The `FacetTokenizer` process a `Facet` binary representation
|
/// The `FacetTokenizer` process a `Facet` binary representation
|
||||||
@@ -9,63 +9,72 @@ use crate::schema::FACET_SEP_BYTE;
|
|||||||
/// - `/america/north_america/canada`
|
/// - `/america/north_america/canada`
|
||||||
/// - `/america/north_america`
|
/// - `/america/north_america`
|
||||||
/// - `/america`
|
/// - `/america`
|
||||||
#[derive(Clone, Debug, Default)]
|
#[derive(Clone)]
|
||||||
pub struct FacetTokenizer;
|
pub struct FacetTokenizer;
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Debug)]
|
||||||
enum State {
|
enum State {
|
||||||
RootFacetNotEmitted,
|
RootFacetNotEmitted,
|
||||||
UpToPosition(usize), //< we already emitted facet prefix up to &text[..cursor]
|
UpToPosition(usize), //< we already emitted facet prefix up to &text[..cursor]
|
||||||
Terminated,
|
Terminated,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
pub struct FacetTokenStream<'a> {
|
||||||
pub struct FacetTokenStream {
|
text: &'a str,
|
||||||
text: String,
|
|
||||||
state: State,
|
state: State,
|
||||||
token: Token,
|
token: Token,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Tokenizer for FacetTokenizer {
|
impl Tokenizer for FacetTokenizer {
|
||||||
type Iter = FacetTokenStream;
|
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||||
fn token_stream(&self, text: &str) -> Self::Iter {
|
|
||||||
FacetTokenStream {
|
FacetTokenStream {
|
||||||
text: text.to_string(),
|
text,
|
||||||
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
|
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
|
||||||
token: Token::default(),
|
token: Token::default(),
|
||||||
}
|
}
|
||||||
|
.into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Iterator for FacetTokenStream {
|
impl<'a> TokenStream for FacetTokenStream<'a> {
|
||||||
type Item = Token;
|
fn advance(&mut self) -> bool {
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
match self.state {
|
||||||
self.state = match self.state {
|
|
||||||
State::RootFacetNotEmitted => {
|
State::RootFacetNotEmitted => {
|
||||||
if self.text.is_empty() {
|
self.state = if self.text.is_empty() {
|
||||||
State::Terminated
|
State::Terminated
|
||||||
} else {
|
} else {
|
||||||
State::UpToPosition(0)
|
State::UpToPosition(0)
|
||||||
}
|
};
|
||||||
|
true
|
||||||
}
|
}
|
||||||
State::UpToPosition(cursor) => {
|
State::UpToPosition(cursor) => {
|
||||||
if let Some(next_sep_pos) = self.text.as_bytes()[cursor + 1..]
|
let bytes: &[u8] = self.text.as_bytes();
|
||||||
|
if let Some(next_sep_pos) = bytes[cursor + 1..]
|
||||||
.iter()
|
.iter()
|
||||||
.position(|&b| b == FACET_SEP_BYTE)
|
.cloned()
|
||||||
|
.position(|b| b == FACET_SEP_BYTE)
|
||||||
.map(|pos| cursor + 1 + pos)
|
.map(|pos| cursor + 1 + pos)
|
||||||
{
|
{
|
||||||
let facet_part = &self.text[cursor..next_sep_pos];
|
let facet_part = &self.text[cursor..next_sep_pos];
|
||||||
self.token.text.push_str(facet_part);
|
self.token.text.push_str(facet_part);
|
||||||
State::UpToPosition(next_sep_pos)
|
self.state = State::UpToPosition(next_sep_pos);
|
||||||
} else {
|
} else {
|
||||||
let facet_part = &self.text[cursor..];
|
let facet_part = &self.text[cursor..];
|
||||||
self.token.text.push_str(facet_part);
|
self.token.text.push_str(facet_part);
|
||||||
State::Terminated
|
self.state = State::Terminated;
|
||||||
}
|
}
|
||||||
|
true
|
||||||
}
|
}
|
||||||
State::Terminated => return None,
|
State::Terminated => false,
|
||||||
};
|
}
|
||||||
Some(self.token.clone())
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
&self.token
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
&mut self.token
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -74,19 +83,21 @@ mod tests {
|
|||||||
|
|
||||||
use super::FacetTokenizer;
|
use super::FacetTokenizer;
|
||||||
use crate::schema::Facet;
|
use crate::schema::Facet;
|
||||||
use crate::tokenizer::Tokenizer;
|
use crate::tokenizer::{Token, Tokenizer};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_tokenizer() {
|
fn test_facet_tokenizer() {
|
||||||
let facet = Facet::from_path(vec!["top", "a", "b"]);
|
let facet = Facet::from_path(vec!["top", "a", "b"]);
|
||||||
let tokens: Vec<_> = FacetTokenizer
|
let mut tokens = vec![];
|
||||||
.token_stream(facet.encoded_str())
|
{
|
||||||
.map(|token| {
|
let mut add_token = |token: &Token| {
|
||||||
Facet::from_encoded(token.text.as_bytes().to_owned())
|
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
|
||||||
.unwrap()
|
tokens.push(format!("{}", facet));
|
||||||
.to_string()
|
};
|
||||||
})
|
FacetTokenizer
|
||||||
.collect();
|
.token_stream(facet.encoded_str())
|
||||||
|
.process(&mut add_token);
|
||||||
|
}
|
||||||
assert_eq!(tokens.len(), 4);
|
assert_eq!(tokens.len(), 4);
|
||||||
assert_eq!(tokens[0], "/");
|
assert_eq!(tokens[0], "/");
|
||||||
assert_eq!(tokens[1], "/top");
|
assert_eq!(tokens[1], "/top");
|
||||||
@@ -97,14 +108,16 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_facet_tokenizer_root_facets() {
|
fn test_facet_tokenizer_root_facets() {
|
||||||
let facet = Facet::root();
|
let facet = Facet::root();
|
||||||
let tokens: Vec<_> = FacetTokenizer
|
let mut tokens = vec![];
|
||||||
.token_stream(facet.encoded_str())
|
{
|
||||||
.map(|token| {
|
let mut add_token = |token: &Token| {
|
||||||
Facet::from_encoded(token.text.as_bytes().to_owned())
|
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
|
||||||
.unwrap()
|
tokens.push(format!("{}", facet));
|
||||||
.to_string()
|
};
|
||||||
})
|
FacetTokenizer
|
||||||
.collect();
|
.token_stream(facet.encoded_str()) // ok test
|
||||||
|
.process(&mut add_token);
|
||||||
|
}
|
||||||
assert_eq!(tokens.len(), 1);
|
assert_eq!(tokens.len(), 1);
|
||||||
assert_eq!(tokens[0], "/");
|
assert_eq!(tokens[0], "/");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,36 +1,27 @@
|
|||||||
use super::{Token, TokenFilter};
|
use super::{Token, TokenFilter, TokenStream};
|
||||||
|
use crate::tokenizer::BoxTokenStream;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
|
|
||||||
impl TokenFilter for LowerCaser {
|
impl TokenFilter for LowerCaser {
|
||||||
fn transform(&mut self, mut token: Token) -> Option<Token> {
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||||
if token.text.is_ascii() {
|
BoxTokenStream::from(LowerCaserTokenStream {
|
||||||
// fast track for ascii.
|
tail: token_stream,
|
||||||
token.text.make_ascii_lowercase();
|
buffer: String::with_capacity(100),
|
||||||
} else {
|
})
|
||||||
to_lowercase_unicode(&token.text, &mut self.buffer);
|
|
||||||
mem::swap(&mut token.text, &mut self.buffer);
|
|
||||||
}
|
|
||||||
Some(token)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Token filter that lowercase terms.
|
/// Token filter that lowercase terms.
|
||||||
#[derive(Clone, Debug, Default)]
|
#[derive(Clone)]
|
||||||
pub struct LowerCaser {
|
pub struct LowerCaser;
|
||||||
buffer: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LowerCaser {
|
pub struct LowerCaserTokenStream<'a> {
|
||||||
/// Initialize the `LowerCaser`
|
buffer: String,
|
||||||
pub fn new() -> Self {
|
tail: BoxTokenStream<'a>,
|
||||||
LowerCaser {
|
|
||||||
buffer: String::with_capacity(100),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// writes a lowercased version of text into output.
|
// writes a lowercased version of text into output.
|
||||||
fn to_lowercase_unicode(text: &String, output: &mut String) {
|
fn to_lowercase_unicode(text: &mut String, output: &mut String) {
|
||||||
output.clear();
|
output.clear();
|
||||||
for c in text.chars() {
|
for c in text.chars() {
|
||||||
// Contrary to the std, we do not take care of sigma special case.
|
// Contrary to the std, we do not take care of sigma special case.
|
||||||
@@ -39,31 +30,57 @@ fn to_lowercase_unicode(text: &String, output: &mut String) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'a> TokenStream for LowerCaserTokenStream<'a> {
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
if !self.tail.advance() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if self.token_mut().text.is_ascii() {
|
||||||
|
// fast track for ascii.
|
||||||
|
self.token_mut().text.make_ascii_lowercase();
|
||||||
|
} else {
|
||||||
|
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||||
|
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
self.tail.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
self.tail.token_mut()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use crate::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer};
|
||||||
use crate::tokenizer::{analyzer_builder, LowerCaser, SimpleTokenizer, TextAnalyzerT};
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_to_lower_case() {
|
fn test_to_lower_case() {
|
||||||
assert_eq!(lowercase_helper("Русский текст"), vec!["русский", "текст"]);
|
assert_eq!(
|
||||||
|
lowercase_helper("Русский текст"),
|
||||||
|
vec!["русский".to_string(), "текст".to_string()]
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn lowercase_helper(text: &str) -> Vec<String> {
|
fn lowercase_helper(text: &str) -> Vec<String> {
|
||||||
analyzer_builder(SimpleTokenizer)
|
let mut tokens = vec![];
|
||||||
.filter(LowerCaser::new())
|
let mut token_stream = TextAnalyzer::from(SimpleTokenizer)
|
||||||
.build()
|
.filter(LowerCaser)
|
||||||
.token_stream(text)
|
.token_stream(text);
|
||||||
.map(|token| {
|
while token_stream.advance() {
|
||||||
let Token { text, .. } = token;
|
let token_text = token_stream.token().text.clone();
|
||||||
text
|
tokens.push(token_text);
|
||||||
})
|
}
|
||||||
.collect()
|
tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_lowercaser() {
|
fn test_lowercaser() {
|
||||||
assert_eq!(lowercase_helper("Tree"), vec!["tree"]);
|
assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]);
|
||||||
assert_eq!(lowercase_helper("Русский"), vec!["русский"]);
|
assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -64,10 +64,10 @@
|
|||||||
//! ```rust
|
//! ```rust
|
||||||
//! use tantivy::tokenizer::*;
|
//! use tantivy::tokenizer::*;
|
||||||
//!
|
//!
|
||||||
//! let en_stem = analyzer_builder(SimpleTokenizer)
|
//! let en_stem = TextAnalyzer::from(SimpleTokenizer)
|
||||||
//! .filter(RemoveLongFilter::limit(40))
|
//! .filter(RemoveLongFilter::limit(40))
|
||||||
//! .filter(LowerCaser::new())
|
//! .filter(LowerCaser)
|
||||||
//! .filter(Stemmer::new(Language::English)).build();
|
//! .filter(Stemmer::new(Language::English));
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! Once your tokenizer is defined, you need to
|
//! Once your tokenizer is defined, you need to
|
||||||
@@ -109,9 +109,9 @@
|
|||||||
//! let index = Index::create_in_ram(schema);
|
//! let index = Index::create_in_ram(schema);
|
||||||
//!
|
//!
|
||||||
//! // We need to register our tokenizer :
|
//! // We need to register our tokenizer :
|
||||||
//! let custom_en_tokenizer = analyzer_builder(SimpleTokenizer)
|
//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||||
//! .filter(RemoveLongFilter::limit(40))
|
//! .filter(RemoveLongFilter::limit(40))
|
||||||
//! .filter(LowerCaser::new()).build();
|
//! .filter(LowerCaser);
|
||||||
//! index
|
//! index
|
||||||
//! .tokenizers()
|
//! .tokenizers()
|
||||||
//! .register("custom_en", custom_en_tokenizer);
|
//! .register("custom_en", custom_en_tokenizer);
|
||||||
@@ -133,7 +133,7 @@ mod tokenizer;
|
|||||||
mod tokenizer_manager;
|
mod tokenizer_manager;
|
||||||
|
|
||||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||||
pub use self::ascii_folding_filter::AsciiFolding;
|
pub use self::ascii_folding_filter::AsciiFoldingFilter;
|
||||||
pub use self::facet_tokenizer::FacetTokenizer;
|
pub use self::facet_tokenizer::FacetTokenizer;
|
||||||
pub use self::lower_caser::LowerCaser;
|
pub use self::lower_caser::LowerCaser;
|
||||||
pub use self::ngram_tokenizer::NgramTokenizer;
|
pub use self::ngram_tokenizer::NgramTokenizer;
|
||||||
@@ -142,11 +142,11 @@ pub use self::remove_long::RemoveLongFilter;
|
|||||||
pub use self::simple_tokenizer::SimpleTokenizer;
|
pub use self::simple_tokenizer::SimpleTokenizer;
|
||||||
pub use self::stemmer::{Language, Stemmer};
|
pub use self::stemmer::{Language, Stemmer};
|
||||||
pub use self::stop_word_filter::StopWordFilter;
|
pub use self::stop_word_filter::StopWordFilter;
|
||||||
pub(crate) use self::token_stream_chain::{DynTokenStreamChain, TokenStreamChain};
|
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||||
|
|
||||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||||
pub use self::tokenizer::{
|
pub use self::tokenizer::{
|
||||||
analyzer_builder, Identity, TextAnalyzer, TextAnalyzerT, Token, TokenFilter, Tokenizer,
|
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub use self::tokenizer_manager::TokenizerManager;
|
pub use self::tokenizer_manager::TokenizerManager;
|
||||||
@@ -160,7 +160,10 @@ pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4;
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests {
|
pub mod tests {
|
||||||
use super::*;
|
use super::{
|
||||||
|
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, TokenizerManager,
|
||||||
|
};
|
||||||
|
use crate::tokenizer::TextAnalyzer;
|
||||||
|
|
||||||
/// This is a function that can be used in tests and doc tests
|
/// This is a function that can be used in tests and doc tests
|
||||||
/// to assert a token's correctness.
|
/// to assert a token's correctness.
|
||||||
@@ -187,9 +190,15 @@ pub mod tests {
|
|||||||
fn test_raw_tokenizer() {
|
fn test_raw_tokenizer() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
let en_tokenizer = tokenizer_manager.get("raw").unwrap();
|
let en_tokenizer = tokenizer_manager.get("raw").unwrap();
|
||||||
let tokens: Vec<Token> = en_tokenizer
|
let mut tokens: Vec<Token> = vec![];
|
||||||
.token_stream("Hello, happy tax payer!")
|
{
|
||||||
.collect();
|
let mut add_token = |token: &Token| {
|
||||||
|
tokens.push(token.clone());
|
||||||
|
};
|
||||||
|
en_tokenizer
|
||||||
|
.token_stream("Hello, happy tax payer!")
|
||||||
|
.process(&mut add_token);
|
||||||
|
}
|
||||||
assert_eq!(tokens.len(), 1);
|
assert_eq!(tokens.len(), 1);
|
||||||
assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23);
|
assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23);
|
||||||
}
|
}
|
||||||
@@ -199,9 +208,15 @@ pub mod tests {
|
|||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
|
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
|
||||||
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||||
let tokens: Vec<Token> = en_tokenizer
|
let mut tokens: Vec<Token> = vec![];
|
||||||
.token_stream("Hello, happy tax payer!")
|
{
|
||||||
.collect();
|
let mut add_token = |token: &Token| {
|
||||||
|
tokens.push(token.clone());
|
||||||
|
};
|
||||||
|
en_tokenizer
|
||||||
|
.token_stream("Hello, happy tax payer!")
|
||||||
|
.process(&mut add_token);
|
||||||
|
}
|
||||||
|
|
||||||
assert_eq!(tokens.len(), 4);
|
assert_eq!(tokens.len(), 4);
|
||||||
assert_token(&tokens[0], 0, "hello", 0, 5);
|
assert_token(&tokens[0], 0, "hello", 0, 5);
|
||||||
@@ -215,16 +230,21 @@ pub mod tests {
|
|||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
tokenizer_manager.register(
|
tokenizer_manager.register(
|
||||||
"el_stem",
|
"el_stem",
|
||||||
analyzer_builder(SimpleTokenizer)
|
TextAnalyzer::from(SimpleTokenizer)
|
||||||
.filter(RemoveLongFilter::limit(40))
|
.filter(RemoveLongFilter::limit(40))
|
||||||
.filter(LowerCaser::new())
|
.filter(LowerCaser)
|
||||||
.filter(Stemmer::new(Language::Greek))
|
.filter(Stemmer::new(Language::Greek)),
|
||||||
.build(),
|
|
||||||
);
|
);
|
||||||
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
|
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
|
||||||
let tokens: Vec<Token> = en_tokenizer
|
let mut tokens: Vec<Token> = vec![];
|
||||||
.token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
|
{
|
||||||
.collect();
|
let mut add_token = |token: &Token| {
|
||||||
|
tokens.push(token.clone());
|
||||||
|
};
|
||||||
|
en_tokenizer
|
||||||
|
.token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
|
||||||
|
.process(&mut add_token);
|
||||||
|
}
|
||||||
|
|
||||||
assert_eq!(tokens.len(), 3);
|
assert_eq!(tokens.len(), 3);
|
||||||
assert_token(&tokens[0], 0, "καλημερ", 0, 16);
|
assert_token(&tokens[0], 0, "καλημερ", 0, 16);
|
||||||
@@ -236,9 +256,25 @@ pub mod tests {
|
|||||||
fn test_tokenizer_empty() {
|
fn test_tokenizer_empty() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||||
let tokens: Vec<Token> = en_tokenizer.token_stream(" ").collect();
|
{
|
||||||
assert!(tokens.is_empty());
|
let mut tokens: Vec<Token> = vec![];
|
||||||
let tokens: Vec<Token> = en_tokenizer.token_stream(" ").collect();
|
{
|
||||||
assert!(tokens.is_empty());
|
let mut add_token = |token: &Token| {
|
||||||
|
tokens.push(token.clone());
|
||||||
|
};
|
||||||
|
en_tokenizer.token_stream(" ").process(&mut add_token);
|
||||||
|
}
|
||||||
|
assert!(tokens.is_empty());
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let mut tokens: Vec<Token> = vec![];
|
||||||
|
{
|
||||||
|
let mut add_token = |token: &Token| {
|
||||||
|
tokens.push(token.clone());
|
||||||
|
};
|
||||||
|
en_tokenizer.token_stream(" ").process(&mut add_token);
|
||||||
|
}
|
||||||
|
assert!(tokens.is_empty());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use super::{Token, Tokenizer};
|
use super::{Token, TokenStream, Tokenizer};
|
||||||
|
use crate::tokenizer::BoxTokenStream;
|
||||||
|
|
||||||
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
||||||
///
|
///
|
||||||
@@ -78,7 +79,7 @@ use super::{Token, Tokenizer};
|
|||||||
/// }
|
/// }
|
||||||
/// assert!(stream.next().is_none());
|
/// assert!(stream.next().is_none());
|
||||||
/// ```
|
/// ```
|
||||||
#[derive(Clone, Debug, Default)]
|
#[derive(Clone)]
|
||||||
pub struct NgramTokenizer {
|
pub struct NgramTokenizer {
|
||||||
/// min size of the n-gram
|
/// min size of the n-gram
|
||||||
min_gram: usize,
|
min_gram: usize,
|
||||||
@@ -118,48 +119,54 @@ impl NgramTokenizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// TokenStream associate to the `NgramTokenizer`
|
/// TokenStream associate to the `NgramTokenizer`
|
||||||
pub struct NgramTokenStream {
|
pub struct NgramTokenStream<'a> {
|
||||||
/// parameters
|
/// parameters
|
||||||
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers>,
|
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>,
|
||||||
/// true if the NgramTokenStream is in prefix mode.
|
/// true if the NgramTokenStream is in prefix mode.
|
||||||
prefix_only: bool,
|
prefix_only: bool,
|
||||||
/// input
|
/// input
|
||||||
text: String,
|
text: &'a str,
|
||||||
/// output
|
/// output
|
||||||
token: Token,
|
token: Token,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Tokenizer for NgramTokenizer {
|
impl Tokenizer for NgramTokenizer {
|
||||||
type Iter = NgramTokenStream;
|
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||||
fn token_stream(&self, text: &str) -> Self::Iter {
|
From::from(NgramTokenStream {
|
||||||
NgramTokenStream {
|
|
||||||
ngram_charidx_iterator: StutteringIterator::new(
|
ngram_charidx_iterator: StutteringIterator::new(
|
||||||
CodepointFrontiers::for_str(text),
|
CodepointFrontiers::for_str(text),
|
||||||
self.min_gram,
|
self.min_gram,
|
||||||
self.max_gram,
|
self.max_gram,
|
||||||
),
|
),
|
||||||
prefix_only: self.prefix_only,
|
prefix_only: self.prefix_only,
|
||||||
text: text.to_string(),
|
text,
|
||||||
token: Token::default(),
|
token: Token::default(),
|
||||||
}
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Iterator for NgramTokenStream {
|
impl<'a> TokenStream for NgramTokenStream<'a> {
|
||||||
type Item = Token;
|
fn advance(&mut self) -> bool {
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
|
if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
|
||||||
if self.prefix_only && offset_from > 0 {
|
if self.prefix_only && offset_from > 0 {
|
||||||
return None;
|
return false;
|
||||||
}
|
}
|
||||||
self.token.position = 0;
|
self.token.position = 0;
|
||||||
self.token.offset_from = offset_from;
|
self.token.offset_from = offset_from;
|
||||||
self.token.offset_to = offset_to;
|
self.token.offset_to = offset_to;
|
||||||
self.token.text.clear();
|
self.token.text.clear();
|
||||||
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
||||||
return Some(self.token.clone());
|
true
|
||||||
};
|
} else {
|
||||||
None
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
&self.token
|
||||||
|
}
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
&mut self.token
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -250,21 +257,21 @@ where
|
|||||||
/// or a codepoint ends.
|
/// or a codepoint ends.
|
||||||
///
|
///
|
||||||
/// By convention, we emit [0] for the empty string.
|
/// By convention, we emit [0] for the empty string.
|
||||||
struct CodepointFrontiers {
|
struct CodepointFrontiers<'a> {
|
||||||
s: String,
|
s: &'a str,
|
||||||
next_el: Option<usize>,
|
next_el: Option<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CodepointFrontiers {
|
impl<'a> CodepointFrontiers<'a> {
|
||||||
fn for_str(s: &str) -> Self {
|
fn for_str(s: &'a str) -> Self {
|
||||||
CodepointFrontiers {
|
CodepointFrontiers {
|
||||||
s: s.to_string(),
|
s,
|
||||||
next_el: Some(0),
|
next_el: Some(0),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterator for CodepointFrontiers {
|
impl<'a> Iterator for CodepointFrontiers<'a> {
|
||||||
type Item = usize;
|
type Item = usize;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<usize> {
|
fn next(&mut self) -> Option<usize> {
|
||||||
@@ -273,7 +280,7 @@ impl<'a> Iterator for CodepointFrontiers {
|
|||||||
self.next_el = None;
|
self.next_el = None;
|
||||||
} else {
|
} else {
|
||||||
let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]);
|
let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]);
|
||||||
self.s = (&self.s[first_codepoint_width..]).to_string();
|
self.s = &self.s[first_codepoint_width..];
|
||||||
self.next_el = Some(offset + first_codepoint_width);
|
self.next_el = Some(offset + first_codepoint_width);
|
||||||
}
|
}
|
||||||
offset
|
offset
|
||||||
@@ -294,8 +301,20 @@ fn utf8_codepoint_width(b: u8) -> usize {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
|
||||||
|
use super::utf8_codepoint_width;
|
||||||
|
use super::CodepointFrontiers;
|
||||||
|
use super::NgramTokenizer;
|
||||||
|
use super::StutteringIterator;
|
||||||
use crate::tokenizer::tests::assert_token;
|
use crate::tokenizer::tests::assert_token;
|
||||||
|
use crate::tokenizer::tokenizer::Tokenizer;
|
||||||
|
use crate::tokenizer::{BoxTokenStream, Token};
|
||||||
|
|
||||||
|
fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> {
|
||||||
|
let mut tokens: Vec<Token> = vec![];
|
||||||
|
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
|
||||||
|
tokens
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_utf8_codepoint_width() {
|
fn test_utf8_codepoint_width() {
|
||||||
@@ -332,9 +351,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_ngram_tokenizer_1_2_false() {
|
fn test_ngram_tokenizer_1_2_false() {
|
||||||
let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 2)
|
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello"));
|
||||||
.token_stream("hello")
|
|
||||||
.collect();
|
|
||||||
assert_eq!(tokens.len(), 9);
|
assert_eq!(tokens.len(), 9);
|
||||||
assert_token(&tokens[0], 0, "h", 0, 1);
|
assert_token(&tokens[0], 0, "h", 0, 1);
|
||||||
assert_token(&tokens[1], 0, "he", 0, 2);
|
assert_token(&tokens[1], 0, "he", 0, 2);
|
||||||
@@ -349,9 +366,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_ngram_tokenizer_min_max_equal() {
|
fn test_ngram_tokenizer_min_max_equal() {
|
||||||
let tokens: Vec<_> = NgramTokenizer::all_ngrams(3, 3)
|
let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello"));
|
||||||
.token_stream("hello")
|
|
||||||
.collect();
|
|
||||||
assert_eq!(tokens.len(), 3);
|
assert_eq!(tokens.len(), 3);
|
||||||
assert_token(&tokens[0], 0, "hel", 0, 3);
|
assert_token(&tokens[0], 0, "hel", 0, 3);
|
||||||
assert_token(&tokens[1], 0, "ell", 1, 4);
|
assert_token(&tokens[1], 0, "ell", 1, 4);
|
||||||
@@ -360,9 +375,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_ngram_tokenizer_2_5_prefix() {
|
fn test_ngram_tokenizer_2_5_prefix() {
|
||||||
let tokens: Vec<_> = NgramTokenizer::prefix_only(2, 5)
|
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein"));
|
||||||
.token_stream("frankenstein")
|
|
||||||
.collect();
|
|
||||||
assert_eq!(tokens.len(), 4);
|
assert_eq!(tokens.len(), 4);
|
||||||
assert_token(&tokens[0], 0, "fr", 0, 2);
|
assert_token(&tokens[0], 0, "fr", 0, 2);
|
||||||
assert_token(&tokens[1], 0, "fra", 0, 3);
|
assert_token(&tokens[1], 0, "fra", 0, 3);
|
||||||
@@ -372,9 +385,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_ngram_non_ascii_1_2() {
|
fn test_ngram_non_ascii_1_2() {
|
||||||
let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 2)
|
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo"));
|
||||||
.token_stream("hεllo")
|
|
||||||
.collect();
|
|
||||||
assert_eq!(tokens.len(), 9);
|
assert_eq!(tokens.len(), 9);
|
||||||
assert_token(&tokens[0], 0, "h", 0, 1);
|
assert_token(&tokens[0], 0, "h", 0, 1);
|
||||||
assert_token(&tokens[1], 0, "hε", 0, 3);
|
assert_token(&tokens[1], 0, "hε", 0, 3);
|
||||||
@@ -389,9 +400,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_ngram_non_ascii_2_5_prefix() {
|
fn test_ngram_non_ascii_2_5_prefix() {
|
||||||
let tokens: Vec<_> = NgramTokenizer::prefix_only(2, 5)
|
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo"));
|
||||||
.token_stream("hεllo")
|
|
||||||
.collect();
|
|
||||||
assert_eq!(tokens.len(), 4);
|
assert_eq!(tokens.len(), 4);
|
||||||
assert_token(&tokens[0], 0, "hε", 0, 3);
|
assert_token(&tokens[0], 0, "hε", 0, 3);
|
||||||
assert_token(&tokens[1], 0, "hεl", 0, 4);
|
assert_token(&tokens[1], 0, "hεl", 0, 4);
|
||||||
@@ -401,16 +410,16 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_ngram_empty() {
|
fn test_ngram_empty() {
|
||||||
let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 5).token_stream("").collect();
|
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream(""));
|
||||||
assert!(tokens.is_empty());
|
assert!(tokens.is_empty());
|
||||||
let tokens: Vec<_> = NgramTokenizer::all_ngrams(2, 5).token_stream("").collect();
|
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream(""));
|
||||||
assert!(tokens.is_empty());
|
assert!(tokens.is_empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[should_panic(expected = "min_gram must be greater than 0")]
|
#[should_panic(expected = "min_gram must be greater than 0")]
|
||||||
fn test_ngram_min_max_interval_empty() {
|
fn test_ngram_min_max_interval_empty() {
|
||||||
NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss");
|
test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -1,17 +1,17 @@
|
|||||||
use super::{Token, Tokenizer};
|
use super::{Token, TokenStream, Tokenizer};
|
||||||
|
use crate::tokenizer::BoxTokenStream;
|
||||||
|
|
||||||
/// For each value of the field, emit a single unprocessed token.
|
/// For each value of the field, emit a single unprocessed token.
|
||||||
#[derive(Clone, Debug, Default)]
|
#[derive(Clone)]
|
||||||
pub struct RawTokenizer;
|
pub struct RawTokenizer;
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub struct RawTokenStream {
|
pub struct RawTokenStream {
|
||||||
token: Option<Token>,
|
token: Token,
|
||||||
|
has_token: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Tokenizer for RawTokenizer {
|
impl Tokenizer for RawTokenizer {
|
||||||
type Iter = RawTokenStream;
|
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||||
fn token_stream(&self, text: &str) -> Self::Iter {
|
|
||||||
let token = Token {
|
let token = Token {
|
||||||
offset_from: 0,
|
offset_from: 0,
|
||||||
offset_to: text.len(),
|
offset_to: text.len(),
|
||||||
@@ -19,13 +19,26 @@ impl Tokenizer for RawTokenizer {
|
|||||||
text: text.to_string(),
|
text: text.to_string(),
|
||||||
position_length: 1,
|
position_length: 1,
|
||||||
};
|
};
|
||||||
RawTokenStream { token: Some(token) }
|
RawTokenStream {
|
||||||
|
token,
|
||||||
|
has_token: true,
|
||||||
|
}
|
||||||
|
.into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Iterator for RawTokenStream {
|
impl TokenStream for RawTokenStream {
|
||||||
type Item = Token;
|
fn advance(&mut self) -> bool {
|
||||||
fn next(&mut self) -> Option<Token> {
|
let result = self.has_token;
|
||||||
self.token.take()
|
self.has_token = false;
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
&self.token
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
&mut self.token
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,8 +2,8 @@
|
|||||||
//! ```rust
|
//! ```rust
|
||||||
//! use tantivy::tokenizer::*;
|
//! use tantivy::tokenizer::*;
|
||||||
//!
|
//!
|
||||||
//! let tokenizer = analyzer_builder(SimpleTokenizer)
|
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||||
//! .filter(RemoveLongFilter::limit(5)).build();
|
//! .filter(RemoveLongFilter::limit(5));
|
||||||
//!
|
//!
|
||||||
//! let mut stream = tokenizer.token_stream("toolong nice");
|
//! let mut stream = tokenizer.token_stream("toolong nice");
|
||||||
//! // because `toolong` is more than 5 characters, it is filtered
|
//! // because `toolong` is more than 5 characters, it is filtered
|
||||||
@@ -12,30 +12,61 @@
|
|||||||
//! assert!(stream.next().is_none());
|
//! assert!(stream.next().is_none());
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
use super::{Token, TokenFilter};
|
use super::{Token, TokenFilter, TokenStream};
|
||||||
|
use crate::tokenizer::BoxTokenStream;
|
||||||
|
|
||||||
/// `RemoveLongFilter` removes tokens that are longer
|
/// `RemoveLongFilter` removes tokens that are longer
|
||||||
/// than a given number of bytes (in UTF-8 representation).
|
/// than a given number of bytes (in UTF-8 representation).
|
||||||
///
|
///
|
||||||
/// It is especially useful when indexing unconstrained content.
|
/// It is especially useful when indexing unconstrained content.
|
||||||
/// e.g. Mail containing base-64 encoded pictures etc.
|
/// e.g. Mail containing base-64 encoded pictures etc.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone)]
|
||||||
pub struct RemoveLongFilter {
|
pub struct RemoveLongFilter {
|
||||||
limit: usize,
|
length_limit: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RemoveLongFilter {
|
impl RemoveLongFilter {
|
||||||
/// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation.
|
/// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation.
|
||||||
pub fn limit(limit: usize) -> RemoveLongFilter {
|
pub fn limit(length_limit: usize) -> RemoveLongFilter {
|
||||||
RemoveLongFilter { limit }
|
RemoveLongFilter { length_limit }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> RemoveLongFilterStream<'a> {
|
||||||
|
fn predicate(&self, token: &Token) -> bool {
|
||||||
|
token.text.len() < self.token_length_limit
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TokenFilter for RemoveLongFilter {
|
impl TokenFilter for RemoveLongFilter {
|
||||||
fn transform(&mut self, token: Token) -> Option<Token> {
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||||
if token.text.len() >= self.limit {
|
BoxTokenStream::from(RemoveLongFilterStream {
|
||||||
return None;
|
token_length_limit: self.length_limit,
|
||||||
}
|
tail: token_stream,
|
||||||
Some(token)
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct RemoveLongFilterStream<'a> {
|
||||||
|
token_length_limit: usize,
|
||||||
|
tail: BoxTokenStream<'a>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> TokenStream for RemoveLongFilterStream<'a> {
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
while self.tail.advance() {
|
||||||
|
if self.predicate(self.tail.token()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
self.tail.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
self.tail.token_mut()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,74 +1,59 @@
|
|||||||
use super::{Token, Tokenizer};
|
use super::BoxTokenStream;
|
||||||
|
use super::{Token, TokenStream, Tokenizer};
|
||||||
|
use std::str::CharIndices;
|
||||||
|
|
||||||
/// Tokenize the text by splitting on whitespaces and punctuation.
|
/// Tokenize the text by splitting on whitespaces and punctuation.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone)]
|
||||||
pub struct SimpleTokenizer;
|
pub struct SimpleTokenizer;
|
||||||
|
|
||||||
|
pub struct SimpleTokenStream<'a> {
|
||||||
|
text: &'a str,
|
||||||
|
chars: CharIndices<'a>,
|
||||||
|
token: Token,
|
||||||
|
}
|
||||||
|
|
||||||
impl Tokenizer for SimpleTokenizer {
|
impl Tokenizer for SimpleTokenizer {
|
||||||
type Iter = SimpleTokenizerStream;
|
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||||
fn token_stream(&self, text: &str) -> Self::Iter {
|
BoxTokenStream::from(SimpleTokenStream {
|
||||||
let vec: Vec<_> = text.char_indices().collect();
|
text,
|
||||||
SimpleTokenizerStream {
|
chars: text.char_indices(),
|
||||||
text: text.to_string(),
|
token: Token::default(),
|
||||||
chars: vec.into_iter(),
|
})
|
||||||
position: usize::max_value(),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
impl<'a> SimpleTokenStream<'a> {
|
||||||
pub struct SimpleTokenizerStream {
|
|
||||||
text: String,
|
|
||||||
chars: std::vec::IntoIter<(usize, char)>,
|
|
||||||
position: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SimpleTokenizerStream {
|
|
||||||
// search for the end of the current token.
|
// search for the end of the current token.
|
||||||
fn search_token_end(&mut self) -> usize {
|
fn search_token_end(&mut self) -> usize {
|
||||||
(&mut self.chars)
|
(&mut self.chars)
|
||||||
.filter(|&(_, c)| !c.is_alphanumeric())
|
.filter(|&(_, ref c)| !c.is_alphanumeric())
|
||||||
.map(|(offset, _)| offset)
|
.map(|(offset, _)| offset)
|
||||||
.next()
|
.next()
|
||||||
.unwrap_or_else(|| self.text.len())
|
.unwrap_or_else(|| self.text.len())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Iterator for SimpleTokenizerStream {
|
impl<'a> TokenStream for SimpleTokenStream<'a> {
|
||||||
type Item = Token;
|
fn advance(&mut self) -> bool {
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
self.token.text.clear();
|
||||||
self.position = self.position.wrapping_add(1);
|
self.token.position = self.token.position.wrapping_add(1);
|
||||||
while let Some((offset_from, c)) = self.chars.next() {
|
while let Some((offset_from, c)) = self.chars.next() {
|
||||||
if c.is_alphanumeric() {
|
if c.is_alphanumeric() {
|
||||||
let offset_to = self.search_token_end();
|
let offset_to = self.search_token_end();
|
||||||
let token = Token {
|
self.token.offset_from = offset_from;
|
||||||
text: self.text[offset_from..offset_to].into(),
|
self.token.offset_to = offset_to;
|
||||||
offset_from,
|
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
||||||
offset_to,
|
return true;
|
||||||
position: self.position,
|
|
||||||
..Default::default()
|
|
||||||
};
|
|
||||||
return Some(token);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None
|
false
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
#[cfg(test)]
|
&self.token
|
||||||
mod tests {
|
}
|
||||||
use super::*;
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
#[test]
|
&mut self.token
|
||||||
fn test_empty() {
|
|
||||||
let mut empty = SimpleTokenizer.token_stream("");
|
|
||||||
assert_eq!(empty.next(), None);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn simple_tokenizer() {
|
|
||||||
let mut simple = SimpleTokenizer.token_stream("tokenizer hello world");
|
|
||||||
assert_eq!(simple.next().unwrap().text, "tokenizer");
|
|
||||||
assert_eq!(simple.next().unwrap().text, "hello");
|
|
||||||
assert_eq!(simple.next().unwrap().text, "world");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
use std::sync::Arc;
|
use super::{Token, TokenFilter, TokenStream};
|
||||||
|
use crate::tokenizer::BoxTokenStream;
|
||||||
use super::{Token, TokenFilter};
|
|
||||||
use rust_stemmers::{self, Algorithm};
|
use rust_stemmers::{self, Algorithm};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
@@ -59,15 +58,14 @@ impl Language {
|
|||||||
/// Tokens are expected to be lowercased beforehand.
|
/// Tokens are expected to be lowercased beforehand.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Stemmer {
|
pub struct Stemmer {
|
||||||
stemmer: Arc<rust_stemmers::Stemmer>,
|
stemmer_algorithm: Algorithm,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Stemmer {
|
impl Stemmer {
|
||||||
/// Creates a new Stemmer `TokenFilter` for a given language algorithm.
|
/// Creates a new Stemmer `TokenFilter` for a given language algorithm.
|
||||||
pub fn new(language: Language) -> Stemmer {
|
pub fn new(language: Language) -> Stemmer {
|
||||||
let stemmer = rust_stemmers::Stemmer::create(language.algorithm());
|
|
||||||
Stemmer {
|
Stemmer {
|
||||||
stemmer: Arc::new(stemmer),
|
stemmer_algorithm: language.algorithm(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -80,12 +78,37 @@ impl Default for Stemmer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl TokenFilter for Stemmer {
|
impl TokenFilter for Stemmer {
|
||||||
fn transform(&mut self, mut token: Token) -> Option<Token> {
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||||
// TODO remove allocation
|
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||||
let stemmed_str: String = self.stemmer.stem(&token.text).into_owned();
|
BoxTokenStream::from(StemmerTokenStream {
|
||||||
// TODO remove clear
|
tail: token_stream,
|
||||||
token.text.clear();
|
stemmer: inner_stemmer,
|
||||||
token.text.push_str(&stemmed_str);
|
})
|
||||||
Some(token)
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct StemmerTokenStream<'a> {
|
||||||
|
tail: BoxTokenStream<'a>,
|
||||||
|
stemmer: rust_stemmers::Stemmer,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> TokenStream for StemmerTokenStream<'a> {
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
if !self.tail.advance() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// TODO remove allocation
|
||||||
|
let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
|
||||||
|
self.token_mut().text.clear();
|
||||||
|
self.token_mut().text.push_str(&stemmed_str);
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
self.tail.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
self.tail.token_mut()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,15 +2,16 @@
|
|||||||
//! ```rust
|
//! ```rust
|
||||||
//! use tantivy::tokenizer::*;
|
//! use tantivy::tokenizer::*;
|
||||||
//!
|
//!
|
||||||
//! let tokenizer = analyzer_builder(SimpleTokenizer)
|
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||||
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()])).build();
|
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]));
|
||||||
//!
|
//!
|
||||||
//! let mut stream = tokenizer.token_stream("the fox is crafty");
|
//! let mut stream = tokenizer.token_stream("the fox is crafty");
|
||||||
//! assert_eq!(stream.next().unwrap().text, "fox");
|
//! assert_eq!(stream.next().unwrap().text, "fox");
|
||||||
//! assert_eq!(stream.next().unwrap().text, "crafty");
|
//! assert_eq!(stream.next().unwrap().text, "crafty");
|
||||||
//! assert!(stream.next().is_none());
|
//! assert!(stream.next().is_none());
|
||||||
//! ```
|
//! ```
|
||||||
use super::{Token, TokenFilter};
|
use super::{Token, TokenFilter, TokenStream};
|
||||||
|
use crate::tokenizer::BoxTokenStream;
|
||||||
use fnv::FnvHasher;
|
use fnv::FnvHasher;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::hash::BuildHasherDefault;
|
use std::hash::BuildHasherDefault;
|
||||||
@@ -48,12 +49,42 @@ impl StopWordFilter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct StopWordFilterStream<'a> {
|
||||||
|
words: StopWordHashSet,
|
||||||
|
tail: BoxTokenStream<'a>,
|
||||||
|
}
|
||||||
|
|
||||||
impl TokenFilter for StopWordFilter {
|
impl TokenFilter for StopWordFilter {
|
||||||
fn transform(&mut self, token: Token) -> Option<Token> {
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||||
if self.words.contains(&token.text) {
|
BoxTokenStream::from(StopWordFilterStream {
|
||||||
return None;
|
words: self.words.clone(),
|
||||||
|
tail: token_stream,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> StopWordFilterStream<'a> {
|
||||||
|
fn predicate(&self, token: &Token) -> bool {
|
||||||
|
!self.words.contains(&token.text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> TokenStream for StopWordFilterStream<'a> {
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
while self.tail.advance() {
|
||||||
|
if self.predicate(self.tail.token()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Some(token)
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
self.tail.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
self.tail.token_mut()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,121 +1,95 @@
|
|||||||
use crate::tokenizer::Token;
|
use crate::tokenizer::{BoxTokenStream, Token, TokenStream};
|
||||||
|
use std::ops::DerefMut;
|
||||||
|
|
||||||
const POSITION_GAP: usize = 2;
|
const POSITION_GAP: usize = 2;
|
||||||
|
|
||||||
pub(crate) struct TokenStreamChain<Inner, Outer> {
|
pub(crate) struct TokenStreamChain<'a> {
|
||||||
streams_with_offsets: Outer,
|
offsets: Vec<usize>,
|
||||||
current: Option<(Inner, usize)>,
|
token_streams: Vec<BoxTokenStream<'a>>,
|
||||||
position: usize,
|
|
||||||
position_shift: usize,
|
position_shift: usize,
|
||||||
|
stream_idx: usize,
|
||||||
|
token: Token,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, Inner, Outer> TokenStreamChain<Inner, Outer>
|
impl<'a> TokenStreamChain<'a> {
|
||||||
where
|
pub fn new(
|
||||||
Inner: Iterator<Item = Token>,
|
offsets: Vec<usize>,
|
||||||
Outer: Iterator<Item = (Inner, usize)>,
|
token_streams: Vec<BoxTokenStream<'a>>,
|
||||||
{
|
) -> TokenStreamChain<'a> {
|
||||||
pub fn new(mut streams_with_offsets: Outer) -> TokenStreamChain<Inner, Outer> {
|
|
||||||
let current = streams_with_offsets.next();
|
|
||||||
TokenStreamChain {
|
TokenStreamChain {
|
||||||
streams_with_offsets: streams_with_offsets,
|
offsets,
|
||||||
current,
|
stream_idx: 0,
|
||||||
position: usize::max_value(),
|
token_streams,
|
||||||
position_shift: 0,
|
position_shift: 0,
|
||||||
|
token: Token::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, Inner, Outer> Iterator for TokenStreamChain<Inner, Outer>
|
impl<'a> TokenStream for TokenStreamChain<'a> {
|
||||||
where
|
fn advance(&mut self) -> bool {
|
||||||
Inner: Iterator<Item = Token>,
|
while self.stream_idx < self.token_streams.len() {
|
||||||
Outer: Iterator<Item = (Inner, usize)>,
|
let token_stream = self.token_streams[self.stream_idx].deref_mut();
|
||||||
{
|
if token_stream.advance() {
|
||||||
type Item = Token;
|
let token = token_stream.token();
|
||||||
fn next(&mut self) -> Option<Token> {
|
let offset_offset = self.offsets[self.stream_idx];
|
||||||
while let Some((ref mut token_stream, offset_offset)) = self.current {
|
self.token.offset_from = token.offset_from + offset_offset;
|
||||||
if let Some(mut token) = token_stream.next() {
|
self.token.offset_to = token.offset_to + offset_offset;
|
||||||
token.offset_from += offset_offset;
|
self.token.position = token.position + self.position_shift;
|
||||||
token.offset_to += offset_offset;
|
self.token.text.clear();
|
||||||
token.position += self.position_shift;
|
self.token.text.push_str(token.text.as_str());
|
||||||
self.position = token.position;
|
return true;
|
||||||
return Some(token);
|
} else {
|
||||||
|
self.stream_idx += 1;
|
||||||
|
self.position_shift = self.token.position.wrapping_add(POSITION_GAP);
|
||||||
}
|
}
|
||||||
self.position_shift = self.position.wrapping_add(POSITION_GAP);
|
|
||||||
self.current = self.streams_with_offsets.next();
|
|
||||||
}
|
}
|
||||||
None
|
false
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl DynTokenStreamChain {
|
fn token(&self) -> &Token {
|
||||||
pub fn from_vec(
|
assert!(
|
||||||
streams_with_offsets: Vec<(Box<dyn Iterator<Item = Token>>, usize)>,
|
self.stream_idx <= self.token_streams.len(),
|
||||||
) -> impl Iterator<Item = Token> {
|
"You called .token(), after the end of the token stream has been reached"
|
||||||
DynTokenStreamChain {
|
);
|
||||||
streams_with_offsets,
|
&self.token
|
||||||
idx: 0,
|
|
||||||
position: usize::max_value(),
|
|
||||||
position_shift: 0,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) struct DynTokenStreamChain {
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
streams_with_offsets: Vec<(Box<dyn Iterator<Item = Token>>, usize)>,
|
assert!(
|
||||||
idx: usize,
|
self.stream_idx <= self.token_streams.len(),
|
||||||
position: usize,
|
"You called .token(), after the end of the token stream has been reached"
|
||||||
position_shift: usize,
|
);
|
||||||
}
|
&mut self.token
|
||||||
|
|
||||||
impl Iterator for DynTokenStreamChain {
|
|
||||||
type Item = Token;
|
|
||||||
fn next(&mut self) -> Option<Token> {
|
|
||||||
while let Some((token_stream, offset_offset)) = self.streams_with_offsets.get_mut(self.idx)
|
|
||||||
{
|
|
||||||
if let Some(mut token) = token_stream.next() {
|
|
||||||
token.offset_from += *offset_offset;
|
|
||||||
token.offset_to += *offset_offset;
|
|
||||||
token.position += self.position_shift;
|
|
||||||
self.position = token.position;
|
|
||||||
return Some(token);
|
|
||||||
}
|
|
||||||
self.idx += 1;
|
|
||||||
self.position_shift = self.position.wrapping_add(POSITION_GAP);
|
|
||||||
}
|
|
||||||
None
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::super::tokenizer::Tokenizer;
|
use super::super::{SimpleTokenizer, TokenStream, Tokenizer};
|
||||||
use super::super::SimpleTokenizer;
|
use super::TokenStreamChain;
|
||||||
use super::*;
|
use super::POSITION_GAP;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_chain_first_emits_no_tokens() {
|
fn test_chain_first_emits_no_tokens() {
|
||||||
let token_streams = vec![
|
let token_streams = vec![
|
||||||
(SimpleTokenizer.token_stream(""), 0),
|
SimpleTokenizer.token_stream(""),
|
||||||
(SimpleTokenizer.token_stream("hello world"), 0),
|
SimpleTokenizer.token_stream("hello world"),
|
||||||
];
|
];
|
||||||
let mut token_chain = TokenStreamChain::new(token_streams.into_iter());
|
let mut token_chain = TokenStreamChain::new(vec![0, 0], token_streams);
|
||||||
let token = token_chain.next();
|
|
||||||
|
|
||||||
let expect = Token {
|
assert!(token_chain.advance());
|
||||||
offset_from: 0,
|
assert_eq!(token_chain.token().text, "hello");
|
||||||
offset_to: 5,
|
assert_eq!(token_chain.token().offset_from, 0);
|
||||||
position: POSITION_GAP - 1,
|
assert_eq!(token_chain.token().offset_to, 5);
|
||||||
text: "hello".into(),
|
assert_eq!(token_chain.token().position, POSITION_GAP - 1);
|
||||||
..Token::default()
|
|
||||||
};
|
|
||||||
assert_eq!(token.unwrap(), expect);
|
|
||||||
|
|
||||||
let token = token_chain.next().unwrap();
|
assert!(token_chain.advance());
|
||||||
assert_eq!(token.text, "world");
|
assert_eq!(token_chain.token().text, "world");
|
||||||
assert_eq!(token.offset_from, 6);
|
assert_eq!(token_chain.token().offset_from, 6);
|
||||||
assert_eq!(token.offset_to, 11);
|
assert_eq!(token_chain.token().offset_to, 11);
|
||||||
assert_eq!(token.position, POSITION_GAP);
|
assert_eq!(token_chain.token().position, POSITION_GAP);
|
||||||
|
|
||||||
assert!(token_chain.next().is_none());
|
assert!(!token_chain.advance());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use crate::tokenizer::{Token, TokenStreamChain};
|
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
@@ -26,14 +26,14 @@ impl PartialOrd for PreTokenizedString {
|
|||||||
/// TokenStream implementation which wraps PreTokenizedString
|
/// TokenStream implementation which wraps PreTokenizedString
|
||||||
pub struct PreTokenizedStream {
|
pub struct PreTokenizedStream {
|
||||||
tokenized_string: PreTokenizedString,
|
tokenized_string: PreTokenizedString,
|
||||||
current_token: usize,
|
current_token: i64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<PreTokenizedString> for PreTokenizedStream {
|
impl From<PreTokenizedString> for PreTokenizedStream {
|
||||||
fn from(s: PreTokenizedString) -> PreTokenizedStream {
|
fn from(s: PreTokenizedString) -> PreTokenizedStream {
|
||||||
PreTokenizedStream {
|
PreTokenizedStream {
|
||||||
tokenized_string: s,
|
tokenized_string: s,
|
||||||
current_token: 0,
|
current_token: -1,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -41,28 +41,49 @@ impl From<PreTokenizedString> for PreTokenizedStream {
|
|||||||
impl PreTokenizedStream {
|
impl PreTokenizedStream {
|
||||||
/// Creates a TokenStream from PreTokenizedString array
|
/// Creates a TokenStream from PreTokenizedString array
|
||||||
pub fn chain_tokenized_strings<'a>(
|
pub fn chain_tokenized_strings<'a>(
|
||||||
tok_strings: &'a [&PreTokenizedString],
|
tok_strings: &'a [&'a PreTokenizedString],
|
||||||
) -> impl Iterator<Item = Token> + 'a {
|
) -> BoxTokenStream {
|
||||||
let streams_with_offsets = tok_strings.iter().scan(0, |total_offset, tok_string| {
|
if tok_strings.len() == 1 {
|
||||||
let next = Some((
|
PreTokenizedStream::from((*tok_strings[0]).clone()).into()
|
||||||
PreTokenizedStream::from((*tok_string).to_owned()),
|
} else {
|
||||||
*total_offset,
|
let mut offsets = vec![];
|
||||||
));
|
let mut total_offset = 0;
|
||||||
if let Some(last_token) = tok_string.tokens.last() {
|
for &tok_string in tok_strings {
|
||||||
*total_offset += last_token.offset_to;
|
offsets.push(total_offset);
|
||||||
|
if let Some(last_token) = tok_string.tokens.last() {
|
||||||
|
total_offset += last_token.offset_to;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
next
|
// TODO remove the string cloning.
|
||||||
});
|
let token_streams: Vec<BoxTokenStream<'static>> = tok_strings
|
||||||
TokenStreamChain::new(streams_with_offsets)
|
.iter()
|
||||||
|
.map(|&tok_string| PreTokenizedStream::from((*tok_string).clone()).into())
|
||||||
|
.collect();
|
||||||
|
TokenStreamChain::new(offsets, token_streams).into()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Iterator for PreTokenizedStream {
|
impl TokenStream for PreTokenizedStream {
|
||||||
type Item = Token;
|
fn advance(&mut self) -> bool {
|
||||||
fn next(&mut self) -> Option<Token> {
|
|
||||||
let token = self.tokenized_string.tokens.get(self.current_token)?;
|
|
||||||
self.current_token += 1;
|
self.current_token += 1;
|
||||||
Some(token.clone())
|
self.current_token < self.tokenized_string.tokens.len() as i64
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
assert!(
|
||||||
|
self.current_token >= 0,
|
||||||
|
"TokenStream not initialized. You should call advance() at least once."
|
||||||
|
);
|
||||||
|
&self.tokenized_string.tokens[self.current_token as usize]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
assert!(
|
||||||
|
self.current_token >= 0,
|
||||||
|
"TokenStream not initialized. You should call advance() at least once."
|
||||||
|
);
|
||||||
|
&mut self.tokenized_string.tokens[self.current_token as usize]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -98,9 +119,10 @@ mod tests {
|
|||||||
let mut token_stream = PreTokenizedStream::from(tok_text.clone());
|
let mut token_stream = PreTokenizedStream::from(tok_text.clone());
|
||||||
|
|
||||||
for expected_token in tok_text.tokens {
|
for expected_token in tok_text.tokens {
|
||||||
assert_eq!(token_stream.next().unwrap(), expected_token);
|
assert!(token_stream.advance());
|
||||||
|
assert_eq!(token_stream.token(), &expected_token);
|
||||||
}
|
}
|
||||||
assert!(token_stream.next().is_none());
|
assert!(!token_stream.advance());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -161,8 +183,9 @@ mod tests {
|
|||||||
];
|
];
|
||||||
|
|
||||||
for expected_token in expected_tokens {
|
for expected_token in expected_tokens {
|
||||||
assert_eq!(token_stream.next().unwrap(), expected_token);
|
assert!(token_stream.advance());
|
||||||
|
assert_eq!(token_stream.token(), &expected_token);
|
||||||
}
|
}
|
||||||
assert!(token_stream.next().is_none());
|
assert!(!token_stream.advance());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,23 +2,8 @@ use crate::tokenizer::TokenStreamChain;
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
/// The tokenizer module contains all of the tools used to process
|
/// The tokenizer module contains all of the tools used to process
|
||||||
/// text in `tantivy`.
|
/// text in `tantivy`.
|
||||||
|
use std::borrow::{Borrow, BorrowMut};
|
||||||
pub trait TextAnalyzerClone {
|
use std::ops::{Deref, DerefMut};
|
||||||
fn box_clone(&self) -> Box<dyn TextAnalyzerT>;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// 'Top-level' trait hiding concrete types, below which static dispatch occurs.
|
|
||||||
pub trait TextAnalyzerT: 'static + Send + Sync + TextAnalyzerClone {
|
|
||||||
/// 'Top-level' dynamic dispatch function hiding concrete types of the staticly
|
|
||||||
/// dispatched `token_stream` from the `Tokenizer` trait.
|
|
||||||
fn token_stream(&self, text: &str) -> Box<dyn Iterator<Item = Token>>;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Clone for Box<dyn TextAnalyzerT> {
|
|
||||||
fn clone(&self) -> Self {
|
|
||||||
(**self).box_clone()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Token
|
/// Token
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||||
@@ -50,116 +35,35 @@ impl Default for Token {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Trait for the pluggable components of `Tokenizer`s.
|
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||||
pub trait TokenFilter: 'static + Send + Sync + Clone {
|
|
||||||
/// Take a `Token` and transform it or return `None` if it's to be removed
|
|
||||||
/// from the output stream.
|
|
||||||
fn transform(&mut self, token: Token) -> Option<Token>;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `Tokenizer` are in charge of splitting text into a stream of token
|
|
||||||
/// before indexing.
|
|
||||||
///
|
|
||||||
/// See the [module documentation](./index.html) for more detail.
|
|
||||||
pub trait Tokenizer: 'static + Send + Sync + Clone {
|
|
||||||
/// An iteratable type is returned.
|
|
||||||
type Iter: Iterator<Item = Token>;
|
|
||||||
/// Creates a token stream for a given `str`.
|
|
||||||
fn token_stream(&self, text: &str) -> Self::Iter;
|
|
||||||
/// Tokenize an array`&str`
|
|
||||||
///
|
|
||||||
/// The resulting `Token` stream is equivalent to what would be obtained if the &str were
|
|
||||||
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
|
||||||
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
|
||||||
fn token_stream_texts<'a>(&'a self, texts: &'a [&str]) -> Box<dyn Iterator<Item = Token> + 'a> {
|
|
||||||
let streams_with_offsets = texts.iter().scan(0, move |total_offset, &text| {
|
|
||||||
let temp = *total_offset;
|
|
||||||
*total_offset += text.len();
|
|
||||||
Some((self.token_stream(text), temp))
|
|
||||||
});
|
|
||||||
Box::new(TokenStreamChain::new(streams_with_offsets))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `TextAnalyzer` wraps the tokenization of an input text and its modification by any filters applied onto it.
|
|
||||||
///
|
///
|
||||||
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
|
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
|
||||||
#[derive(Clone, Debug, Default)]
|
pub struct TextAnalyzer {
|
||||||
pub struct TextAnalyzer<T>(T);
|
tokenizer: Box<dyn Tokenizer>,
|
||||||
|
token_filters: Vec<BoxTokenFilter>,
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: Tokenizer> From<T> for TextAnalyzer<T> {
|
impl<T: Tokenizer> From<T> for TextAnalyzer {
|
||||||
fn from(src: T) -> TextAnalyzer<T> {
|
fn from(tokenizer: T) -> Self {
|
||||||
TextAnalyzer(src)
|
TextAnalyzer::new(tokenizer, Vec::new())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: Tokenizer> TextAnalyzerClone for TextAnalyzer<T> {
|
impl TextAnalyzer {
|
||||||
fn box_clone(&self) -> Box<dyn TextAnalyzerT> {
|
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
|
||||||
Box::new(TextAnalyzer(self.0.clone()))
|
///
|
||||||
}
|
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
|
||||||
}
|
/// `TextAnalyzer::from(tokenizer)`.
|
||||||
|
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
|
||||||
impl<T: Tokenizer> TextAnalyzerT for TextAnalyzer<T> {
|
TextAnalyzer {
|
||||||
fn token_stream(&self, text: &str) -> Box<dyn Iterator<Item = Token>> {
|
tokenizer: Box::new(tokenizer),
|
||||||
Box::new(self.0.token_stream(text))
|
token_filters,
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Identity `TokenFilter`
|
|
||||||
#[derive(Clone, Debug, Default)]
|
|
||||||
pub struct Identity;
|
|
||||||
|
|
||||||
impl TokenFilter for Identity {
|
|
||||||
fn transform(&mut self, token: Token) -> Option<Token> {
|
|
||||||
Some(token)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `Filter` is a wrapper around a `Token` stream and a `TokenFilter` which modifies it.
|
|
||||||
#[derive(Clone, Default, Debug)]
|
|
||||||
pub struct Filter<I, F> {
|
|
||||||
iter: I,
|
|
||||||
f: F,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<I, F> Iterator for Filter<I, F>
|
|
||||||
where
|
|
||||||
I: Iterator<Item = Token>,
|
|
||||||
F: TokenFilter,
|
|
||||||
{
|
|
||||||
type Item = Token;
|
|
||||||
fn next(&mut self) -> Option<Token> {
|
|
||||||
while let Some(token) = self.iter.next() {
|
|
||||||
if let Some(tok) = self.f.transform(token) {
|
|
||||||
return Some(tok);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
None
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, Debug, Default)]
|
|
||||||
pub struct AnalyzerBuilder<T, F> {
|
|
||||||
tokenizer: T,
|
|
||||||
f: F,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Construct an `AnalyzerBuilder` on which to apply `TokenFilter`.
|
|
||||||
pub fn analyzer_builder<T: Tokenizer>(tokenizer: T) -> AnalyzerBuilder<T, Identity> {
|
|
||||||
AnalyzerBuilder {
|
|
||||||
tokenizer,
|
|
||||||
f: Identity,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T, F> AnalyzerBuilder<T, F>
|
|
||||||
where
|
|
||||||
T: Tokenizer,
|
|
||||||
F: TokenFilter,
|
|
||||||
{
|
|
||||||
/// Appends a token filter to the current tokenizer.
|
/// Appends a token filter to the current tokenizer.
|
||||||
///
|
///
|
||||||
/// The method consumes the current `Token` and returns a
|
/// The method consumes the current `TokenStream` and returns a
|
||||||
/// new one.
|
/// new one.
|
||||||
///
|
///
|
||||||
/// # Example
|
/// # Example
|
||||||
@@ -167,35 +71,248 @@ where
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// use tantivy::tokenizer::*;
|
/// use tantivy::tokenizer::*;
|
||||||
///
|
///
|
||||||
/// let en_stem = analyzer_builder(SimpleTokenizer)
|
/// let en_stem = TextAnalyzer::from(SimpleTokenizer)
|
||||||
/// .filter(RemoveLongFilter::limit(40))
|
/// .filter(RemoveLongFilter::limit(40))
|
||||||
/// .filter(LowerCaser::new())
|
/// .filter(LowerCaser)
|
||||||
/// .filter(Stemmer::default()).build();
|
/// .filter(Stemmer::default());
|
||||||
/// ```
|
/// ```
|
||||||
///
|
///
|
||||||
pub fn filter<G: TokenFilter>(self, f: G) -> AnalyzerBuilder<AnalyzerBuilder<T, F>, G> {
|
pub fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self {
|
||||||
AnalyzerBuilder { tokenizer: self, f }
|
self.token_filters.push(token_filter.into());
|
||||||
|
self
|
||||||
}
|
}
|
||||||
/// Finalize the build process.
|
|
||||||
pub fn build(self) -> TextAnalyzer<AnalyzerBuilder<T, F>> {
|
/// Tokenize an array`&str`
|
||||||
TextAnalyzer(self)
|
///
|
||||||
|
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
|
||||||
|
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
||||||
|
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
||||||
|
pub fn token_stream_texts<'a>(&self, texts: &'a [&'a str]) -> BoxTokenStream<'a> {
|
||||||
|
assert!(!texts.is_empty());
|
||||||
|
if texts.len() == 1 {
|
||||||
|
self.token_stream(texts[0])
|
||||||
|
} else {
|
||||||
|
let mut offsets = vec![];
|
||||||
|
let mut total_offset = 0;
|
||||||
|
for &text in texts {
|
||||||
|
offsets.push(total_offset);
|
||||||
|
total_offset += text.len();
|
||||||
|
}
|
||||||
|
let token_streams: Vec<BoxTokenStream<'a>> = texts
|
||||||
|
.iter()
|
||||||
|
.cloned()
|
||||||
|
.map(|text| self.token_stream(text))
|
||||||
|
.collect();
|
||||||
|
From::from(TokenStreamChain::new(offsets, token_streams))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a token stream for a given `str`.
|
||||||
|
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||||
|
let mut token_stream = self.tokenizer.token_stream(text);
|
||||||
|
for token_filter in &self.token_filters {
|
||||||
|
token_stream = token_filter.transform(token_stream);
|
||||||
|
}
|
||||||
|
token_stream
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: Tokenizer, F: TokenFilter> Tokenizer for AnalyzerBuilder<T, F> {
|
impl Clone for TextAnalyzer {
|
||||||
type Iter = Filter<T::Iter, F>;
|
fn clone(&self) -> Self {
|
||||||
fn token_stream(&self, text: &str) -> Self::Iter {
|
TextAnalyzer {
|
||||||
Filter {
|
tokenizer: self.tokenizer.box_clone(),
|
||||||
iter: self.tokenizer.token_stream(text),
|
token_filters: self
|
||||||
f: self.f.clone(),
|
.token_filters
|
||||||
|
.iter()
|
||||||
|
.map(|token_filter| token_filter.box_clone())
|
||||||
|
.collect(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// `Tokenizer` are in charge of splitting text into a stream of token
|
||||||
|
/// before indexing.
|
||||||
|
///
|
||||||
|
/// See the [module documentation](./index.html) for more detail.
|
||||||
|
///
|
||||||
|
/// # Warning
|
||||||
|
///
|
||||||
|
/// This API may change to use associated types.
|
||||||
|
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
|
||||||
|
/// Creates a token stream for a given `str`.
|
||||||
|
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait TokenizerClone {
|
||||||
|
fn box_clone(&self) -> Box<dyn Tokenizer>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Tokenizer + Clone> TokenizerClone for T {
|
||||||
|
fn box_clone(&self) -> Box<dyn Tokenizer> {
|
||||||
|
Box::new(self.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
let token_stream: &mut dyn TokenStream = self.borrow_mut();
|
||||||
|
token_stream.advance()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token<'b>(&'b self) -> &'b Token {
|
||||||
|
let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
|
||||||
|
token_stream.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut<'b>(&'b mut self) -> &'b mut Token {
|
||||||
|
let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
|
||||||
|
token_stream.token_mut()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||||
|
///
|
||||||
|
/// See `TokenStream` for more information.
|
||||||
|
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||||
|
|
||||||
|
impl<'a, T> From<T> for BoxTokenStream<'a>
|
||||||
|
where
|
||||||
|
T: TokenStream + 'a,
|
||||||
|
{
|
||||||
|
fn from(token_stream: T) -> BoxTokenStream<'a> {
|
||||||
|
BoxTokenStream(Box::new(token_stream))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Deref for BoxTokenStream<'a> {
|
||||||
|
type Target = dyn TokenStream + 'a;
|
||||||
|
|
||||||
|
fn deref(&self) -> &Self::Target {
|
||||||
|
&*self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl<'a> DerefMut for BoxTokenStream<'a> {
|
||||||
|
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||||
|
&mut *self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
|
||||||
|
///
|
||||||
|
/// See `TokenStream` for more information.
|
||||||
|
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
|
||||||
|
|
||||||
|
impl Deref for BoxTokenFilter {
|
||||||
|
type Target = dyn TokenFilter;
|
||||||
|
|
||||||
|
fn deref(&self) -> &dyn TokenFilter {
|
||||||
|
&*self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: TokenFilter> From<T> for BoxTokenFilter {
|
||||||
|
fn from(tokenizer: T) -> BoxTokenFilter {
|
||||||
|
BoxTokenFilter(Box::new(tokenizer))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `TokenStream` is the result of the tokenization.
|
||||||
|
///
|
||||||
|
/// It consists consumable stream of `Token`s.
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use tantivy::tokenizer::*;
|
||||||
|
///
|
||||||
|
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||||
|
/// .filter(RemoveLongFilter::limit(40))
|
||||||
|
/// .filter(LowerCaser);
|
||||||
|
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||||
|
/// {
|
||||||
|
/// let token = token_stream.next().unwrap();
|
||||||
|
/// assert_eq!(&token.text, "hello");
|
||||||
|
/// assert_eq!(token.offset_from, 0);
|
||||||
|
/// assert_eq!(token.offset_to, 5);
|
||||||
|
/// assert_eq!(token.position, 0);
|
||||||
|
/// }
|
||||||
|
/// {
|
||||||
|
/// let token = token_stream.next().unwrap();
|
||||||
|
/// assert_eq!(&token.text, "happy");
|
||||||
|
/// assert_eq!(token.offset_from, 7);
|
||||||
|
/// assert_eq!(token.offset_to, 12);
|
||||||
|
/// assert_eq!(token.position, 1);
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
pub trait TokenStream {
|
||||||
|
/// Advance to the next token
|
||||||
|
///
|
||||||
|
/// Returns false if there are no other tokens.
|
||||||
|
fn advance(&mut self) -> bool;
|
||||||
|
|
||||||
|
/// Returns a reference to the current token.
|
||||||
|
fn token(&self) -> &Token;
|
||||||
|
|
||||||
|
/// Returns a mutable reference to the current token.
|
||||||
|
fn token_mut(&mut self) -> &mut Token;
|
||||||
|
|
||||||
|
/// Helper to iterate over tokens. It
|
||||||
|
/// simply combines a call to `.advance()`
|
||||||
|
/// and `.token()`.
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use tantivy::tokenizer::*;
|
||||||
|
///
|
||||||
|
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||||
|
/// .filter(RemoveLongFilter::limit(40))
|
||||||
|
/// .filter(LowerCaser);
|
||||||
|
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||||
|
/// while let Some(token) = token_stream.next() {
|
||||||
|
/// println!("Token {:?}", token.text);
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
fn next(&mut self) -> Option<&Token> {
|
||||||
|
if self.advance() {
|
||||||
|
Some(self.token())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function to consume the entire `TokenStream`
|
||||||
|
/// and push the tokens to a sink function.
|
||||||
|
///
|
||||||
|
/// Remove this.
|
||||||
|
fn process(&mut self, sink: &mut dyn FnMut(&Token)) -> u32 {
|
||||||
|
let mut num_tokens_pushed = 0u32;
|
||||||
|
while self.advance() {
|
||||||
|
sink(self.token());
|
||||||
|
num_tokens_pushed += 1u32;
|
||||||
|
}
|
||||||
|
num_tokens_pushed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait TokenFilterClone {
|
||||||
|
fn box_clone(&self) -> BoxTokenFilter;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Trait for the pluggable components of `Tokenizer`s.
|
||||||
|
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
|
||||||
|
/// Wraps a token stream and returns the modified one.
|
||||||
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: TokenFilter + Clone> TokenFilterClone for T {
|
||||||
|
fn box_clone(&self) -> BoxTokenFilter {
|
||||||
|
BoxTokenFilter::from(self.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::*;
|
use super::Token;
|
||||||
use crate::tokenizer::SimpleTokenizer;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn clone() {
|
fn clone() {
|
||||||
@@ -213,15 +330,4 @@ mod test {
|
|||||||
assert_eq!(t1.offset_to, t2.offset_to);
|
assert_eq!(t1.offset_to, t2.offset_to);
|
||||||
assert_eq!(t1.text, t2.text);
|
assert_eq!(t1.text, t2.text);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn text_analyzer() {
|
|
||||||
let mut stream = SimpleTokenizer.token_stream("tokenizer hello world");
|
|
||||||
dbg!(stream.next());
|
|
||||||
dbg!(stream.next());
|
|
||||||
dbg!(stream.next());
|
|
||||||
dbg!(stream.next());
|
|
||||||
dbg!(stream.next());
|
|
||||||
dbg!(stream.next());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use crate::tokenizer::stemmer::Language;
|
use crate::tokenizer::stemmer::Language;
|
||||||
use crate::tokenizer::tokenizer::{analyzer_builder, TextAnalyzer, TextAnalyzerT, Tokenizer};
|
use crate::tokenizer::tokenizer::TextAnalyzer;
|
||||||
use crate::tokenizer::LowerCaser;
|
use crate::tokenizer::LowerCaser;
|
||||||
use crate::tokenizer::RawTokenizer;
|
use crate::tokenizer::RawTokenizer;
|
||||||
use crate::tokenizer::RemoveLongFilter;
|
use crate::tokenizer::RemoveLongFilter;
|
||||||
@@ -22,23 +22,24 @@ use std::sync::{Arc, RwLock};
|
|||||||
/// search engine.
|
/// search engine.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct TokenizerManager {
|
pub struct TokenizerManager {
|
||||||
tokenizers: Arc<RwLock<HashMap<String, Box<dyn TextAnalyzerT>>>>,
|
tokenizers: Arc<RwLock<HashMap<String, TextAnalyzer>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TokenizerManager {
|
impl TokenizerManager {
|
||||||
/// Registers a new tokenizer associated with a given name.
|
/// Registers a new tokenizer associated with a given name.
|
||||||
pub fn register<U: Tokenizer, T>(&self, tokenizer_name: &str, tokenizer: T)
|
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
|
||||||
where
|
where
|
||||||
T: Into<TextAnalyzer<U>>,
|
TextAnalyzer: From<T>,
|
||||||
{
|
{
|
||||||
|
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
|
||||||
self.tokenizers
|
self.tokenizers
|
||||||
.write()
|
.write()
|
||||||
.expect("Acquiring the lock should never fail")
|
.expect("Acquiring the lock should never fail")
|
||||||
.insert(tokenizer_name.to_string(), Box::new(tokenizer.into()));
|
.insert(tokenizer_name.to_string(), boxed_tokenizer);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Accessing a tokenizer given its name.
|
/// Accessing a tokenizer given its name.
|
||||||
pub fn get(&self, tokenizer_name: &str) -> Option<Box<dyn TextAnalyzerT>> {
|
pub fn get(&self, tokenizer_name: &str) -> Option<TextAnalyzer> {
|
||||||
self.tokenizers
|
self.tokenizers
|
||||||
.read()
|
.read()
|
||||||
.expect("Acquiring the lock should never fail")
|
.expect("Acquiring the lock should never fail")
|
||||||
@@ -53,25 +54,23 @@ impl Default for TokenizerManager {
|
|||||||
/// - simple
|
/// - simple
|
||||||
/// - en_stem
|
/// - en_stem
|
||||||
/// - ja
|
/// - ja
|
||||||
fn default() -> Self {
|
fn default() -> TokenizerManager {
|
||||||
let manager = TokenizerManager {
|
let manager = TokenizerManager {
|
||||||
tokenizers: Arc::new(RwLock::new(HashMap::new())),
|
tokenizers: Arc::new(RwLock::new(HashMap::new())),
|
||||||
};
|
};
|
||||||
manager.register("raw", RawTokenizer);
|
manager.register("raw", RawTokenizer);
|
||||||
manager.register(
|
manager.register(
|
||||||
"default",
|
"default",
|
||||||
analyzer_builder(SimpleTokenizer)
|
TextAnalyzer::from(SimpleTokenizer)
|
||||||
.filter(RemoveLongFilter::limit(40))
|
.filter(RemoveLongFilter::limit(40))
|
||||||
.filter(LowerCaser::new())
|
.filter(LowerCaser),
|
||||||
.build(),
|
|
||||||
);
|
);
|
||||||
manager.register(
|
manager.register(
|
||||||
"en_stem",
|
"en_stem",
|
||||||
analyzer_builder(SimpleTokenizer)
|
TextAnalyzer::from(SimpleTokenizer)
|
||||||
.filter(RemoveLongFilter::limit(40))
|
.filter(RemoveLongFilter::limit(40))
|
||||||
.filter(LowerCaser::new())
|
.filter(LowerCaser)
|
||||||
.filter(Stemmer::new(Language::English))
|
.filter(Stemmer::new(Language::English)),
|
||||||
.build(),
|
|
||||||
);
|
);
|
||||||
manager
|
manager
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user