mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-01 23:12:54 +00:00
Compare commits
7 Commits
debugging-
...
intermedia
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1a29e06bd2 | ||
|
|
8e39265dbe | ||
|
|
bf7ac960b3 | ||
|
|
783df1b15c | ||
|
|
253d207103 | ||
|
|
03148e86c9 | ||
|
|
a6a903d8a1 |
@@ -47,18 +47,16 @@ murmurhash32 = "0.2"
|
||||
chrono = "0.4"
|
||||
smallvec = "1"
|
||||
rayon = "1"
|
||||
env_logger = "0.8"
|
||||
lru = "0.6"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3"
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.8"
|
||||
rand = "0.7"
|
||||
maplit = "1"
|
||||
matches = "0.1.8"
|
||||
proptest = "0.10"
|
||||
criterion = "0.3"
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.4"
|
||||
@@ -99,7 +97,3 @@ travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
name = "failpoints"
|
||||
path = "tests/failpoints/mod.rs"
|
||||
required-features = ["fail/failpoints"]
|
||||
|
||||
[[bench]]
|
||||
name = "analyzer"
|
||||
harness = false
|
||||
|
||||
3774
benches/alice.txt
3774
benches/alice.txt
File diff suppressed because it is too large
Load Diff
@@ -1,22 +0,0 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use tantivy::tokenizer::TokenizerManager;
|
||||
|
||||
const ALICE_TXT: &'static str = include_str!("alice.txt");
|
||||
|
||||
pub fn criterion_benchmark(c: &mut Criterion) {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let tokenizer = tokenizer_manager.get("default").unwrap();
|
||||
c.bench_function("default-tokenize-alice", |b| {
|
||||
b.iter(|| {
|
||||
let mut word_count = 0;
|
||||
let mut token_stream = tokenizer.token_stream(ALICE_TXT);
|
||||
while token_stream.advance() {
|
||||
word_count += 1;
|
||||
}
|
||||
assert_eq!(word_count, 30_731);
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(benches, criterion_benchmark);
|
||||
criterion_main!(benches);
|
||||
@@ -35,18 +35,12 @@ fn load_metas(
|
||||
inventory: &SegmentMetaInventory,
|
||||
) -> crate::Result<IndexMeta> {
|
||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||
let meta_string = String::from_utf8(meta_data)
|
||||
.map_err(|utf8_err| {
|
||||
DataCorruption::new(
|
||||
META_FILEPATH.to_path_buf(),
|
||||
format!("Meta file is not valid utf-8. {:?}", utf8_err)
|
||||
)
|
||||
})?;
|
||||
let meta_string = String::from_utf8_lossy(&meta_data);
|
||||
IndexMeta::deserialize(&meta_string, &inventory)
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
META_FILEPATH.to_path_buf(),
|
||||
format!("Meta file cannot be deserialized. {:?}. content = {}", e, meta_string),
|
||||
format!("Meta file cannot be deserialized. {:?}.", e),
|
||||
)
|
||||
})
|
||||
.map_err(From::from)
|
||||
|
||||
@@ -310,7 +310,7 @@ impl SegmentReader {
|
||||
}
|
||||
|
||||
/// Returns an iterator that will iterate over the alive document ids
|
||||
pub fn doc_ids_alive<'a>(&'a self) -> impl Iterator<Item = DocId> + 'a {
|
||||
pub fn doc_ids_alive(&self) -> impl Iterator<Item = DocId> + '_ {
|
||||
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
|
||||
}
|
||||
|
||||
|
||||
@@ -1,94 +1,45 @@
|
||||
use crate::Index;
|
||||
use crate::Searcher;
|
||||
use crate::{doc, schema::*};
|
||||
use rand::thread_rng;
|
||||
use rand::Rng;
|
||||
use std::collections::HashSet;
|
||||
|
||||
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
|
||||
use crate::schema::*;
|
||||
use crate::Index;
|
||||
use crate::Searcher;
|
||||
use rand::Rng;
|
||||
|
||||
fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
||||
assert!(searcher.segment_readers().len() < 20);
|
||||
assert_eq!(searcher.num_docs() as usize, vals.len());
|
||||
for segment_reader in searcher.segment_readers() {
|
||||
let store_reader = segment_reader.get_store_reader()?;
|
||||
for doc_id in 0..segment_reader.max_doc() {
|
||||
let _doc = store_reader.get(doc_id)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_functional_store() -> crate::Result<()> {
|
||||
env_logger::init();
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let id_field = schema_builder.add_u64_field("id", INDEXED | STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let reader = index.reader()?;
|
||||
|
||||
let mut rng = thread_rng();
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(3, 12_000_000)?;
|
||||
|
||||
let mut doc_set: Vec<u64> = Vec::new();
|
||||
|
||||
let mut doc_id = 0u64;
|
||||
for iteration in 0.. {
|
||||
let num_docs: usize = rng.gen_range(0..4);
|
||||
if doc_set.len() >= 1 {
|
||||
let doc_to_remove_id = rng.gen_range(0..doc_set.len());
|
||||
let removed_doc_id = doc_set.swap_remove(doc_to_remove_id);
|
||||
index_writer.delete_term(Term::from_field_u64(id_field, removed_doc_id));
|
||||
}
|
||||
for _ in 0..num_docs {
|
||||
doc_set.push(doc_id);
|
||||
index_writer.add_document(doc!(id_field=>doc_id));
|
||||
doc_id += 1;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
println!("#{} - {}", iteration, searcher.segment_readers().len());
|
||||
check_index_content(&searcher, &doc_set)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_functional_indexing() -> crate::Result<()> {
|
||||
fn test_indexing() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let id_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_from_tempdir(schema)?;
|
||||
let reader = index.reader()?;
|
||||
let index = Index::create_from_tempdir(schema).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
let mut rng = thread_rng();
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
|
||||
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
|
||||
|
||||
let mut committed_docs: HashSet<u64> = HashSet::new();
|
||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||
|
||||
for _ in 0..200 {
|
||||
let random_val = rng.gen_range(0..20);
|
||||
let random_val = rng.gen_range(0, 20);
|
||||
if random_val == 0 {
|
||||
index_writer.commit()?;
|
||||
index_writer.commit().expect("Commit failed");
|
||||
committed_docs.extend(&uncommitted_docs);
|
||||
uncommitted_docs.clear();
|
||||
reader.reload()?;
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
// check that everything is correct.
|
||||
check_index_content(
|
||||
&searcher,
|
||||
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
|
||||
)?;
|
||||
check_index_content(&searcher, &committed_docs);
|
||||
} else {
|
||||
if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
||||
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
||||
@@ -104,5 +55,4 @@ fn test_functional_indexing() -> crate::Result<()> {
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
|
||||
const DEFAULT_MIN_MERGE_SIZE: usize = 8;
|
||||
const DEFAULT_MAX_MERGE_SIZE: usize = 10_000_000;
|
||||
|
||||
/// `LogMergePolicy` tries to merge segments that have a similar number of
|
||||
/// `LogMergePolicy` tries tries to merge segments that have a similar number of
|
||||
/// documents.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LogMergePolicy {
|
||||
|
||||
@@ -11,7 +11,8 @@ use crate::schema::Schema;
|
||||
use crate::schema::Term;
|
||||
use crate::schema::Value;
|
||||
use crate::schema::{Field, FieldEntry};
|
||||
use crate::tokenizer::{BoxTokenStream, PreTokenizedStream};
|
||||
use crate::tokenizer::PreTokenizedStream;
|
||||
use crate::tokenizer::TokenStream;
|
||||
use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
|
||||
use crate::tokenizer::{TokenStreamChain, Tokenizer};
|
||||
use crate::Opstamp;
|
||||
@@ -141,13 +142,13 @@ impl SegmentWriter {
|
||||
}
|
||||
let (term_buffer, multifield_postings) =
|
||||
(&mut self.term_buffer, &mut self.multifield_postings);
|
||||
match *field_entry.field_type() {
|
||||
match field_entry.field_type() {
|
||||
FieldType::HierarchicalFacet => {
|
||||
term_buffer.set_field(field);
|
||||
let facets =
|
||||
field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match *field_value.value() {
|
||||
.flat_map(|field_value| match field_value.value() {
|
||||
Value::Facet(ref facet) => Some(facet.encoded_str()),
|
||||
_ => {
|
||||
panic!("Expected hierarchical facet");
|
||||
@@ -172,37 +173,38 @@ impl SegmentWriter {
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) => {
|
||||
let mut token_streams: Vec<BoxTokenStream> = vec![];
|
||||
let mut offsets = vec![];
|
||||
let mut streams_with_offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
|
||||
for field_value in field_values {
|
||||
match field_value.value() {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
offsets.push(total_offset);
|
||||
streams_with_offsets.push((
|
||||
Box::new(PreTokenizedStream::from(tok_str.clone()))
|
||||
as Box<dyn TokenStream>,
|
||||
total_offset,
|
||||
));
|
||||
if let Some(last_token) = tok_str.tokens.last() {
|
||||
total_offset += last_token.offset_to;
|
||||
}
|
||||
token_streams
|
||||
.push(PreTokenizedStream::from(tok_str.clone()).into());
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
if let Some(ref mut tokenizer) =
|
||||
self.tokenizers[field.field_id() as usize]
|
||||
{
|
||||
offsets.push(total_offset);
|
||||
streams_with_offsets
|
||||
.push((tokenizer.token_stream(text), total_offset));
|
||||
total_offset += text.len();
|
||||
token_streams.push(tokenizer.token_stream(text));
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
let num_tokens = if token_streams.is_empty() {
|
||||
let num_tokens = if streams_with_offsets.is_empty() {
|
||||
0
|
||||
} else {
|
||||
let mut token_stream = TokenStreamChain::new(offsets, token_streams);
|
||||
let mut token_stream = TokenStreamChain::new(streams_with_offsets);
|
||||
multifield_postings.index_text(
|
||||
doc_id,
|
||||
field,
|
||||
|
||||
@@ -132,7 +132,7 @@ impl PositionReader {
|
||||
"offset arguments should be increasing."
|
||||
);
|
||||
let delta_to_block_offset = offset as i64 - self.block_offset as i64;
|
||||
if delta_to_block_offset < 0 || delta_to_block_offset >= 128 {
|
||||
if !(0..128).contains(&delta_to_block_offset) {
|
||||
// The first position is not within the first block.
|
||||
// We need to decompress the first block.
|
||||
let delta_to_anchor_offset = offset - self.anchor_offset;
|
||||
|
||||
@@ -109,9 +109,9 @@ impl BlockSearcher {
|
||||
/// The results should be equivalent to
|
||||
/// ```compile_fail
|
||||
/// block[..]
|
||||
// .iter()
|
||||
// .take_while(|&&val| val < target)
|
||||
// .count()
|
||||
/// .iter()
|
||||
/// .take_while(|&&val| val < target)
|
||||
/// .count()
|
||||
/// ```
|
||||
///
|
||||
/// The `start` argument is just used to hint that the response is
|
||||
|
||||
@@ -302,7 +302,7 @@ mod tests {
|
||||
let mut rng = rand::thread_rng();
|
||||
writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
for _ in 0..3_000 {
|
||||
let term_freq = rng.gen_range(1..10000);
|
||||
let term_freq = rng.gen_range(1, 10000);
|
||||
let words: Vec<&str> = std::iter::repeat("bbbb").take(term_freq).collect();
|
||||
let text = words.join(" ");
|
||||
writer.add_document(doc!(text_field=>text));
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::schema::Value;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde::Serialize;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
/// Internal representation of a document used for JSON
|
||||
@@ -8,5 +8,5 @@ use std::collections::BTreeMap;
|
||||
/// A `NamedFieldDocument` is a simple representation of a document
|
||||
/// as a `BTreeMap<String, Vec<Value>>`.
|
||||
///
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
#[derive(Serialize)]
|
||||
pub struct NamedFieldDocument(pub BTreeMap<String, Vec<Value>>);
|
||||
|
||||
@@ -43,9 +43,6 @@ impl CheckpointBlock {
|
||||
|
||||
/// Adding another checkpoint in the block.
|
||||
pub fn push(&mut self, checkpoint: Checkpoint) {
|
||||
if let Some(prev_checkpoint) = self.checkpoints.last() {
|
||||
assert!(checkpoint.follows(prev_checkpoint));
|
||||
}
|
||||
self.checkpoints.push(checkpoint);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
const CHECKPOINT_PERIOD: usize = 2;
|
||||
const CHECKPOINT_PERIOD: usize = 8;
|
||||
|
||||
use std::fmt;
|
||||
mod block;
|
||||
@@ -26,13 +26,6 @@ pub struct Checkpoint {
|
||||
pub end_offset: u64,
|
||||
}
|
||||
|
||||
impl Checkpoint {
|
||||
pub(crate) fn follows(&self, other: &Checkpoint) -> bool {
|
||||
(self.start_doc == other.end_doc) &&
|
||||
(self.start_offset == other.end_offset)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Checkpoint {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(
|
||||
@@ -46,16 +39,13 @@ impl fmt::Debug for Checkpoint {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::{io, iter};
|
||||
use std::io;
|
||||
|
||||
use futures::executor::block_on;
|
||||
use proptest::strategy::{BoxedStrategy, Strategy};
|
||||
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::schema::{SchemaBuilder, STORED, STRING};
|
||||
use crate::store::index::Checkpoint;
|
||||
use crate::{DocAddress, DocId, Index, Term};
|
||||
use crate::DocId;
|
||||
|
||||
use super::{SkipIndex, SkipIndexBuilder};
|
||||
|
||||
@@ -64,7 +54,7 @@ mod tests {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let skip_index_builder: SkipIndexBuilder = SkipIndexBuilder::new();
|
||||
skip_index_builder.write(&mut output)?;
|
||||
let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output));
|
||||
let skip_index: SkipIndex = SkipIndex::from(OwnedBytes::new(output));
|
||||
let mut skip_cursor = skip_index.checkpoints();
|
||||
assert!(skip_cursor.next().is_none());
|
||||
Ok(())
|
||||
@@ -82,7 +72,7 @@ mod tests {
|
||||
};
|
||||
skip_index_builder.insert(checkpoint);
|
||||
skip_index_builder.write(&mut output)?;
|
||||
let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output));
|
||||
let skip_index: SkipIndex = SkipIndex::from(OwnedBytes::new(output));
|
||||
let mut skip_cursor = skip_index.checkpoints();
|
||||
assert_eq!(skip_cursor.next(), Some(checkpoint));
|
||||
assert_eq!(skip_cursor.next(), None);
|
||||
@@ -131,7 +121,7 @@ mod tests {
|
||||
}
|
||||
skip_index_builder.write(&mut output)?;
|
||||
|
||||
let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output));
|
||||
let skip_index: SkipIndex = SkipIndex::from(OwnedBytes::new(output));
|
||||
assert_eq!(
|
||||
&skip_index.checkpoints().collect::<Vec<_>>()[..],
|
||||
&checkpoints[..]
|
||||
@@ -143,40 +133,6 @@ mod tests {
|
||||
(doc as u64) * (doc as u64)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_store_with_stacking_reproducing_issue969() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text = schema_builder.add_text_field("text", STORED | STRING);
|
||||
let body = schema_builder.add_text_field("body", STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
let long_text: String = iter::repeat("abcdefghijklmnopqrstuvwxyz")
|
||||
.take(1_000)
|
||||
.collect();
|
||||
for _ in 0..20 {
|
||||
index_writer.add_document(doc!(body=>long_text.clone()));
|
||||
}
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(text=>"testb"));
|
||||
for _ in 0..10 {
|
||||
index_writer.add_document(doc!(text=>"testd", body=>long_text.clone()));
|
||||
}
|
||||
index_writer.commit()?;
|
||||
index_writer.delete_term(Term::from_field_text(text, "testb"));
|
||||
index_writer.commit()?;
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 30);
|
||||
for i in 0..searcher.num_docs() as u32 {
|
||||
let _doc = searcher.doc(DocAddress(0u32, i))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_index_long() -> io::Result<()> {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
@@ -194,7 +150,7 @@ mod tests {
|
||||
}
|
||||
skip_index_builder.write(&mut output)?;
|
||||
assert_eq!(output.len(), 4035);
|
||||
let resulting_checkpoints: Vec<Checkpoint> = SkipIndex::open(OwnedBytes::new(output))
|
||||
let resulting_checkpoints: Vec<Checkpoint> = SkipIndex::from(OwnedBytes::new(output))
|
||||
.checkpoints()
|
||||
.collect();
|
||||
assert_eq!(&resulting_checkpoints, &checkpoints);
|
||||
@@ -265,7 +221,7 @@ mod tests {
|
||||
}
|
||||
let mut buffer = Vec::new();
|
||||
skip_index_builder.write(&mut buffer).unwrap();
|
||||
let skip_index = SkipIndex::open(OwnedBytes::new(buffer));
|
||||
let skip_index = SkipIndex::from(OwnedBytes::new(buffer));
|
||||
let iter_checkpoints: Vec<Checkpoint> = skip_index.checkpoints().collect();
|
||||
assert_eq!(&checkpoints[..], &iter_checkpoints[..]);
|
||||
test_skip_index_aux(skip_index, &checkpoints[..]);
|
||||
|
||||
@@ -35,11 +35,11 @@ struct Layer {
|
||||
}
|
||||
|
||||
impl Layer {
|
||||
fn cursor<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a {
|
||||
fn cursor(&self) -> impl Iterator<Item = Checkpoint> + '_ {
|
||||
self.cursor_at_offset(0u64)
|
||||
}
|
||||
|
||||
fn cursor_at_offset<'a>(&'a self, start_offset: u64) -> impl Iterator<Item = Checkpoint> + 'a {
|
||||
fn cursor_at_offset(&self, start_offset: u64) -> impl Iterator<Item = Checkpoint> + '_ {
|
||||
let data = &self.data.as_slice();
|
||||
LayerCursor {
|
||||
remaining: &data[start_offset as usize..],
|
||||
@@ -59,25 +59,7 @@ pub struct SkipIndex {
|
||||
}
|
||||
|
||||
impl SkipIndex {
|
||||
pub fn open(mut data: OwnedBytes) -> SkipIndex {
|
||||
let offsets: Vec<u64> = Vec::<VInt>::deserialize(&mut data)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|el| el.0)
|
||||
.collect();
|
||||
let mut start_offset = 0;
|
||||
let mut layers = Vec::new();
|
||||
for end_offset in offsets {
|
||||
let layer = Layer {
|
||||
data: data.slice(start_offset as usize, end_offset as usize),
|
||||
};
|
||||
layers.push(layer);
|
||||
start_offset = end_offset;
|
||||
}
|
||||
SkipIndex { layers }
|
||||
}
|
||||
|
||||
pub(crate) fn checkpoints<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a {
|
||||
pub(crate) fn checkpoints(&self) -> impl Iterator<Item = Checkpoint> + '_ {
|
||||
self.layers
|
||||
.last()
|
||||
.into_iter()
|
||||
@@ -108,3 +90,22 @@ impl SkipIndex {
|
||||
Some(cur_checkpoint)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OwnedBytes> for SkipIndex {
|
||||
fn from(mut data: OwnedBytes) -> SkipIndex {
|
||||
let offsets: Vec<u64> = Vec::<VInt>::deserialize(&mut data)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|el| el.0)
|
||||
.collect();
|
||||
let mut start_offset = 0;
|
||||
let mut layers = Vec::new();
|
||||
for end_offset in offsets {
|
||||
layers.push(Layer {
|
||||
data: data.slice(start_offset as usize, end_offset as usize),
|
||||
});
|
||||
start_offset = end_offset;
|
||||
}
|
||||
SkipIndex { layers }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,20 +28,18 @@ impl LayerBuilder {
|
||||
///
|
||||
/// If the block was empty to begin with, simply return None.
|
||||
fn flush_block(&mut self) -> Option<Checkpoint> {
|
||||
if let Some((start_doc, end_doc)) = self.block.doc_interval() {
|
||||
self.block.doc_interval().map(|(start_doc, end_doc)| {
|
||||
let start_offset = self.buffer.len() as u64;
|
||||
self.block.serialize(&mut self.buffer);
|
||||
let end_offset = self.buffer.len() as u64;
|
||||
self.block.clear();
|
||||
Some(Checkpoint {
|
||||
Checkpoint {
|
||||
start_doc,
|
||||
end_doc,
|
||||
start_offset,
|
||||
end_offset,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn push(&mut self, checkpoint: Checkpoint) {
|
||||
@@ -50,7 +48,7 @@ impl LayerBuilder {
|
||||
|
||||
fn insert(&mut self, checkpoint: Checkpoint) -> Option<Checkpoint> {
|
||||
self.push(checkpoint);
|
||||
let emit_skip_info = self.block.len() >= CHECKPOINT_PERIOD;
|
||||
let emit_skip_info = (self.block.len() % CHECKPOINT_PERIOD) == 0;
|
||||
if emit_skip_info {
|
||||
self.flush_block()
|
||||
} else {
|
||||
|
||||
@@ -35,7 +35,7 @@ impl StoreReader {
|
||||
let (data_file, offset_index_file) = split_file(store_file)?;
|
||||
let index_data = offset_index_file.read_bytes()?;
|
||||
let space_usage = StoreSpaceUsage::new(data_file.len(), offset_index_file.len());
|
||||
let skip_index = SkipIndex::open(index_data);
|
||||
let skip_index = SkipIndex::from(index_data);
|
||||
Ok(StoreReader {
|
||||
data: data_file,
|
||||
cache: Arc::new(Mutex::new(LruCache::new(LRU_CACHE_CAPACITY))),
|
||||
@@ -46,7 +46,7 @@ impl StoreReader {
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn block_checkpoints<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a {
|
||||
pub(crate) fn block_checkpoints(&self) -> impl Iterator<Item = Checkpoint> + '_ {
|
||||
self.skip_index.checkpoints()
|
||||
}
|
||||
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
use std::path::Path;
|
||||
|
||||
use crate::HasLen;
|
||||
use crate::directory::{Directory, ManagedDirectory, MmapDirectory, RAMDirectory};
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
|
||||
use super::{StoreReader, StoreWriter};
|
||||
|
||||
#[test]
|
||||
fn test_toto2() -> crate::Result<()> {
|
||||
let directory = ManagedDirectory::wrap(MmapDirectory::open("src/store/broken_seg")?)?;
|
||||
let path = Path::new("b6029ade1b954ea1acad15b432eaacb9.store");
|
||||
assert!(directory.validate_checksum(path)?);
|
||||
let store_file = directory.open_read(path)?;
|
||||
let store = StoreReader::open(store_file)?;
|
||||
let documents = store.documents();
|
||||
// for doc in documents {
|
||||
// println!("{:?}", doc);
|
||||
// }
|
||||
let doc= store.get(15_086)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_toto() -> crate::Result<()> {
|
||||
let directory = ManagedDirectory::wrap(MmapDirectory::open("src/store/broken_seg")?)?;
|
||||
assert!(directory.validate_checksum(Path::new("e6ece22e5bca4e0dbe7ce3e4dcbd5bbf.store"))?);
|
||||
let store_file = directory.open_read(Path::new("e6ece22e5bca4e0dbe7ce3e4dcbd5bbf.store.patched"))?;
|
||||
let store = StoreReader::open(store_file)?;
|
||||
let doc= store.get(53)?;
|
||||
println!("{:?}", doc);
|
||||
// let documents = store.documents();
|
||||
// let ram_directory = RAMDirectory::create();
|
||||
// let path = Path::new("store");
|
||||
|
||||
// let store_wrt = ram_directory.open_write(path)?;
|
||||
// let mut store_writer = StoreWriter::new(store_wrt);
|
||||
// for doc in &documents {
|
||||
// store_writer.store(doc)?;
|
||||
// }
|
||||
// store_writer.close()?;
|
||||
// let store_data = ram_directory.open_read(path)?;
|
||||
// let new_store = StoreReader::open(store_data)?;
|
||||
// for doc in 0..59 {
|
||||
// println!("{}", doc);
|
||||
// let doc = new_store.get(doc)?;
|
||||
// println!("{:?}", doc);
|
||||
// }
|
||||
Ok(())
|
||||
}
|
||||
@@ -10,7 +10,7 @@ use crate::store::index::Checkpoint;
|
||||
use crate::DocId;
|
||||
use std::io::{self, Write};
|
||||
|
||||
const BLOCK_SIZE: usize = 30;
|
||||
const BLOCK_SIZE: usize = 16_384;
|
||||
|
||||
/// Write tantivy's [`Store`](./index.html)
|
||||
///
|
||||
@@ -72,7 +72,6 @@ impl StoreWriter {
|
||||
if !self.current_block.is_empty() {
|
||||
self.write_and_compress_block()?;
|
||||
}
|
||||
assert_eq!(self.first_doc_in_block, self.doc);
|
||||
let doc_shift = self.doc;
|
||||
let start_shift = self.writer.written_bytes() as u64;
|
||||
|
||||
@@ -87,17 +86,12 @@ impl StoreWriter {
|
||||
checkpoint.end_doc += doc_shift;
|
||||
checkpoint.start_offset += start_shift;
|
||||
checkpoint.end_offset += start_shift;
|
||||
self.register_checkpoint(checkpoint);
|
||||
self.offset_index_writer.insert(checkpoint);
|
||||
self.doc = checkpoint.end_doc;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn register_checkpoint(&mut self, checkpoint: Checkpoint) {
|
||||
self.offset_index_writer.insert(checkpoint);
|
||||
self.first_doc_in_block = checkpoint.end_doc;
|
||||
self.doc = checkpoint.end_doc;
|
||||
}
|
||||
|
||||
fn write_and_compress_block(&mut self) -> io::Result<()> {
|
||||
assert!(self.doc > 0);
|
||||
self.intermediary_buffer.clear();
|
||||
@@ -106,13 +100,14 @@ impl StoreWriter {
|
||||
self.writer.write_all(&self.intermediary_buffer)?;
|
||||
let end_offset = self.writer.written_bytes();
|
||||
let end_doc = self.doc;
|
||||
self.register_checkpoint(Checkpoint {
|
||||
self.offset_index_writer.insert(Checkpoint {
|
||||
start_doc: self.first_doc_in_block,
|
||||
end_doc,
|
||||
start_offset,
|
||||
end_offset,
|
||||
});
|
||||
self.current_block.clear();
|
||||
self.first_doc_in_block = self.doc;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@
|
||||
//! // the "emoji" is dropped because its not an alphanum
|
||||
//! assert!(stream.next().is_none());
|
||||
//! ```
|
||||
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// `TokenFilter` that removes all tokens that contain non
|
||||
/// ascii alphanumeric characters.
|
||||
@@ -27,7 +27,7 @@ use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||
pub struct AlphaNumOnlyFilter;
|
||||
|
||||
pub struct AlphaNumOnlyFilterStream<'a> {
|
||||
tail: BoxTokenStream<'a>,
|
||||
tail: Box<dyn TokenStream + 'a>,
|
||||
}
|
||||
|
||||
impl<'a> AlphaNumOnlyFilterStream<'a> {
|
||||
@@ -37,8 +37,8 @@ impl<'a> AlphaNumOnlyFilterStream<'a> {
|
||||
}
|
||||
|
||||
impl TokenFilter for AlphaNumOnlyFilter {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream })
|
||||
fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(AlphaNumOnlyFilterStream { tail: token_stream })
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use std::mem;
|
||||
|
||||
/// This class converts alphabetic, numeric, and symbolic Unicode characters
|
||||
@@ -8,8 +8,8 @@ use std::mem;
|
||||
pub struct AsciiFoldingFilter;
|
||||
|
||||
impl TokenFilter for AsciiFoldingFilter {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
From::from(AsciiFoldingFilterTokenStream {
|
||||
fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(AsciiFoldingFilterTokenStream {
|
||||
tail: token_stream,
|
||||
buffer: String::with_capacity(100),
|
||||
})
|
||||
@@ -18,7 +18,7 @@ impl TokenFilter for AsciiFoldingFilter {
|
||||
|
||||
pub struct AsciiFoldingFilterTokenStream<'a> {
|
||||
buffer: String,
|
||||
tail: BoxTokenStream<'a>,
|
||||
tail: Box<dyn TokenStream + 'a>,
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use crate::schema::FACET_SEP_BYTE;
|
||||
|
||||
/// The `FacetTokenizer` process a `Facet` binary representation
|
||||
@@ -26,13 +26,12 @@ pub struct FacetTokenStream<'a> {
|
||||
}
|
||||
|
||||
impl Tokenizer for FacetTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
FacetTokenStream {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(FacetTokenStream {
|
||||
text,
|
||||
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
|
||||
token: Token::default(),
|
||||
}
|
||||
.into()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
use std::mem;
|
||||
|
||||
impl TokenFilter for LowerCaser {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(LowerCaserTokenStream {
|
||||
fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(LowerCaserTokenStream {
|
||||
tail: token_stream,
|
||||
buffer: String::with_capacity(100),
|
||||
})
|
||||
@@ -17,7 +16,7 @@ pub struct LowerCaser;
|
||||
|
||||
pub struct LowerCaserTokenStream<'a> {
|
||||
buffer: String,
|
||||
tail: BoxTokenStream<'a>,
|
||||
tail: Box<dyn TokenStream + 'a>,
|
||||
}
|
||||
|
||||
// writes a lowercased version of text into output.
|
||||
|
||||
@@ -145,9 +145,7 @@ pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||
|
||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||
pub use self::tokenizer::{
|
||||
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
|
||||
};
|
||||
pub use self::tokenizer::{TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
|
||||
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
||||
///
|
||||
@@ -131,8 +130,8 @@ pub struct NgramTokenStream<'a> {
|
||||
}
|
||||
|
||||
impl Tokenizer for NgramTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
From::from(NgramTokenStream {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(NgramTokenStream {
|
||||
ngram_charidx_iterator: StutteringIterator::new(
|
||||
CodepointFrontiers::for_str(text),
|
||||
self.min_gram,
|
||||
@@ -308,9 +307,9 @@ mod tests {
|
||||
use super::StutteringIterator;
|
||||
use crate::tokenizer::tests::assert_token;
|
||||
use crate::tokenizer::tokenizer::Tokenizer;
|
||||
use crate::tokenizer::{BoxTokenStream, Token};
|
||||
use crate::tokenizer::{Token, TokenStream};
|
||||
|
||||
fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> {
|
||||
fn test_helper(mut tokenizer: Box<dyn TokenStream>) -> Vec<Token> {
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
|
||||
tokens
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
|
||||
/// For each value of the field, emit a single unprocessed token.
|
||||
#[derive(Clone)]
|
||||
@@ -11,7 +10,7 @@ pub struct RawTokenStream {
|
||||
}
|
||||
|
||||
impl Tokenizer for RawTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
let token = Token {
|
||||
offset_from: 0,
|
||||
offset_to: text.len(),
|
||||
@@ -19,11 +18,10 @@ impl Tokenizer for RawTokenizer {
|
||||
text: text.to_string(),
|
||||
position_length: 1,
|
||||
};
|
||||
RawTokenStream {
|
||||
Box::new(RawTokenStream {
|
||||
token,
|
||||
has_token: true,
|
||||
}
|
||||
.into()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
//! ```
|
||||
//!
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
|
||||
/// `RemoveLongFilter` removes tokens that are longer
|
||||
/// than a given number of bytes (in UTF-8 representation).
|
||||
@@ -39,8 +38,8 @@ impl<'a> RemoveLongFilterStream<'a> {
|
||||
}
|
||||
|
||||
impl TokenFilter for RemoveLongFilter {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(RemoveLongFilterStream {
|
||||
fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(RemoveLongFilterStream {
|
||||
token_length_limit: self.length_limit,
|
||||
tail: token_stream,
|
||||
})
|
||||
@@ -49,7 +48,7 @@ impl TokenFilter for RemoveLongFilter {
|
||||
|
||||
pub struct RemoveLongFilterStream<'a> {
|
||||
token_length_limit: usize,
|
||||
tail: BoxTokenStream<'a>,
|
||||
tail: Box<dyn TokenStream + 'a>,
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for RemoveLongFilterStream<'a> {
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use super::BoxTokenStream;
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use std::str::CharIndices;
|
||||
|
||||
@@ -13,8 +12,8 @@ pub struct SimpleTokenStream<'a> {
|
||||
}
|
||||
|
||||
impl Tokenizer for SimpleTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(SimpleTokenStream {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(SimpleTokenStream {
|
||||
text,
|
||||
chars: text.char_indices(),
|
||||
token: Token::default(),
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
use rust_stemmers::{self, Algorithm};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -78,9 +77,9 @@ impl Default for Stemmer {
|
||||
}
|
||||
|
||||
impl TokenFilter for Stemmer {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
|
||||
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||
BoxTokenStream::from(StemmerTokenStream {
|
||||
Box::new(StemmerTokenStream {
|
||||
tail: token_stream,
|
||||
stemmer: inner_stemmer,
|
||||
})
|
||||
@@ -88,7 +87,7 @@ impl TokenFilter for Stemmer {
|
||||
}
|
||||
|
||||
pub struct StemmerTokenStream<'a> {
|
||||
tail: BoxTokenStream<'a>,
|
||||
tail: Box<dyn TokenStream + 'a>,
|
||||
stemmer: rust_stemmers::Stemmer,
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
//! assert!(stream.next().is_none());
|
||||
//! ```
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
use fnv::FnvHasher;
|
||||
use std::collections::HashSet;
|
||||
use std::hash::BuildHasherDefault;
|
||||
@@ -51,12 +50,12 @@ impl StopWordFilter {
|
||||
|
||||
pub struct StopWordFilterStream<'a> {
|
||||
words: StopWordHashSet,
|
||||
tail: BoxTokenStream<'a>,
|
||||
tail: Box<dyn TokenStream + 'a>,
|
||||
}
|
||||
|
||||
impl TokenFilter for StopWordFilter {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(StopWordFilterStream {
|
||||
fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(StopWordFilterStream {
|
||||
words: self.words.clone(),
|
||||
tail: token_stream,
|
||||
})
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
use crate::tokenizer::{BoxTokenStream, Token, TokenStream};
|
||||
use std::ops::DerefMut;
|
||||
use crate::tokenizer::{Token, TokenStream};
|
||||
|
||||
const POSITION_GAP: usize = 2;
|
||||
|
||||
pub(crate) struct TokenStreamChain<'a> {
|
||||
offsets: Vec<usize>,
|
||||
token_streams: Vec<BoxTokenStream<'a>>,
|
||||
streams_with_offsets: Vec<(Box<dyn TokenStream + 'a>, usize)>,
|
||||
position_shift: usize,
|
||||
stream_idx: usize,
|
||||
token: Token,
|
||||
@@ -13,13 +11,11 @@ pub(crate) struct TokenStreamChain<'a> {
|
||||
|
||||
impl<'a> TokenStreamChain<'a> {
|
||||
pub fn new(
|
||||
offsets: Vec<usize>,
|
||||
token_streams: Vec<BoxTokenStream<'a>>,
|
||||
streams_with_offsets: Vec<(Box<dyn TokenStream + 'a>, usize)>,
|
||||
) -> TokenStreamChain<'a> {
|
||||
TokenStreamChain {
|
||||
offsets,
|
||||
streams_with_offsets,
|
||||
stream_idx: 0,
|
||||
token_streams,
|
||||
position_shift: 0,
|
||||
token: Token::default(),
|
||||
}
|
||||
@@ -28,11 +24,10 @@ impl<'a> TokenStreamChain<'a> {
|
||||
|
||||
impl<'a> TokenStream for TokenStreamChain<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.stream_idx < self.token_streams.len() {
|
||||
let token_stream = self.token_streams[self.stream_idx].deref_mut();
|
||||
while self.stream_idx < self.streams_with_offsets.len() {
|
||||
let (ref mut token_stream, offset_offset) = self.streams_with_offsets[self.stream_idx];
|
||||
if token_stream.advance() {
|
||||
let token = token_stream.token();
|
||||
let offset_offset = self.offsets[self.stream_idx];
|
||||
self.token.offset_from = token.offset_from + offset_offset;
|
||||
self.token.offset_to = token.offset_to + offset_offset;
|
||||
self.token.position = token.position + self.position_shift;
|
||||
@@ -49,7 +44,7 @@ impl<'a> TokenStream for TokenStreamChain<'a> {
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
assert!(
|
||||
self.stream_idx <= self.token_streams.len(),
|
||||
self.stream_idx <= self.streams_with_offsets.len(),
|
||||
"You called .token(), after the end of the token stream has been reached"
|
||||
);
|
||||
&self.token
|
||||
@@ -57,7 +52,7 @@ impl<'a> TokenStream for TokenStreamChain<'a> {
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
assert!(
|
||||
self.stream_idx <= self.token_streams.len(),
|
||||
self.stream_idx <= self.streams_with_offsets.len(),
|
||||
"You called .token(), after the end of the token stream has been reached"
|
||||
);
|
||||
&mut self.token
|
||||
@@ -73,10 +68,10 @@ mod tests {
|
||||
#[test]
|
||||
fn test_chain_first_emits_no_tokens() {
|
||||
let token_streams = vec![
|
||||
SimpleTokenizer.token_stream(""),
|
||||
SimpleTokenizer.token_stream("hello world"),
|
||||
(SimpleTokenizer.token_stream(""), 0),
|
||||
(SimpleTokenizer.token_stream("hello world"), 0),
|
||||
];
|
||||
let mut token_chain = TokenStreamChain::new(vec![0, 0], token_streams);
|
||||
let mut token_chain = TokenStreamChain::new(token_streams);
|
||||
|
||||
assert!(token_chain.advance());
|
||||
assert_eq!(token_chain.token().text, "hello");
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain};
|
||||
use crate::tokenizer::{Token, TokenStream, TokenStreamChain};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::Ordering;
|
||||
|
||||
@@ -42,24 +42,23 @@ impl PreTokenizedStream {
|
||||
/// Creates a TokenStream from PreTokenizedString array
|
||||
pub fn chain_tokenized_strings<'a>(
|
||||
tok_strings: &'a [&'a PreTokenizedString],
|
||||
) -> BoxTokenStream {
|
||||
) -> Box<dyn TokenStream> {
|
||||
if tok_strings.len() == 1 {
|
||||
PreTokenizedStream::from((*tok_strings[0]).clone()).into()
|
||||
Box::new(PreTokenizedStream::from(tok_strings[0].to_owned()))
|
||||
} else {
|
||||
let mut offsets = vec![];
|
||||
let mut streams_with_offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
for &tok_string in tok_strings {
|
||||
offsets.push(total_offset);
|
||||
streams_with_offsets.push((
|
||||
Box::new(PreTokenizedStream::from(tok_string.to_owned()))
|
||||
as Box<dyn TokenStream>,
|
||||
total_offset,
|
||||
));
|
||||
if let Some(last_token) = tok_string.tokens.last() {
|
||||
total_offset += last_token.offset_to;
|
||||
}
|
||||
}
|
||||
// TODO remove the string cloning.
|
||||
let token_streams: Vec<BoxTokenStream<'static>> = tok_strings
|
||||
.iter()
|
||||
.map(|&tok_string| PreTokenizedStream::from((*tok_string).clone()).into())
|
||||
.collect();
|
||||
TokenStreamChain::new(offsets, token_streams).into()
|
||||
Box::new(TokenStreamChain::new(streams_with_offsets))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,8 +2,6 @@ use crate::tokenizer::TokenStreamChain;
|
||||
use serde::{Deserialize, Serialize};
|
||||
/// The tokenizer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
/// Token
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
@@ -40,7 +38,7 @@ impl Default for Token {
|
||||
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
|
||||
pub struct TextAnalyzer {
|
||||
tokenizer: Box<dyn Tokenizer>,
|
||||
token_filters: Vec<BoxTokenFilter>,
|
||||
token_filters: Vec<Box<dyn TokenFilter>>,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> From<T> for TextAnalyzer {
|
||||
@@ -50,11 +48,14 @@ impl<T: Tokenizer> From<T> for TextAnalyzer {
|
||||
}
|
||||
|
||||
impl TextAnalyzer {
|
||||
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
|
||||
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `Box<dyn TokenFilter>`.
|
||||
///
|
||||
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
|
||||
/// `TextAnalyzer::from(tokenizer)`.
|
||||
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
|
||||
pub fn new<T: Tokenizer>(
|
||||
tokenizer: T,
|
||||
token_filters: Vec<Box<dyn TokenFilter>>,
|
||||
) -> TextAnalyzer {
|
||||
TextAnalyzer {
|
||||
tokenizer: Box::new(tokenizer),
|
||||
token_filters,
|
||||
@@ -77,8 +78,8 @@ impl TextAnalyzer {
|
||||
/// .filter(Stemmer::default());
|
||||
/// ```
|
||||
///
|
||||
pub fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self {
|
||||
self.token_filters.push(token_filter.into());
|
||||
pub fn filter<F: TokenFilter>(mut self, token_filter: F) -> Self {
|
||||
self.token_filters.push(Box::new(token_filter));
|
||||
self
|
||||
}
|
||||
|
||||
@@ -87,28 +88,19 @@ impl TextAnalyzer {
|
||||
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
|
||||
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
||||
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
||||
pub fn token_stream_texts<'a>(&self, texts: &'a [&'a str]) -> BoxTokenStream<'a> {
|
||||
assert!(!texts.is_empty());
|
||||
if texts.len() == 1 {
|
||||
self.token_stream(texts[0])
|
||||
} else {
|
||||
let mut offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
for &text in texts {
|
||||
offsets.push(total_offset);
|
||||
total_offset += text.len();
|
||||
}
|
||||
let token_streams: Vec<BoxTokenStream<'a>> = texts
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|text| self.token_stream(text))
|
||||
.collect();
|
||||
From::from(TokenStreamChain::new(offsets, token_streams))
|
||||
pub fn token_stream_texts<'a>(&self, texts: &'a [&str]) -> Box<dyn TokenStream + 'a> {
|
||||
debug_assert!(!texts.is_empty());
|
||||
let mut streams_with_offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
for &text in texts {
|
||||
streams_with_offsets.push((self.token_stream(text), total_offset));
|
||||
total_offset += text.len();
|
||||
}
|
||||
Box::new(TokenStreamChain::new(streams_with_offsets))
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
pub fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
let mut token_stream = self.tokenizer.token_stream(text);
|
||||
for token_filter in &self.token_filters {
|
||||
token_stream = token_filter.transform(token_stream);
|
||||
@@ -140,7 +132,7 @@ impl Clone for TextAnalyzer {
|
||||
/// This API may change to use associated types.
|
||||
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
|
||||
/// Creates a token stream for a given `str`.
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a>;
|
||||
}
|
||||
|
||||
pub trait TokenizerClone {
|
||||
@@ -153,69 +145,6 @@ impl<T: Tokenizer + Clone> TokenizerClone for T {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
let token_stream: &mut dyn TokenStream = self.borrow_mut();
|
||||
token_stream.advance()
|
||||
}
|
||||
|
||||
fn token<'b>(&'b self) -> &'b Token {
|
||||
let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
|
||||
token_stream.token()
|
||||
}
|
||||
|
||||
fn token_mut<'b>(&'b mut self) -> &'b mut Token {
|
||||
let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
|
||||
token_stream.token_mut()
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||
///
|
||||
/// See `TokenStream` for more information.
|
||||
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||
|
||||
impl<'a, T> From<T> for BoxTokenStream<'a>
|
||||
where
|
||||
T: TokenStream + 'a,
|
||||
{
|
||||
fn from(token_stream: T) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream(Box::new(token_stream))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for BoxTokenStream<'a> {
|
||||
type Target = dyn TokenStream + 'a;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
impl<'a> DerefMut for BoxTokenStream<'a> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut *self.0
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
|
||||
///
|
||||
/// See `TokenStream` for more information.
|
||||
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
|
||||
|
||||
impl Deref for BoxTokenFilter {
|
||||
type Target = dyn TokenFilter;
|
||||
|
||||
fn deref(&self) -> &dyn TokenFilter {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenFilter> From<T> for BoxTokenFilter {
|
||||
fn from(tokenizer: T) -> BoxTokenFilter {
|
||||
BoxTokenFilter(Box::new(tokenizer))
|
||||
}
|
||||
}
|
||||
|
||||
/// `TokenStream` is the result of the tokenization.
|
||||
///
|
||||
/// It consists consumable stream of `Token`s.
|
||||
@@ -295,18 +224,18 @@ pub trait TokenStream {
|
||||
}
|
||||
|
||||
pub trait TokenFilterClone {
|
||||
fn box_clone(&self) -> BoxTokenFilter;
|
||||
fn box_clone(&self) -> Box<dyn TokenFilter>;
|
||||
}
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
|
||||
/// Wraps a token stream and returns the modified one.
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
|
||||
fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a>;
|
||||
}
|
||||
|
||||
impl<T: TokenFilter + Clone> TokenFilterClone for T {
|
||||
fn box_clone(&self) -> BoxTokenFilter {
|
||||
BoxTokenFilter::from(self.clone())
|
||||
fn box_clone(&self) -> Box<dyn TokenFilter> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user