Compare commits

...

10 Commits

Author SHA1 Message Date
Paul Masurel
96f24b078e Added failing unit test. 2021-01-07 22:43:28 +09:00
Paul Masurel
332b50a4eb Merge pull request #970 from tantivy-search/functional-test-store
Added a functional long running test to test store merging.
2021-01-07 14:27:08 +09:00
Paul Masurel
8ca0954b3b Added a functional long running test to test store merging. 2021-01-07 14:07:15 +09:00
Paul Masurel
36343e2de8 Merge pull request #968 from tantivy-search/add-bench-analyzer
added a simple bench for the default analyzer
2021-01-06 21:33:39 +09:00
Paul Masurel
2f14a892ca added a simple bench for the default analyzer 2021-01-06 19:11:26 +09:00
Paul Masurel
9c3cabce40 Updated version of the rand crate. 2021-01-06 18:09:00 +09:00
Paul Masurel
f8d71c2b10 Merge pull request #964 from mosuka/deserializable
Make NamedFieldDocument deserializable
2021-01-06 17:43:53 +09:00
Paul Masurel
394dfb24f1 Merge pull request #965 from lewisdiamond/patch-1
Fix spelling
2021-01-06 13:38:31 +09:00
Lewis Diamond
b0549a229d Fix spelling 2021-01-05 22:34:56 -05:00
Minoru Osuka
670b6eaff6 Make NamedFieldDocument deserializable 2020-12-21 16:51:31 +09:00
10 changed files with 3931 additions and 45 deletions

View File

@@ -53,10 +53,11 @@ lru = "0.6"
winapi = "0.3"
[dev-dependencies]
rand = "0.7"
rand = "0.8"
maplit = "1"
matches = "0.1.8"
proptest = "0.10"
criterion = "0.3"
[dev-dependencies.fail]
version = "0.4"
@@ -97,3 +98,7 @@ travis-ci = { repository = "tantivy-search/tantivy" }
name = "failpoints"
path = "tests/failpoints/mod.rs"
required-features = ["fail/failpoints"]
[[bench]]
name = "analyzer"
harness = false

3774
benches/alice.txt Normal file

File diff suppressed because it is too large Load Diff

22
benches/analyzer.rs Normal file
View File

@@ -0,0 +1,22 @@
use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::TokenizerManager;
const ALICE_TXT: &'static str = include_str!("alice.txt");
pub fn criterion_benchmark(c: &mut Criterion) {
let tokenizer_manager = TokenizerManager::default();
let tokenizer = tokenizer_manager.get("default").unwrap();
c.bench_function("default-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
let mut token_stream = tokenizer.token_stream(ALICE_TXT);
while token_stream.advance() {
word_count += 1;
}
assert_eq!(word_count, 30_731);
})
});
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

View File

@@ -1,45 +1,93 @@
use rand::thread_rng;
use std::collections::HashSet;
use crate::schema::*;
use crate::Index;
use crate::Searcher;
use crate::{doc, schema::*};
use rand::thread_rng;
use rand::Rng;
use std::collections::HashSet;
fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
assert!(searcher.segment_readers().len() < 20);
assert_eq!(searcher.num_docs() as usize, vals.len());
for segment_reader in searcher.segment_readers() {
let store_reader = segment_reader.get_store_reader()?;
for doc_id in 0..segment_reader.max_doc() {
let _doc = store_reader.get(doc_id)?;
}
}
Ok(())
}
#[test]
#[ignore]
fn test_indexing() {
fn test_functional_store() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", INDEXED | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, 12_000_000)?;
let mut doc_set: Vec<u64> = Vec::new();
let mut doc_id = 0u64;
for iteration in 0..500 {
dbg!(iteration);
let num_docs: usize = rng.gen_range(0..4);
if doc_set.len() >= 1 {
let doc_to_remove_id = rng.gen_range(0..doc_set.len());
let removed_doc_id = doc_set.swap_remove(doc_to_remove_id);
index_writer.delete_term(Term::from_field_u64(id_field, removed_doc_id));
}
for _ in 0..num_docs {
doc_set.push(doc_id);
index_writer.add_document(doc!(id_field=>doc_id));
doc_id += 1;
}
index_writer.commit()?;
reader.reload()?;
let searcher = reader.searcher();
check_index_content(&searcher, &doc_set)?;
}
Ok(())
}
#[test]
#[ignore]
fn test_functional_indexing() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap();
let reader = index.reader().unwrap();
let index = Index::create_from_tempdir(schema)?;
let reader = index.reader()?;
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..200 {
let random_val = rng.gen_range(0, 20);
let random_val = rng.gen_range(0..20);
if random_val == 0 {
index_writer.commit().expect("Commit failed");
index_writer.commit()?;
committed_docs.extend(&uncommitted_docs);
uncommitted_docs.clear();
reader.reload().unwrap();
reader.reload()?;
let searcher = reader.searcher();
// check that everything is correct.
check_index_content(&searcher, &committed_docs);
check_index_content(
&searcher,
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
)?;
} else {
if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u64(id_field, random_val);
@@ -55,4 +103,5 @@ fn test_indexing() {
}
}
}
Ok(())
}

View File

@@ -8,7 +8,7 @@ const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
const DEFAULT_MIN_MERGE_SIZE: usize = 8;
const DEFAULT_MAX_MERGE_SIZE: usize = 10_000_000;
/// `LogMergePolicy` tries tries to merge segments that have a similar number of
/// `LogMergePolicy` tries to merge segments that have a similar number of
/// documents.
#[derive(Debug, Clone)]
pub struct LogMergePolicy {

View File

@@ -302,7 +302,7 @@ mod tests {
let mut rng = rand::thread_rng();
writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..3_000 {
let term_freq = rng.gen_range(1, 10000);
let term_freq = rng.gen_range(1..10000);
let words: Vec<&str> = std::iter::repeat("bbbb").take(term_freq).collect();
let text = words.join(" ");
writer.add_document(doc!(text_field=>text));

View File

@@ -1,5 +1,5 @@
use crate::schema::Value;
use serde::Serialize;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
/// Internal representation of a document used for JSON
@@ -8,5 +8,5 @@ use std::collections::BTreeMap;
/// A `NamedFieldDocument` is a simple representation of a document
/// as a `BTreeMap<String, Vec<Value>>`.
///
#[derive(Serialize)]
#[derive(Debug, Deserialize, Serialize)]
pub struct NamedFieldDocument(pub BTreeMap<String, Vec<Value>>);

View File

@@ -39,13 +39,16 @@ impl fmt::Debug for Checkpoint {
#[cfg(test)]
mod tests {
use std::io;
use std::{io, iter};
use futures::executor::block_on;
use proptest::strategy::{BoxedStrategy, Strategy};
use crate::directory::OwnedBytes;
use crate::indexer::NoMergePolicy;
use crate::schema::{SchemaBuilder, STORED, STRING};
use crate::store::index::Checkpoint;
use crate::DocId;
use crate::{DocAddress, DocId, Index, Term};
use super::{SkipIndex, SkipIndexBuilder};
@@ -54,7 +57,7 @@ mod tests {
let mut output: Vec<u8> = Vec::new();
let skip_index_builder: SkipIndexBuilder = SkipIndexBuilder::new();
skip_index_builder.write(&mut output)?;
let skip_index: SkipIndex = SkipIndex::from(OwnedBytes::new(output));
let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output));
let mut skip_cursor = skip_index.checkpoints();
assert!(skip_cursor.next().is_none());
Ok(())
@@ -72,7 +75,7 @@ mod tests {
};
skip_index_builder.insert(checkpoint);
skip_index_builder.write(&mut output)?;
let skip_index: SkipIndex = SkipIndex::from(OwnedBytes::new(output));
let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output));
let mut skip_cursor = skip_index.checkpoints();
assert_eq!(skip_cursor.next(), Some(checkpoint));
assert_eq!(skip_cursor.next(), None);
@@ -121,7 +124,7 @@ mod tests {
}
skip_index_builder.write(&mut output)?;
let skip_index: SkipIndex = SkipIndex::from(OwnedBytes::new(output));
let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output));
assert_eq!(
&skip_index.checkpoints().collect::<Vec<_>>()[..],
&checkpoints[..]
@@ -133,6 +136,40 @@ mod tests {
(doc as u64) * (doc as u64)
}
#[test]
fn test_merge_store_with_stacking() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default();
let text = schema_builder.add_text_field("text", STORED | STRING);
let body = schema_builder.add_text_field("body", STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let long_text: String = iter::repeat("abcdefghijklmnopqrstuvwxyz")
.take(1_000)
.collect();
for _ in 0..20 {
index_writer.add_document(doc!(body=>long_text.clone()));
}
index_writer.commit()?;
index_writer.add_document(doc!(text=>"testb"));
for _ in 0..10 {
index_writer.add_document(doc!(text=>"testd", body=>long_text.clone()));
}
index_writer.commit()?;
index_writer.delete_term(Term::from_field_text(text, "testb"));
index_writer.commit()?;
let segment_ids = index.searchable_segment_ids()?;
block_on(index_writer.merge(&segment_ids))?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 30);
for i in 0..searcher.num_docs() as u32 {
let _doc = searcher.doc(DocAddress(0u32, i))?;
}
Ok(())
}
#[test]
fn test_skip_index_long() -> io::Result<()> {
let mut output: Vec<u8> = Vec::new();
@@ -150,7 +187,7 @@ mod tests {
}
skip_index_builder.write(&mut output)?;
assert_eq!(output.len(), 4035);
let resulting_checkpoints: Vec<Checkpoint> = SkipIndex::from(OwnedBytes::new(output))
let resulting_checkpoints: Vec<Checkpoint> = SkipIndex::open(OwnedBytes::new(output))
.checkpoints()
.collect();
assert_eq!(&resulting_checkpoints, &checkpoints);
@@ -221,7 +258,7 @@ mod tests {
}
let mut buffer = Vec::new();
skip_index_builder.write(&mut buffer).unwrap();
let skip_index = SkipIndex::from(OwnedBytes::new(buffer));
let skip_index = SkipIndex::open(OwnedBytes::new(buffer));
let iter_checkpoints: Vec<Checkpoint> = skip_index.checkpoints().collect();
assert_eq!(&checkpoints[..], &iter_checkpoints[..]);
test_skip_index_aux(skip_index, &checkpoints[..]);

View File

@@ -59,6 +59,24 @@ pub struct SkipIndex {
}
impl SkipIndex {
pub fn open(mut data: OwnedBytes) -> SkipIndex {
let offsets: Vec<u64> = Vec::<VInt>::deserialize(&mut data)
.unwrap()
.into_iter()
.map(|el| el.0)
.collect();
let mut start_offset = 0;
let mut layers = Vec::new();
for end_offset in offsets {
let layer = Layer {
data: data.slice(start_offset as usize, end_offset as usize),
};
layers.push(layer);
start_offset = end_offset;
}
SkipIndex { layers }
}
pub(crate) fn checkpoints<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a {
self.layers
.last()
@@ -90,22 +108,3 @@ impl SkipIndex {
Some(cur_checkpoint)
}
}
impl From<OwnedBytes> for SkipIndex {
fn from(mut data: OwnedBytes) -> SkipIndex {
let offsets: Vec<u64> = Vec::<VInt>::deserialize(&mut data)
.unwrap()
.into_iter()
.map(|el| el.0)
.collect();
let mut start_offset = 0;
let mut layers = Vec::new();
for end_offset in offsets {
layers.push(Layer {
data: data.slice(start_offset as usize, end_offset as usize),
});
start_offset = end_offset;
}
SkipIndex { layers }
}
}

View File

@@ -35,7 +35,7 @@ impl StoreReader {
let (data_file, offset_index_file) = split_file(store_file)?;
let index_data = offset_index_file.read_bytes()?;
let space_usage = StoreSpaceUsage::new(data_file.len(), offset_index_file.len());
let skip_index = SkipIndex::from(index_data);
let skip_index = SkipIndex::open(index_data);
Ok(StoreReader {
data: data_file,
cache: Arc::new(Mutex::new(LruCache::new(LRU_CACHE_CAPACITY))),