Compare commits

...

7 Commits

Author SHA1 Message Date
Paul Masurel
4a072e3c18 Introducing a column trait 2022-09-02 11:24:04 +09:00
Paul Masurel
84e0c75598 Bench fixing 2022-09-02 11:15:44 +09:00
Paul Masurel
08c4412d73 Adding dragon API to build index without any thread. (#1496)
Closes #1487
2022-09-01 10:32:36 +09:00
Shikhar Bhushan
70e58adff9 OwnedBytes doc clarification (#1498)
It only exposes it with the same lifetime as `&self`, which is what keeps things safe
2022-09-01 10:32:17 +09:00
PSeitz
0d1cd119e9 Merge pull request #1497 from quickwit-oss/improve_proptest
custom num strategy, faster test
2022-08-31 06:25:25 -07:00
Pascal Seitz
d3dd620048 fix clippy 2022-08-31 13:13:56 +02:00
Pascal Seitz
e89c220b56 custom num strategy, faster test
closes #1486
faster test with rand values
2022-08-31 12:08:44 +02:00
13 changed files with 206 additions and 131 deletions

View File

@@ -62,7 +62,7 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
pub struct AntiCallToken(());
/// Trait used to indicate when no more write need to be done on a writer
pub trait TerminatingWrite: Write + Send {
pub trait TerminatingWrite: Write + Send + Sync {
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
fn terminate(mut self) -> io::Result<()>
where Self: Sized {

View File

@@ -289,6 +289,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
#[allow(clippy::question_mark)]
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE {
return None;

View File

@@ -92,6 +92,7 @@ pub struct FastFieldStats {
struct VecColum<'a>(&'a [u64]);
impl<'a> Column for VecColum<'a> {
fn get_val(&self, position: u64) -> u64 {
self.0[position as usize]
}
@@ -121,8 +122,9 @@ impl<'a> From<&'a [u64]> for VecColum<'a> {
#[cfg(test)]
mod tests {
use proptest::arbitrary::any;
use proptest::proptest;
use proptest::prelude::*;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
use crate::bitpacked::BitpackedCodec;
use crate::blockwise_linear::BlockwiseLinearCodec;
@@ -153,21 +155,32 @@ mod tests {
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(100))]
#[test]
fn test_proptest_small(data in proptest::collection::vec(any::<u64>(), 1..10)) {
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
}
#[test]
fn test_proptest_large(data in proptest::collection::vec(any::<u64>(), 1..6000)) {
fn test_proptest_small(data in proptest::collection::vec(num_strategy(), 1..10)) {
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(10))]
#[test]
fn test_proptest_large(data in proptest::collection::vec(num_strategy(), 1..6000)) {
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
}
}
fn num_strategy() -> impl Strategy<Value = u64> {
prop_oneof![
1 => prop::num::u64::ANY.prop_map(|num| u64::MAX - (num % 10) ),
1 => prop::num::u64::ANY.prop_map(|num| num % 10 ),
20 => prop::num::u64::ANY,
]
}
pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
let mut data_and_names = vec![];

View File

@@ -193,6 +193,7 @@ impl FastFieldCodec for LinearCodec {
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
#[allow(clippy::question_mark)]
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
if fastfield_accessor.num_vals() < 3 {
return None; // disable compressor for this case
@@ -258,6 +259,8 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
#[cfg(test)]
mod tests {
use rand::RngCore;
use super::*;
use crate::tests::get_codec_test_datasets;
@@ -340,10 +343,9 @@ mod tests {
#[test]
fn linear_interpol_fast_field_rand() {
for _ in 0..5000 {
let mut data = (0..10_000)
.map(|_| rand::random::<u64>())
.collect::<Vec<_>>();
let mut rng = rand::thread_rng();
for _ in 0..50 {
let mut data = (0..10_000).map(|_| rng.next_u64()).collect::<Vec<_>>();
create_and_validate(&data, "random");
data.reverse();
create_and_validate(&data, "random");

View File

@@ -6,7 +6,7 @@ use std::{fmt, io, mem};
use stable_deref_trait::StableDeref;
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
/// this data as a static slice.
/// this data as a slice.
///
/// The backing object is required to be `StableDeref`.
#[derive(Clone)]

View File

@@ -7,6 +7,7 @@ use std::sync::Arc;
use super::segment::Segment;
use super::IndexSettings;
use crate::core::single_segment_index_writer::SingleSegmentIndexWriter;
use crate::core::{
Executor, IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory, META_FILEPATH,
};
@@ -163,6 +164,25 @@ impl IndexBuilder {
self.create(mmap_directory)
}
/// Dragons ahead!!!
///
/// The point of this API is to let users create a simple index with a single segment
/// and without starting any thread.
///
/// Do not use this method if you are not sure what you are doing.
///
/// It expects an originally empty directory, and will not run any GC operation.
#[doc(hidden)]
pub fn single_segment_index_writer(
self,
dir: impl Into<Box<dyn Directory>>,
mem_budget: usize,
) -> crate::Result<SingleSegmentIndexWriter> {
let index = self.create(dir)?;
let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
Ok(index_simple_writer)
}
/// Creates a new index in a temp directory.
///
/// The index will use the `MMapDirectory` in a newly created directory.
@@ -608,10 +628,12 @@ impl fmt::Debug for Index {
#[cfg(test)]
mod tests {
use crate::collector::Count;
use crate::directory::{RamDirectory, WatchCallback};
use crate::schema::{Field, Schema, INDEXED, TEXT};
use crate::query::TermQuery;
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, TEXT};
use crate::tokenizer::TokenizerManager;
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy};
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy, Term};
#[test]
fn test_indexer_for_field() {
@@ -877,4 +899,28 @@ mod tests {
);
Ok(())
}
#[test]
fn test_single_segment_index_writer() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let directory = RamDirectory::default();
let mut single_segment_index_writer = Index::builder()
.schema(schema)
.single_segment_index_writer(directory, 10_000_000)?;
for _ in 0..10 {
let doc = doc!(text_field=>"hello");
single_segment_index_writer.add_document(doc)?;
}
let index = single_segment_index_writer.finalize()?;
let searcher = index.reader()?.searcher();
let term_query = TermQuery::new(
Term::from_field_text(text_field, "hello"),
IndexRecordOption::Basic,
);
let count = searcher.search(&term_query, &Count)?;
assert_eq!(count, 10);
Ok(())
}
}

View File

@@ -7,6 +7,7 @@ mod segment;
mod segment_component;
mod segment_id;
mod segment_reader;
mod single_segment_index_writer;
use std::path::Path;
@@ -23,6 +24,7 @@ pub use self::segment::Segment;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
pub use self::segment_reader::SegmentReader;
pub use self::single_segment_index_writer::SingleSegmentIndexWriter;
/// The meta file contains all the information about the list of segments and the schema
/// of the index.

View File

@@ -0,0 +1,47 @@
use crate::indexer::operation::AddOperation;
use crate::indexer::segment_updater::save_metas;
use crate::indexer::SegmentWriter;
use crate::{Directory, Document, Index, IndexMeta, Opstamp, Segment};
#[doc(hidden)]
pub struct SingleSegmentIndexWriter {
segment_writer: SegmentWriter,
segment: Segment,
opstamp: Opstamp,
}
impl SingleSegmentIndexWriter {
pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
let segment = index.new_segment();
let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
Ok(Self {
segment_writer,
segment,
opstamp: 0,
})
}
pub fn add_document(&mut self, document: Document) -> crate::Result<()> {
let opstamp = self.opstamp;
self.opstamp += 1;
self.segment_writer
.add_document(AddOperation { opstamp, document })
}
pub fn finalize(self) -> crate::Result<Index> {
let max_doc = self.segment_writer.max_doc();
self.segment_writer.finalize()?;
let segment: Segment = self.segment.with_max_doc(max_doc);
let index = segment.index();
let index_meta = IndexMeta {
index_settings: index.settings().clone(),
segments: vec![segment.meta().clone()],
schema: index.schema(),
opstamp: 0,
payload: None,
};
save_metas(&index_meta, index.directory())?;
index.directory().sync_directory()?;
Ok(segment.index().clone())
}
}

View File

@@ -978,37 +978,20 @@ mod tests {
#[cfg(all(test, feature = "unstable"))]
mod bench {
use std::collections::HashMap;
use std::path::Path;
use fastfield_codecs::Column;
use test::{self, Bencher};
use super::tests::{generate_permutation, FIELD, SCHEMA};
use super::tests::generate_permutation;
use super::*;
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
use crate::fastfield::tests::generate_permutation_gcd;
#[bench]
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in (0u32..n / 7).map(|v| v * 7) {
a ^= permutation[i as usize];
}
a
});
}
#[bench]
fn bench_intfastfield_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u64;
for _ in 0u32..n {
for _ in 0..n {
a = permutation[a as usize];
}
a
@@ -1016,102 +999,83 @@ mod bench {
}
#[bench]
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
let path = Path::new("test");
fn bench_intfastfield_jumpy_fflookup(b: &mut Bencher) {
let permutation = generate_permutation();
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
let n = permutation.len();
let column = DynamicFastFieldReader::from(permutation);
b.iter(|| {
let mut a = 0u64;
for _ in 0..n {
a = column.get_val(a as u64);
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in (0u32..n / 7).map(|val| val * 7) {
a ^= fast_field_reader.get_val(i as u64);
}
a
});
}
a
});
}
#[bench]
fn bench_intfastfield_fflookup(b: &mut Bencher) {
let path = Path::new("test");
fn bench_intfastfield_stride7_vec(b: &mut Bencher) {
let permutation = generate_permutation();
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
let n = permutation.len();
b.iter(|| {
let mut a = 0u64;
for i in (0..n / 7).map(|val| val * 7) {
a += permutation[i as usize];
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
b.iter(|| {
let mut a = 0u32;
for i in 0u64..permutation.len() as u64 {
a = fast_field_reader.get_val(i) as u32;
}
a
});
}
a
});
}
#[bench]
fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) {
let path = Path::new("test");
fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
let column = DynamicFastFieldReader::from(permutation);
b.iter(|| {
let mut a = 0u64;
for i in (0..n / 7).map(|val| val * 7) {
a += column.get_val(i as u64);
}
a
});
}
#[bench]
fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
let column = DynamicFastFieldReader::from(permutation);
b.iter(|| {
let mut a = 0u64;
for i in 0u64..n as u64 {
a += column.get_val(i);
}
a
});
}
#[bench]
fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) {
let permutation = generate_permutation_gcd();
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
let n = permutation.len();
let column = DynamicFastFieldReader::from(permutation);
b.iter(|| {
let mut a = 0u64;
for i in 0..n as u64 {
a += column.get_val(i);
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
a
});
}
b.iter(|| {
let mut a = 0u32;
for i in 0u32..permutation.len() as u32 {
a = fast_field_reader.get_val(i as u64) as u32;
}
a
});
}
#[bench]
fn bench_intfastfield_scan_all_vec(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let mut a = 0u64;
for i in 0..permutation.len() {
a += permutation[i as usize] as u64;
}
a
});
}
}

View File

@@ -64,8 +64,8 @@ impl From<FastFieldCodecType> for FastFieldCodecEnableCheck {
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
// https://github.com/rust-lang/rust/pull/86176
fn codec_estimation<C: FastFieldCodec, D: Column>(
fastfield_accessor: &D,
fn codec_estimation<C: FastFieldCodec>(
fastfield_accessor: &impl Column,
estimations: &mut Vec<(f32, FastFieldCodecType)>,
) {
if let Some(ratio) = C::estimate(fastfield_accessor) {
@@ -202,13 +202,13 @@ impl CompositeFastFieldSerializer {
let mut estimations = vec![];
if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) {
codec_estimation::<BitpackedCodec, _>(&fastfield_accessor, &mut estimations);
codec_estimation::<BitpackedCodec>(&fastfield_accessor, &mut estimations);
}
if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) {
codec_estimation::<LinearCodec, _>(&fastfield_accessor, &mut estimations);
codec_estimation::<LinearCodec>(&fastfield_accessor, &mut estimations);
}
if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) {
codec_estimation::<BlockwiseLinearCodec, _>(&fastfield_accessor, &mut estimations);
codec_estimation::<BlockwiseLinearCodec>(&fastfield_accessor, &mut estimations);
}
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
{

View File

@@ -301,7 +301,7 @@ pub use self::docset::{DocSet, TERMINATED};
pub use crate::core::{
Executor, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader,
Order, Searcher, SearcherGeneration, Segment, SegmentComponent, SegmentId, SegmentMeta,
SegmentReader,
SegmentReader, SingleSegmentIndexWriter,
};
pub use crate::directory::Directory;
pub use crate::indexer::demuxer::*;

View File

@@ -116,7 +116,7 @@ pub(crate) struct IndexingPosition {
/// and building a `Segment` in anonymous memory.
///
/// `PostingsWriter` writes in a `MemoryArena`.
pub(crate) trait PostingsWriter {
pub(crate) trait PostingsWriter: Send + Sync {
/// Record that a document contains a term at a given position.
///
/// * doc - the document id

View File

@@ -56,7 +56,7 @@ impl<'a> Iterator for VInt32Reader<'a> {
/// * the document id
/// * the term frequency
/// * the term positions
pub(crate) trait Recorder: Copy + Default + 'static {
pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
/// Returns the current document
fn current_doc(&self) -> u32;
/// Starts recording information about a new document