mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-09 02:22:54 +00:00
* use optional index in multivalued index
For mostly empty multivalued indices there was a large overhead during
creation when iterating all docids. This is alleviated by placing an
optional index in the multivalued index to mark documents that have values.
There's some performance overhead when accessing values in a multivalued
index. The accessing cost is now optional index + multivalue index. The
sparse codec performs relatively bad with the binary_search when accessing
data. This is reflected in the benchmarks below.
This changes the format of columnar to v2, but code is added to handle the v1
formats.
```
Running benches/bench_access.rs (/home/pascal/Development/tantivy/optional_multivalues/target/release/deps/bench_access-ea323c028db88db4)
multi sparse 1/13
access_values_for_doc Avg: 42.8946ms (+241.80%) Median: 42.8869ms (+244.10%) [42.7484ms .. 43.1074ms]
access_first_vals Avg: 42.8022ms (+421.93%) Median: 42.7553ms (+439.84%) [42.6794ms .. 43.7404ms]
multi 2x
access_values_for_doc Avg: 31.1244ms (+24.17%) Median: 30.8339ms (+23.46%) [30.7192ms .. 33.6059ms]
access_first_vals Avg: 24.3070ms (+70.92%) Median: 24.0966ms (+70.18%) [23.9328ms .. 26.4851ms]
sparse 1/13
access_values_for_doc Avg: 42.2490ms (+0.61%) Median: 42.2346ms (+2.28%) [41.8988ms .. 43.7821ms]
access_first_vals Avg: 43.6272ms (+0.23%) Median: 43.6197ms (+1.78%) [43.4920ms .. 43.9009ms]
dense 1/12
access_values_for_doc Avg: 8.6184ms (+23.18%) Median: 8.6126ms (+23.78%) [8.5843ms .. 8.7527ms]
access_first_vals Avg: 6.8112ms (+4.47%) Median: 6.8002ms (+4.55%) [6.7887ms .. 6.8991ms]
full
access_values_for_doc Avg: 9.4073ms (-5.09%) Median: 9.4023ms (-2.23%) [9.3694ms .. 9.4568ms]
access_first_vals Avg: 4.9531ms (+6.24%) Median: 4.9502ms (+7.85%) [4.9423ms .. 4.9718ms]
```
```
Running benches/bench_merge.rs (/home/pascal/Development/tantivy/optional_multivalues/target/release/deps/bench_merge-475697dfceb3639f)
merge_multi 2x_and_multi 2x Avg: 20.2280ms (+34.33%) Median: 20.1829ms (+35.33%) [19.9933ms .. 20.8806ms]
merge_multi sparse 1/13_and_multi sparse 1/13 Avg: 0.8961ms (-78.04%) Median: 0.8943ms (-77.61%) [0.8899ms .. 0.9272ms]
merge_dense 1/12_and_dense 1/12 Avg: 0.6619ms (-1.26%) Median: 0.6616ms (+2.20%) [0.6473ms .. 0.6837ms]
merge_sparse 1/13_and_sparse 1/13 Avg: 0.5508ms (-0.85%) Median: 0.5508ms (+2.80%) [0.5420ms .. 0.5634ms]
merge_sparse 1/13_and_dense 1/12 Avg: 0.6046ms (-4.64%) Median: 0.6038ms (+2.80%) [0.5939ms .. 0.6296ms]
merge_multi sparse 1/13_and_dense 1/12 Avg: 0.9111ms (-83.48%) Median: 0.9063ms (-83.50%) [0.9047ms .. 0.9663ms]
merge_multi sparse 1/13_and_sparse 1/13 Avg: 0.8451ms (-89.49%) Median: 0.8428ms (-89.43%) [0.8411ms .. 0.8563ms]
merge_multi 2x_and_dense 1/12 Avg: 10.6624ms (-4.82%) Median: 10.6568ms (-4.49%) [10.5738ms .. 10.8353ms]
merge_multi 2x_and_sparse 1/13 Avg: 10.6336ms (-22.95%) Median: 10.5925ms (-22.33%) [10.5149ms .. 11.5657ms]
```
* Update columnar/src/columnar/format_version.rs
Co-authored-by: Paul Masurel <paul@quickwit.io>
* Update columnar/src/column_index/mod.rs
Co-authored-by: Paul Masurel <paul@quickwit.io>
---------
Co-authored-by: Paul Masurel <paul@quickwit.io>
184 lines
5.6 KiB
Rust
184 lines
5.6 KiB
Rust
use std::path::PathBuf;
|
|
|
|
use itertools::Itertools;
|
|
|
|
use crate::{
|
|
merge_columnar, Cardinality, Column, ColumnarReader, DynamicColumn, StackMergeOrder,
|
|
CURRENT_VERSION,
|
|
};
|
|
|
|
const NUM_DOCS: u32 = u16::MAX as u32;
|
|
|
|
fn generate_columnar(num_docs: u32, value_offset: u64) -> Vec<u8> {
|
|
use crate::ColumnarWriter;
|
|
|
|
let mut columnar_writer = ColumnarWriter::default();
|
|
|
|
for i in 0..num_docs {
|
|
if i % 100 == 0 {
|
|
columnar_writer.record_numerical(i, "sparse", value_offset + i as u64);
|
|
}
|
|
if i % 5 == 0 {
|
|
columnar_writer.record_numerical(i, "dense", value_offset + i as u64);
|
|
}
|
|
columnar_writer.record_numerical(i, "full", value_offset + i as u64);
|
|
columnar_writer.record_numerical(i, "multi", value_offset + i as u64);
|
|
columnar_writer.record_numerical(i, "multi", value_offset + i as u64);
|
|
}
|
|
|
|
let mut wrt: Vec<u8> = Vec::new();
|
|
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
|
|
|
|
wrt
|
|
}
|
|
|
|
#[test]
|
|
/// Writes a columnar for the CURRENT_VERSION to disk.
|
|
fn create_format() {
|
|
let version = CURRENT_VERSION.to_string();
|
|
let file_path = path_for_version(&version);
|
|
if PathBuf::from(file_path.clone()).exists() {
|
|
return;
|
|
}
|
|
let columnar = generate_columnar(NUM_DOCS, 0);
|
|
std::fs::write(file_path, columnar).unwrap();
|
|
}
|
|
|
|
fn path_for_version(version: &str) -> String {
|
|
format!("./compat_tests_data/{}.columnar", version)
|
|
}
|
|
|
|
#[test]
|
|
fn test_format_v1() {
|
|
let path = path_for_version("v1");
|
|
test_format(&path);
|
|
}
|
|
|
|
#[test]
|
|
fn test_format_v2() {
|
|
let path = path_for_version("v2");
|
|
test_format(&path);
|
|
}
|
|
|
|
fn test_format(path: &str) {
|
|
let file_content = std::fs::read(path).unwrap();
|
|
let reader = ColumnarReader::open(file_content).unwrap();
|
|
|
|
check_columns(&reader);
|
|
|
|
// Test merge
|
|
let reader2 = ColumnarReader::open(generate_columnar(NUM_DOCS, NUM_DOCS as u64)).unwrap();
|
|
let columnar_readers = vec![&reader, &reader2];
|
|
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
|
|
let mut out = Vec::new();
|
|
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
|
|
let reader = ColumnarReader::open(out).unwrap();
|
|
check_columns(&reader);
|
|
}
|
|
|
|
fn check_columns(reader: &ColumnarReader) {
|
|
let column = open_column(reader, "full");
|
|
check_column(&column, |doc_id| vec![(doc_id, doc_id as u64).into()]);
|
|
assert_eq!(column.get_cardinality(), Cardinality::Full);
|
|
|
|
let column = open_column(reader, "multi");
|
|
check_column(&column, |doc_id| {
|
|
vec![
|
|
(doc_id * 2, doc_id as u64).into(),
|
|
(doc_id * 2 + 1, doc_id as u64).into(),
|
|
]
|
|
});
|
|
assert_eq!(column.get_cardinality(), Cardinality::Multivalued);
|
|
|
|
let column = open_column(reader, "sparse");
|
|
check_column(&column, |doc_id| {
|
|
if doc_id % 100 == 0 {
|
|
vec![(doc_id / 100, doc_id as u64).into()]
|
|
} else {
|
|
vec![]
|
|
}
|
|
});
|
|
assert_eq!(column.get_cardinality(), Cardinality::Optional);
|
|
|
|
let column = open_column(reader, "dense");
|
|
check_column(&column, |doc_id| {
|
|
if doc_id % 5 == 0 {
|
|
vec![(doc_id / 5, doc_id as u64).into()]
|
|
} else {
|
|
vec![]
|
|
}
|
|
});
|
|
assert_eq!(column.get_cardinality(), Cardinality::Optional);
|
|
}
|
|
|
|
struct RowIdAndValue {
|
|
row_id: u32,
|
|
value: u64,
|
|
}
|
|
impl From<(u32, u64)> for RowIdAndValue {
|
|
fn from((row_id, value): (u32, u64)) -> Self {
|
|
Self { row_id, value }
|
|
}
|
|
}
|
|
|
|
fn check_column<F: Fn(u32) -> Vec<RowIdAndValue>>(column: &Column<u64>, expected: F) {
|
|
let num_docs = column.num_docs();
|
|
let test_doc = |doc: u32| {
|
|
if expected(doc).is_empty() {
|
|
assert_eq!(column.first(doc), None);
|
|
} else {
|
|
assert_eq!(column.first(doc), Some(expected(doc)[0].value));
|
|
}
|
|
let values = column.values_for_doc(doc).collect_vec();
|
|
assert_eq!(values, expected(doc).iter().map(|x| x.value).collect_vec());
|
|
let mut row_ids = Vec::new();
|
|
column.row_ids_for_docs(&[doc], &mut vec![], &mut row_ids);
|
|
assert_eq!(
|
|
row_ids,
|
|
expected(doc).iter().map(|x| x.row_id).collect_vec()
|
|
);
|
|
let values = column.values_for_doc(doc).collect_vec();
|
|
assert_eq!(values, expected(doc).iter().map(|x| x.value).collect_vec());
|
|
|
|
// Docid rowid conversion
|
|
let mut row_ids = Vec::new();
|
|
let safe_next_doc = |doc: u32| (doc + 1).min(num_docs - 1);
|
|
column
|
|
.index
|
|
.docids_to_rowids(&[doc, safe_next_doc(doc)], &mut vec![], &mut row_ids);
|
|
let expected_rowids = expected(doc)
|
|
.iter()
|
|
.map(|x| x.row_id)
|
|
.chain(expected(safe_next_doc(doc)).iter().map(|x| x.row_id))
|
|
.collect_vec();
|
|
assert_eq!(row_ids, expected_rowids);
|
|
let rowid_range = column
|
|
.index
|
|
.docid_range_to_rowids(doc..safe_next_doc(doc) + 1);
|
|
if expected_rowids.is_empty() {
|
|
assert!(rowid_range.is_empty());
|
|
} else {
|
|
assert_eq!(
|
|
rowid_range,
|
|
expected_rowids[0]..expected_rowids.last().unwrap() + 1
|
|
);
|
|
}
|
|
};
|
|
test_doc(0);
|
|
test_doc(num_docs - 1);
|
|
test_doc(num_docs - 2);
|
|
test_doc(65000);
|
|
}
|
|
|
|
fn open_column(reader: &ColumnarReader, name: &str) -> Column<u64> {
|
|
let column = reader.read_columns(name).unwrap()[0]
|
|
.open()
|
|
.unwrap()
|
|
.coerce_numerical(crate::NumericalType::U64)
|
|
.unwrap();
|
|
let DynamicColumn::U64(column) = column else {
|
|
panic!();
|
|
};
|
|
column
|
|
}
|