mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
* use optional index in multivalued index
For mostly empty multivalued indices there was a large overhead during
creation when iterating all docids. This is alleviated by placing an
optional index in the multivalued index to mark documents that have values.
There's some performance overhead when accessing values in a multivalued
index. The accessing cost is now optional index + multivalue index. The
sparse codec performs relatively bad with the binary_search when accessing
data. This is reflected in the benchmarks below.
This changes the format of columnar to v2, but code is added to handle the v1
formats.
```
Running benches/bench_access.rs (/home/pascal/Development/tantivy/optional_multivalues/target/release/deps/bench_access-ea323c028db88db4)
multi sparse 1/13
access_values_for_doc Avg: 42.8946ms (+241.80%) Median: 42.8869ms (+244.10%) [42.7484ms .. 43.1074ms]
access_first_vals Avg: 42.8022ms (+421.93%) Median: 42.7553ms (+439.84%) [42.6794ms .. 43.7404ms]
multi 2x
access_values_for_doc Avg: 31.1244ms (+24.17%) Median: 30.8339ms (+23.46%) [30.7192ms .. 33.6059ms]
access_first_vals Avg: 24.3070ms (+70.92%) Median: 24.0966ms (+70.18%) [23.9328ms .. 26.4851ms]
sparse 1/13
access_values_for_doc Avg: 42.2490ms (+0.61%) Median: 42.2346ms (+2.28%) [41.8988ms .. 43.7821ms]
access_first_vals Avg: 43.6272ms (+0.23%) Median: 43.6197ms (+1.78%) [43.4920ms .. 43.9009ms]
dense 1/12
access_values_for_doc Avg: 8.6184ms (+23.18%) Median: 8.6126ms (+23.78%) [8.5843ms .. 8.7527ms]
access_first_vals Avg: 6.8112ms (+4.47%) Median: 6.8002ms (+4.55%) [6.7887ms .. 6.8991ms]
full
access_values_for_doc Avg: 9.4073ms (-5.09%) Median: 9.4023ms (-2.23%) [9.3694ms .. 9.4568ms]
access_first_vals Avg: 4.9531ms (+6.24%) Median: 4.9502ms (+7.85%) [4.9423ms .. 4.9718ms]
```
```
Running benches/bench_merge.rs (/home/pascal/Development/tantivy/optional_multivalues/target/release/deps/bench_merge-475697dfceb3639f)
merge_multi 2x_and_multi 2x Avg: 20.2280ms (+34.33%) Median: 20.1829ms (+35.33%) [19.9933ms .. 20.8806ms]
merge_multi sparse 1/13_and_multi sparse 1/13 Avg: 0.8961ms (-78.04%) Median: 0.8943ms (-77.61%) [0.8899ms .. 0.9272ms]
merge_dense 1/12_and_dense 1/12 Avg: 0.6619ms (-1.26%) Median: 0.6616ms (+2.20%) [0.6473ms .. 0.6837ms]
merge_sparse 1/13_and_sparse 1/13 Avg: 0.5508ms (-0.85%) Median: 0.5508ms (+2.80%) [0.5420ms .. 0.5634ms]
merge_sparse 1/13_and_dense 1/12 Avg: 0.6046ms (-4.64%) Median: 0.6038ms (+2.80%) [0.5939ms .. 0.6296ms]
merge_multi sparse 1/13_and_dense 1/12 Avg: 0.9111ms (-83.48%) Median: 0.9063ms (-83.50%) [0.9047ms .. 0.9663ms]
merge_multi sparse 1/13_and_sparse 1/13 Avg: 0.8451ms (-89.49%) Median: 0.8428ms (-89.43%) [0.8411ms .. 0.8563ms]
merge_multi 2x_and_dense 1/12 Avg: 10.6624ms (-4.82%) Median: 10.6568ms (-4.49%) [10.5738ms .. 10.8353ms]
merge_multi 2x_and_sparse 1/13 Avg: 10.6336ms (-22.95%) Median: 10.5925ms (-22.33%) [10.5149ms .. 11.5657ms]
```
* Update columnar/src/columnar/format_version.rs
Co-authored-by: Paul Masurel <paul@quickwit.io>
* Update columnar/src/column_index/mod.rs
Co-authored-by: Paul Masurel <paul@quickwit.io>
---------
Co-authored-by: Paul Masurel <paul@quickwit.io>
68 lines
1.7 KiB
Rust
68 lines
1.7 KiB
Rust
use binggan::{black_box, InputGroup};
|
|
use common::*;
|
|
use tantivy_columnar::Column;
|
|
|
|
pub mod common;
|
|
|
|
const NUM_DOCS: u32 = 2_000_000;
|
|
|
|
pub fn generate_columnar_and_open(card: Card, num_docs: u32) -> Column {
|
|
let reader = generate_columnar_with_name(card, num_docs, "price");
|
|
reader.read_columns("price").unwrap()[0]
|
|
.open_u64_lenient()
|
|
.unwrap()
|
|
.unwrap()
|
|
}
|
|
|
|
fn main() {
|
|
let mut inputs = Vec::new();
|
|
|
|
let mut add_card = |card1: Card| {
|
|
inputs.push((
|
|
format!("{card1}"),
|
|
generate_columnar_and_open(card1, NUM_DOCS),
|
|
));
|
|
};
|
|
|
|
add_card(Card::MultiSparse);
|
|
add_card(Card::Multi);
|
|
add_card(Card::Sparse);
|
|
add_card(Card::Dense);
|
|
add_card(Card::Full);
|
|
|
|
bench_group(InputGroup::new_with_inputs(inputs));
|
|
}
|
|
|
|
fn bench_group(mut runner: InputGroup<Column>) {
|
|
runner.register("access_values_for_doc", |column| {
|
|
let mut sum = 0;
|
|
for i in 0..NUM_DOCS {
|
|
for value in column.values_for_doc(i) {
|
|
sum += value;
|
|
}
|
|
}
|
|
black_box(sum);
|
|
});
|
|
runner.register("access_first_vals", |column| {
|
|
let mut sum = 0;
|
|
const BLOCK_SIZE: usize = 32;
|
|
let mut docs = vec![0; BLOCK_SIZE];
|
|
let mut buffer = vec![None; BLOCK_SIZE];
|
|
for i in (0..NUM_DOCS).step_by(BLOCK_SIZE) {
|
|
// fill docs
|
|
for idx in 0..BLOCK_SIZE {
|
|
docs[idx] = idx as u32 + i;
|
|
}
|
|
|
|
column.first_vals(&docs, &mut buffer);
|
|
for val in buffer.iter() {
|
|
let Some(val) = val else { continue };
|
|
sum += *val;
|
|
}
|
|
}
|
|
|
|
black_box(sum);
|
|
});
|
|
runner.run();
|
|
}
|