mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
* Optimize ExistsQuery for a high number of dynamic columns The previous algorithm checked _each_ doc in _each_ column for existence. This causes huge cost on JSON fields with e.g. 100k columns. Compute a bitset instead if we have more than one column. add `iter_docs` to the multivalued_index * add benchmark subfields=1 exists_json_union Memory: 89.3 KB (+2.01%) Avg: 0.4865ms (-26.03%) Median: 0.4865ms (-26.03%) [0.4865ms .. 0.4865ms] subfields=2 exists_json_union Memory: 68.1 KB Avg: 1.7048ms (-0.46%) Median: 1.7048ms (-0.46%) [1.7048ms .. 1.7048ms] subfields=3 exists_json_union Memory: 61.8 KB Avg: 2.0742ms (-2.22%) Median: 2.0742ms (-2.22%) [2.0742ms .. 2.0742ms] subfields=4 exists_json_union Memory: 119.8 KB (+103.44%) Avg: 3.9500ms (+42.62%) Median: 3.9500ms (+42.62%) [3.9500ms .. 3.9500ms] subfields=5 exists_json_union Memory: 120.4 KB (+107.65%) Avg: 3.9610ms (+20.65%) Median: 3.9610ms (+20.65%) [3.9610ms .. 3.9610ms] subfields=6 exists_json_union Memory: 120.6 KB (+107.49%) Avg: 3.8903ms (+3.11%) Median: 3.8903ms (+3.11%) [3.8903ms .. 3.8903ms] subfields=7 exists_json_union Memory: 120.9 KB (+106.93%) Avg: 3.6220ms (-16.22%) Median: 3.6220ms (-16.22%) [3.6220ms .. 3.6220ms] subfields=8 exists_json_union Memory: 121.3 KB (+106.23%) Avg: 4.0981ms (-15.97%) Median: 4.0981ms (-15.97%) [4.0981ms .. 4.0981ms] subfields=16 exists_json_union Memory: 123.1 KB (+103.09%) Avg: 4.3483ms (-92.26%) Median: 4.3483ms (-92.26%) [4.3483ms .. 4.3483ms] subfields=256 exists_json_union Memory: 204.6 KB (+19.85%) Avg: 3.8874ms (-99.01%) Median: 3.8874ms (-99.01%) [3.8874ms .. 3.8874ms] subfields=4096 exists_json_union Memory: 2.0 MB Avg: 3.5571ms (-99.90%) Median: 3.5571ms (-99.90%) [3.5571ms .. 3.5571ms] subfields=65536 exists_json_union Memory: 28.3 MB Avg: 14.4417ms (-99.97%) Median: 14.4417ms (-99.97%) [14.4417ms .. 14.4417ms] subfields=262144 exists_json_union Memory: 113.3 MB Avg: 66.2860ms (-99.95%) Median: 66.2860ms (-99.95%) [66.2860ms .. 66.2860ms] * rename methods
226 lines
7.7 KiB
Rust
226 lines
7.7 KiB
Rust
use proptest::prelude::*;
|
|
use proptest::{prop_oneof, proptest};
|
|
|
|
use super::*;
|
|
use crate::{ColumnarReader, ColumnarWriter, DynamicColumnHandle};
|
|
|
|
#[test]
|
|
fn test_optional_index_bug_2293() {
|
|
// tests for panic in docid_range_to_rowids for docid == num_docs
|
|
test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK - 1);
|
|
test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK);
|
|
test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK + 1);
|
|
}
|
|
fn test_optional_index_with_num_docs(num_docs: u32) {
|
|
let mut dataframe_writer = ColumnarWriter::default();
|
|
dataframe_writer.record_numerical(100, "score", 80i64);
|
|
let mut buffer: Vec<u8> = Vec::new();
|
|
dataframe_writer.serialize(num_docs, &mut buffer).unwrap();
|
|
let columnar = ColumnarReader::open(buffer).unwrap();
|
|
assert_eq!(columnar.num_columns(), 1);
|
|
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("score").unwrap();
|
|
assert_eq!(cols.len(), 1);
|
|
|
|
let col = cols[0].open().unwrap();
|
|
col.column_index().docid_range_to_rowids(0..num_docs);
|
|
}
|
|
|
|
#[test]
|
|
fn test_dense_block_threshold() {
|
|
assert_eq!(super::DENSE_BLOCK_THRESHOLD, 5_120);
|
|
}
|
|
|
|
fn random_bitvec() -> BoxedStrategy<Vec<bool>> {
|
|
prop_oneof![
|
|
1 => prop::collection::vec(proptest::bool::weighted(1.0), 0..100),
|
|
1 => prop::collection::vec(proptest::bool::weighted(0.00), 0..(ELEMENTS_PER_BLOCK as usize * 3)), // empty blocks
|
|
1 => prop::collection::vec(proptest::bool::weighted(1.00), 0..(ELEMENTS_PER_BLOCK as usize + 10)), // full block
|
|
1 => prop::collection::vec(proptest::bool::weighted(0.01), 0..100),
|
|
1 => prop::collection::vec(proptest::bool::weighted(0.01), 0..u16::MAX as usize),
|
|
8 => vec![any::<bool>()],
|
|
]
|
|
.boxed()
|
|
}
|
|
|
|
proptest! {
|
|
#![proptest_config(ProptestConfig::with_cases(50))]
|
|
#[test]
|
|
fn test_with_random_bitvecs(bitvec1 in random_bitvec(), bitvec2 in random_bitvec(), bitvec3 in random_bitvec()) {
|
|
let mut bitvec = Vec::new();
|
|
bitvec.extend_from_slice(&bitvec1);
|
|
bitvec.extend_from_slice(&bitvec2);
|
|
bitvec.extend_from_slice(&bitvec3);
|
|
test_null_index(&bitvec[..]);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_with_random_sets_simple() {
|
|
let vals = 10..ELEMENTS_PER_BLOCK * 2;
|
|
let mut out: Vec<u8> = Vec::new();
|
|
serialize_optional_index(&vals, 100, &mut out).unwrap();
|
|
let null_index = open_optional_index(OwnedBytes::new(out)).unwrap();
|
|
let ranks: Vec<u32> = (65_472u32..65_473u32).collect();
|
|
let els: Vec<u32> = ranks.iter().copied().map(|rank| rank + 10).collect();
|
|
let mut select_cursor = null_index.select_cursor();
|
|
for (rank, el) in ranks.iter().copied().zip(els.iter().copied()) {
|
|
assert_eq!(select_cursor.select(rank), el);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_optional_index_trailing_empty_blocks() {
|
|
test_null_index(&[false]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_optional_index_one_block_false() {
|
|
let mut iter = vec![false; ELEMENTS_PER_BLOCK as usize];
|
|
iter.push(true);
|
|
test_null_index(&iter[..]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_optional_index_one_block_true() {
|
|
let mut iter = vec![true; ELEMENTS_PER_BLOCK as usize];
|
|
iter.push(true);
|
|
test_null_index(&iter[..]);
|
|
}
|
|
|
|
impl<'a> Iterable<RowId> for &'a [bool] {
|
|
fn boxed_iter(&self) -> Box<dyn Iterator<Item = RowId> + 'a> {
|
|
Box::new(
|
|
self.iter()
|
|
.cloned()
|
|
.enumerate()
|
|
.filter(|(_pos, val)| *val)
|
|
.map(|(pos, _val)| pos as u32),
|
|
)
|
|
}
|
|
}
|
|
|
|
fn test_null_index(data: &[bool]) {
|
|
let mut out: Vec<u8> = Vec::new();
|
|
serialize_optional_index(&data, data.len() as RowId, &mut out).unwrap();
|
|
let null_index = open_optional_index(OwnedBytes::new(out)).unwrap();
|
|
let orig_idx_with_value: Vec<u32> = data
|
|
.iter()
|
|
.enumerate()
|
|
.filter(|(_pos, val)| **val)
|
|
.map(|(pos, _val)| pos as u32)
|
|
.collect();
|
|
let mut select_iter = null_index.select_cursor();
|
|
for (i, expected) in orig_idx_with_value.iter().enumerate() {
|
|
assert_eq!(select_iter.select(i as u32), *expected);
|
|
}
|
|
|
|
let step_size = (orig_idx_with_value.len() / 100).max(1);
|
|
for (dense_idx, orig_idx) in orig_idx_with_value.iter().enumerate().step_by(step_size) {
|
|
assert_eq!(null_index.rank_if_exists(*orig_idx), Some(dense_idx as u32));
|
|
}
|
|
|
|
// 100 samples
|
|
let step_size = (data.len() / 100).max(1);
|
|
for (pos, value) in data.iter().enumerate().step_by(step_size) {
|
|
assert_eq!(null_index.contains(pos as u32), *value);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_optional_index_test_translation() {
|
|
let optional_index = OptionalIndex::for_test(4, &[0, 2]);
|
|
let mut select_cursor = optional_index.select_cursor();
|
|
assert_eq!(select_cursor.select(0), 0);
|
|
assert_eq!(select_cursor.select(1), 2);
|
|
}
|
|
|
|
#[test]
|
|
fn test_optional_index_translate() {
|
|
let optional_index = OptionalIndex::for_test(4, &[0, 2]);
|
|
assert_eq!(optional_index.rank_if_exists(0), Some(0));
|
|
assert_eq!(optional_index.rank_if_exists(2), Some(1));
|
|
}
|
|
|
|
#[test]
|
|
fn test_optional_index_small() {
|
|
let optional_index = OptionalIndex::for_test(4, &[0, 2]);
|
|
assert!(optional_index.contains(0));
|
|
assert!(!optional_index.contains(1));
|
|
assert!(optional_index.contains(2));
|
|
assert!(!optional_index.contains(3));
|
|
}
|
|
|
|
#[test]
|
|
fn test_optional_index_large() {
|
|
let row_ids = &[ELEMENTS_PER_BLOCK, ELEMENTS_PER_BLOCK + 1];
|
|
let optional_index = OptionalIndex::for_test(ELEMENTS_PER_BLOCK + 2, row_ids);
|
|
assert!(!optional_index.contains(0));
|
|
assert!(!optional_index.contains(100));
|
|
assert!(!optional_index.contains(ELEMENTS_PER_BLOCK - 1));
|
|
assert!(optional_index.contains(ELEMENTS_PER_BLOCK));
|
|
assert!(optional_index.contains(ELEMENTS_PER_BLOCK + 1));
|
|
}
|
|
|
|
fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {
|
|
let optional_index = OptionalIndex::for_test(num_rows, row_ids);
|
|
assert_eq!(optional_index.num_docs(), num_rows);
|
|
assert!(
|
|
optional_index
|
|
.iter_non_null_docs()
|
|
.eq(row_ids.iter().copied())
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_optional_index_iter_empty() {
|
|
test_optional_index_iter_aux(&[], 0u32);
|
|
}
|
|
|
|
fn test_optional_index_rank_aux(row_ids: &[RowId]) {
|
|
let num_rows = row_ids.last().copied().unwrap_or(0u32) + 1;
|
|
let null_index = OptionalIndex::for_test(num_rows, row_ids);
|
|
assert_eq!(null_index.num_docs(), num_rows);
|
|
for (row_id, row_val) in row_ids.iter().copied().enumerate() {
|
|
assert_eq!(null_index.rank(row_val), row_id as u32);
|
|
assert_eq!(null_index.rank_if_exists(row_val), Some(row_id as u32));
|
|
if row_val > 0 && !null_index.contains(&row_val - 1) {
|
|
assert_eq!(null_index.rank(row_val - 1), row_id as u32);
|
|
}
|
|
assert_eq!(null_index.rank(row_val + 1), row_id as u32 + 1);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_optional_index_rank() {
|
|
test_optional_index_rank_aux(&[1u32]);
|
|
test_optional_index_rank_aux(&[0u32, 1u32]);
|
|
let mut block = Vec::new();
|
|
block.push(3u32);
|
|
block.extend((0..ELEMENTS_PER_BLOCK).map(|i| i + ELEMENTS_PER_BLOCK + 1));
|
|
test_optional_index_rank_aux(&block);
|
|
}
|
|
|
|
#[test]
|
|
fn test_optional_index_iter_empty_one() {
|
|
test_optional_index_iter_aux(&[1], 2u32);
|
|
test_optional_index_iter_aux(&[100_000], 200_000u32);
|
|
}
|
|
|
|
#[test]
|
|
fn test_optional_index_iter_dense_block() {
|
|
let mut block = Vec::new();
|
|
block.push(3u32);
|
|
block.extend((0..ELEMENTS_PER_BLOCK).map(|i| i + ELEMENTS_PER_BLOCK + 1));
|
|
test_optional_index_iter_aux(&block, 3 * ELEMENTS_PER_BLOCK);
|
|
}
|
|
|
|
#[test]
|
|
fn test_optional_index_for_tests() {
|
|
let optional_index = OptionalIndex::for_test(4, &[1, 2]);
|
|
assert!(!optional_index.contains(0));
|
|
assert!(optional_index.contains(1));
|
|
assert!(optional_index.contains(2));
|
|
assert!(!optional_index.contains(3));
|
|
assert_eq!(optional_index.num_docs(), 4);
|
|
}
|