improve range query performance (#1864)

fix RowId vs DocId naming
fixes #1863
This commit is contained in:
PSeitz
2023-02-14 12:25:39 +08:00
committed by GitHub
parent 539ff08a79
commit 1cfb9ce59a
17 changed files with 138 additions and 68 deletions

View File

@@ -58,7 +58,7 @@ fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) {
b.iter(|| {
let mut positions = Vec::new();
column.get_docids_for_value_range(
column.get_row_ids_for_value_range(
*FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128,
0..data.len() as u32,
&mut positions,
@@ -74,7 +74,7 @@ fn bench_intfastfield_getrange_u128_single_hit(b: &mut Bencher) {
b.iter(|| {
let mut positions = Vec::new();
column.get_docids_for_value_range(
column.get_row_ids_for_value_range(
*SINGLE_ITEM_RANGE.start() as u128..=*SINGLE_ITEM_RANGE.end() as u128,
0..data.len() as u32,
&mut positions,
@@ -90,7 +90,7 @@ fn bench_intfastfield_getrange_u128_hit_all(b: &mut Bencher) {
b.iter(|| {
let mut positions = Vec::new();
column.get_docids_for_value_range(0..=u128::MAX, 0..data.len() as u32, &mut positions);
column.get_row_ids_for_value_range(0..=u128::MAX, 0..data.len() as u32, &mut positions);
positions
});
}

View File

@@ -89,7 +89,7 @@ fn bench_intfastfield_getrange_u64_50percent_hit(b: &mut Bencher) {
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
b.iter(|| {
let mut positions = Vec::new();
column.get_docids_for_value_range(
column.get_row_ids_for_value_range(
FIFTY_PERCENT_RANGE,
0..data.len() as u32,
&mut positions,
@@ -106,7 +106,7 @@ fn bench_intfastfield_getrange_u64_1percent_hit(b: &mut Bencher) {
b.iter(|| {
let mut positions = Vec::new();
column.get_docids_for_value_range(
column.get_row_ids_for_value_range(
ONE_PERCENT_ITEM_RANGE,
0..data.len() as u32,
&mut positions,
@@ -123,7 +123,7 @@ fn bench_intfastfield_getrange_u64_single_hit(b: &mut Bencher) {
b.iter(|| {
let mut positions = Vec::new();
column.get_docids_for_value_range(SINGLE_ITEM_RANGE, 0..data.len() as u32, &mut positions);
column.get_row_ids_for_value_range(SINGLE_ITEM_RANGE, 0..data.len() as u32, &mut positions);
positions
});
}
@@ -136,7 +136,7 @@ fn bench_intfastfield_getrange_u64_hit_all(b: &mut Bencher) {
b.iter(|| {
let mut positions = Vec::new();
column.get_docids_for_value_range(0..=u64::MAX, 0..data.len() as u32, &mut positions);
column.get_row_ids_for_value_range(0..=u64::MAX, 0..data.len() as u32, &mut positions);
positions
});
}

View File

@@ -32,7 +32,7 @@ impl BytesColumn {
/// Returns the number of rows in the column.
pub fn num_rows(&self) -> RowId {
self.term_ord_column.num_rows()
self.term_ord_column.num_docs()
}
pub fn term_ords(&self, row_id: RowId) -> impl Iterator<Item = u64> + '_ {

View File

@@ -3,7 +3,7 @@ mod serialize;
use std::fmt::Debug;
use std::io::Write;
use std::ops::Deref;
use std::ops::{Deref, Range, RangeInclusive};
use std::sync::Arc;
use common::BinarySerializable;
@@ -42,14 +42,14 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
self.idx.get_cardinality()
}
pub fn num_rows(&self) -> RowId {
pub fn num_docs(&self) -> RowId {
match &self.idx {
ColumnIndex::Full => self.values.num_vals() as u32,
ColumnIndex::Optional(optional_index) => optional_index.num_rows(),
ColumnIndex::Optional(optional_index) => optional_index.num_docs(),
ColumnIndex::Multivalued(col_index) => {
// The multivalued index contains all value start row_id,
// and one extra value at the end with the overall number of rows.
col_index.num_rows()
col_index.num_docs()
}
}
}
@@ -71,6 +71,25 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
.map(|value_row_id: RowId| self.values.get_val(value_row_id))
}
/// Get the docids of values which are in the provided value range.
#[inline]
pub fn get_docids_for_value_range(
&self,
value_range: RangeInclusive<T>,
selected_docid_range: Range<u32>,
docids: &mut Vec<u32>,
) {
// convert passed docid range to row id range
let rowid_range = self.idx.docid_range_to_rowids(selected_docid_range.clone());
// Load rows
self.values
.get_row_ids_for_value_range(value_range.clone(), rowid_range, docids);
// Convert rows to docids
self.idx
.select_batch_in_place(docids, selected_docid_range.start);
}
/// Fils the output vector with the (possibly multiple values that are associated_with
/// `row_id`.
///
@@ -132,8 +151,8 @@ impl<T: PartialOrd + Debug + Send + Sync + Copy + 'static> ColumnValues<T>
fn num_vals(&self) -> u32 {
match &self.column.idx {
ColumnIndex::Full => self.column.values.num_vals(),
ColumnIndex::Optional(optional_idx) => optional_idx.num_rows(),
ColumnIndex::Multivalued(multivalue_idx) => multivalue_idx.num_rows(),
ColumnIndex::Optional(optional_idx) => optional_idx.num_docs(),
ColumnIndex::Multivalued(multivalue_idx) => multivalue_idx.num_docs(),
}
}
}

View File

@@ -10,7 +10,7 @@ pub use optional_index::{OptionalIndex, Set};
pub use serialize::{open_column_index, serialize_column_index, SerializableColumnIndex};
use crate::column_index::multivalued_index::MultiValueIndex;
use crate::{Cardinality, RowId};
use crate::{Cardinality, DocId, RowId};
#[derive(Clone)]
pub enum ColumnIndex {
@@ -43,31 +43,51 @@ impl ColumnIndex {
}
/// Returns true if and only if there are at least one value associated to the row.
pub fn has_value(&self, row_id: RowId) -> bool {
pub fn has_value(&self, doc_id: DocId) -> bool {
match self {
ColumnIndex::Full => true,
ColumnIndex::Optional(optional_index) => optional_index.contains(row_id),
ColumnIndex::Optional(optional_index) => optional_index.contains(doc_id),
ColumnIndex::Multivalued(multivalued_index) => {
multivalued_index.range(row_id).len() > 0
multivalued_index.range(doc_id).len() > 0
}
}
}
pub fn value_row_ids(&self, row_id: RowId) -> Range<RowId> {
pub fn value_row_ids(&self, doc_id: DocId) -> Range<RowId> {
match self {
ColumnIndex::Full => row_id..row_id + 1,
ColumnIndex::Full => doc_id..doc_id + 1,
ColumnIndex::Optional(optional_index) => {
if let Some(val) = optional_index.rank_if_exists(row_id) {
if let Some(val) = optional_index.rank_if_exists(doc_id) {
val..val + 1
} else {
0..0
}
}
ColumnIndex::Multivalued(multivalued_index) => multivalued_index.range(row_id),
ColumnIndex::Multivalued(multivalued_index) => multivalued_index.range(doc_id),
}
}
pub fn select_batch_in_place(&self, rank_ids: &mut Vec<RowId>) {
pub fn docid_range_to_rowids(&self, doc_id: Range<DocId>) -> Range<RowId> {
match self {
ColumnIndex::Full => doc_id,
ColumnIndex::Optional(optional_index) => {
let row_start = optional_index.rank(doc_id.start);
let row_end = optional_index.rank(doc_id.end);
row_start..row_end
}
ColumnIndex::Multivalued(multivalued_index) => {
let end_docid = doc_id.end.min(multivalued_index.num_docs() - 1) + 1;
let start_docid = doc_id.start.min(end_docid);
let row_start = multivalued_index.start_index_column.get_val(start_docid);
let row_end = multivalued_index.start_index_column.get_val(end_docid);
row_start..row_end
}
}
}
pub fn select_batch_in_place(&self, rank_ids: &mut Vec<RowId>, doc_id_start: DocId) {
match self {
ColumnIndex::Full => {
// No need to do anything:
@@ -77,8 +97,7 @@ impl ColumnIndex {
optional_index.select_batch(&mut rank_ids[..]);
}
ColumnIndex::Multivalued(multivalued_index) => {
// TODO important: avoid using 0u32, and restart from the beginning all of the time.
multivalued_index.select_batch_in_place(0u32, rank_ids)
multivalued_index.select_batch_in_place(doc_id_start, rank_ids)
}
}
}

View File

@@ -8,7 +8,7 @@ use common::OwnedBytes;
use crate::column_values::u64_based::CodecType;
use crate::column_values::ColumnValues;
use crate::iterable::Iterable;
use crate::RowId;
use crate::{DocId, RowId};
pub fn serialize_multivalued_index(
multivalued_index: &dyn Iterable<RowId>,
@@ -52,20 +52,20 @@ impl MultiValueIndex {
/// Returns `[start, end)`, such that the values associated with
/// the given document are `start..end`.
#[inline]
pub(crate) fn range(&self, row_id: RowId) -> Range<RowId> {
let start = self.start_index_column.get_val(row_id);
let end = self.start_index_column.get_val(row_id + 1);
pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
let start = self.start_index_column.get_val(doc_id);
let end = self.start_index_column.get_val(doc_id + 1);
start..end
}
/// Returns the number of documents in the index.
#[inline]
pub fn num_rows(&self) -> u32 {
pub fn num_docs(&self) -> u32 {
self.start_index_column.num_vals() - 1
}
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
/// row_ids. Positions are converted inplace to docids.
/// docids. Positions are converted inplace to docids.
///
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
/// index.
@@ -76,14 +76,14 @@ impl MultiValueIndex {
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
/// match a docid to its value position.
#[allow(clippy::bool_to_int_with_if)]
pub(crate) fn select_batch_in_place(&self, row_start: RowId, ranks: &mut Vec<u32>) {
pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
if ranks.is_empty() {
return;
}
let mut cur_doc = row_start;
let mut cur_doc = docid_start;
let mut last_doc = None;
assert!(self.start_index_column.get_val(row_start) as u32 <= ranks[0]);
assert!(self.start_index_column.get_val(docid_start) as u32 <= ranks[0]);
let mut write_doc_pos = 0;
for i in 0..ranks.len() {
@@ -127,7 +127,7 @@ mod tests {
let offsets: Vec<RowId> = vec![0, 10, 12, 15, 22, 23]; // docid values are [0..10, 10..12, 12..15, etc.]
let column: Arc<dyn ColumnValues<RowId>> = Arc::new(IterColumn::from(offsets.into_iter()));
let index = MultiValueIndex::from(column);
assert_eq!(index.num_rows(), 5);
assert_eq!(index.num_docs(), 5);
let positions = &[10u32, 11, 15, 20, 21, 22];
assert_eq!(index_to_pos_helper(&index, 0..5, positions), vec![1, 3, 4]);
assert_eq!(index_to_pos_helper(&index, 1..5, positions), vec![1, 3, 4]);

View File

@@ -11,7 +11,7 @@ use set_block::{
};
use crate::iterable::Iterable;
use crate::{InvalidData, RowId};
use crate::{DocId, InvalidData, RowId};
/// The threshold for for number of elements after which we switch to dense block encoding.
///
@@ -177,11 +177,11 @@ impl Set<RowId> for OptionalIndex {
}
#[inline]
fn rank(&self, row_id: RowId) -> RowId {
fn rank(&self, doc_id: DocId) -> RowId {
let RowAddr {
block_id,
in_block_row_id,
} = row_addr_from_row_id(row_id);
} = row_addr_from_row_id(doc_id);
let block_meta = self.block_metas[block_id as usize];
let block = self.block(block_meta);
let block_offset_row_id = match block {
@@ -192,11 +192,11 @@ impl Set<RowId> for OptionalIndex {
}
#[inline]
fn rank_if_exists(&self, row_id: RowId) -> Option<RowId> {
fn rank_if_exists(&self, doc_id: DocId) -> Option<RowId> {
let RowAddr {
block_id,
in_block_row_id,
} = row_addr_from_row_id(row_id);
} = row_addr_from_row_id(doc_id);
let block_meta = self.block_metas[block_id as usize];
let block = self.block(block_meta);
let block_offset_row_id = match block {
@@ -247,7 +247,7 @@ impl OptionalIndex {
open_optional_index(bytes).unwrap()
}
pub fn num_rows(&self) -> RowId {
pub fn num_docs(&self) -> RowId {
self.num_rows
}

View File

@@ -142,7 +142,7 @@ fn test_optional_index_large() {
fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {
let optional_index = OptionalIndex::for_test(num_rows, row_ids);
assert_eq!(optional_index.num_rows(), num_rows);
assert_eq!(optional_index.num_docs(), num_rows);
assert!(optional_index.iter_rows().eq(row_ids.iter().copied()));
}
@@ -154,7 +154,7 @@ fn test_optional_index_iter_empty() {
fn test_optional_index_rank_aux(row_ids: &[RowId]) {
let num_rows = row_ids.last().copied().unwrap_or(0u32) + 1;
let null_index = OptionalIndex::for_test(num_rows, row_ids);
assert_eq!(null_index.num_rows(), num_rows);
assert_eq!(null_index.num_docs(), num_rows);
for (row_id, row_val) in row_ids.iter().copied().enumerate() {
assert_eq!(null_index.rank(row_val), row_id as u32);
assert_eq!(null_index.rank_if_exists(row_val), Some(row_id as u32));
@@ -196,7 +196,7 @@ fn test_optional_index_for_tests() {
assert!(optional_index.contains(1));
assert!(optional_index.contains(2));
assert!(!optional_index.contains(3));
assert_eq!(optional_index.num_rows(), 4);
assert_eq!(optional_index.num_docs(), 4);
}
#[cfg(all(test, feature = "unstable"))]

View File

@@ -6,6 +6,7 @@ use std::sync::Arc;
use tantivy_bitpacker::minmax;
use crate::column_values::monotonic_mapping::StrictlyMonotonicFn;
use crate::RowId;
/// `ColumnValues` provides access to a dense field column.
///
@@ -35,21 +36,21 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync {
}
}
/// Get the positions of values which are in the provided value range.
/// Get the row ids of values which are in the provided value range.
///
/// Note that position == docid for single value fast fields
#[inline(always)]
fn get_docids_for_value_range(
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<T>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
row_id_range: Range<RowId>,
row_id_hits: &mut Vec<RowId>,
) {
let doc_id_range = doc_id_range.start..doc_id_range.end.min(self.num_vals());
for idx in doc_id_range.start..doc_id_range.end {
let row_id_range = row_id_range.start..row_id_range.end.min(self.num_vals());
for idx in row_id_range.start..row_id_range.end {
let val = self.get_val(idx);
if value_range.contains(&val) {
positions.push(idx);
row_id_hits.push(idx);
}
}
}
@@ -257,13 +258,13 @@ where
)
}
fn get_docids_for_value_range(
fn get_row_ids_for_value_range(
&self,
range: RangeInclusive<Output>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.from_column.get_docids_for_value_range(
self.from_column.get_row_ids_for_value_range(
self.monotonic_mapping.inverse(range.start().clone())
..=self.monotonic_mapping.inverse(range.end().clone()),
doc_id_range,

View File

@@ -313,7 +313,7 @@ impl ColumnValues<u128> for CompactSpaceDecompressor {
}
#[inline]
fn get_docids_for_value_range(
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<u128>,
positions_range: Range<u32>,
@@ -709,7 +709,7 @@ mod tests {
doc_id_range: Range<u32>,
) -> Vec<u32> {
let mut positions = Vec::new();
column.get_docids_for_value_range(value_range, doc_id_range, &mut positions);
column.get_row_ids_for_value_range(value_range, doc_id_range, &mut positions);
positions
}

View File

@@ -60,7 +60,7 @@ pub(crate) fn create_and_validate<TColumnCodec: ColumnCodec>(
.map(|(pos, _)| pos as u32)
.collect();
let mut positions = Vec::new();
reader.get_docids_for_value_range(
reader.get_row_ids_for_value_range(
vals[test_rand_idx]..=vals[test_rand_idx],
0..vals.len() as u32,
&mut positions,

View File

@@ -32,6 +32,7 @@ pub use value::{NumericalType, NumericalValue};
pub use self::dynamic_column::{DynamicColumn, DynamicColumnHandle};
pub type RowId = u32;
pub type DocId = u32;
#[derive(Clone, Copy)]
pub struct RowAddr {

View File

@@ -75,7 +75,7 @@ fn test_dataframe_writer_u64_multivalued() {
divisor_col.get_cardinality(),
crate::Cardinality::Multivalued
);
assert_eq!(divisor_col.num_rows(), 7);
assert_eq!(divisor_col.num_docs(), 7);
}
#[test]

View File

@@ -925,7 +925,7 @@ mod tests {
let test_range = |range: RangeInclusive<u64>| {
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
let mut vec = vec![];
field.get_docids_for_value_range(range, 0..u32::MAX, &mut vec);
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
assert_eq!(vec.len(), expexted_count);
};
test_range(50..=50);
@@ -953,7 +953,7 @@ mod tests {
let searcher = index.reader().unwrap().searcher();
let fastfields = searcher.segment_reader(0u32).fast_fields();
let column: Column<Ipv6Addr> = fastfields.column_opt("ip").unwrap().unwrap();
assert_eq!(column.num_rows(), 3);
assert_eq!(column.num_docs(), 3);
assert_eq!(column.first(0), None);
assert_eq!(column.first(1), Some(ip_addr));
assert_eq!(column.first(2), None);
@@ -995,7 +995,7 @@ mod tests {
let test_range = |range: RangeInclusive<u64>| {
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
let mut vec = vec![];
field.get_docids_for_value_range(range, 0..u32::MAX, &mut vec);
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
assert_eq!(vec.len(), expexted_count);
};
let test_range_variant = |start, stop| {

View File

@@ -1870,7 +1870,7 @@ mod tests {
.column_opt::<Ipv6Addr>("ips")
.unwrap()
.unwrap();
ff_reader.num_rows() as usize
ff_reader.num_docs() as usize
})
.sum();
assert_eq!(num_docs, num_docs_expected);
@@ -2089,11 +2089,12 @@ mod tests {
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
// Range query on single value field
// let query = gen_query_inclusive("ip", ip, ip);
// assert_eq!(do_search_ip_field(&query), count);
let query = gen_query_inclusive("ip", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
}
@@ -2111,8 +2112,8 @@ mod tests {
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
// Range query on single value field
// let query = gen_query_inclusive("ip", ip, ip);
// assert_eq!(do_search_ip_field(&query), count);
let query = gen_query_inclusive("ip", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip, ip);
@@ -2372,6 +2373,30 @@ mod tests {
test_operation_strategy(&ops[..], false, true).unwrap();
}
#[test]
fn test_range_query_bug_1() {
use IndexingOp::*;
let ops = &[
AddDoc { id: 9 },
AddDoc { id: 0 },
AddDoc { id: 13 },
Commit,
];
test_operation_strategy(&ops[..], false, true).unwrap();
}
#[test]
fn test_range_query_bug_2() {
use IndexingOp::*;
let ops = &[
AddDoc { id: 3 },
AddDoc { id: 6 },
AddDoc { id: 9 },
AddDoc { id: 10 },
];
test_operation_strategy(&ops[..], false, false).unwrap();
}
#[test]
fn test_index_doc_missing_field() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();

View File

@@ -106,7 +106,7 @@ impl<T: MakeZero + Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSe
fn fetch_horizon(&mut self, horizon: u32) -> bool {
let mut finished_to_end = false;
let limit = self.column.values.num_vals();
let limit = self.column.num_docs();
let mut end = self.next_fetch_start + horizon;
if end >= limit {
end = limit;
@@ -115,12 +115,11 @@ impl<T: MakeZero + Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSe
let last_value = self.loaded_docs.last_value();
let doc_buffer: &mut Vec<DocId> = self.loaded_docs.get_cleared_data();
self.column.values.get_docids_for_value_range(
self.column.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
doc_buffer,
);
self.column.idx.select_batch_in_place(doc_buffer);
if let Some(last_value) = last_value {
while self.loaded_docs.current() == Some(last_value) {
self.loaded_docs.next();

View File

@@ -154,6 +154,12 @@ pub mod tests {
assert!(test_id_range_for_docs(ops).is_ok());
}
#[test]
fn test_range_regression3() {
let ops = vec![doc_from_id_1(9), doc_from_id_1(0), doc_from_id_1(13)];
assert!(test_id_range_for_docs(ops).is_ok());
}
#[test]
fn test_range_regression_simplified() {
let mut schema_builder = SchemaBuilder::new();