WIP: Add ValueRange.

This commit is contained in:
Stu Hood
2025-12-25 14:53:15 -07:00
parent 53c067d1f3
commit f4252fc184
11 changed files with 71 additions and 35 deletions

View File

@@ -143,7 +143,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
#[inline]
pub fn get_docids_for_value_range(
&self,
value_range: RangeInclusive<T>,
value_range: ValueRange<T>,
selected_docid_range: Range<u32>,
doc_ids: &mut Vec<u32>,
) {
@@ -168,6 +168,18 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
}
}
/// A range of values.
///
/// This type is intended to be used in batch APIs, where the cost of unpacking the enum
/// is outweighed by the time spent processing a batch.
///
/// Implementers should pattern match on the variants to use optimized loops for each case.
#[derive(Clone, Debug)]
pub enum ValueRange<T> {
/// A range that includes both start and end.
Inclusive(RangeInclusive<T>),
}
impl BinarySerializable for Cardinality {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> {
self.to_code().serialize(writer)

View File

@@ -339,7 +339,7 @@ mod tests {
use std::ops::Range;
use super::MultiValueIndex;
use crate::{ColumnarReader, DynamicColumn};
use crate::{ColumnarReader, DynamicColumn, ValueRange};
fn index_to_pos_helper(
index: &MultiValueIndex,
@@ -419,7 +419,7 @@ mod tests {
assert_eq!(row_id_range, 0..4);
let check = |range, expected| {
let full_range = 0..=u64::MAX;
let full_range = ValueRange::Inclusive(0..=u64::MAX);
let mut docids = Vec::new();
column.get_docids_for_value_range(full_range, range, &mut docids);
assert_eq!(docids, expected);

View File

@@ -7,13 +7,15 @@
//! - Monotonically map values to u64/u128
use std::fmt::Debug;
use std::ops::{Range, RangeInclusive};
use std::ops::Range;
use std::sync::Arc;
use downcast_rs::DowncastSync;
pub use monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
pub use monotonic_mapping_u128::MonotonicallyMappableToU128;
use crate::column::ValueRange;
mod merge;
pub(crate) mod monotonic_mapping;
pub(crate) mod monotonic_mapping_u128;
@@ -128,15 +130,19 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
/// Note that position == docid for single value fast fields
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<T>,
value_range: ValueRange<T>,
row_id_range: Range<RowId>,
row_id_hits: &mut Vec<RowId>,
) {
let row_id_range = row_id_range.start..row_id_range.end.min(self.num_vals());
for idx in row_id_range {
let val = self.get_val(idx);
if value_range.contains(&val) {
row_id_hits.push(idx);
match value_range {
ValueRange::Inclusive(range) => {
for idx in row_id_range {
let val = self.get_val(idx);
if range.contains(&val) {
row_id_hits.push(idx);
}
}
}
}
}
@@ -233,7 +239,7 @@ impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnV
#[inline(always)]
fn get_row_ids_for_value_range(
&self,
range: RangeInclusive<T>,
range: ValueRange<T>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {

View File

@@ -1,8 +1,9 @@
use std::fmt::Debug;
use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive};
use std::ops::Range;
use crate::ColumnValues;
use crate::column::ValueRange;
use crate::column_values::monotonic_mapping::StrictlyMonotonicFn;
struct MonotonicMappingColumn<C, T, Input> {
@@ -80,13 +81,16 @@ where
fn get_row_ids_for_value_range(
&self,
range: RangeInclusive<Output>,
range: ValueRange<Output>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let ValueRange::Inclusive(range) = range;
self.from_column.get_row_ids_for_value_range(
self.monotonic_mapping.inverse(range.start().clone())
..=self.monotonic_mapping.inverse(range.end().clone()),
ValueRange::Inclusive(
self.monotonic_mapping.inverse(range.start().clone())
..=self.monotonic_mapping.inverse(range.end().clone()),
),
doc_id_range,
positions,
)

View File

@@ -25,6 +25,7 @@ use common::{BinarySerializable, CountingWriter, OwnedBytes, VInt, VIntU128};
use tantivy_bitpacker::{BitPacker, BitUnpacker};
use crate::RowId;
use crate::column::ValueRange;
use crate::column_values::ColumnValues;
/// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of
@@ -338,12 +339,15 @@ impl ColumnValues<u64> for CompactSpaceU64Accessor {
#[inline]
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<u64>,
value_range: ValueRange<u64>,
position_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let value_range = self.0.compact_to_u128(*value_range.start() as u32)
..=self.0.compact_to_u128(*value_range.end() as u32);
let ValueRange::Inclusive(value_range) = value_range;
let value_range = ValueRange::Inclusive(
self.0.compact_to_u128(*value_range.start() as u32)
..=self.0.compact_to_u128(*value_range.end() as u32),
);
self.0
.get_row_ids_for_value_range(value_range, position_range, positions)
}
@@ -375,10 +379,11 @@ impl ColumnValues<u128> for CompactSpaceDecompressor {
#[inline]
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<u128>,
value_range: ValueRange<u128>,
position_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let ValueRange::Inclusive(value_range) = value_range;
if value_range.start() > value_range.end() {
return;
}
@@ -560,7 +565,7 @@ mod tests {
.collect::<Vec<_>>();
let mut positions = Vec::new();
decompressor.get_row_ids_for_value_range(
range,
ValueRange::Inclusive(range),
0..decompressor.num_vals(),
&mut positions,
);
@@ -604,7 +609,11 @@ mod tests {
let val = *val;
let pos = pos as u32;
let mut positions = Vec::new();
decomp.get_row_ids_for_value_range(val..=val, pos..pos + 1, &mut positions);
decomp.get_row_ids_for_value_range(
ValueRange::Inclusive(val..=val),
pos..pos + 1,
&mut positions,
);
assert_eq!(positions, vec![pos]);
}
@@ -746,7 +755,11 @@ mod tests {
doc_id_range: Range<u32>,
) -> Vec<u32> {
let mut positions = Vec::new();
column.get_row_ids_for_value_range(value_range, doc_id_range, &mut positions);
column.get_row_ids_for_value_range(
ValueRange::Inclusive(value_range),
doc_id_range,
&mut positions,
);
positions
}

View File

@@ -8,6 +8,7 @@ use common::{BinarySerializable, HasLen, OwnedBytes};
use fastdivide::DividerU64;
use tantivy_bitpacker::{BitPacker, BitUnpacker, compute_num_bits};
use crate::column::ValueRange;
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
use crate::{ColumnValues, RowId};
@@ -69,8 +70,9 @@ const fn div_ceil(n: u64, q: NonZeroU64) -> u64 {
// f(bitpacked) ∈ [min_value, max_value] <=> bitpacked ∈ [min_bitpacked_value, max_bitpacked_value]
fn transform_range_before_linear_transformation(
stats: &ColumnStats,
range: RangeInclusive<u64>,
range: ValueRange<u64>,
) -> Option<RangeInclusive<u64>> {
let ValueRange::Inclusive(range) = range;
if range.is_empty() {
return None;
}
@@ -108,7 +110,7 @@ impl ColumnValues for BitpackedReader {
fn get_row_ids_for_value_range(
&self,
range: RangeInclusive<u64>,
range: ValueRange<u64>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {

View File

@@ -132,7 +132,7 @@ pub(crate) fn create_and_validate<TColumnCodec: ColumnCodec>(
.collect();
let mut positions = Vec::new();
reader.get_row_ids_for_value_range(
vals[test_rand_idx]..=vals[test_rand_idx],
crate::column::ValueRange::Inclusive(vals[test_rand_idx]..=vals[test_rand_idx]),
0..vals.len() as u32,
&mut positions,
);

View File

@@ -36,7 +36,7 @@ pub(crate) mod utils;
mod value;
pub use block_accessor::ColumnBlockAccessor;
pub use column::{BytesColumn, Column, StrColumn};
pub use column::{BytesColumn, Column, StrColumn, ValueRange};
pub use column_index::ColumnIndex;
pub use column_values::{
ColumnValues, EmptyColumnValues, MonotonicallyMappableToU64, MonotonicallyMappableToU128,

View File

@@ -79,7 +79,7 @@ mod tests {
use std::ops::{Range, RangeInclusive};
use std::path::Path;
use columnar::StrColumn;
use columnar::{StrColumn, ValueRange};
use common::{ByteCount, DateTimePrecision, HasLen, TerminatingWrite};
use once_cell::sync::Lazy;
use rand::prelude::SliceRandom;
@@ -944,7 +944,7 @@ mod tests {
let test_range = |range: RangeInclusive<u64>| {
let expected_count = numbers.iter().filter(|num| range.contains(*num)).count();
let mut vec = vec![];
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
field.get_row_ids_for_value_range(ValueRange::Inclusive(range), 0..u32::MAX, &mut vec);
assert_eq!(vec.len(), expected_count);
};
test_range(50..=50);
@@ -1022,7 +1022,7 @@ mod tests {
let test_range = |range: RangeInclusive<u64>| {
let expected_count = numbers.iter().filter(|num| range.contains(*num)).count();
let mut vec = vec![];
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
field.get_row_ids_for_value_range(ValueRange::Inclusive(range), 0..u32::MAX, &mut vec);
assert_eq!(vec.len(), expected_count);
};
let test_range_variant = |start, stop| {

View File

@@ -1,7 +1,6 @@
use core::fmt::Debug;
use std::ops::RangeInclusive;
use columnar::Column;
use columnar::{Column, ValueRange};
use crate::{DocId, DocSet, TERMINATED};
@@ -41,7 +40,7 @@ impl VecCursor {
pub(crate) struct RangeDocSet<T> {
/// The range filter on the values.
value_range: RangeInclusive<T>,
value_range: ValueRange<T>,
column: Column<T>,
/// The next docid start range to fetch (inclusive).
next_fetch_start: u32,
@@ -61,7 +60,7 @@ pub(crate) struct RangeDocSet<T> {
const DEFAULT_FETCH_HORIZON: u32 = 128;
impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
pub(crate) fn new(value_range: RangeInclusive<T>, column: Column<T>) -> Self {
pub(crate) fn new(value_range: ValueRange<T>, column: Column<T>) -> Self {
let mut range_docset = Self {
value_range,
column,

View File

@@ -7,7 +7,7 @@ use std::ops::{Bound, RangeInclusive};
use columnar::{
Cardinality, Column, ColumnType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
NumericalType, StrColumn,
NumericalType, StrColumn, ValueRange,
};
use common::bounds::{BoundsRange, TransformBound};
@@ -154,7 +154,7 @@ impl Weight for FastFieldRangeWeight {
ip_addr_column.min_value(),
ip_addr_column.max_value(),
);
let docset = RangeDocSet::new(value_range, ip_addr_column);
let docset = RangeDocSet::new(ValueRange::Inclusive(value_range), ip_addr_column);
Ok(Box::new(ConstScorer::new(docset, boost)))
} else if field_type.is_str() {
let Some(str_dict_column): Option<StrColumn> = reader.fast_fields().str(&field_name)?
@@ -426,7 +426,7 @@ fn search_on_u64_ff(
}
}
let docset = RangeDocSet::new(value_range, column);
let docset = RangeDocSet::new(ValueRange::Inclusive(value_range), column);
Ok(Box::new(ConstScorer::new(docset, boost)))
}