WIP: Add ValueRange cases for Comparators.

This commit is contained in:
Stu Hood
2025-12-26 11:02:19 -07:00
parent e8a4adeedd
commit 5ff38e1605
6 changed files with 413 additions and 1 deletions

View File

@@ -118,6 +118,22 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
.filter(|val| range.contains(val));
}
}
(ColumnIndex::Optional(optional_index), ValueRange::GreaterThan(threshold, _)) => {
for (i, docid) in docids.iter().enumerate() {
output[i] = optional_index
.rank_if_exists(*docid)
.map(|rowid| self.values.get_val(rowid))
.filter(|val| *val > threshold);
}
}
(ColumnIndex::Optional(optional_index), ValueRange::LessThan(threshold, _)) => {
for (i, docid) in docids.iter().enumerate() {
output[i] = optional_index
.rank_if_exists(*docid)
.map(|rowid| self.values.get_val(rowid))
.filter(|val| *val < threshold);
}
}
(ColumnIndex::Multivalued(multivalued_index), ValueRange::All) => {
for (i, docid) in docids.iter().enumerate() {
let range = multivalued_index.range(*docid);
@@ -145,6 +161,41 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
}
}
}
(
ColumnIndex::Multivalued(multivalued_index),
ValueRange::GreaterThan(threshold, _),
) => {
for (i, docid) in docids.iter().enumerate() {
let row_range = multivalued_index.range(*docid);
let is_empty = row_range.start == row_range.end;
if !is_empty {
let val = self.values.get_val(row_range.start);
if val > threshold {
output[i] = Some(val);
} else {
output[i] = None;
}
} else {
output[i] = None;
}
}
}
(ColumnIndex::Multivalued(multivalued_index), ValueRange::LessThan(threshold, _)) => {
for (i, docid) in docids.iter().enumerate() {
let row_range = multivalued_index.range(*docid);
let is_empty = row_range.start == row_range.end;
if !is_empty {
let val = self.values.get_val(row_range.start);
if val < threshold {
output[i] = Some(val);
} else {
output[i] = None;
}
} else {
output[i] = None;
}
}
}
}
}
@@ -214,6 +265,12 @@ pub enum ValueRange<T> {
Inclusive(RangeInclusive<T>),
/// A range that matches all values.
All,
/// A range that matches all values greater than the threshold.
/// The boolean flag indicates if null values should be included.
GreaterThan(T, bool),
/// A range that matches all values less than the threshold.
/// The boolean flag indicates if null values should be included.
LessThan(T, bool),
}
impl BinarySerializable for Cardinality {

View File

@@ -144,6 +144,50 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
*out = if range.contains(&v) { Some(v) } else { None };
}
}
ValueRange::GreaterThan(threshold, _) => {
let out_and_idx_chunks = output.chunks_exact_mut(4).zip(indexes.chunks_exact(4));
for (out_x4, idx_x4) in out_and_idx_chunks {
let v0 = self.get_val(idx_x4[0]);
out_x4[0] = if v0 > threshold { Some(v0) } else { None };
let v1 = self.get_val(idx_x4[1]);
out_x4[1] = if v1 > threshold { Some(v1) } else { None };
let v2 = self.get_val(idx_x4[2]);
out_x4[2] = if v2 > threshold { Some(v2) } else { None };
let v3 = self.get_val(idx_x4[3]);
out_x4[3] = if v3 > threshold { Some(v3) } else { None };
}
let out_and_idx_chunks = output
.chunks_exact_mut(4)
.into_remainder()
.iter_mut()
.zip(indexes.chunks_exact(4).remainder());
for (out, idx) in out_and_idx_chunks {
let v = self.get_val(*idx);
*out = if v > threshold { Some(v) } else { None };
}
}
ValueRange::LessThan(threshold, _) => {
let out_and_idx_chunks = output.chunks_exact_mut(4).zip(indexes.chunks_exact(4));
for (out_x4, idx_x4) in out_and_idx_chunks {
let v0 = self.get_val(idx_x4[0]);
out_x4[0] = if v0 < threshold { Some(v0) } else { None };
let v1 = self.get_val(idx_x4[1]);
out_x4[1] = if v1 < threshold { Some(v1) } else { None };
let v2 = self.get_val(idx_x4[2]);
out_x4[2] = if v2 < threshold { Some(v2) } else { None };
let v3 = self.get_val(idx_x4[3]);
out_x4[3] = if v3 < threshold { Some(v3) } else { None };
}
let out_and_idx_chunks = output
.chunks_exact_mut(4)
.into_remainder()
.iter_mut()
.zip(indexes.chunks_exact(4).remainder());
for (out, idx) in out_and_idx_chunks {
let v = self.get_val(*idx);
*out = if v < threshold { Some(v) } else { None };
}
}
}
}
@@ -181,6 +225,22 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
}
}
}
ValueRange::GreaterThan(threshold, _) => {
for idx in row_id_range {
let val = self.get_val(idx);
if val > threshold {
row_id_hits.push(idx);
}
}
}
ValueRange::LessThan(threshold, _) => {
for idx in row_id_range {
let val = self.get_val(idx);
if val < threshold {
row_id_hits.push(idx);
}
}
}
ValueRange::All => {
row_id_hits.extend(row_id_range);
}

View File

@@ -99,6 +99,16 @@ where
doc_id_range,
positions,
),
ValueRange::GreaterThan(threshold, _) => self.from_column.get_row_ids_for_value_range(
ValueRange::GreaterThan(self.monotonic_mapping.inverse(threshold), false),
doc_id_range,
positions,
),
ValueRange::LessThan(threshold, _) => self.from_column.get_row_ids_for_value_range(
ValueRange::LessThan(self.monotonic_mapping.inverse(threshold), false),
doc_id_range,
positions,
),
}
}

View File

@@ -356,6 +356,18 @@ impl ColumnValues<u64> for CompactSpaceU64Accessor {
let position_range = position_range.start..position_range.end.min(self.num_vals());
positions.extend(position_range);
}
ValueRange::GreaterThan(threshold, _) => {
let value_range =
ValueRange::GreaterThan(self.0.compact_to_u128(threshold as u32), false);
self.0
.get_row_ids_for_value_range(value_range, position_range, positions)
}
ValueRange::LessThan(threshold, _) => {
let value_range =
ValueRange::LessThan(self.0.compact_to_u128(threshold as u32), false);
self.0
.get_row_ids_for_value_range(value_range, position_range, positions)
}
}
}
}
@@ -397,6 +409,20 @@ impl ColumnValues<u128> for CompactSpaceDecompressor {
positions.extend(position_range);
return;
}
ValueRange::GreaterThan(threshold, _) => {
let max = self.max_value();
if threshold >= max {
return;
}
(threshold + 1)..=max
}
ValueRange::LessThan(threshold, _) => {
let min = self.min_value();
if threshold <= min {
return;
}
min..=(threshold - 1)
}
};
if value_range.start() > value_range.end() {

View File

@@ -135,6 +135,66 @@ impl ColumnValues for BitpackedReader {
}
}
}
ValueRange::GreaterThan(threshold, _) => {
if threshold < self.stats.min_value {
self.get_vals_opt(indexes, output);
} else if threshold >= self.stats.max_value {
for out in output.iter_mut() {
*out = None;
}
} else {
let raw_threshold = (threshold - self.stats.min_value) / self.stats.gcd.get();
for (i, doc) in indexes.iter().enumerate() {
let raw_val = self.unpack_val(*doc);
if raw_val > raw_threshold {
output[i] = Some(self.stats.min_value + self.stats.gcd.get() * raw_val);
} else {
output[i] = None;
}
}
}
}
ValueRange::LessThan(threshold, _) => {
if threshold > self.stats.max_value {
self.get_vals_opt(indexes, output);
} else if threshold <= self.stats.min_value {
for out in output.iter_mut() {
*out = None;
}
} else {
// val < threshold
// min + gcd * raw < threshold
// gcd * raw < threshold - min
// raw < (threshold - min) / gcd
// If (threshold - min) % gcd == 0, then strictly less.
// If remainder != 0, e.g. gcd=10, min=0, threshold=15. raw < 1.5 => raw <= 1.
// (15-0)/10 = 1. raw < 1? No, raw=1 => 10 < 15. Correct.
// threshold=10. raw < 1. raw=0 => 0 < 10. Correct.
// So integer division works for strictly less if exact?
// 10 < 10 is false. 10/10 = 1. raw < 1 => raw=0. 0 < 10.
// So raw < (threshold - min + gcd - 1) / gcd ?
// No. raw_val * gcd < threshold - min.
// raw_val < (threshold - min) / gcd (float).
// integers: raw_val < ceil((threshold - min)/gcd)
// raw_val < (threshold - min + gcd - 1) / gcd.
let diff = threshold - self.stats.min_value;
let gcd = self.stats.gcd.get();
let raw_threshold = if diff % gcd == 0 {
diff / gcd
} else {
diff / gcd + 1
};
for (i, doc) in indexes.iter().enumerate() {
let raw_val = self.unpack_val(*doc);
if raw_val < raw_threshold {
output[i] = Some(self.stats.min_value + self.stats.gcd.get() * raw_val);
} else {
output[i] = None;
}
}
}
}
}
}
@@ -176,6 +236,80 @@ impl ColumnValues for BitpackedReader {
positions,
);
}
ValueRange::GreaterThan(threshold, _) => {
if threshold < self.stats.min_value {
positions.extend(doc_id_range);
return;
}
if threshold >= self.stats.max_value {
return;
}
let raw_threshold = (threshold - self.stats.min_value) / self.stats.gcd.get();
// We want raw > raw_threshold.
// bit_unpacker.get_ids_for_value_range_from_subset takes a RangeInclusive.
// We can construct a RangeInclusive: (raw_threshold + 1) ..= u64::MAX
// But max raw value is known? (max_value - min_value) / gcd.
let max_raw = (self.stats.max_value - self.stats.min_value) / self.stats.gcd.get();
let transformed_range = (raw_threshold + 1)..=max_raw;
let data_range = self
.bit_unpacker
.block_oblivious_range(doc_id_range.clone(), self.data.len());
let data_offset = data_range.start;
let data_subset = self
.data
.slice(data_range)
.read_bytes()
.expect("Failed to read column values.");
self.bit_unpacker.get_ids_for_value_range_from_subset(
transformed_range,
doc_id_range,
data_offset,
&data_subset,
positions,
);
}
ValueRange::LessThan(threshold, _) => {
if threshold > self.stats.max_value {
positions.extend(doc_id_range);
return;
}
if threshold <= self.stats.min_value {
return;
}
let diff = threshold - self.stats.min_value;
let gcd = self.stats.gcd.get();
// We want raw < raw_threshold_limit
// raw <= raw_threshold_limit - 1
let raw_threshold_limit = if diff % gcd == 0 {
diff / gcd
} else {
diff / gcd + 1
};
if raw_threshold_limit == 0 {
return;
}
let transformed_range = 0..=(raw_threshold_limit - 1);
let data_range = self
.bit_unpacker
.block_oblivious_range(doc_id_range.clone(), self.data.len());
let data_offset = data_range.start;
let data_subset = self
.data
.slice(data_range)
.read_bytes()
.expect("Failed to read column values.");
self.bit_unpacker.get_ids_for_value_range_from_subset(
transformed_range,
doc_id_range,
data_offset,
&data_subset,
positions,
);
}
}
}
}

View File

@@ -1,6 +1,6 @@
use std::cmp::Ordering;
use columnar::MonotonicallyMappableToU64;
use columnar::{MonotonicallyMappableToU64, ValueRange};
use serde::{Deserialize, Serialize};
use crate::collector::{ComparableDoc, SegmentSortKeyComputer, SortKeyComputer};
@@ -86,6 +86,10 @@ pub trait Comparator<T>: Send + Sync + std::fmt::Debug + Default {
lhs.doc.cmp(&rhs.doc)
})
}
/// Return a `ValueRange` that matches all values that are greater than the provided threshold.
#[allow(dead_code)]
fn threshold_to_valuerange(&self, threshold: T) -> ValueRange<T>;
}
/// Compare values naturally (e.g. 1 < 2).
@@ -103,6 +107,10 @@ impl<T: PartialOrd> Comparator<T> for NaturalComparator {
fn compare(&self, lhs: &T, rhs: &T) -> Ordering {
lhs.partial_cmp(rhs).unwrap()
}
fn threshold_to_valuerange(&self, threshold: T) -> ValueRange<T> {
ValueRange::GreaterThan(threshold, false)
}
}
/// A (partial) implementation of comparison for OwnedValue.
@@ -114,6 +122,10 @@ impl Comparator<OwnedValue> for NaturalComparator {
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
compare_owned_value::</* NULLS_FIRST= */ true>(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: OwnedValue) -> ValueRange<OwnedValue> {
ValueRange::GreaterThan(threshold, false)
}
}
/// Compare values in reverse (e.g. 2 < 1).
@@ -138,6 +150,10 @@ where NaturalComparator: Comparator<T>
fn compare(&self, lhs: &T, rhs: &T) -> Ordering {
NaturalComparator.compare(rhs, lhs)
}
fn threshold_to_valuerange(&self, threshold: T) -> ValueRange<T> {
ValueRange::LessThan(threshold, true)
}
}
/// Compare values in reverse, but treating `None` as lower than `Some`.
@@ -164,6 +180,10 @@ where ReverseComparator: Comparator<T>
(Some(lhs), Some(rhs)) => ReverseComparator.compare(lhs, rhs),
}
}
fn threshold_to_valuerange(&self, threshold: Option<T>) -> ValueRange<Option<T>> {
ValueRange::LessThan(threshold, false)
}
}
impl Comparator<u32> for ReverseNoneIsLowerComparator {
@@ -171,6 +191,10 @@ impl Comparator<u32> for ReverseNoneIsLowerComparator {
fn compare(&self, lhs: &u32, rhs: &u32) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: u32) -> ValueRange<u32> {
ValueRange::LessThan(threshold, false)
}
}
impl Comparator<u64> for ReverseNoneIsLowerComparator {
@@ -178,6 +202,10 @@ impl Comparator<u64> for ReverseNoneIsLowerComparator {
fn compare(&self, lhs: &u64, rhs: &u64) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: u64) -> ValueRange<u64> {
ValueRange::LessThan(threshold, false)
}
}
impl Comparator<f64> for ReverseNoneIsLowerComparator {
@@ -185,6 +213,10 @@ impl Comparator<f64> for ReverseNoneIsLowerComparator {
fn compare(&self, lhs: &f64, rhs: &f64) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: f64) -> ValueRange<f64> {
ValueRange::LessThan(threshold, false)
}
}
impl Comparator<f32> for ReverseNoneIsLowerComparator {
@@ -192,6 +224,10 @@ impl Comparator<f32> for ReverseNoneIsLowerComparator {
fn compare(&self, lhs: &f32, rhs: &f32) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: f32) -> ValueRange<f32> {
ValueRange::LessThan(threshold, false)
}
}
impl Comparator<i64> for ReverseNoneIsLowerComparator {
@@ -199,6 +235,10 @@ impl Comparator<i64> for ReverseNoneIsLowerComparator {
fn compare(&self, lhs: &i64, rhs: &i64) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: i64) -> ValueRange<i64> {
ValueRange::LessThan(threshold, false)
}
}
impl Comparator<String> for ReverseNoneIsLowerComparator {
@@ -206,6 +246,10 @@ impl Comparator<String> for ReverseNoneIsLowerComparator {
fn compare(&self, lhs: &String, rhs: &String) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: String) -> ValueRange<String> {
ValueRange::LessThan(threshold, false)
}
}
impl Comparator<OwnedValue> for ReverseNoneIsLowerComparator {
@@ -213,6 +257,10 @@ impl Comparator<OwnedValue> for ReverseNoneIsLowerComparator {
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
compare_owned_value::</* NULLS_FIRST= */ false>(rhs, lhs)
}
fn threshold_to_valuerange(&self, threshold: OwnedValue) -> ValueRange<OwnedValue> {
ValueRange::LessThan(threshold, false)
}
}
/// Compare values naturally, but treating `None` as higher than `Some`.
@@ -235,6 +283,10 @@ where NaturalComparator: Comparator<T>
(Some(lhs), Some(rhs)) => NaturalComparator.compare(lhs, rhs),
}
}
fn threshold_to_valuerange(&self, threshold: Option<T>) -> ValueRange<Option<T>> {
ValueRange::GreaterThan(threshold, true)
}
}
impl Comparator<u32> for NaturalNoneIsHigherComparator {
@@ -242,6 +294,10 @@ impl Comparator<u32> for NaturalNoneIsHigherComparator {
fn compare(&self, lhs: &u32, rhs: &u32) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: u32) -> ValueRange<u32> {
ValueRange::GreaterThan(threshold, true)
}
}
impl Comparator<u64> for NaturalNoneIsHigherComparator {
@@ -249,6 +305,10 @@ impl Comparator<u64> for NaturalNoneIsHigherComparator {
fn compare(&self, lhs: &u64, rhs: &u64) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: u64) -> ValueRange<u64> {
ValueRange::GreaterThan(threshold, true)
}
}
impl Comparator<f64> for NaturalNoneIsHigherComparator {
@@ -256,6 +316,10 @@ impl Comparator<f64> for NaturalNoneIsHigherComparator {
fn compare(&self, lhs: &f64, rhs: &f64) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: f64) -> ValueRange<f64> {
ValueRange::GreaterThan(threshold, true)
}
}
impl Comparator<f32> for NaturalNoneIsHigherComparator {
@@ -263,6 +327,10 @@ impl Comparator<f32> for NaturalNoneIsHigherComparator {
fn compare(&self, lhs: &f32, rhs: &f32) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: f32) -> ValueRange<f32> {
ValueRange::GreaterThan(threshold, true)
}
}
impl Comparator<i64> for NaturalNoneIsHigherComparator {
@@ -270,6 +338,10 @@ impl Comparator<i64> for NaturalNoneIsHigherComparator {
fn compare(&self, lhs: &i64, rhs: &i64) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: i64) -> ValueRange<i64> {
ValueRange::GreaterThan(threshold, true)
}
}
impl Comparator<String> for NaturalNoneIsHigherComparator {
@@ -277,6 +349,10 @@ impl Comparator<String> for NaturalNoneIsHigherComparator {
fn compare(&self, lhs: &String, rhs: &String) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: String) -> ValueRange<String> {
ValueRange::GreaterThan(threshold, true)
}
}
impl Comparator<OwnedValue> for NaturalNoneIsHigherComparator {
@@ -284,6 +360,10 @@ impl Comparator<OwnedValue> for NaturalNoneIsHigherComparator {
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
compare_owned_value::</* NULLS_FIRST= */ false>(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: OwnedValue) -> ValueRange<OwnedValue> {
ValueRange::GreaterThan(threshold, true)
}
}
/// An enum representing the different sort orders.
@@ -325,6 +405,19 @@ where
ComparatorEnum::NaturalNoneHigher => NaturalNoneIsHigherComparator.compare(lhs, rhs),
}
}
fn threshold_to_valuerange(&self, threshold: T) -> ValueRange<T> {
match self {
ComparatorEnum::Natural => NaturalComparator.threshold_to_valuerange(threshold),
ComparatorEnum::Reverse => ReverseComparator.threshold_to_valuerange(threshold),
ComparatorEnum::ReverseNoneLower => {
ReverseNoneIsLowerComparator.threshold_to_valuerange(threshold)
}
ComparatorEnum::NaturalNoneHigher => {
NaturalNoneIsHigherComparator.threshold_to_valuerange(threshold)
}
}
}
}
impl<Head, Tail, LeftComparator, RightComparator> Comparator<(Head, Tail)>
@@ -339,6 +432,10 @@ where
.compare(&lhs.0, &rhs.0)
.then_with(|| self.1.compare(&lhs.1, &rhs.1))
}
fn threshold_to_valuerange(&self, threshold: (Head, Tail)) -> ValueRange<(Head, Tail)> {
ValueRange::GreaterThan(threshold, false)
}
}
impl<Type1, Type2, Type3, Comparator1, Comparator2, Comparator3> Comparator<(Type1, (Type2, Type3))>
@@ -355,6 +452,13 @@ where
.then_with(|| self.1.compare(&lhs.1 .0, &rhs.1 .0))
.then_with(|| self.2.compare(&lhs.1 .1, &rhs.1 .1))
}
fn threshold_to_valuerange(
&self,
threshold: (Type1, (Type2, Type3)),
) -> ValueRange<(Type1, (Type2, Type3))> {
ValueRange::GreaterThan(threshold, false)
}
}
impl<Type1, Type2, Type3, Comparator1, Comparator2, Comparator3> Comparator<(Type1, Type2, Type3)>
@@ -371,6 +475,13 @@ where
.then_with(|| self.1.compare(&lhs.1, &rhs.1))
.then_with(|| self.2.compare(&lhs.2, &rhs.2))
}
fn threshold_to_valuerange(
&self,
threshold: (Type1, Type2, Type3),
) -> ValueRange<(Type1, Type2, Type3)> {
ValueRange::GreaterThan(threshold, false)
}
}
impl<Type1, Type2, Type3, Type4, Comparator1, Comparator2, Comparator3, Comparator4>
@@ -394,6 +505,13 @@ where
.then_with(|| self.2.compare(&lhs.1 .1 .0, &rhs.1 .1 .0))
.then_with(|| self.3.compare(&lhs.1 .1 .1, &rhs.1 .1 .1))
}
fn threshold_to_valuerange(
&self,
threshold: (Type1, (Type2, (Type3, Type4))),
) -> ValueRange<(Type1, (Type2, (Type3, Type4)))> {
ValueRange::GreaterThan(threshold, false)
}
}
impl<Type1, Type2, Type3, Type4, Comparator1, Comparator2, Comparator3, Comparator4>
@@ -417,6 +535,13 @@ where
.then_with(|| self.2.compare(&lhs.2, &rhs.2))
.then_with(|| self.3.compare(&lhs.3, &rhs.3))
}
fn threshold_to_valuerange(
&self,
threshold: (Type1, Type2, Type3, Type4),
) -> ValueRange<(Type1, Type2, Type3, Type4)> {
ValueRange::GreaterThan(threshold, false)
}
}
impl<TSortKeyComputer> SortKeyComputer for (TSortKeyComputer, ComparatorEnum)