diff --git a/columnar/src/column/mod.rs b/columnar/src/column/mod.rs index d12fab3c5..43f496b2d 100644 --- a/columnar/src/column/mod.rs +++ b/columnar/src/column/mod.rs @@ -118,6 +118,22 @@ impl Column { .filter(|val| range.contains(val)); } } + (ColumnIndex::Optional(optional_index), ValueRange::GreaterThan(threshold, _)) => { + for (i, docid) in docids.iter().enumerate() { + output[i] = optional_index + .rank_if_exists(*docid) + .map(|rowid| self.values.get_val(rowid)) + .filter(|val| *val > threshold); + } + } + (ColumnIndex::Optional(optional_index), ValueRange::LessThan(threshold, _)) => { + for (i, docid) in docids.iter().enumerate() { + output[i] = optional_index + .rank_if_exists(*docid) + .map(|rowid| self.values.get_val(rowid)) + .filter(|val| *val < threshold); + } + } (ColumnIndex::Multivalued(multivalued_index), ValueRange::All) => { for (i, docid) in docids.iter().enumerate() { let range = multivalued_index.range(*docid); @@ -145,6 +161,41 @@ impl Column { } } } + ( + ColumnIndex::Multivalued(multivalued_index), + ValueRange::GreaterThan(threshold, _), + ) => { + for (i, docid) in docids.iter().enumerate() { + let row_range = multivalued_index.range(*docid); + let is_empty = row_range.start == row_range.end; + if !is_empty { + let val = self.values.get_val(row_range.start); + if val > threshold { + output[i] = Some(val); + } else { + output[i] = None; + } + } else { + output[i] = None; + } + } + } + (ColumnIndex::Multivalued(multivalued_index), ValueRange::LessThan(threshold, _)) => { + for (i, docid) in docids.iter().enumerate() { + let row_range = multivalued_index.range(*docid); + let is_empty = row_range.start == row_range.end; + if !is_empty { + let val = self.values.get_val(row_range.start); + if val < threshold { + output[i] = Some(val); + } else { + output[i] = None; + } + } else { + output[i] = None; + } + } + } } } @@ -214,6 +265,12 @@ pub enum ValueRange { Inclusive(RangeInclusive), /// A range that matches all values. All, + /// A range that matches all values greater than the threshold. + /// The boolean flag indicates if null values should be included. + GreaterThan(T, bool), + /// A range that matches all values less than the threshold. + /// The boolean flag indicates if null values should be included. + LessThan(T, bool), } impl BinarySerializable for Cardinality { diff --git a/columnar/src/column_values/mod.rs b/columnar/src/column_values/mod.rs index 4dab2578b..7fe21d46e 100644 --- a/columnar/src/column_values/mod.rs +++ b/columnar/src/column_values/mod.rs @@ -144,6 +144,50 @@ pub trait ColumnValues: Send + Sync + DowncastSync { *out = if range.contains(&v) { Some(v) } else { None }; } } + ValueRange::GreaterThan(threshold, _) => { + let out_and_idx_chunks = output.chunks_exact_mut(4).zip(indexes.chunks_exact(4)); + for (out_x4, idx_x4) in out_and_idx_chunks { + let v0 = self.get_val(idx_x4[0]); + out_x4[0] = if v0 > threshold { Some(v0) } else { None }; + let v1 = self.get_val(idx_x4[1]); + out_x4[1] = if v1 > threshold { Some(v1) } else { None }; + let v2 = self.get_val(idx_x4[2]); + out_x4[2] = if v2 > threshold { Some(v2) } else { None }; + let v3 = self.get_val(idx_x4[3]); + out_x4[3] = if v3 > threshold { Some(v3) } else { None }; + } + let out_and_idx_chunks = output + .chunks_exact_mut(4) + .into_remainder() + .iter_mut() + .zip(indexes.chunks_exact(4).remainder()); + for (out, idx) in out_and_idx_chunks { + let v = self.get_val(*idx); + *out = if v > threshold { Some(v) } else { None }; + } + } + ValueRange::LessThan(threshold, _) => { + let out_and_idx_chunks = output.chunks_exact_mut(4).zip(indexes.chunks_exact(4)); + for (out_x4, idx_x4) in out_and_idx_chunks { + let v0 = self.get_val(idx_x4[0]); + out_x4[0] = if v0 < threshold { Some(v0) } else { None }; + let v1 = self.get_val(idx_x4[1]); + out_x4[1] = if v1 < threshold { Some(v1) } else { None }; + let v2 = self.get_val(idx_x4[2]); + out_x4[2] = if v2 < threshold { Some(v2) } else { None }; + let v3 = self.get_val(idx_x4[3]); + out_x4[3] = if v3 < threshold { Some(v3) } else { None }; + } + let out_and_idx_chunks = output + .chunks_exact_mut(4) + .into_remainder() + .iter_mut() + .zip(indexes.chunks_exact(4).remainder()); + for (out, idx) in out_and_idx_chunks { + let v = self.get_val(*idx); + *out = if v < threshold { Some(v) } else { None }; + } + } } } @@ -181,6 +225,22 @@ pub trait ColumnValues: Send + Sync + DowncastSync { } } } + ValueRange::GreaterThan(threshold, _) => { + for idx in row_id_range { + let val = self.get_val(idx); + if val > threshold { + row_id_hits.push(idx); + } + } + } + ValueRange::LessThan(threshold, _) => { + for idx in row_id_range { + let val = self.get_val(idx); + if val < threshold { + row_id_hits.push(idx); + } + } + } ValueRange::All => { row_id_hits.extend(row_id_range); } diff --git a/columnar/src/column_values/monotonic_column.rs b/columnar/src/column_values/monotonic_column.rs index bd2ed81d8..e8586ad7a 100644 --- a/columnar/src/column_values/monotonic_column.rs +++ b/columnar/src/column_values/monotonic_column.rs @@ -99,6 +99,16 @@ where doc_id_range, positions, ), + ValueRange::GreaterThan(threshold, _) => self.from_column.get_row_ids_for_value_range( + ValueRange::GreaterThan(self.monotonic_mapping.inverse(threshold), false), + doc_id_range, + positions, + ), + ValueRange::LessThan(threshold, _) => self.from_column.get_row_ids_for_value_range( + ValueRange::LessThan(self.monotonic_mapping.inverse(threshold), false), + doc_id_range, + positions, + ), } } diff --git a/columnar/src/column_values/u128_based/compact_space/mod.rs b/columnar/src/column_values/u128_based/compact_space/mod.rs index 9a750ebcc..d745bc905 100644 --- a/columnar/src/column_values/u128_based/compact_space/mod.rs +++ b/columnar/src/column_values/u128_based/compact_space/mod.rs @@ -356,6 +356,18 @@ impl ColumnValues for CompactSpaceU64Accessor { let position_range = position_range.start..position_range.end.min(self.num_vals()); positions.extend(position_range); } + ValueRange::GreaterThan(threshold, _) => { + let value_range = + ValueRange::GreaterThan(self.0.compact_to_u128(threshold as u32), false); + self.0 + .get_row_ids_for_value_range(value_range, position_range, positions) + } + ValueRange::LessThan(threshold, _) => { + let value_range = + ValueRange::LessThan(self.0.compact_to_u128(threshold as u32), false); + self.0 + .get_row_ids_for_value_range(value_range, position_range, positions) + } } } } @@ -397,6 +409,20 @@ impl ColumnValues for CompactSpaceDecompressor { positions.extend(position_range); return; } + ValueRange::GreaterThan(threshold, _) => { + let max = self.max_value(); + if threshold >= max { + return; + } + (threshold + 1)..=max + } + ValueRange::LessThan(threshold, _) => { + let min = self.min_value(); + if threshold <= min { + return; + } + min..=(threshold - 1) + } }; if value_range.start() > value_range.end() { diff --git a/columnar/src/column_values/u64_based/bitpacked.rs b/columnar/src/column_values/u64_based/bitpacked.rs index bbff868a9..089cae152 100644 --- a/columnar/src/column_values/u64_based/bitpacked.rs +++ b/columnar/src/column_values/u64_based/bitpacked.rs @@ -135,6 +135,66 @@ impl ColumnValues for BitpackedReader { } } } + ValueRange::GreaterThan(threshold, _) => { + if threshold < self.stats.min_value { + self.get_vals_opt(indexes, output); + } else if threshold >= self.stats.max_value { + for out in output.iter_mut() { + *out = None; + } + } else { + let raw_threshold = (threshold - self.stats.min_value) / self.stats.gcd.get(); + for (i, doc) in indexes.iter().enumerate() { + let raw_val = self.unpack_val(*doc); + if raw_val > raw_threshold { + output[i] = Some(self.stats.min_value + self.stats.gcd.get() * raw_val); + } else { + output[i] = None; + } + } + } + } + ValueRange::LessThan(threshold, _) => { + if threshold > self.stats.max_value { + self.get_vals_opt(indexes, output); + } else if threshold <= self.stats.min_value { + for out in output.iter_mut() { + *out = None; + } + } else { + // val < threshold + // min + gcd * raw < threshold + // gcd * raw < threshold - min + // raw < (threshold - min) / gcd + // If (threshold - min) % gcd == 0, then strictly less. + // If remainder != 0, e.g. gcd=10, min=0, threshold=15. raw < 1.5 => raw <= 1. + // (15-0)/10 = 1. raw < 1? No, raw=1 => 10 < 15. Correct. + // threshold=10. raw < 1. raw=0 => 0 < 10. Correct. + // So integer division works for strictly less if exact? + // 10 < 10 is false. 10/10 = 1. raw < 1 => raw=0. 0 < 10. + // So raw < (threshold - min + gcd - 1) / gcd ? + // No. raw_val * gcd < threshold - min. + // raw_val < (threshold - min) / gcd (float). + // integers: raw_val < ceil((threshold - min)/gcd) + // raw_val < (threshold - min + gcd - 1) / gcd. + let diff = threshold - self.stats.min_value; + let gcd = self.stats.gcd.get(); + let raw_threshold = if diff % gcd == 0 { + diff / gcd + } else { + diff / gcd + 1 + }; + + for (i, doc) in indexes.iter().enumerate() { + let raw_val = self.unpack_val(*doc); + if raw_val < raw_threshold { + output[i] = Some(self.stats.min_value + self.stats.gcd.get() * raw_val); + } else { + output[i] = None; + } + } + } + } } } @@ -176,6 +236,80 @@ impl ColumnValues for BitpackedReader { positions, ); } + ValueRange::GreaterThan(threshold, _) => { + if threshold < self.stats.min_value { + positions.extend(doc_id_range); + return; + } + if threshold >= self.stats.max_value { + return; + } + let raw_threshold = (threshold - self.stats.min_value) / self.stats.gcd.get(); + // We want raw > raw_threshold. + // bit_unpacker.get_ids_for_value_range_from_subset takes a RangeInclusive. + // We can construct a RangeInclusive: (raw_threshold + 1) ..= u64::MAX + // But max raw value is known? (max_value - min_value) / gcd. + let max_raw = (self.stats.max_value - self.stats.min_value) / self.stats.gcd.get(); + let transformed_range = (raw_threshold + 1)..=max_raw; + + let data_range = self + .bit_unpacker + .block_oblivious_range(doc_id_range.clone(), self.data.len()); + let data_offset = data_range.start; + let data_subset = self + .data + .slice(data_range) + .read_bytes() + .expect("Failed to read column values."); + self.bit_unpacker.get_ids_for_value_range_from_subset( + transformed_range, + doc_id_range, + data_offset, + &data_subset, + positions, + ); + } + ValueRange::LessThan(threshold, _) => { + if threshold > self.stats.max_value { + positions.extend(doc_id_range); + return; + } + if threshold <= self.stats.min_value { + return; + } + + let diff = threshold - self.stats.min_value; + let gcd = self.stats.gcd.get(); + // We want raw < raw_threshold_limit + // raw <= raw_threshold_limit - 1 + let raw_threshold_limit = if diff % gcd == 0 { + diff / gcd + } else { + diff / gcd + 1 + }; + + if raw_threshold_limit == 0 { + return; + } + let transformed_range = 0..=(raw_threshold_limit - 1); + + let data_range = self + .bit_unpacker + .block_oblivious_range(doc_id_range.clone(), self.data.len()); + let data_offset = data_range.start; + let data_subset = self + .data + .slice(data_range) + .read_bytes() + .expect("Failed to read column values."); + self.bit_unpacker.get_ids_for_value_range_from_subset( + transformed_range, + doc_id_range, + data_offset, + &data_subset, + positions, + ); + } } } } diff --git a/src/collector/sort_key/order.rs b/src/collector/sort_key/order.rs index a330675ab..123aa7c14 100644 --- a/src/collector/sort_key/order.rs +++ b/src/collector/sort_key/order.rs @@ -1,6 +1,6 @@ use std::cmp::Ordering; -use columnar::MonotonicallyMappableToU64; +use columnar::{MonotonicallyMappableToU64, ValueRange}; use serde::{Deserialize, Serialize}; use crate::collector::{ComparableDoc, SegmentSortKeyComputer, SortKeyComputer}; @@ -86,6 +86,10 @@ pub trait Comparator: Send + Sync + std::fmt::Debug + Default { lhs.doc.cmp(&rhs.doc) }) } + + /// Return a `ValueRange` that matches all values that are greater than the provided threshold. + #[allow(dead_code)] + fn threshold_to_valuerange(&self, threshold: T) -> ValueRange; } /// Compare values naturally (e.g. 1 < 2). @@ -103,6 +107,10 @@ impl Comparator for NaturalComparator { fn compare(&self, lhs: &T, rhs: &T) -> Ordering { lhs.partial_cmp(rhs).unwrap() } + + fn threshold_to_valuerange(&self, threshold: T) -> ValueRange { + ValueRange::GreaterThan(threshold, false) + } } /// A (partial) implementation of comparison for OwnedValue. @@ -114,6 +122,10 @@ impl Comparator for NaturalComparator { fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering { compare_owned_value::(lhs, rhs) } + + fn threshold_to_valuerange(&self, threshold: OwnedValue) -> ValueRange { + ValueRange::GreaterThan(threshold, false) + } } /// Compare values in reverse (e.g. 2 < 1). @@ -138,6 +150,10 @@ where NaturalComparator: Comparator fn compare(&self, lhs: &T, rhs: &T) -> Ordering { NaturalComparator.compare(rhs, lhs) } + + fn threshold_to_valuerange(&self, threshold: T) -> ValueRange { + ValueRange::LessThan(threshold, true) + } } /// Compare values in reverse, but treating `None` as lower than `Some`. @@ -164,6 +180,10 @@ where ReverseComparator: Comparator (Some(lhs), Some(rhs)) => ReverseComparator.compare(lhs, rhs), } } + + fn threshold_to_valuerange(&self, threshold: Option) -> ValueRange> { + ValueRange::LessThan(threshold, false) + } } impl Comparator for ReverseNoneIsLowerComparator { @@ -171,6 +191,10 @@ impl Comparator for ReverseNoneIsLowerComparator { fn compare(&self, lhs: &u32, rhs: &u32) -> Ordering { ReverseComparator.compare(lhs, rhs) } + + fn threshold_to_valuerange(&self, threshold: u32) -> ValueRange { + ValueRange::LessThan(threshold, false) + } } impl Comparator for ReverseNoneIsLowerComparator { @@ -178,6 +202,10 @@ impl Comparator for ReverseNoneIsLowerComparator { fn compare(&self, lhs: &u64, rhs: &u64) -> Ordering { ReverseComparator.compare(lhs, rhs) } + + fn threshold_to_valuerange(&self, threshold: u64) -> ValueRange { + ValueRange::LessThan(threshold, false) + } } impl Comparator for ReverseNoneIsLowerComparator { @@ -185,6 +213,10 @@ impl Comparator for ReverseNoneIsLowerComparator { fn compare(&self, lhs: &f64, rhs: &f64) -> Ordering { ReverseComparator.compare(lhs, rhs) } + + fn threshold_to_valuerange(&self, threshold: f64) -> ValueRange { + ValueRange::LessThan(threshold, false) + } } impl Comparator for ReverseNoneIsLowerComparator { @@ -192,6 +224,10 @@ impl Comparator for ReverseNoneIsLowerComparator { fn compare(&self, lhs: &f32, rhs: &f32) -> Ordering { ReverseComparator.compare(lhs, rhs) } + + fn threshold_to_valuerange(&self, threshold: f32) -> ValueRange { + ValueRange::LessThan(threshold, false) + } } impl Comparator for ReverseNoneIsLowerComparator { @@ -199,6 +235,10 @@ impl Comparator for ReverseNoneIsLowerComparator { fn compare(&self, lhs: &i64, rhs: &i64) -> Ordering { ReverseComparator.compare(lhs, rhs) } + + fn threshold_to_valuerange(&self, threshold: i64) -> ValueRange { + ValueRange::LessThan(threshold, false) + } } impl Comparator for ReverseNoneIsLowerComparator { @@ -206,6 +246,10 @@ impl Comparator for ReverseNoneIsLowerComparator { fn compare(&self, lhs: &String, rhs: &String) -> Ordering { ReverseComparator.compare(lhs, rhs) } + + fn threshold_to_valuerange(&self, threshold: String) -> ValueRange { + ValueRange::LessThan(threshold, false) + } } impl Comparator for ReverseNoneIsLowerComparator { @@ -213,6 +257,10 @@ impl Comparator for ReverseNoneIsLowerComparator { fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering { compare_owned_value::(rhs, lhs) } + + fn threshold_to_valuerange(&self, threshold: OwnedValue) -> ValueRange { + ValueRange::LessThan(threshold, false) + } } /// Compare values naturally, but treating `None` as higher than `Some`. @@ -235,6 +283,10 @@ where NaturalComparator: Comparator (Some(lhs), Some(rhs)) => NaturalComparator.compare(lhs, rhs), } } + + fn threshold_to_valuerange(&self, threshold: Option) -> ValueRange> { + ValueRange::GreaterThan(threshold, true) + } } impl Comparator for NaturalNoneIsHigherComparator { @@ -242,6 +294,10 @@ impl Comparator for NaturalNoneIsHigherComparator { fn compare(&self, lhs: &u32, rhs: &u32) -> Ordering { NaturalComparator.compare(lhs, rhs) } + + fn threshold_to_valuerange(&self, threshold: u32) -> ValueRange { + ValueRange::GreaterThan(threshold, true) + } } impl Comparator for NaturalNoneIsHigherComparator { @@ -249,6 +305,10 @@ impl Comparator for NaturalNoneIsHigherComparator { fn compare(&self, lhs: &u64, rhs: &u64) -> Ordering { NaturalComparator.compare(lhs, rhs) } + + fn threshold_to_valuerange(&self, threshold: u64) -> ValueRange { + ValueRange::GreaterThan(threshold, true) + } } impl Comparator for NaturalNoneIsHigherComparator { @@ -256,6 +316,10 @@ impl Comparator for NaturalNoneIsHigherComparator { fn compare(&self, lhs: &f64, rhs: &f64) -> Ordering { NaturalComparator.compare(lhs, rhs) } + + fn threshold_to_valuerange(&self, threshold: f64) -> ValueRange { + ValueRange::GreaterThan(threshold, true) + } } impl Comparator for NaturalNoneIsHigherComparator { @@ -263,6 +327,10 @@ impl Comparator for NaturalNoneIsHigherComparator { fn compare(&self, lhs: &f32, rhs: &f32) -> Ordering { NaturalComparator.compare(lhs, rhs) } + + fn threshold_to_valuerange(&self, threshold: f32) -> ValueRange { + ValueRange::GreaterThan(threshold, true) + } } impl Comparator for NaturalNoneIsHigherComparator { @@ -270,6 +338,10 @@ impl Comparator for NaturalNoneIsHigherComparator { fn compare(&self, lhs: &i64, rhs: &i64) -> Ordering { NaturalComparator.compare(lhs, rhs) } + + fn threshold_to_valuerange(&self, threshold: i64) -> ValueRange { + ValueRange::GreaterThan(threshold, true) + } } impl Comparator for NaturalNoneIsHigherComparator { @@ -277,6 +349,10 @@ impl Comparator for NaturalNoneIsHigherComparator { fn compare(&self, lhs: &String, rhs: &String) -> Ordering { NaturalComparator.compare(lhs, rhs) } + + fn threshold_to_valuerange(&self, threshold: String) -> ValueRange { + ValueRange::GreaterThan(threshold, true) + } } impl Comparator for NaturalNoneIsHigherComparator { @@ -284,6 +360,10 @@ impl Comparator for NaturalNoneIsHigherComparator { fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering { compare_owned_value::(lhs, rhs) } + + fn threshold_to_valuerange(&self, threshold: OwnedValue) -> ValueRange { + ValueRange::GreaterThan(threshold, true) + } } /// An enum representing the different sort orders. @@ -325,6 +405,19 @@ where ComparatorEnum::NaturalNoneHigher => NaturalNoneIsHigherComparator.compare(lhs, rhs), } } + + fn threshold_to_valuerange(&self, threshold: T) -> ValueRange { + match self { + ComparatorEnum::Natural => NaturalComparator.threshold_to_valuerange(threshold), + ComparatorEnum::Reverse => ReverseComparator.threshold_to_valuerange(threshold), + ComparatorEnum::ReverseNoneLower => { + ReverseNoneIsLowerComparator.threshold_to_valuerange(threshold) + } + ComparatorEnum::NaturalNoneHigher => { + NaturalNoneIsHigherComparator.threshold_to_valuerange(threshold) + } + } + } } impl Comparator<(Head, Tail)> @@ -339,6 +432,10 @@ where .compare(&lhs.0, &rhs.0) .then_with(|| self.1.compare(&lhs.1, &rhs.1)) } + + fn threshold_to_valuerange(&self, threshold: (Head, Tail)) -> ValueRange<(Head, Tail)> { + ValueRange::GreaterThan(threshold, false) + } } impl Comparator<(Type1, (Type2, Type3))> @@ -355,6 +452,13 @@ where .then_with(|| self.1.compare(&lhs.1 .0, &rhs.1 .0)) .then_with(|| self.2.compare(&lhs.1 .1, &rhs.1 .1)) } + + fn threshold_to_valuerange( + &self, + threshold: (Type1, (Type2, Type3)), + ) -> ValueRange<(Type1, (Type2, Type3))> { + ValueRange::GreaterThan(threshold, false) + } } impl Comparator<(Type1, Type2, Type3)> @@ -371,6 +475,13 @@ where .then_with(|| self.1.compare(&lhs.1, &rhs.1)) .then_with(|| self.2.compare(&lhs.2, &rhs.2)) } + + fn threshold_to_valuerange( + &self, + threshold: (Type1, Type2, Type3), + ) -> ValueRange<(Type1, Type2, Type3)> { + ValueRange::GreaterThan(threshold, false) + } } impl @@ -394,6 +505,13 @@ where .then_with(|| self.2.compare(&lhs.1 .1 .0, &rhs.1 .1 .0)) .then_with(|| self.3.compare(&lhs.1 .1 .1, &rhs.1 .1 .1)) } + + fn threshold_to_valuerange( + &self, + threshold: (Type1, (Type2, (Type3, Type4))), + ) -> ValueRange<(Type1, (Type2, (Type3, Type4)))> { + ValueRange::GreaterThan(threshold, false) + } } impl @@ -417,6 +535,13 @@ where .then_with(|| self.2.compare(&lhs.2, &rhs.2)) .then_with(|| self.3.compare(&lhs.3, &rhs.3)) } + + fn threshold_to_valuerange( + &self, + threshold: (Type1, Type2, Type3, Type4), + ) -> ValueRange<(Type1, Type2, Type3, Type4)> { + ValueRange::GreaterThan(threshold, false) + } } impl SortKeyComputer for (TSortKeyComputer, ComparatorEnum)