Add null handling to first_vals_in_value_range.

This commit is contained in:
Stu Hood
2025-12-26 11:12:53 -07:00
parent 5ff38e1605
commit 996fc936f6
6 changed files with 167 additions and 60 deletions

View File

@@ -57,7 +57,7 @@ fn bench_group(mut runner: InputGroup<Column>) {
column.first_vals_in_value_range(&docs, &mut buffer, ValueRange::All);
for val in buffer.iter() {
let Some(val) = val else { continue };
let Some(Some(val)) = val else { continue };
sum += *val;
}
}

View File

@@ -94,7 +94,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
pub fn first_vals_in_value_range(
&self,
docids: &[DocId],
output: &mut [Option<T>],
output: &mut [Option<Option<T>>],
value_range: ValueRange<T>,
) {
match (&self.index, value_range) {
@@ -105,33 +105,74 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
}
(ColumnIndex::Optional(optional_index), ValueRange::All) => {
for (i, docid) in docids.iter().enumerate() {
output[i] = optional_index
.rank_if_exists(*docid)
.map(|rowid| self.values.get_val(rowid));
output[i] = Some(
optional_index
.rank_if_exists(*docid)
.map(|rowid| self.values.get_val(rowid)),
);
}
}
(ColumnIndex::Optional(optional_index), ValueRange::Inclusive(range)) => {
for (i, docid) in docids.iter().enumerate() {
output[i] = optional_index
.rank_if_exists(*docid)
.map(|rowid| self.values.get_val(rowid))
.filter(|val| range.contains(val));
output[i] = match optional_index.rank_if_exists(*docid) {
Some(rowid) => {
let val = self.values.get_val(rowid);
if range.contains(&val) {
Some(Some(val))
} else {
None
}
}
None => None, // range does not include NULL
};
}
}
(ColumnIndex::Optional(optional_index), ValueRange::GreaterThan(threshold, _)) => {
(
ColumnIndex::Optional(optional_index),
ValueRange::GreaterThan(threshold, nulls_match),
) => {
for (i, docid) in docids.iter().enumerate() {
output[i] = optional_index
.rank_if_exists(*docid)
.map(|rowid| self.values.get_val(rowid))
.filter(|val| *val > threshold);
output[i] = match optional_index.rank_if_exists(*docid) {
Some(rowid) => {
let val = self.values.get_val(rowid);
if val > threshold {
Some(Some(val))
} else {
None
}
}
None => {
if nulls_match {
Some(None)
} else {
None
}
}
};
}
}
(ColumnIndex::Optional(optional_index), ValueRange::LessThan(threshold, _)) => {
(
ColumnIndex::Optional(optional_index),
ValueRange::LessThan(threshold, nulls_match),
) => {
for (i, docid) in docids.iter().enumerate() {
output[i] = optional_index
.rank_if_exists(*docid)
.map(|rowid| self.values.get_val(rowid))
.filter(|val| *val < threshold);
output[i] = match optional_index.rank_if_exists(*docid) {
Some(rowid) => {
let val = self.values.get_val(rowid);
if val < threshold {
Some(Some(val))
} else {
None
}
}
None => {
if nulls_match {
Some(None)
} else {
None
}
}
};
}
}
(ColumnIndex::Multivalued(multivalued_index), ValueRange::All) => {
@@ -139,9 +180,9 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
let range = multivalued_index.range(*docid);
let is_empty = range.start == range.end;
if !is_empty {
output[i] = Some(self.values.get_val(range.start));
output[i] = Some(Some(self.values.get_val(range.start)));
} else {
output[i] = None;
output[i] = Some(None);
}
}
}
@@ -152,7 +193,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
if !is_empty {
let val = self.values.get_val(row_range.start);
if range.contains(&val) {
output[i] = Some(val);
output[i] = Some(Some(val));
} else {
output[i] = None;
}
@@ -163,7 +204,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
}
(
ColumnIndex::Multivalued(multivalued_index),
ValueRange::GreaterThan(threshold, _),
ValueRange::GreaterThan(threshold, nulls_match),
) => {
for (i, docid) in docids.iter().enumerate() {
let row_range = multivalued_index.range(*docid);
@@ -171,28 +212,39 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
if !is_empty {
let val = self.values.get_val(row_range.start);
if val > threshold {
output[i] = Some(val);
output[i] = Some(Some(val));
} else {
output[i] = None;
}
} else {
output[i] = None;
if nulls_match {
output[i] = Some(None);
} else {
output[i] = None;
}
}
}
}
(ColumnIndex::Multivalued(multivalued_index), ValueRange::LessThan(threshold, _)) => {
(
ColumnIndex::Multivalued(multivalued_index),
ValueRange::LessThan(threshold, nulls_match),
) => {
for (i, docid) in docids.iter().enumerate() {
let row_range = multivalued_index.range(*docid);
let is_empty = row_range.start == row_range.end;
if !is_empty {
let val = self.values.get_val(row_range.start);
if val < threshold {
output[i] = Some(val);
output[i] = Some(Some(val));
} else {
output[i] = None;
}
} else {
output[i] = None;
if nulls_match {
output[i] = Some(None);
} else {
output[i] = None;
}
}
}
}

View File

@@ -116,23 +116,43 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
fn get_vals_in_value_range(
&self,
indexes: &[u32],
output: &mut [Option<T>],
output: &mut [Option<Option<T>>],
value_range: ValueRange<T>,
) {
assert!(indexes.len() == output.len());
match value_range {
ValueRange::All => self.get_vals_opt(indexes, output),
ValueRange::All => {
for (out, idx) in output.iter_mut().zip(indexes) {
*out = Some(Some(self.get_val(*idx)));
}
}
ValueRange::Inclusive(range) => {
let out_and_idx_chunks = output.chunks_exact_mut(4).zip(indexes.chunks_exact(4));
for (out_x4, idx_x4) in out_and_idx_chunks {
let v0 = self.get_val(idx_x4[0]);
out_x4[0] = if range.contains(&v0) { Some(v0) } else { None };
out_x4[0] = if range.contains(&v0) {
Some(Some(v0))
} else {
None
};
let v1 = self.get_val(idx_x4[1]);
out_x4[1] = if range.contains(&v1) { Some(v1) } else { None };
out_x4[1] = if range.contains(&v1) {
Some(Some(v1))
} else {
None
};
let v2 = self.get_val(idx_x4[2]);
out_x4[2] = if range.contains(&v2) { Some(v2) } else { None };
out_x4[2] = if range.contains(&v2) {
Some(Some(v2))
} else {
None
};
let v3 = self.get_val(idx_x4[3]);
out_x4[3] = if range.contains(&v3) { Some(v3) } else { None };
out_x4[3] = if range.contains(&v3) {
Some(Some(v3))
} else {
None
};
}
let out_and_idx_chunks = output
.chunks_exact_mut(4)
@@ -141,20 +161,24 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
.zip(indexes.chunks_exact(4).remainder());
for (out, idx) in out_and_idx_chunks {
let v = self.get_val(*idx);
*out = if range.contains(&v) { Some(v) } else { None };
*out = if range.contains(&v) {
Some(Some(v))
} else {
None
};
}
}
ValueRange::GreaterThan(threshold, _) => {
let out_and_idx_chunks = output.chunks_exact_mut(4).zip(indexes.chunks_exact(4));
for (out_x4, idx_x4) in out_and_idx_chunks {
let v0 = self.get_val(idx_x4[0]);
out_x4[0] = if v0 > threshold { Some(v0) } else { None };
out_x4[0] = if v0 > threshold { Some(Some(v0)) } else { None };
let v1 = self.get_val(idx_x4[1]);
out_x4[1] = if v1 > threshold { Some(v1) } else { None };
out_x4[1] = if v1 > threshold { Some(Some(v1)) } else { None };
let v2 = self.get_val(idx_x4[2]);
out_x4[2] = if v2 > threshold { Some(v2) } else { None };
out_x4[2] = if v2 > threshold { Some(Some(v2)) } else { None };
let v3 = self.get_val(idx_x4[3]);
out_x4[3] = if v3 > threshold { Some(v3) } else { None };
out_x4[3] = if v3 > threshold { Some(Some(v3)) } else { None };
}
let out_and_idx_chunks = output
.chunks_exact_mut(4)
@@ -163,20 +187,20 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
.zip(indexes.chunks_exact(4).remainder());
for (out, idx) in out_and_idx_chunks {
let v = self.get_val(*idx);
*out = if v > threshold { Some(v) } else { None };
*out = if v > threshold { Some(Some(v)) } else { None };
}
}
ValueRange::LessThan(threshold, _) => {
let out_and_idx_chunks = output.chunks_exact_mut(4).zip(indexes.chunks_exact(4));
for (out_x4, idx_x4) in out_and_idx_chunks {
let v0 = self.get_val(idx_x4[0]);
out_x4[0] = if v0 < threshold { Some(v0) } else { None };
out_x4[0] = if v0 < threshold { Some(Some(v0)) } else { None };
let v1 = self.get_val(idx_x4[1]);
out_x4[1] = if v1 < threshold { Some(v1) } else { None };
out_x4[1] = if v1 < threshold { Some(Some(v1)) } else { None };
let v2 = self.get_val(idx_x4[2]);
out_x4[2] = if v2 < threshold { Some(v2) } else { None };
out_x4[2] = if v2 < threshold { Some(Some(v2)) } else { None };
let v3 = self.get_val(idx_x4[3]);
out_x4[3] = if v3 < threshold { Some(v3) } else { None };
out_x4[3] = if v3 < threshold { Some(Some(v3)) } else { None };
}
let out_and_idx_chunks = output
.chunks_exact_mut(4)
@@ -185,7 +209,7 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
.zip(indexes.chunks_exact(4).remainder());
for (out, idx) in out_and_idx_chunks {
let v = self.get_val(*idx);
*out = if v < threshold { Some(v) } else { None };
*out = if v < threshold { Some(Some(v)) } else { None };
}
}
}
@@ -298,6 +322,16 @@ impl<T: PartialOrd + Default> ColumnValues<T> for EmptyColumnValues {
fn num_vals(&self) -> u32 {
0
}
fn get_vals_in_value_range(
&self,
indexes: &[u32],
output: &mut [Option<Option<T>>],
value_range: ValueRange<T>,
) {
let _ = (indexes, output, value_range);
panic!("Internal Error: Called get_vals_in_value_range of empty column.")
}
}
impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnValues<T>> {
@@ -315,7 +349,7 @@ impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnV
fn get_vals_in_value_range(
&self,
indexes: &[u32],
output: &mut [Option<T>],
output: &mut [Option<Option<T>>],
value_range: ValueRange<T>,
) {
self.as_ref()

View File

@@ -110,12 +110,14 @@ impl ColumnValues for BitpackedReader {
fn get_vals_in_value_range(
&self,
indexes: &[u32],
output: &mut [Option<u64>],
output: &mut [Option<Option<u64>>],
value_range: ValueRange<u64>,
) {
match value_range {
ValueRange::All => {
self.get_vals_opt(indexes, output);
for (out, idx) in output.iter_mut().zip(indexes) {
*out = Some(Some(self.get_val(*idx)));
}
}
ValueRange::Inclusive(range) => {
if let Some(transformed_range) =
@@ -124,7 +126,8 @@ impl ColumnValues for BitpackedReader {
for (i, doc) in indexes.iter().enumerate() {
let raw_val = self.unpack_val(*doc);
if transformed_range.contains(&raw_val) {
output[i] = Some(self.stats.min_value + self.stats.gcd.get() * raw_val);
output[i] =
Some(Some(self.stats.min_value + self.stats.gcd.get() * raw_val));
} else {
output[i] = None;
}
@@ -137,7 +140,9 @@ impl ColumnValues for BitpackedReader {
}
ValueRange::GreaterThan(threshold, _) => {
if threshold < self.stats.min_value {
self.get_vals_opt(indexes, output);
for (out, idx) in output.iter_mut().zip(indexes) {
*out = Some(Some(self.get_val(*idx)));
}
} else if threshold >= self.stats.max_value {
for out in output.iter_mut() {
*out = None;
@@ -147,7 +152,8 @@ impl ColumnValues for BitpackedReader {
for (i, doc) in indexes.iter().enumerate() {
let raw_val = self.unpack_val(*doc);
if raw_val > raw_threshold {
output[i] = Some(self.stats.min_value + self.stats.gcd.get() * raw_val);
output[i] =
Some(Some(self.stats.min_value + self.stats.gcd.get() * raw_val));
} else {
output[i] = None;
}
@@ -156,7 +162,9 @@ impl ColumnValues for BitpackedReader {
}
ValueRange::LessThan(threshold, _) => {
if threshold > self.stats.max_value {
self.get_vals_opt(indexes, output);
for (out, idx) in output.iter_mut().zip(indexes) {
*out = Some(Some(self.get_val(*idx)));
}
} else if threshold <= self.stats.min_value {
for out in output.iter_mut() {
*out = None;
@@ -188,7 +196,8 @@ impl ColumnValues for BitpackedReader {
for (i, doc) in indexes.iter().enumerate() {
let raw_val = self.unpack_val(*doc);
if raw_val < raw_threshold {
output[i] = Some(self.stats.min_value + self.stats.gcd.get() * raw_val);
output[i] =
Some(Some(self.stats.min_value + self.stats.gcd.get() * raw_val));
} else {
output[i] = None;
}
@@ -197,7 +206,6 @@ impl ColumnValues for BitpackedReader {
}
}
}
fn get_row_ids_for_value_range(
&self,
range: ValueRange<u64>,

View File

@@ -72,6 +72,7 @@ impl<T: FastValue> SortKeyComputer for SortByStaticFastValue<T> {
sort_column,
typ: PhantomData,
buffer: Vec::new(),
fetch_buffer: Vec::new(),
})
}
}
@@ -80,6 +81,7 @@ pub struct SortByFastValueSegmentSortKeyComputer<T> {
sort_column: Column<u64>,
typ: PhantomData<T>,
buffer: Vec<Option<u64>>,
fetch_buffer: Vec<Option<Option<u64>>>,
}
impl<T: FastValue> SegmentSortKeyComputer for SortByFastValueSegmentSortKeyComputer<T> {
@@ -93,9 +95,13 @@ impl<T: FastValue> SegmentSortKeyComputer for SortByFastValueSegmentSortKeyCompu
}
fn segment_sort_keys(&mut self, docs: &[DocId]) -> &mut Vec<Self::SegmentSortKey> {
self.buffer.resize(docs.len(), None);
self.fetch_buffer.resize(docs.len(), None);
self.sort_column
.first_vals_in_value_range(docs, &mut self.buffer, ValueRange::All);
.first_vals_in_value_range(docs, &mut self.fetch_buffer, ValueRange::All);
self.buffer.clear();
self.buffer
.extend(self.fetch_buffer.iter().map(|val| val.flatten()));
&mut self.buffer
}

View File

@@ -41,6 +41,7 @@ impl SortKeyComputer for SortByString {
Ok(ByStringColumnSegmentSortKeyComputer {
str_column_opt,
buffer: Vec::new(),
fetch_buffer: Vec::new(),
})
}
}
@@ -48,6 +49,7 @@ impl SortKeyComputer for SortByString {
pub struct ByStringColumnSegmentSortKeyComputer {
str_column_opt: Option<StrColumn>,
buffer: Vec<Option<TermOrdinal>>,
fetch_buffer: Vec<Option<Option<TermOrdinal>>>,
}
impl SegmentSortKeyComputer for ByStringColumnSegmentSortKeyComputer {
@@ -62,12 +64,17 @@ impl SegmentSortKeyComputer for ByStringColumnSegmentSortKeyComputer {
}
fn segment_sort_keys(&mut self, docs: &[DocId]) -> &mut Vec<Self::SegmentSortKey> {
self.buffer.resize(docs.len(), None);
self.fetch_buffer.resize(docs.len(), None);
if let Some(str_column) = &self.str_column_opt {
str_column
.ords()
.first_vals_in_value_range(docs, &mut self.buffer, ValueRange::All);
str_column.ords().first_vals_in_value_range(
docs,
&mut self.fetch_buffer,
ValueRange::All,
);
}
self.buffer.clear();
self.buffer
.extend(self.fetch_buffer.iter().map(|val| val.flatten()));
&mut self.buffer
}