Simpler implementation of first_vals_in_value_range.

This commit is contained in:
Stu Hood
2025-12-28 12:19:47 -07:00
parent 00110312c9
commit 865a12f4bb
3 changed files with 115 additions and 97 deletions

View File

@@ -1,6 +1,7 @@
mod dictionary_encoded;
mod serialize;
use std::cell::RefCell;
use std::fmt::{self, Debug};
use std::io::Write;
use std::ops::{Range, RangeInclusive};
@@ -19,6 +20,11 @@ use crate::column_values::monotonic_mapping::StrictlyMonotonicMappingToInternal;
use crate::column_values::{ColumnValues, monotonic_map_column};
use crate::{Cardinality, DocId, EmptyColumnValues, MonotonicallyMappableToU64, RowId};
thread_local! {
static ROWS: RefCell<Vec<RowId>> = const { RefCell::new(Vec::new()) };
static DOCS: RefCell<Vec<DocId>> = const { RefCell::new(Vec::new()) };
}
#[derive(Clone)]
pub struct Column<T = u64> {
pub index: ColumnIndex,
@@ -156,9 +162,6 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static + Default> Column<T> {
output: &mut Vec<crate::ComparableDoc<Option<T>, DocId>>,
value_range: ValueRange<T>,
) {
// TODO: Move `COLLECT_BLOCK_BUFFER_LEN` to allow for use here, or use a different constant
// in this context.
const BLOCK_LEN: usize = 64; // Corresponds to COLLECT_BLOCK_BUFFER_LEN in tantivy's docset
match (&self.index, value_range) {
(ColumnIndex::Empty { .. }, value_range) => {
let nulls_match = match &value_range {
@@ -178,47 +181,9 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static + Default> Column<T> {
}
(ColumnIndex::Full, value_range) => {
self.values
.get_vals_in_value_range(input_docs, output, value_range);
.get_vals_in_value_range(input_docs, input_docs, output, value_range);
}
(ColumnIndex::Optional(optional_index), value_range) => {
let len = input_docs.len();
// Ensure the input docids length does not exceed BLOCK_LEN for stack allocation
// safety. If it does, we might need to handle this with multiple
// chunks or fallback to heap. For now, an assert is used to confirm
// expected usage within batch processing limits.
assert!(
len <= BLOCK_LEN,
"Input docids length ({}) exceeds BLOCK_LEN ({})",
len,
BLOCK_LEN
);
let mut input_docs_buffer = [0u32; BLOCK_LEN];
input_docs_buffer[..len].copy_from_slice(input_docs);
let mut dense_row_ids_buffer = [0u32; BLOCK_LEN];
let mut dense_values_buffer = [T::default(); BLOCK_LEN];
let mut presence_mask: u64 = 0; // Bitmask to track which input_docs have a value
let mut num_present = 0;
// Phase 1: Identify existing RowIds and build dense_row_ids_buffer
for (i, &doc_id) in input_docs_buffer[..len].iter().enumerate() {
if let Some(row_id) = optional_index.rank_if_exists(doc_id) {
dense_row_ids_buffer[num_present] = row_id;
presence_mask |= 1u64 << i; // Set bit for present docid
num_present += 1;
}
}
// Phase 2: Batch fetch values for present docs
if num_present > 0 {
self.values.get_vals(
&dense_row_ids_buffer[..num_present],
&mut dense_values_buffer[..num_present],
);
}
// Determine if nulls match the value range
let nulls_match = match &value_range {
ValueRange::All => true,
ValueRange::Inclusive(_) => false,
@@ -226,35 +191,63 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static + Default> Column<T> {
ValueRange::LessThan(_, nulls_match) => *nulls_match,
};
// Phase 3: Filter and merge results, reconstructing docids and values
let mut dense_values_cursor = 0;
for i in 0..len {
let original_doc_id = input_docs_buffer[i];
if (presence_mask & (1u64 << i)) != 0 {
// This doc_id was present in the optional index and has a value
let val = dense_values_buffer[dense_values_cursor];
dense_values_cursor += 1;
let fallback_needed = ROWS.with(|rows_cell| {
DOCS.with(|docs_cell| {
let mut rows = rows_cell.borrow_mut();
let mut docs = docs_cell.borrow_mut();
rows.clear();
docs.clear();
// Check if the value matches the value range
let value_matches = match &value_range {
ValueRange::All => true,
ValueRange::Inclusive(r) => r.contains(&val),
ValueRange::GreaterThan(t, _) => val > *t,
ValueRange::LessThan(t, _) => val < *t,
};
let mut has_nulls = false;
if value_matches {
for &doc_id in input_docs {
if let Some(row_id) = optional_index.rank_if_exists(doc_id) {
rows.push(row_id);
docs.push(doc_id);
} else {
has_nulls = true;
if nulls_match {
break;
}
}
}
if !has_nulls || !nulls_match {
self.values.get_vals_in_value_range(
&rows,
&docs,
output,
value_range.clone(),
);
return false;
}
true
})
});
if fallback_needed {
for &doc_id in input_docs {
if let Some(row_id) = optional_index.rank_if_exists(doc_id) {
let val = self.values.get_val(row_id);
let value_matches = match &value_range {
ValueRange::All => true,
ValueRange::Inclusive(r) => r.contains(&val),
ValueRange::GreaterThan(t, _) => val > *t,
ValueRange::LessThan(t, _) => val < *t,
};
if value_matches {
output.push(crate::ComparableDoc {
doc: doc_id,
sort_key: Some(val),
});
}
} else if nulls_match {
output.push(crate::ComparableDoc {
doc: original_doc_id,
sort_key: Some(val),
doc: doc_id,
sort_key: None,
});
}
} else if nulls_match {
// This doc_id was not present in the optional index (null) and nulls match
output.push(crate::ComparableDoc {
doc: original_doc_id,
sort_key: None,
});
}
}
}

View File

@@ -116,6 +116,7 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
fn get_vals_in_value_range(
&self,
input_indexes: &[u32],
input_doc_ids: &[u32],
output: &mut Vec<crate::ComparableDoc<Option<T>, crate::DocId>>,
value_range: ValueRange<T>,
) {
@@ -130,25 +131,30 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
let idx2 = input_indexes[read_head + 2];
let idx3 = input_indexes[read_head + 3];
let doc0 = input_doc_ids[read_head];
let doc1 = input_doc_ids[read_head + 1];
let doc2 = input_doc_ids[read_head + 2];
let doc3 = input_doc_ids[read_head + 3];
let val0 = self.get_val(idx0);
let val1 = self.get_val(idx1);
let val2 = self.get_val(idx2);
let val3 = self.get_val(idx3);
output.push(crate::ComparableDoc {
doc: idx0,
doc: doc0,
sort_key: Some(val0),
});
output.push(crate::ComparableDoc {
doc: idx1,
doc: doc1,
sort_key: Some(val1),
});
output.push(crate::ComparableDoc {
doc: idx2,
doc: doc2,
sort_key: Some(val2),
});
output.push(crate::ComparableDoc {
doc: idx3,
doc: doc3,
sort_key: Some(val3),
});
@@ -162,6 +168,11 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
let idx2 = input_indexes[read_head + 2];
let idx3 = input_indexes[read_head + 3];
let doc0 = input_doc_ids[read_head];
let doc1 = input_doc_ids[read_head + 1];
let doc2 = input_doc_ids[read_head + 2];
let doc3 = input_doc_ids[read_head + 3];
let val0 = self.get_val(idx0);
let val1 = self.get_val(idx1);
let val2 = self.get_val(idx2);
@@ -169,25 +180,25 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
if range.contains(&val0) {
output.push(crate::ComparableDoc {
doc: idx0,
doc: doc0,
sort_key: Some(val0),
});
}
if range.contains(&val1) {
output.push(crate::ComparableDoc {
doc: idx1,
doc: doc1,
sort_key: Some(val1),
});
}
if range.contains(&val2) {
output.push(crate::ComparableDoc {
doc: idx2,
doc: doc2,
sort_key: Some(val2),
});
}
if range.contains(&val3) {
output.push(crate::ComparableDoc {
doc: idx3,
doc: doc3,
sort_key: Some(val3),
});
}
@@ -202,6 +213,11 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
let idx2 = input_indexes[read_head + 2];
let idx3 = input_indexes[read_head + 3];
let doc0 = input_doc_ids[read_head];
let doc1 = input_doc_ids[read_head + 1];
let doc2 = input_doc_ids[read_head + 2];
let doc3 = input_doc_ids[read_head + 3];
let val0 = self.get_val(idx0);
let val1 = self.get_val(idx1);
let val2 = self.get_val(idx2);
@@ -209,25 +225,25 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
if val0 > *threshold {
output.push(crate::ComparableDoc {
doc: idx0,
doc: doc0,
sort_key: Some(val0),
});
}
if val1 > *threshold {
output.push(crate::ComparableDoc {
doc: idx1,
doc: doc1,
sort_key: Some(val1),
});
}
if val2 > *threshold {
output.push(crate::ComparableDoc {
doc: idx2,
doc: doc2,
sort_key: Some(val2),
});
}
if val3 > *threshold {
output.push(crate::ComparableDoc {
doc: idx3,
doc: doc3,
sort_key: Some(val3),
});
}
@@ -242,6 +258,11 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
let idx2 = input_indexes[read_head + 2];
let idx3 = input_indexes[read_head + 3];
let doc0 = input_doc_ids[read_head];
let doc1 = input_doc_ids[read_head + 1];
let doc2 = input_doc_ids[read_head + 2];
let doc3 = input_doc_ids[read_head + 3];
let val0 = self.get_val(idx0);
let val1 = self.get_val(idx1);
let val2 = self.get_val(idx2);
@@ -249,25 +270,25 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
if val0 < *threshold {
output.push(crate::ComparableDoc {
doc: idx0,
doc: doc0,
sort_key: Some(val0),
});
}
if val1 < *threshold {
output.push(crate::ComparableDoc {
doc: idx1,
doc: doc1,
sort_key: Some(val1),
});
}
if val2 < *threshold {
output.push(crate::ComparableDoc {
doc: idx2,
doc: doc2,
sort_key: Some(val2),
});
}
if val3 < *threshold {
output.push(crate::ComparableDoc {
doc: idx3,
doc: doc3,
sort_key: Some(val3),
});
}
@@ -279,6 +300,7 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
// Process remaining elements (0 to 3)
while read_head < len {
let idx = input_indexes[read_head];
let doc = input_doc_ids[read_head];
let val = self.get_val(idx);
let matches = match value_range {
// 'value_range' is still moved here. This is the outer `value_range`
@@ -289,7 +311,7 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
};
if matches {
output.push(crate::ComparableDoc {
doc: idx,
doc,
sort_key: Some(val),
});
}
@@ -408,10 +430,11 @@ impl<T: PartialOrd + Default> ColumnValues<T> for EmptyColumnValues {
fn get_vals_in_value_range(
&self,
input_indexes: &[u32],
input_doc_ids: &[u32],
output: &mut Vec<crate::ComparableDoc<Option<T>, crate::DocId>>,
value_range: ValueRange<T>,
) {
let _ = (input_indexes, output, value_range);
let _ = (input_indexes, input_doc_ids, output, value_range);
panic!("Internal Error: Called get_vals_in_value_range of empty column.")
}
}
@@ -431,11 +454,12 @@ impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnV
fn get_vals_in_value_range(
&self,
input_indexes: &[u32],
input_doc_ids: &[u32],
output: &mut Vec<crate::ComparableDoc<Option<T>, crate::DocId>>,
value_range: ValueRange<T>,
) {
self.as_ref()
.get_vals_in_value_range(input_indexes, output, value_range)
.get_vals_in_value_range(input_indexes, input_doc_ids, output, value_range)
}
#[inline(always)]

View File

@@ -110,14 +110,15 @@ impl ColumnValues for BitpackedReader {
fn get_vals_in_value_range(
&self,
input_indexes: &[u32],
input_doc_ids: &[u32],
output: &mut Vec<crate::ComparableDoc<Option<u64>, crate::DocId>>,
value_range: ValueRange<u64>,
) {
match value_range {
ValueRange::All => {
for &idx in input_indexes {
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
output.push(crate::ComparableDoc {
doc: idx,
doc,
sort_key: Some(self.get_val(idx)),
});
}
@@ -126,8 +127,8 @@ impl ColumnValues for BitpackedReader {
if let Some(transformed_range) =
transform_range_before_linear_transformation(&self.stats, range)
{
for &doc in input_indexes {
let raw_val = self.unpack_val(doc);
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
let raw_val = self.unpack_val(idx);
if transformed_range.contains(&raw_val) {
output.push(crate::ComparableDoc {
doc,
@@ -141,9 +142,9 @@ impl ColumnValues for BitpackedReader {
}
ValueRange::GreaterThan(threshold, _) => {
if threshold < self.stats.min_value {
for &idx in input_indexes {
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
output.push(crate::ComparableDoc {
doc: idx,
doc,
sort_key: Some(self.get_val(idx)),
});
}
@@ -151,8 +152,8 @@ impl ColumnValues for BitpackedReader {
// All filtered out
} else {
let raw_threshold = (threshold - self.stats.min_value) / self.stats.gcd.get();
for &doc in input_indexes {
let raw_val = self.unpack_val(doc);
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
let raw_val = self.unpack_val(idx);
if raw_val > raw_threshold {
output.push(crate::ComparableDoc {
doc,
@@ -166,9 +167,9 @@ impl ColumnValues for BitpackedReader {
}
ValueRange::LessThan(threshold, _) => {
if threshold > self.stats.max_value {
for &idx in input_indexes {
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
output.push(crate::ComparableDoc {
doc: idx,
doc,
sort_key: Some(self.get_val(idx)),
});
}
@@ -183,8 +184,8 @@ impl ColumnValues for BitpackedReader {
diff / gcd + 1
};
for &doc in input_indexes {
let raw_val = self.unpack_val(doc);
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
let raw_val = self.unpack_val(idx);
if raw_val < raw_threshold {
output.push(crate::ComparableDoc {
doc,