mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-30 15:10:40 +00:00
Simpler implementation of first_vals_in_value_range.
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
mod dictionary_encoded;
|
||||
mod serialize;
|
||||
|
||||
use std::cell::RefCell;
|
||||
use std::fmt::{self, Debug};
|
||||
use std::io::Write;
|
||||
use std::ops::{Range, RangeInclusive};
|
||||
@@ -19,6 +20,11 @@ use crate::column_values::monotonic_mapping::StrictlyMonotonicMappingToInternal;
|
||||
use crate::column_values::{ColumnValues, monotonic_map_column};
|
||||
use crate::{Cardinality, DocId, EmptyColumnValues, MonotonicallyMappableToU64, RowId};
|
||||
|
||||
thread_local! {
|
||||
static ROWS: RefCell<Vec<RowId>> = const { RefCell::new(Vec::new()) };
|
||||
static DOCS: RefCell<Vec<DocId>> = const { RefCell::new(Vec::new()) };
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Column<T = u64> {
|
||||
pub index: ColumnIndex,
|
||||
@@ -156,9 +162,6 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static + Default> Column<T> {
|
||||
output: &mut Vec<crate::ComparableDoc<Option<T>, DocId>>,
|
||||
value_range: ValueRange<T>,
|
||||
) {
|
||||
// TODO: Move `COLLECT_BLOCK_BUFFER_LEN` to allow for use here, or use a different constant
|
||||
// in this context.
|
||||
const BLOCK_LEN: usize = 64; // Corresponds to COLLECT_BLOCK_BUFFER_LEN in tantivy's docset
|
||||
match (&self.index, value_range) {
|
||||
(ColumnIndex::Empty { .. }, value_range) => {
|
||||
let nulls_match = match &value_range {
|
||||
@@ -178,47 +181,9 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static + Default> Column<T> {
|
||||
}
|
||||
(ColumnIndex::Full, value_range) => {
|
||||
self.values
|
||||
.get_vals_in_value_range(input_docs, output, value_range);
|
||||
.get_vals_in_value_range(input_docs, input_docs, output, value_range);
|
||||
}
|
||||
(ColumnIndex::Optional(optional_index), value_range) => {
|
||||
let len = input_docs.len();
|
||||
// Ensure the input docids length does not exceed BLOCK_LEN for stack allocation
|
||||
// safety. If it does, we might need to handle this with multiple
|
||||
// chunks or fallback to heap. For now, an assert is used to confirm
|
||||
// expected usage within batch processing limits.
|
||||
assert!(
|
||||
len <= BLOCK_LEN,
|
||||
"Input docids length ({}) exceeds BLOCK_LEN ({})",
|
||||
len,
|
||||
BLOCK_LEN
|
||||
);
|
||||
|
||||
let mut input_docs_buffer = [0u32; BLOCK_LEN];
|
||||
input_docs_buffer[..len].copy_from_slice(input_docs);
|
||||
|
||||
let mut dense_row_ids_buffer = [0u32; BLOCK_LEN];
|
||||
let mut dense_values_buffer = [T::default(); BLOCK_LEN];
|
||||
let mut presence_mask: u64 = 0; // Bitmask to track which input_docs have a value
|
||||
let mut num_present = 0;
|
||||
|
||||
// Phase 1: Identify existing RowIds and build dense_row_ids_buffer
|
||||
for (i, &doc_id) in input_docs_buffer[..len].iter().enumerate() {
|
||||
if let Some(row_id) = optional_index.rank_if_exists(doc_id) {
|
||||
dense_row_ids_buffer[num_present] = row_id;
|
||||
presence_mask |= 1u64 << i; // Set bit for present docid
|
||||
num_present += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Batch fetch values for present docs
|
||||
if num_present > 0 {
|
||||
self.values.get_vals(
|
||||
&dense_row_ids_buffer[..num_present],
|
||||
&mut dense_values_buffer[..num_present],
|
||||
);
|
||||
}
|
||||
|
||||
// Determine if nulls match the value range
|
||||
let nulls_match = match &value_range {
|
||||
ValueRange::All => true,
|
||||
ValueRange::Inclusive(_) => false,
|
||||
@@ -226,35 +191,63 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static + Default> Column<T> {
|
||||
ValueRange::LessThan(_, nulls_match) => *nulls_match,
|
||||
};
|
||||
|
||||
// Phase 3: Filter and merge results, reconstructing docids and values
|
||||
let mut dense_values_cursor = 0;
|
||||
for i in 0..len {
|
||||
let original_doc_id = input_docs_buffer[i];
|
||||
if (presence_mask & (1u64 << i)) != 0 {
|
||||
// This doc_id was present in the optional index and has a value
|
||||
let val = dense_values_buffer[dense_values_cursor];
|
||||
dense_values_cursor += 1;
|
||||
let fallback_needed = ROWS.with(|rows_cell| {
|
||||
DOCS.with(|docs_cell| {
|
||||
let mut rows = rows_cell.borrow_mut();
|
||||
let mut docs = docs_cell.borrow_mut();
|
||||
rows.clear();
|
||||
docs.clear();
|
||||
|
||||
// Check if the value matches the value range
|
||||
let value_matches = match &value_range {
|
||||
ValueRange::All => true,
|
||||
ValueRange::Inclusive(r) => r.contains(&val),
|
||||
ValueRange::GreaterThan(t, _) => val > *t,
|
||||
ValueRange::LessThan(t, _) => val < *t,
|
||||
};
|
||||
let mut has_nulls = false;
|
||||
|
||||
if value_matches {
|
||||
for &doc_id in input_docs {
|
||||
if let Some(row_id) = optional_index.rank_if_exists(doc_id) {
|
||||
rows.push(row_id);
|
||||
docs.push(doc_id);
|
||||
} else {
|
||||
has_nulls = true;
|
||||
if nulls_match {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !has_nulls || !nulls_match {
|
||||
self.values.get_vals_in_value_range(
|
||||
&rows,
|
||||
&docs,
|
||||
output,
|
||||
value_range.clone(),
|
||||
);
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})
|
||||
});
|
||||
|
||||
if fallback_needed {
|
||||
for &doc_id in input_docs {
|
||||
if let Some(row_id) = optional_index.rank_if_exists(doc_id) {
|
||||
let val = self.values.get_val(row_id);
|
||||
let value_matches = match &value_range {
|
||||
ValueRange::All => true,
|
||||
ValueRange::Inclusive(r) => r.contains(&val),
|
||||
ValueRange::GreaterThan(t, _) => val > *t,
|
||||
ValueRange::LessThan(t, _) => val < *t,
|
||||
};
|
||||
|
||||
if value_matches {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: doc_id,
|
||||
sort_key: Some(val),
|
||||
});
|
||||
}
|
||||
} else if nulls_match {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: original_doc_id,
|
||||
sort_key: Some(val),
|
||||
doc: doc_id,
|
||||
sort_key: None,
|
||||
});
|
||||
}
|
||||
} else if nulls_match {
|
||||
// This doc_id was not present in the optional index (null) and nulls match
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: original_doc_id,
|
||||
sort_key: None,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -116,6 +116,7 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
|
||||
fn get_vals_in_value_range(
|
||||
&self,
|
||||
input_indexes: &[u32],
|
||||
input_doc_ids: &[u32],
|
||||
output: &mut Vec<crate::ComparableDoc<Option<T>, crate::DocId>>,
|
||||
value_range: ValueRange<T>,
|
||||
) {
|
||||
@@ -130,25 +131,30 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
|
||||
let idx2 = input_indexes[read_head + 2];
|
||||
let idx3 = input_indexes[read_head + 3];
|
||||
|
||||
let doc0 = input_doc_ids[read_head];
|
||||
let doc1 = input_doc_ids[read_head + 1];
|
||||
let doc2 = input_doc_ids[read_head + 2];
|
||||
let doc3 = input_doc_ids[read_head + 3];
|
||||
|
||||
let val0 = self.get_val(idx0);
|
||||
let val1 = self.get_val(idx1);
|
||||
let val2 = self.get_val(idx2);
|
||||
let val3 = self.get_val(idx3);
|
||||
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx0,
|
||||
doc: doc0,
|
||||
sort_key: Some(val0),
|
||||
});
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx1,
|
||||
doc: doc1,
|
||||
sort_key: Some(val1),
|
||||
});
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx2,
|
||||
doc: doc2,
|
||||
sort_key: Some(val2),
|
||||
});
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx3,
|
||||
doc: doc3,
|
||||
sort_key: Some(val3),
|
||||
});
|
||||
|
||||
@@ -162,6 +168,11 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
|
||||
let idx2 = input_indexes[read_head + 2];
|
||||
let idx3 = input_indexes[read_head + 3];
|
||||
|
||||
let doc0 = input_doc_ids[read_head];
|
||||
let doc1 = input_doc_ids[read_head + 1];
|
||||
let doc2 = input_doc_ids[read_head + 2];
|
||||
let doc3 = input_doc_ids[read_head + 3];
|
||||
|
||||
let val0 = self.get_val(idx0);
|
||||
let val1 = self.get_val(idx1);
|
||||
let val2 = self.get_val(idx2);
|
||||
@@ -169,25 +180,25 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
|
||||
|
||||
if range.contains(&val0) {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx0,
|
||||
doc: doc0,
|
||||
sort_key: Some(val0),
|
||||
});
|
||||
}
|
||||
if range.contains(&val1) {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx1,
|
||||
doc: doc1,
|
||||
sort_key: Some(val1),
|
||||
});
|
||||
}
|
||||
if range.contains(&val2) {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx2,
|
||||
doc: doc2,
|
||||
sort_key: Some(val2),
|
||||
});
|
||||
}
|
||||
if range.contains(&val3) {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx3,
|
||||
doc: doc3,
|
||||
sort_key: Some(val3),
|
||||
});
|
||||
}
|
||||
@@ -202,6 +213,11 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
|
||||
let idx2 = input_indexes[read_head + 2];
|
||||
let idx3 = input_indexes[read_head + 3];
|
||||
|
||||
let doc0 = input_doc_ids[read_head];
|
||||
let doc1 = input_doc_ids[read_head + 1];
|
||||
let doc2 = input_doc_ids[read_head + 2];
|
||||
let doc3 = input_doc_ids[read_head + 3];
|
||||
|
||||
let val0 = self.get_val(idx0);
|
||||
let val1 = self.get_val(idx1);
|
||||
let val2 = self.get_val(idx2);
|
||||
@@ -209,25 +225,25 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
|
||||
|
||||
if val0 > *threshold {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx0,
|
||||
doc: doc0,
|
||||
sort_key: Some(val0),
|
||||
});
|
||||
}
|
||||
if val1 > *threshold {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx1,
|
||||
doc: doc1,
|
||||
sort_key: Some(val1),
|
||||
});
|
||||
}
|
||||
if val2 > *threshold {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx2,
|
||||
doc: doc2,
|
||||
sort_key: Some(val2),
|
||||
});
|
||||
}
|
||||
if val3 > *threshold {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx3,
|
||||
doc: doc3,
|
||||
sort_key: Some(val3),
|
||||
});
|
||||
}
|
||||
@@ -242,6 +258,11 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
|
||||
let idx2 = input_indexes[read_head + 2];
|
||||
let idx3 = input_indexes[read_head + 3];
|
||||
|
||||
let doc0 = input_doc_ids[read_head];
|
||||
let doc1 = input_doc_ids[read_head + 1];
|
||||
let doc2 = input_doc_ids[read_head + 2];
|
||||
let doc3 = input_doc_ids[read_head + 3];
|
||||
|
||||
let val0 = self.get_val(idx0);
|
||||
let val1 = self.get_val(idx1);
|
||||
let val2 = self.get_val(idx2);
|
||||
@@ -249,25 +270,25 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
|
||||
|
||||
if val0 < *threshold {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx0,
|
||||
doc: doc0,
|
||||
sort_key: Some(val0),
|
||||
});
|
||||
}
|
||||
if val1 < *threshold {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx1,
|
||||
doc: doc1,
|
||||
sort_key: Some(val1),
|
||||
});
|
||||
}
|
||||
if val2 < *threshold {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx2,
|
||||
doc: doc2,
|
||||
sort_key: Some(val2),
|
||||
});
|
||||
}
|
||||
if val3 < *threshold {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx3,
|
||||
doc: doc3,
|
||||
sort_key: Some(val3),
|
||||
});
|
||||
}
|
||||
@@ -279,6 +300,7 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
|
||||
// Process remaining elements (0 to 3)
|
||||
while read_head < len {
|
||||
let idx = input_indexes[read_head];
|
||||
let doc = input_doc_ids[read_head];
|
||||
let val = self.get_val(idx);
|
||||
let matches = match value_range {
|
||||
// 'value_range' is still moved here. This is the outer `value_range`
|
||||
@@ -289,7 +311,7 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
|
||||
};
|
||||
if matches {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx,
|
||||
doc,
|
||||
sort_key: Some(val),
|
||||
});
|
||||
}
|
||||
@@ -408,10 +430,11 @@ impl<T: PartialOrd + Default> ColumnValues<T> for EmptyColumnValues {
|
||||
fn get_vals_in_value_range(
|
||||
&self,
|
||||
input_indexes: &[u32],
|
||||
input_doc_ids: &[u32],
|
||||
output: &mut Vec<crate::ComparableDoc<Option<T>, crate::DocId>>,
|
||||
value_range: ValueRange<T>,
|
||||
) {
|
||||
let _ = (input_indexes, output, value_range);
|
||||
let _ = (input_indexes, input_doc_ids, output, value_range);
|
||||
panic!("Internal Error: Called get_vals_in_value_range of empty column.")
|
||||
}
|
||||
}
|
||||
@@ -431,11 +454,12 @@ impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnV
|
||||
fn get_vals_in_value_range(
|
||||
&self,
|
||||
input_indexes: &[u32],
|
||||
input_doc_ids: &[u32],
|
||||
output: &mut Vec<crate::ComparableDoc<Option<T>, crate::DocId>>,
|
||||
value_range: ValueRange<T>,
|
||||
) {
|
||||
self.as_ref()
|
||||
.get_vals_in_value_range(input_indexes, output, value_range)
|
||||
.get_vals_in_value_range(input_indexes, input_doc_ids, output, value_range)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
|
||||
@@ -110,14 +110,15 @@ impl ColumnValues for BitpackedReader {
|
||||
fn get_vals_in_value_range(
|
||||
&self,
|
||||
input_indexes: &[u32],
|
||||
input_doc_ids: &[u32],
|
||||
output: &mut Vec<crate::ComparableDoc<Option<u64>, crate::DocId>>,
|
||||
value_range: ValueRange<u64>,
|
||||
) {
|
||||
match value_range {
|
||||
ValueRange::All => {
|
||||
for &idx in input_indexes {
|
||||
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx,
|
||||
doc,
|
||||
sort_key: Some(self.get_val(idx)),
|
||||
});
|
||||
}
|
||||
@@ -126,8 +127,8 @@ impl ColumnValues for BitpackedReader {
|
||||
if let Some(transformed_range) =
|
||||
transform_range_before_linear_transformation(&self.stats, range)
|
||||
{
|
||||
for &doc in input_indexes {
|
||||
let raw_val = self.unpack_val(doc);
|
||||
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
|
||||
let raw_val = self.unpack_val(idx);
|
||||
if transformed_range.contains(&raw_val) {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc,
|
||||
@@ -141,9 +142,9 @@ impl ColumnValues for BitpackedReader {
|
||||
}
|
||||
ValueRange::GreaterThan(threshold, _) => {
|
||||
if threshold < self.stats.min_value {
|
||||
for &idx in input_indexes {
|
||||
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx,
|
||||
doc,
|
||||
sort_key: Some(self.get_val(idx)),
|
||||
});
|
||||
}
|
||||
@@ -151,8 +152,8 @@ impl ColumnValues for BitpackedReader {
|
||||
// All filtered out
|
||||
} else {
|
||||
let raw_threshold = (threshold - self.stats.min_value) / self.stats.gcd.get();
|
||||
for &doc in input_indexes {
|
||||
let raw_val = self.unpack_val(doc);
|
||||
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
|
||||
let raw_val = self.unpack_val(idx);
|
||||
if raw_val > raw_threshold {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc,
|
||||
@@ -166,9 +167,9 @@ impl ColumnValues for BitpackedReader {
|
||||
}
|
||||
ValueRange::LessThan(threshold, _) => {
|
||||
if threshold > self.stats.max_value {
|
||||
for &idx in input_indexes {
|
||||
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc: idx,
|
||||
doc,
|
||||
sort_key: Some(self.get_val(idx)),
|
||||
});
|
||||
}
|
||||
@@ -183,8 +184,8 @@ impl ColumnValues for BitpackedReader {
|
||||
diff / gcd + 1
|
||||
};
|
||||
|
||||
for &doc in input_indexes {
|
||||
let raw_val = self.unpack_val(doc);
|
||||
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
|
||||
let raw_val = self.unpack_val(idx);
|
||||
if raw_val < raw_threshold {
|
||||
output.push(crate::ComparableDoc {
|
||||
doc,
|
||||
|
||||
Reference in New Issue
Block a user