mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-30 15:10:40 +00:00
WIP: Add ValueRange.
This commit is contained in:
@@ -143,7 +143,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
|
||||
#[inline]
|
||||
pub fn get_docids_for_value_range(
|
||||
&self,
|
||||
value_range: RangeInclusive<T>,
|
||||
value_range: ValueRange<T>,
|
||||
selected_docid_range: Range<u32>,
|
||||
doc_ids: &mut Vec<u32>,
|
||||
) {
|
||||
@@ -168,6 +168,18 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// A range of values.
|
||||
///
|
||||
/// This type is intended to be used in batch APIs, where the cost of unpacking the enum
|
||||
/// is outweighed by the time spent processing a batch.
|
||||
///
|
||||
/// Implementers should pattern match on the variants to use optimized loops for each case.
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum ValueRange<T> {
|
||||
/// A range that includes both start and end.
|
||||
Inclusive(RangeInclusive<T>),
|
||||
}
|
||||
|
||||
impl BinarySerializable for Cardinality {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> {
|
||||
self.to_code().serialize(writer)
|
||||
|
||||
@@ -339,7 +339,7 @@ mod tests {
|
||||
use std::ops::Range;
|
||||
|
||||
use super::MultiValueIndex;
|
||||
use crate::{ColumnarReader, DynamicColumn};
|
||||
use crate::{ColumnarReader, DynamicColumn, ValueRange};
|
||||
|
||||
fn index_to_pos_helper(
|
||||
index: &MultiValueIndex,
|
||||
@@ -419,7 +419,7 @@ mod tests {
|
||||
assert_eq!(row_id_range, 0..4);
|
||||
|
||||
let check = |range, expected| {
|
||||
let full_range = 0..=u64::MAX;
|
||||
let full_range = ValueRange::Inclusive(0..=u64::MAX);
|
||||
let mut docids = Vec::new();
|
||||
column.get_docids_for_value_range(full_range, range, &mut docids);
|
||||
assert_eq!(docids, expected);
|
||||
|
||||
@@ -7,13 +7,15 @@
|
||||
//! - Monotonically map values to u64/u128
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::ops::{Range, RangeInclusive};
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use downcast_rs::DowncastSync;
|
||||
pub use monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
|
||||
pub use monotonic_mapping_u128::MonotonicallyMappableToU128;
|
||||
|
||||
use crate::column::ValueRange;
|
||||
|
||||
mod merge;
|
||||
pub(crate) mod monotonic_mapping;
|
||||
pub(crate) mod monotonic_mapping_u128;
|
||||
@@ -128,15 +130,19 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
|
||||
/// Note that position == docid for single value fast fields
|
||||
fn get_row_ids_for_value_range(
|
||||
&self,
|
||||
value_range: RangeInclusive<T>,
|
||||
value_range: ValueRange<T>,
|
||||
row_id_range: Range<RowId>,
|
||||
row_id_hits: &mut Vec<RowId>,
|
||||
) {
|
||||
let row_id_range = row_id_range.start..row_id_range.end.min(self.num_vals());
|
||||
for idx in row_id_range {
|
||||
let val = self.get_val(idx);
|
||||
if value_range.contains(&val) {
|
||||
row_id_hits.push(idx);
|
||||
match value_range {
|
||||
ValueRange::Inclusive(range) => {
|
||||
for idx in row_id_range {
|
||||
let val = self.get_val(idx);
|
||||
if range.contains(&val) {
|
||||
row_id_hits.push(idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -233,7 +239,7 @@ impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnV
|
||||
#[inline(always)]
|
||||
fn get_row_ids_for_value_range(
|
||||
&self,
|
||||
range: RangeInclusive<T>,
|
||||
range: ValueRange<T>,
|
||||
doc_id_range: Range<u32>,
|
||||
positions: &mut Vec<u32>,
|
||||
) {
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::{Range, RangeInclusive};
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::ColumnValues;
|
||||
use crate::column::ValueRange;
|
||||
use crate::column_values::monotonic_mapping::StrictlyMonotonicFn;
|
||||
|
||||
struct MonotonicMappingColumn<C, T, Input> {
|
||||
@@ -80,13 +81,16 @@ where
|
||||
|
||||
fn get_row_ids_for_value_range(
|
||||
&self,
|
||||
range: RangeInclusive<Output>,
|
||||
range: ValueRange<Output>,
|
||||
doc_id_range: Range<u32>,
|
||||
positions: &mut Vec<u32>,
|
||||
) {
|
||||
let ValueRange::Inclusive(range) = range;
|
||||
self.from_column.get_row_ids_for_value_range(
|
||||
self.monotonic_mapping.inverse(range.start().clone())
|
||||
..=self.monotonic_mapping.inverse(range.end().clone()),
|
||||
ValueRange::Inclusive(
|
||||
self.monotonic_mapping.inverse(range.start().clone())
|
||||
..=self.monotonic_mapping.inverse(range.end().clone()),
|
||||
),
|
||||
doc_id_range,
|
||||
positions,
|
||||
)
|
||||
|
||||
@@ -25,6 +25,7 @@ use common::{BinarySerializable, CountingWriter, OwnedBytes, VInt, VIntU128};
|
||||
use tantivy_bitpacker::{BitPacker, BitUnpacker};
|
||||
|
||||
use crate::RowId;
|
||||
use crate::column::ValueRange;
|
||||
use crate::column_values::ColumnValues;
|
||||
|
||||
/// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of
|
||||
@@ -338,12 +339,15 @@ impl ColumnValues<u64> for CompactSpaceU64Accessor {
|
||||
#[inline]
|
||||
fn get_row_ids_for_value_range(
|
||||
&self,
|
||||
value_range: RangeInclusive<u64>,
|
||||
value_range: ValueRange<u64>,
|
||||
position_range: Range<u32>,
|
||||
positions: &mut Vec<u32>,
|
||||
) {
|
||||
let value_range = self.0.compact_to_u128(*value_range.start() as u32)
|
||||
..=self.0.compact_to_u128(*value_range.end() as u32);
|
||||
let ValueRange::Inclusive(value_range) = value_range;
|
||||
let value_range = ValueRange::Inclusive(
|
||||
self.0.compact_to_u128(*value_range.start() as u32)
|
||||
..=self.0.compact_to_u128(*value_range.end() as u32),
|
||||
);
|
||||
self.0
|
||||
.get_row_ids_for_value_range(value_range, position_range, positions)
|
||||
}
|
||||
@@ -375,10 +379,11 @@ impl ColumnValues<u128> for CompactSpaceDecompressor {
|
||||
#[inline]
|
||||
fn get_row_ids_for_value_range(
|
||||
&self,
|
||||
value_range: RangeInclusive<u128>,
|
||||
value_range: ValueRange<u128>,
|
||||
position_range: Range<u32>,
|
||||
positions: &mut Vec<u32>,
|
||||
) {
|
||||
let ValueRange::Inclusive(value_range) = value_range;
|
||||
if value_range.start() > value_range.end() {
|
||||
return;
|
||||
}
|
||||
@@ -560,7 +565,7 @@ mod tests {
|
||||
.collect::<Vec<_>>();
|
||||
let mut positions = Vec::new();
|
||||
decompressor.get_row_ids_for_value_range(
|
||||
range,
|
||||
ValueRange::Inclusive(range),
|
||||
0..decompressor.num_vals(),
|
||||
&mut positions,
|
||||
);
|
||||
@@ -604,7 +609,11 @@ mod tests {
|
||||
let val = *val;
|
||||
let pos = pos as u32;
|
||||
let mut positions = Vec::new();
|
||||
decomp.get_row_ids_for_value_range(val..=val, pos..pos + 1, &mut positions);
|
||||
decomp.get_row_ids_for_value_range(
|
||||
ValueRange::Inclusive(val..=val),
|
||||
pos..pos + 1,
|
||||
&mut positions,
|
||||
);
|
||||
assert_eq!(positions, vec![pos]);
|
||||
}
|
||||
|
||||
@@ -746,7 +755,11 @@ mod tests {
|
||||
doc_id_range: Range<u32>,
|
||||
) -> Vec<u32> {
|
||||
let mut positions = Vec::new();
|
||||
column.get_row_ids_for_value_range(value_range, doc_id_range, &mut positions);
|
||||
column.get_row_ids_for_value_range(
|
||||
ValueRange::Inclusive(value_range),
|
||||
doc_id_range,
|
||||
&mut positions,
|
||||
);
|
||||
positions
|
||||
}
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ use common::{BinarySerializable, HasLen, OwnedBytes};
|
||||
use fastdivide::DividerU64;
|
||||
use tantivy_bitpacker::{BitPacker, BitUnpacker, compute_num_bits};
|
||||
|
||||
use crate::column::ValueRange;
|
||||
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
|
||||
use crate::{ColumnValues, RowId};
|
||||
|
||||
@@ -69,8 +70,9 @@ const fn div_ceil(n: u64, q: NonZeroU64) -> u64 {
|
||||
// f(bitpacked) ∈ [min_value, max_value] <=> bitpacked ∈ [min_bitpacked_value, max_bitpacked_value]
|
||||
fn transform_range_before_linear_transformation(
|
||||
stats: &ColumnStats,
|
||||
range: RangeInclusive<u64>,
|
||||
range: ValueRange<u64>,
|
||||
) -> Option<RangeInclusive<u64>> {
|
||||
let ValueRange::Inclusive(range) = range;
|
||||
if range.is_empty() {
|
||||
return None;
|
||||
}
|
||||
@@ -108,7 +110,7 @@ impl ColumnValues for BitpackedReader {
|
||||
|
||||
fn get_row_ids_for_value_range(
|
||||
&self,
|
||||
range: RangeInclusive<u64>,
|
||||
range: ValueRange<u64>,
|
||||
doc_id_range: Range<u32>,
|
||||
positions: &mut Vec<u32>,
|
||||
) {
|
||||
|
||||
@@ -132,7 +132,7 @@ pub(crate) fn create_and_validate<TColumnCodec: ColumnCodec>(
|
||||
.collect();
|
||||
let mut positions = Vec::new();
|
||||
reader.get_row_ids_for_value_range(
|
||||
vals[test_rand_idx]..=vals[test_rand_idx],
|
||||
crate::column::ValueRange::Inclusive(vals[test_rand_idx]..=vals[test_rand_idx]),
|
||||
0..vals.len() as u32,
|
||||
&mut positions,
|
||||
);
|
||||
|
||||
@@ -36,7 +36,7 @@ pub(crate) mod utils;
|
||||
mod value;
|
||||
|
||||
pub use block_accessor::ColumnBlockAccessor;
|
||||
pub use column::{BytesColumn, Column, StrColumn};
|
||||
pub use column::{BytesColumn, Column, StrColumn, ValueRange};
|
||||
pub use column_index::ColumnIndex;
|
||||
pub use column_values::{
|
||||
ColumnValues, EmptyColumnValues, MonotonicallyMappableToU64, MonotonicallyMappableToU128,
|
||||
|
||||
@@ -79,7 +79,7 @@ mod tests {
|
||||
use std::ops::{Range, RangeInclusive};
|
||||
use std::path::Path;
|
||||
|
||||
use columnar::StrColumn;
|
||||
use columnar::{StrColumn, ValueRange};
|
||||
use common::{ByteCount, DateTimePrecision, HasLen, TerminatingWrite};
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::prelude::SliceRandom;
|
||||
@@ -944,7 +944,7 @@ mod tests {
|
||||
let test_range = |range: RangeInclusive<u64>| {
|
||||
let expected_count = numbers.iter().filter(|num| range.contains(*num)).count();
|
||||
let mut vec = vec![];
|
||||
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
|
||||
field.get_row_ids_for_value_range(ValueRange::Inclusive(range), 0..u32::MAX, &mut vec);
|
||||
assert_eq!(vec.len(), expected_count);
|
||||
};
|
||||
test_range(50..=50);
|
||||
@@ -1022,7 +1022,7 @@ mod tests {
|
||||
let test_range = |range: RangeInclusive<u64>| {
|
||||
let expected_count = numbers.iter().filter(|num| range.contains(*num)).count();
|
||||
let mut vec = vec![];
|
||||
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
|
||||
field.get_row_ids_for_value_range(ValueRange::Inclusive(range), 0..u32::MAX, &mut vec);
|
||||
assert_eq!(vec.len(), expected_count);
|
||||
};
|
||||
let test_range_variant = |start, stop| {
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use core::fmt::Debug;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use columnar::Column;
|
||||
use columnar::{Column, ValueRange};
|
||||
|
||||
use crate::{DocId, DocSet, TERMINATED};
|
||||
|
||||
@@ -41,7 +40,7 @@ impl VecCursor {
|
||||
|
||||
pub(crate) struct RangeDocSet<T> {
|
||||
/// The range filter on the values.
|
||||
value_range: RangeInclusive<T>,
|
||||
value_range: ValueRange<T>,
|
||||
column: Column<T>,
|
||||
/// The next docid start range to fetch (inclusive).
|
||||
next_fetch_start: u32,
|
||||
@@ -61,7 +60,7 @@ pub(crate) struct RangeDocSet<T> {
|
||||
|
||||
const DEFAULT_FETCH_HORIZON: u32 = 128;
|
||||
impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
|
||||
pub(crate) fn new(value_range: RangeInclusive<T>, column: Column<T>) -> Self {
|
||||
pub(crate) fn new(value_range: ValueRange<T>, column: Column<T>) -> Self {
|
||||
let mut range_docset = Self {
|
||||
value_range,
|
||||
column,
|
||||
|
||||
@@ -7,7 +7,7 @@ use std::ops::{Bound, RangeInclusive};
|
||||
|
||||
use columnar::{
|
||||
Cardinality, Column, ColumnType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
|
||||
NumericalType, StrColumn,
|
||||
NumericalType, StrColumn, ValueRange,
|
||||
};
|
||||
use common::bounds::{BoundsRange, TransformBound};
|
||||
|
||||
@@ -154,7 +154,7 @@ impl Weight for FastFieldRangeWeight {
|
||||
ip_addr_column.min_value(),
|
||||
ip_addr_column.max_value(),
|
||||
);
|
||||
let docset = RangeDocSet::new(value_range, ip_addr_column);
|
||||
let docset = RangeDocSet::new(ValueRange::Inclusive(value_range), ip_addr_column);
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
} else if field_type.is_str() {
|
||||
let Some(str_dict_column): Option<StrColumn> = reader.fast_fields().str(&field_name)?
|
||||
@@ -426,7 +426,7 @@ fn search_on_u64_ff(
|
||||
}
|
||||
}
|
||||
|
||||
let docset = RangeDocSet::new(value_range, column);
|
||||
let docset = RangeDocSet::new(ValueRange::Inclusive(value_range), column);
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user