Compare commits

...

5 Commits

25 changed files with 2375 additions and 246 deletions

View File

@@ -16,7 +16,7 @@ stacker = { version= "0.6", path = "../stacker", package="tantivy-stacker"}
sstable = { version= "0.6", path = "../sstable", package = "tantivy-sstable" }
common = { version= "0.10", path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.9", path = "../bitpacker/" }
serde = "1.0.152"
serde = { version = "1.0.152", features = ["derive"] }
downcast-rs = "2.0.1"
[dev-dependencies]

View File

@@ -1,6 +1,6 @@
use binggan::{InputGroup, black_box};
use common::*;
use tantivy_columnar::Column;
use tantivy_columnar::{Column, ValueRange};
pub mod common;
@@ -46,16 +46,16 @@ fn bench_group(mut runner: InputGroup<Column>) {
runner.register("access_first_vals", |column| {
let mut sum = 0;
const BLOCK_SIZE: usize = 32;
let mut docs = vec![0; BLOCK_SIZE];
let mut buffer = vec![None; BLOCK_SIZE];
let mut docs = Vec::with_capacity(BLOCK_SIZE);
let mut buffer = Vec::with_capacity(BLOCK_SIZE);
for i in (0..NUM_DOCS).step_by(BLOCK_SIZE) {
// fill docs
#[allow(clippy::needless_range_loop)]
docs.clear();
for idx in 0..BLOCK_SIZE {
docs[idx] = idx as u32 + i;
docs.push(idx as u32 + i);
}
column.first_vals(&docs, &mut buffer);
buffer.clear();
column.first_vals_in_value_range(&mut docs, &mut buffer, ValueRange::All);
for val in buffer.iter() {
let Some(val) = val else { continue };
sum += *val;

View File

@@ -1,6 +1,7 @@
mod dictionary_encoded;
mod serialize;
use std::cell::RefCell;
use std::fmt::{self, Debug};
use std::io::Write;
use std::ops::{Range, RangeInclusive};
@@ -19,6 +20,11 @@ use crate::column_values::monotonic_mapping::StrictlyMonotonicMappingToInternal;
use crate::column_values::{ColumnValues, monotonic_map_column};
use crate::{Cardinality, DocId, EmptyColumnValues, MonotonicallyMappableToU64, RowId};
thread_local! {
static ROWS: RefCell<Vec<RowId>> = const { RefCell::new(Vec::new()) };
static DOCS: RefCell<Vec<DocId>> = const { RefCell::new(Vec::new()) };
}
#[derive(Clone)]
pub struct Column<T = u64> {
pub index: ColumnIndex,
@@ -89,31 +95,6 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
self.values_for_doc(row_id).next()
}
/// Load the first value for each docid in the provided slice.
#[inline]
pub fn first_vals(&self, docids: &[DocId], output: &mut [Option<T>]) {
match &self.index {
ColumnIndex::Empty { .. } => {}
ColumnIndex::Full => self.values.get_vals_opt(docids, output),
ColumnIndex::Optional(optional_index) => {
for (i, docid) in docids.iter().enumerate() {
output[i] = optional_index
.rank_if_exists(*docid)
.map(|rowid| self.values.get_val(rowid));
}
}
ColumnIndex::Multivalued(multivalued_index) => {
for (i, docid) in docids.iter().enumerate() {
let range = multivalued_index.range(*docid);
let is_empty = range.start == range.end;
if !is_empty {
output[i] = Some(self.values.get_val(range.start));
}
}
}
}
}
/// Translates a block of docids to row_ids.
///
/// returns the row_ids and the matching docids on the same index
@@ -143,7 +124,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
#[inline]
pub fn get_docids_for_value_range(
&self,
value_range: RangeInclusive<T>,
value_range: ValueRange<T>,
selected_docid_range: Range<u32>,
doc_ids: &mut Vec<u32>,
) {
@@ -168,6 +149,194 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
}
}
// Separate impl block for methods requiring `Default` for `T`.
impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static + Default> Column<T> {
/// Load the first value for each docid in the provided slice.
///
/// The `docids` vector is mutated: documents that do not match the `value_range` are removed.
/// The `values` vector is populated with the values of the remaining documents.
#[inline]
pub fn first_vals_in_value_range(
&self,
input_docs: &[DocId],
output: &mut Vec<crate::ComparableDoc<Option<T>, DocId>>,
value_range: ValueRange<T>,
) {
match (&self.index, value_range) {
(ColumnIndex::Empty { .. }, value_range) => {
let nulls_match = match &value_range {
ValueRange::All => true,
ValueRange::Inclusive(_) => false,
ValueRange::GreaterThan(_, nulls_match) => *nulls_match,
ValueRange::GreaterThanOrEqual(_, nulls_match) => *nulls_match,
ValueRange::LessThan(_, nulls_match) => *nulls_match,
ValueRange::LessThanOrEqual(_, nulls_match) => *nulls_match,
};
if nulls_match {
for &doc in input_docs {
output.push(crate::ComparableDoc {
doc,
sort_key: None,
});
}
}
}
(ColumnIndex::Full, value_range) => {
self.values
.get_vals_in_value_range(input_docs, input_docs, output, value_range);
}
(ColumnIndex::Optional(optional_index), value_range) => {
let nulls_match = match &value_range {
ValueRange::All => true,
ValueRange::Inclusive(_) => false,
ValueRange::GreaterThan(_, nulls_match) => *nulls_match,
ValueRange::GreaterThanOrEqual(_, nulls_match) => *nulls_match,
ValueRange::LessThan(_, nulls_match) => *nulls_match,
ValueRange::LessThanOrEqual(_, nulls_match) => *nulls_match,
};
let fallback_needed = ROWS.with(|rows_cell| {
DOCS.with(|docs_cell| {
let mut rows = rows_cell.borrow_mut();
let mut docs = docs_cell.borrow_mut();
rows.clear();
docs.clear();
let mut has_nulls = false;
for &doc_id in input_docs {
if let Some(row_id) = optional_index.rank_if_exists(doc_id) {
rows.push(row_id);
docs.push(doc_id);
} else {
has_nulls = true;
if nulls_match {
break;
}
}
}
if !has_nulls || !nulls_match {
self.values.get_vals_in_value_range(
&rows,
&docs,
output,
value_range.clone(),
);
return false;
}
true
})
});
if fallback_needed {
for &doc_id in input_docs {
if let Some(row_id) = optional_index.rank_if_exists(doc_id) {
let val = self.values.get_val(row_id);
let value_matches = match &value_range {
ValueRange::All => true,
ValueRange::Inclusive(r) => r.contains(&val),
ValueRange::GreaterThan(t, _) => val > *t,
ValueRange::GreaterThanOrEqual(t, _) => val >= *t,
ValueRange::LessThan(t, _) => val < *t,
ValueRange::LessThanOrEqual(t, _) => val <= *t,
};
if value_matches {
output.push(crate::ComparableDoc {
doc: doc_id,
sort_key: Some(val),
});
}
} else if nulls_match {
output.push(crate::ComparableDoc {
doc: doc_id,
sort_key: None,
});
}
}
}
}
(ColumnIndex::Multivalued(multivalued_index), value_range) => {
let nulls_match = match &value_range {
ValueRange::All => true,
ValueRange::Inclusive(_) => false,
ValueRange::GreaterThan(_, nulls_match) => *nulls_match,
ValueRange::GreaterThanOrEqual(_, nulls_match) => *nulls_match,
ValueRange::LessThan(_, nulls_match) => *nulls_match,
ValueRange::LessThanOrEqual(_, nulls_match) => *nulls_match,
};
for i in 0..input_docs.len() {
let docid = input_docs[i];
let row_range = multivalued_index.range(docid);
let is_empty = row_range.start == row_range.end;
if !is_empty {
let val = self.values.get_val(row_range.start);
let matches = match &value_range {
ValueRange::All => true,
ValueRange::Inclusive(r) => r.contains(&val),
ValueRange::GreaterThan(t, _) => val > *t,
ValueRange::GreaterThanOrEqual(t, _) => val >= *t,
ValueRange::LessThan(t, _) => val < *t,
ValueRange::LessThanOrEqual(t, _) => val <= *t,
};
if matches {
output.push(crate::ComparableDoc {
doc: docid,
sort_key: Some(val),
});
}
} else if nulls_match {
output.push(crate::ComparableDoc {
doc: docid,
sort_key: None,
});
}
}
}
}
}
}
/// A range of values.
///
/// This type is intended to be used in batch APIs, where the cost of unpacking the enum
/// is outweighed by the time spent processing a batch.
///
/// Implementers should pattern match on the variants to use optimized loops for each case.
#[derive(Clone, Debug)]
pub enum ValueRange<T> {
/// A range that includes both start and end.
Inclusive(RangeInclusive<T>),
/// A range that matches all values.
All,
/// A range that matches all values greater than the threshold.
/// The boolean flag indicates if null values should be included.
GreaterThan(T, bool),
/// A range that matches all values greater than or equal to the threshold.
/// The boolean flag indicates if null values should be included.
GreaterThanOrEqual(T, bool),
/// A range that matches all values less than the threshold.
/// The boolean flag indicates if null values should be included.
LessThan(T, bool),
/// A range that matches all values less than or equal to the threshold.
/// The boolean flag indicates if null values should be included.
LessThanOrEqual(T, bool),
}
impl<T: PartialOrd> ValueRange<T> {
pub fn intersects(&self, min: T, max: T) -> bool {
match self {
ValueRange::Inclusive(range) => *range.start() <= max && *range.end() >= min,
ValueRange::All => true,
ValueRange::GreaterThan(val, _) => max > *val,
ValueRange::GreaterThanOrEqual(val, _) => max >= *val,
ValueRange::LessThan(val, _) => min < *val,
ValueRange::LessThanOrEqual(val, _) => min <= *val,
}
}
}
impl BinarySerializable for Cardinality {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> {
self.to_code().serialize(writer)

View File

@@ -333,7 +333,7 @@ mod tests {
use std::ops::Range;
use super::MultiValueIndex;
use crate::{ColumnarReader, DynamicColumn};
use crate::{ColumnarReader, DynamicColumn, ValueRange};
fn index_to_pos_helper(
index: &MultiValueIndex,
@@ -413,7 +413,7 @@ mod tests {
assert_eq!(row_id_range, 0..4);
let check = |range, expected| {
let full_range = 0..=u64::MAX;
let full_range = ValueRange::All;
let mut docids = Vec::new();
column.get_docids_for_value_range(full_range, range, &mut docids);
assert_eq!(docids, expected);

View File

@@ -7,13 +7,15 @@
//! - Monotonically map values to u64/u128
use std::fmt::Debug;
use std::ops::{Range, RangeInclusive};
use std::ops::Range;
use std::sync::Arc;
use downcast_rs::DowncastSync;
pub use monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
pub use monotonic_mapping_u128::MonotonicallyMappableToU128;
use crate::column::ValueRange;
mod merge;
pub(crate) mod monotonic_mapping;
pub(crate) mod monotonic_mapping_u128;
@@ -109,6 +111,307 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
}
}
/// Load the values for the provided docids.
///
/// The values are filtered by the provided value range.
fn get_vals_in_value_range(
&self,
input_indexes: &[u32],
input_doc_ids: &[u32],
output: &mut Vec<crate::ComparableDoc<Option<T>, crate::DocId>>,
value_range: ValueRange<T>,
) {
let len = input_indexes.len();
let mut read_head = 0;
match value_range {
ValueRange::All => {
while read_head + 3 < len {
let idx0 = input_indexes[read_head];
let idx1 = input_indexes[read_head + 1];
let idx2 = input_indexes[read_head + 2];
let idx3 = input_indexes[read_head + 3];
let doc0 = input_doc_ids[read_head];
let doc1 = input_doc_ids[read_head + 1];
let doc2 = input_doc_ids[read_head + 2];
let doc3 = input_doc_ids[read_head + 3];
let val0 = self.get_val(idx0);
let val1 = self.get_val(idx1);
let val2 = self.get_val(idx2);
let val3 = self.get_val(idx3);
output.push(crate::ComparableDoc {
doc: doc0,
sort_key: Some(val0),
});
output.push(crate::ComparableDoc {
doc: doc1,
sort_key: Some(val1),
});
output.push(crate::ComparableDoc {
doc: doc2,
sort_key: Some(val2),
});
output.push(crate::ComparableDoc {
doc: doc3,
sort_key: Some(val3),
});
read_head += 4;
}
}
ValueRange::Inclusive(ref range) => {
while read_head + 3 < len {
let idx0 = input_indexes[read_head];
let idx1 = input_indexes[read_head + 1];
let idx2 = input_indexes[read_head + 2];
let idx3 = input_indexes[read_head + 3];
let doc0 = input_doc_ids[read_head];
let doc1 = input_doc_ids[read_head + 1];
let doc2 = input_doc_ids[read_head + 2];
let doc3 = input_doc_ids[read_head + 3];
let val0 = self.get_val(idx0);
let val1 = self.get_val(idx1);
let val2 = self.get_val(idx2);
let val3 = self.get_val(idx3);
if range.contains(&val0) {
output.push(crate::ComparableDoc {
doc: doc0,
sort_key: Some(val0),
});
}
if range.contains(&val1) {
output.push(crate::ComparableDoc {
doc: doc1,
sort_key: Some(val1),
});
}
if range.contains(&val2) {
output.push(crate::ComparableDoc {
doc: doc2,
sort_key: Some(val2),
});
}
if range.contains(&val3) {
output.push(crate::ComparableDoc {
doc: doc3,
sort_key: Some(val3),
});
}
read_head += 4;
}
}
ValueRange::GreaterThan(ref threshold, _) => {
while read_head + 3 < len {
let idx0 = input_indexes[read_head];
let idx1 = input_indexes[read_head + 1];
let idx2 = input_indexes[read_head + 2];
let idx3 = input_indexes[read_head + 3];
let doc0 = input_doc_ids[read_head];
let doc1 = input_doc_ids[read_head + 1];
let doc2 = input_doc_ids[read_head + 2];
let doc3 = input_doc_ids[read_head + 3];
let val0 = self.get_val(idx0);
let val1 = self.get_val(idx1);
let val2 = self.get_val(idx2);
let val3 = self.get_val(idx3);
if val0 > *threshold {
output.push(crate::ComparableDoc {
doc: doc0,
sort_key: Some(val0),
});
}
if val1 > *threshold {
output.push(crate::ComparableDoc {
doc: doc1,
sort_key: Some(val1),
});
}
if val2 > *threshold {
output.push(crate::ComparableDoc {
doc: doc2,
sort_key: Some(val2),
});
}
if val3 > *threshold {
output.push(crate::ComparableDoc {
doc: doc3,
sort_key: Some(val3),
});
}
read_head += 4;
}
}
ValueRange::GreaterThanOrEqual(ref threshold, _) => {
while read_head + 3 < len {
let idx0 = input_indexes[read_head];
let idx1 = input_indexes[read_head + 1];
let idx2 = input_indexes[read_head + 2];
let idx3 = input_indexes[read_head + 3];
let doc0 = input_doc_ids[read_head];
let doc1 = input_doc_ids[read_head + 1];
let doc2 = input_doc_ids[read_head + 2];
let doc3 = input_doc_ids[read_head + 3];
let val0 = self.get_val(idx0);
let val1 = self.get_val(idx1);
let val2 = self.get_val(idx2);
let val3 = self.get_val(idx3);
if val0 >= *threshold {
output.push(crate::ComparableDoc {
doc: doc0,
sort_key: Some(val0),
});
}
if val1 >= *threshold {
output.push(crate::ComparableDoc {
doc: doc1,
sort_key: Some(val1),
});
}
if val2 >= *threshold {
output.push(crate::ComparableDoc {
doc: doc2,
sort_key: Some(val2),
});
}
if val3 >= *threshold {
output.push(crate::ComparableDoc {
doc: doc3,
sort_key: Some(val3),
});
}
read_head += 4;
}
}
ValueRange::LessThan(ref threshold, _) => {
while read_head + 3 < len {
let idx0 = input_indexes[read_head];
let idx1 = input_indexes[read_head + 1];
let idx2 = input_indexes[read_head + 2];
let idx3 = input_indexes[read_head + 3];
let doc0 = input_doc_ids[read_head];
let doc1 = input_doc_ids[read_head + 1];
let doc2 = input_doc_ids[read_head + 2];
let doc3 = input_doc_ids[read_head + 3];
let val0 = self.get_val(idx0);
let val1 = self.get_val(idx1);
let val2 = self.get_val(idx2);
let val3 = self.get_val(idx3);
if val0 < *threshold {
output.push(crate::ComparableDoc {
doc: doc0,
sort_key: Some(val0),
});
}
if val1 < *threshold {
output.push(crate::ComparableDoc {
doc: doc1,
sort_key: Some(val1),
});
}
if val2 < *threshold {
output.push(crate::ComparableDoc {
doc: doc2,
sort_key: Some(val2),
});
}
if val3 < *threshold {
output.push(crate::ComparableDoc {
doc: doc3,
sort_key: Some(val3),
});
}
read_head += 4;
}
}
ValueRange::LessThanOrEqual(ref threshold, _) => {
while read_head + 3 < len {
let idx0 = input_indexes[read_head];
let idx1 = input_indexes[read_head + 1];
let idx2 = input_indexes[read_head + 2];
let idx3 = input_indexes[read_head + 3];
let doc0 = input_doc_ids[read_head];
let doc1 = input_doc_ids[read_head + 1];
let doc2 = input_doc_ids[read_head + 2];
let doc3 = input_doc_ids[read_head + 3];
let val0 = self.get_val(idx0);
let val1 = self.get_val(idx1);
let val2 = self.get_val(idx2);
let val3 = self.get_val(idx3);
if val0 <= *threshold {
output.push(crate::ComparableDoc {
doc: doc0,
sort_key: Some(val0),
});
}
if val1 <= *threshold {
output.push(crate::ComparableDoc {
doc: doc1,
sort_key: Some(val1),
});
}
if val2 <= *threshold {
output.push(crate::ComparableDoc {
doc: doc2,
sort_key: Some(val2),
});
}
if val3 <= *threshold {
output.push(crate::ComparableDoc {
doc: doc3,
sort_key: Some(val3),
});
}
read_head += 4;
}
}
}
// Process remaining elements (0 to 3)
while read_head < len {
let idx = input_indexes[read_head];
let doc = input_doc_ids[read_head];
let val = self.get_val(idx);
let matches = match value_range {
// 'value_range' is still moved here. This is the outer `value_range`
ValueRange::All => true,
ValueRange::Inclusive(ref r) => r.contains(&val),
ValueRange::GreaterThan(ref t, _) => val > *t,
ValueRange::GreaterThanOrEqual(ref t, _) => val >= *t,
ValueRange::LessThan(ref t, _) => val < *t,
ValueRange::LessThanOrEqual(ref t, _) => val <= *t,
};
if matches {
output.push(crate::ComparableDoc {
doc,
sort_key: Some(val),
});
}
read_head += 1;
}
}
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
@@ -129,15 +432,54 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
/// Note that position == docid for single value fast fields
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<T>,
value_range: ValueRange<T>,
row_id_range: Range<RowId>,
row_id_hits: &mut Vec<RowId>,
) {
let row_id_range = row_id_range.start..row_id_range.end.min(self.num_vals());
for idx in row_id_range {
let val = self.get_val(idx);
if value_range.contains(&val) {
row_id_hits.push(idx);
match value_range {
ValueRange::Inclusive(range) => {
for idx in row_id_range {
let val = self.get_val(idx);
if range.contains(&val) {
row_id_hits.push(idx);
}
}
}
ValueRange::GreaterThan(threshold, _) => {
for idx in row_id_range {
let val = self.get_val(idx);
if val > threshold {
row_id_hits.push(idx);
}
}
}
ValueRange::GreaterThanOrEqual(threshold, _) => {
for idx in row_id_range {
let val = self.get_val(idx);
if val >= threshold {
row_id_hits.push(idx);
}
}
}
ValueRange::LessThan(threshold, _) => {
for idx in row_id_range {
let val = self.get_val(idx);
if val < threshold {
row_id_hits.push(idx);
}
}
}
ValueRange::LessThanOrEqual(threshold, _) => {
for idx in row_id_range {
let val = self.get_val(idx);
if val <= threshold {
row_id_hits.push(idx);
}
}
}
ValueRange::All => {
row_id_hits.extend(row_id_range);
}
}
}
@@ -193,6 +535,17 @@ impl<T: PartialOrd + Default> ColumnValues<T> for EmptyColumnValues {
fn num_vals(&self) -> u32 {
0
}
fn get_vals_in_value_range(
&self,
input_indexes: &[u32],
input_doc_ids: &[u32],
output: &mut Vec<crate::ComparableDoc<Option<T>, crate::DocId>>,
value_range: ValueRange<T>,
) {
let _ = (input_indexes, input_doc_ids, output, value_range);
panic!("Internal Error: Called get_vals_in_value_range of empty column.")
}
}
impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnValues<T>> {
@@ -206,6 +559,18 @@ impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnV
self.as_ref().get_vals_opt(indexes, output)
}
#[inline(always)]
fn get_vals_in_value_range(
&self,
input_indexes: &[u32],
input_doc_ids: &[u32],
output: &mut Vec<crate::ComparableDoc<Option<T>, crate::DocId>>,
value_range: ValueRange<T>,
) {
self.as_ref()
.get_vals_in_value_range(input_indexes, input_doc_ids, output, value_range)
}
#[inline(always)]
fn min_value(&self) -> T {
self.as_ref().min_value()
@@ -234,7 +599,7 @@ impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnV
#[inline(always)]
fn get_row_ids_for_value_range(
&self,
range: RangeInclusive<T>,
range: ValueRange<T>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {

View File

@@ -1,8 +1,9 @@
use std::fmt::Debug;
use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive};
use std::ops::Range;
use crate::ColumnValues;
use crate::column::ValueRange;
use crate::column_values::monotonic_mapping::StrictlyMonotonicFn;
struct MonotonicMappingColumn<C, T, Input> {
@@ -80,16 +81,52 @@ where
fn get_row_ids_for_value_range(
&self,
range: RangeInclusive<Output>,
range: ValueRange<Output>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.from_column.get_row_ids_for_value_range(
self.monotonic_mapping.inverse(range.start().clone())
..=self.monotonic_mapping.inverse(range.end().clone()),
doc_id_range,
positions,
)
match range {
ValueRange::Inclusive(range) => self.from_column.get_row_ids_for_value_range(
ValueRange::Inclusive(
self.monotonic_mapping.inverse(range.start().clone())
..=self.monotonic_mapping.inverse(range.end().clone()),
),
doc_id_range,
positions,
),
ValueRange::All => self.from_column.get_row_ids_for_value_range(
ValueRange::All,
doc_id_range,
positions,
),
ValueRange::GreaterThan(threshold, _) => self.from_column.get_row_ids_for_value_range(
ValueRange::GreaterThan(self.monotonic_mapping.inverse(threshold), false),
doc_id_range,
positions,
),
ValueRange::GreaterThanOrEqual(threshold, _) => {
self.from_column.get_row_ids_for_value_range(
ValueRange::GreaterThanOrEqual(
self.monotonic_mapping.inverse(threshold),
false,
),
doc_id_range,
positions,
)
}
ValueRange::LessThan(threshold, _) => self.from_column.get_row_ids_for_value_range(
ValueRange::LessThan(self.monotonic_mapping.inverse(threshold), false),
doc_id_range,
positions,
),
ValueRange::LessThanOrEqual(threshold, _) => {
self.from_column.get_row_ids_for_value_range(
ValueRange::LessThanOrEqual(self.monotonic_mapping.inverse(threshold), false),
doc_id_range,
positions,
)
}
}
}
// We voluntarily do not implement get_range as it yields a regression,

View File

@@ -25,6 +25,7 @@ use common::{BinarySerializable, CountingWriter, OwnedBytes, VInt, VIntU128};
use tantivy_bitpacker::{BitPacker, BitUnpacker};
use crate::RowId;
use crate::column::ValueRange;
use crate::column_values::ColumnValues;
/// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of
@@ -338,14 +339,48 @@ impl ColumnValues<u64> for CompactSpaceU64Accessor {
#[inline]
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<u64>,
value_range: ValueRange<u64>,
position_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let value_range = self.0.compact_to_u128(*value_range.start() as u32)
..=self.0.compact_to_u128(*value_range.end() as u32);
self.0
.get_row_ids_for_value_range(value_range, position_range, positions)
match value_range {
ValueRange::Inclusive(value_range) => {
let value_range = ValueRange::Inclusive(
self.0.compact_to_u128(*value_range.start() as u32)
..=self.0.compact_to_u128(*value_range.end() as u32),
);
self.0
.get_row_ids_for_value_range(value_range, position_range, positions)
}
ValueRange::All => {
let position_range = position_range.start..position_range.end.min(self.num_vals());
positions.extend(position_range);
}
ValueRange::GreaterThan(threshold, _) => {
let value_range =
ValueRange::GreaterThan(self.0.compact_to_u128(threshold as u32), false);
self.0
.get_row_ids_for_value_range(value_range, position_range, positions)
}
ValueRange::GreaterThanOrEqual(threshold, _) => {
let value_range =
ValueRange::GreaterThanOrEqual(self.0.compact_to_u128(threshold as u32), false);
self.0
.get_row_ids_for_value_range(value_range, position_range, positions)
}
ValueRange::LessThan(threshold, _) => {
let value_range =
ValueRange::LessThan(self.0.compact_to_u128(threshold as u32), false);
self.0
.get_row_ids_for_value_range(value_range, position_range, positions)
}
ValueRange::LessThanOrEqual(threshold, _) => {
let value_range =
ValueRange::LessThanOrEqual(self.0.compact_to_u128(threshold as u32), false);
self.0
.get_row_ids_for_value_range(value_range, position_range, positions)
}
}
}
}
@@ -375,10 +410,47 @@ impl ColumnValues<u128> for CompactSpaceDecompressor {
#[inline]
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<u128>,
value_range: ValueRange<u128>,
position_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let value_range = match value_range {
ValueRange::Inclusive(value_range) => value_range,
ValueRange::All => {
let position_range = position_range.start..position_range.end.min(self.num_vals());
positions.extend(position_range);
return;
}
ValueRange::GreaterThan(threshold, _) => {
let max = self.max_value();
if threshold >= max {
return;
}
(threshold + 1)..=max
}
ValueRange::GreaterThanOrEqual(threshold, _) => {
let max = self.max_value();
if threshold > max {
return;
}
threshold..=max
}
ValueRange::LessThan(threshold, _) => {
let min = self.min_value();
if threshold <= min {
return;
}
min..=(threshold - 1)
}
ValueRange::LessThanOrEqual(threshold, _) => {
let min = self.min_value();
if threshold < min {
return;
}
min..=threshold
}
};
if value_range.start() > value_range.end() {
return;
}
@@ -560,7 +632,7 @@ mod tests {
.collect::<Vec<_>>();
let mut positions = Vec::new();
decompressor.get_row_ids_for_value_range(
range,
ValueRange::Inclusive(range),
0..decompressor.num_vals(),
&mut positions,
);
@@ -604,7 +676,11 @@ mod tests {
let val = *val;
let pos = pos as u32;
let mut positions = Vec::new();
decomp.get_row_ids_for_value_range(val..=val, pos..pos + 1, &mut positions);
decomp.get_row_ids_for_value_range(
ValueRange::Inclusive(val..=val),
pos..pos + 1,
&mut positions,
);
assert_eq!(positions, vec![pos]);
}
@@ -746,7 +822,11 @@ mod tests {
doc_id_range: Range<u32>,
) -> Vec<u32> {
let mut positions = Vec::new();
column.get_row_ids_for_value_range(value_range, doc_id_range, &mut positions);
column.get_row_ids_for_value_range(
ValueRange::Inclusive(value_range),
doc_id_range,
&mut positions,
);
positions
}

View File

@@ -6,6 +6,7 @@ use common::{BinarySerializable, OwnedBytes};
use fastdivide::DividerU64;
use tantivy_bitpacker::{BitPacker, BitUnpacker, compute_num_bits};
use crate::column::ValueRange;
use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
use crate::{ColumnValues, RowId};
@@ -66,24 +67,273 @@ impl ColumnValues for BitpackedReader {
self.stats.num_rows
}
fn get_vals_in_value_range(
&self,
input_indexes: &[u32],
input_doc_ids: &[u32],
output: &mut Vec<crate::ComparableDoc<Option<u64>, crate::DocId>>,
value_range: ValueRange<u64>,
) {
match value_range {
ValueRange::All => {
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
output.push(crate::ComparableDoc {
doc,
sort_key: Some(self.get_val(idx)),
});
}
}
ValueRange::Inclusive(range) => {
if let Some(transformed_range) =
transform_range_before_linear_transformation(&self.stats, range)
{
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
let raw_val = self.get_val(idx);
if transformed_range.contains(&raw_val) {
output.push(crate::ComparableDoc {
doc,
sort_key: Some(
self.stats.min_value + self.stats.gcd.get() * raw_val,
),
});
}
}
}
}
ValueRange::GreaterThan(threshold, _) => {
if threshold < self.stats.min_value {
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
output.push(crate::ComparableDoc {
doc,
sort_key: Some(self.get_val(idx)),
});
}
} else if threshold >= self.stats.max_value {
// All filtered out
} else {
let raw_threshold = (threshold - self.stats.min_value) / self.stats.gcd.get();
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
let raw_val = self.get_val(idx);
if raw_val > raw_threshold {
output.push(crate::ComparableDoc {
doc,
sort_key: Some(
self.stats.min_value + self.stats.gcd.get() * raw_val,
),
});
}
}
}
}
ValueRange::GreaterThanOrEqual(threshold, _) => {
if threshold <= self.stats.min_value {
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
output.push(crate::ComparableDoc {
doc,
sort_key: Some(self.get_val(idx)),
});
}
} else if threshold > self.stats.max_value {
// All filtered out
} else {
let diff = threshold - self.stats.min_value;
let gcd = self.stats.gcd.get();
let raw_threshold = (diff + gcd - 1) / gcd;
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
let raw_val = self.get_val(idx);
if raw_val >= raw_threshold {
output.push(crate::ComparableDoc {
doc,
sort_key: Some(
self.stats.min_value + self.stats.gcd.get() * raw_val,
),
});
}
}
}
}
ValueRange::LessThan(threshold, _) => {
if threshold > self.stats.max_value {
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
output.push(crate::ComparableDoc {
doc,
sort_key: Some(self.get_val(idx)),
});
}
} else if threshold <= self.stats.min_value {
// All filtered out
} else {
let diff = threshold - self.stats.min_value;
let gcd = self.stats.gcd.get();
let raw_threshold = if diff % gcd == 0 {
diff / gcd
} else {
diff / gcd + 1
};
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
let raw_val = self.get_val(idx);
if raw_val < raw_threshold {
output.push(crate::ComparableDoc {
doc,
sort_key: Some(
self.stats.min_value + self.stats.gcd.get() * raw_val,
),
});
}
}
}
}
ValueRange::LessThanOrEqual(threshold, _) => {
if threshold >= self.stats.max_value {
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
output.push(crate::ComparableDoc {
doc,
sort_key: Some(self.get_val(idx)),
});
}
} else if threshold < self.stats.min_value {
// All filtered out
} else {
let diff = threshold - self.stats.min_value;
let gcd = self.stats.gcd.get();
let raw_threshold = diff / gcd;
for (&idx, &doc) in input_indexes.iter().zip(input_doc_ids.iter()) {
let raw_val = self.get_val(idx);
if raw_val <= raw_threshold {
output.push(crate::ComparableDoc {
doc,
sort_key: Some(
self.stats.min_value + self.stats.gcd.get() * raw_val,
),
});
}
}
}
}
}
}
fn get_row_ids_for_value_range(
&self,
range: RangeInclusive<u64>,
range: ValueRange<u64>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let Some(transformed_range) =
transform_range_before_linear_transformation(&self.stats, range)
else {
positions.clear();
return;
};
self.bit_unpacker.get_ids_for_value_range(
transformed_range,
doc_id_range,
&self.data,
positions,
);
match range {
ValueRange::All => {
positions.extend(doc_id_range);
return;
}
ValueRange::Inclusive(range) => {
let Some(transformed_range) =
transform_range_before_linear_transformation(&self.stats, range)
else {
positions.clear();
return;
};
self.bit_unpacker.get_ids_for_value_range(
transformed_range,
doc_id_range,
&self.data,
positions,
);
}
ValueRange::GreaterThan(threshold, _) => {
if threshold < self.stats.min_value {
positions.extend(doc_id_range);
return;
}
if threshold >= self.stats.max_value {
return;
}
let raw_threshold = (threshold - self.stats.min_value) / self.stats.gcd.get();
let max_raw = (self.stats.max_value - self.stats.min_value) / self.stats.gcd.get();
let transformed_range = (raw_threshold + 1)..=max_raw;
self.bit_unpacker.get_ids_for_value_range(
transformed_range,
doc_id_range,
&self.data,
positions,
);
}
ValueRange::GreaterThanOrEqual(threshold, _) => {
if threshold <= self.stats.min_value {
positions.extend(doc_id_range);
return;
}
if threshold > self.stats.max_value {
return;
}
let diff = threshold - self.stats.min_value;
let gcd = self.stats.gcd.get();
let raw_threshold = (diff + gcd - 1) / gcd;
// We want raw >= raw_threshold.
let max_raw = (self.stats.max_value - self.stats.min_value) / self.stats.gcd.get();
let transformed_range = raw_threshold..=max_raw;
self.bit_unpacker.get_ids_for_value_range(
transformed_range,
doc_id_range,
&self.data,
positions,
);
}
ValueRange::LessThan(threshold, _) => {
if threshold > self.stats.max_value {
positions.extend(doc_id_range);
return;
}
if threshold <= self.stats.min_value {
return;
}
let diff = threshold - self.stats.min_value;
let gcd = self.stats.gcd.get();
// We want raw < raw_threshold_limit
// raw <= raw_threshold_limit - 1
let raw_threshold_limit = if diff % gcd == 0 {
diff / gcd
} else {
diff / gcd + 1
};
if raw_threshold_limit == 0 {
return;
}
let transformed_range = 0..=(raw_threshold_limit - 1);
self.bit_unpacker.get_ids_for_value_range(
transformed_range,
doc_id_range,
&self.data,
positions,
);
}
ValueRange::LessThanOrEqual(threshold, _) => {
if threshold >= self.stats.max_value {
positions.extend(doc_id_range);
return;
}
if threshold < self.stats.min_value {
return;
}
let diff = threshold - self.stats.min_value;
let gcd = self.stats.gcd.get();
// We want raw <= raw_threshold.
let raw_threshold = diff / gcd;
let transformed_range = 0..=raw_threshold;
self.bit_unpacker.get_ids_for_value_range(
transformed_range,
doc_id_range,
&self.data,
positions,
);
}
}
}
}

View File

@@ -131,7 +131,7 @@ pub(crate) fn create_and_validate<TColumnCodec: ColumnCodec>(
.collect();
let mut positions = Vec::new();
reader.get_row_ids_for_value_range(
vals[test_rand_idx]..=vals[test_rand_idx],
crate::column::ValueRange::Inclusive(vals[test_rand_idx]..=vals[test_rand_idx]),
0..vals.len() as u32,
&mut positions,
);

View File

@@ -29,6 +29,7 @@ mod column;
pub mod column_index;
pub mod column_values;
mod columnar;
mod comparable_doc;
mod dictionary;
mod dynamic_column;
mod iterable;
@@ -36,7 +37,7 @@ pub(crate) mod utils;
mod value;
pub use block_accessor::ColumnBlockAccessor;
pub use column::{BytesColumn, Column, StrColumn};
pub use column::{BytesColumn, Column, StrColumn, ValueRange};
pub use column_index::ColumnIndex;
pub use column_values::{
ColumnValues, EmptyColumnValues, MonotonicallyMappableToU64, MonotonicallyMappableToU128,
@@ -45,6 +46,7 @@ pub use columnar::{
CURRENT_VERSION, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, Version, merge_columnar,
};
pub use comparable_doc::ComparableDoc;
use sstable::VoidSSTable;
pub use value::{NumericalType, NumericalValue};

View File

@@ -1,7 +1,8 @@
use std::cmp::Ordering;
use std::collections::HashMap;
use std::net::Ipv6Addr;
use columnar::{Column, ColumnType, ColumnarReader, DynamicColumn};
use columnar::{Column, ColumnType, ColumnarReader, DynamicColumn, ValueRange};
use common::json_path_writer::JSON_PATH_SEGMENT_SEP_STR;
use common::DateTime;
use regex::Regex;
@@ -16,7 +17,7 @@ use crate::aggregation::intermediate_agg_result::{
};
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::AggregationError;
use crate::collector::sort_key::ReverseComparator;
use crate::collector::sort_key::{Comparator, ReverseComparator};
use crate::collector::TopNComputer;
use crate::schema::OwnedValue;
use crate::{DocAddress, DocId, SegmentOrdinal};
@@ -383,7 +384,7 @@ impl From<FastFieldValue> for OwnedValue {
/// Holds a fast field value in its u64 representation, and the order in which it should be sorted.
#[derive(Clone, Serialize, Deserialize, Debug)]
struct DocValueAndOrder {
pub(crate) struct DocValueAndOrder {
/// A fast field value in its u64 representation.
value: Option<u64>,
/// Sort order for the value
@@ -455,6 +456,37 @@ impl PartialEq for DocSortValuesAndFields {
impl Eq for DocSortValuesAndFields {}
impl Comparator<DocSortValuesAndFields> for ReverseComparator {
#[inline(always)]
fn compare(&self, lhs: &DocSortValuesAndFields, rhs: &DocSortValuesAndFields) -> Ordering {
rhs.cmp(lhs)
}
fn threshold_to_valuerange(
&self,
threshold: DocSortValuesAndFields,
) -> ValueRange<DocSortValuesAndFields> {
ValueRange::LessThan(threshold, true)
}
}
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) struct TopHitsSegmentSortKey(pub Vec<DocValueAndOrder>);
impl Comparator<TopHitsSegmentSortKey> for ReverseComparator {
#[inline(always)]
fn compare(&self, lhs: &TopHitsSegmentSortKey, rhs: &TopHitsSegmentSortKey) -> Ordering {
rhs.cmp(lhs)
}
fn threshold_to_valuerange(
&self,
threshold: TopHitsSegmentSortKey,
) -> ValueRange<TopHitsSegmentSortKey> {
ValueRange::LessThan(threshold, true)
}
}
/// The TopHitsCollector used for collecting over segments and merging results.
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct TopHitsTopNComputer {
@@ -518,7 +550,7 @@ impl TopHitsTopNComputer {
pub(crate) struct TopHitsSegmentCollector {
segment_ordinal: SegmentOrdinal,
accessor_idx: usize,
top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, ReverseComparator>,
top_n: TopNComputer<TopHitsSegmentSortKey, DocAddress, ReverseComparator>,
}
impl TopHitsSegmentCollector {
@@ -539,13 +571,15 @@ impl TopHitsSegmentCollector {
req: &TopHitsAggregationReq,
) -> TopHitsTopNComputer {
let mut top_hits_computer = TopHitsTopNComputer::new(req);
// Map TopHitsSegmentSortKey back to Vec<DocValueAndOrder> if needed or use directly
// The TopNComputer here stores TopHitsSegmentSortKey.
let top_results = self.top_n.into_vec();
for res in top_results {
let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id);
top_hits_computer.collect(
DocSortValuesAndFields {
sorts: res.sort_key,
sorts: res.sort_key.0,
doc_value_fields,
},
res.doc,
@@ -579,7 +613,7 @@ impl TopHitsSegmentCollector {
.collect();
self.top_n.push(
sorts,
TopHitsSegmentSortKey(sorts),
DocAddress {
segment_ord: self.segment_ordinal,
doc_id,

View File

@@ -96,10 +96,9 @@ mod histogram_collector;
pub use histogram_collector::HistogramCollector;
mod multi_collector;
pub use self::multi_collector::{FruitHandle, MultiCollector, MultiFruit};
pub use columnar::ComparableDoc;
mod top_collector;
pub use self::top_collector::ComparableDoc;
pub use self::multi_collector::{FruitHandle, MultiCollector, MultiFruit};
mod top_score_collector;
pub use self::top_score_collector::{TopDocs, TopNComputer};

View File

@@ -39,6 +39,7 @@ pub(crate) mod tests {
use crate::collector::sort_key::{
SortByErasedType, SortBySimilarityScore, SortByStaticFastValue, SortByString,
};
use crate::collector::top_score_collector::compare_for_top_k;
use crate::collector::{ComparableDoc, DocSetCollector, TopDocs};
use crate::indexer::NoMergePolicy;
use crate::query::{AllQuery, QueryParser};
@@ -389,6 +390,52 @@ pub(crate) mod tests {
Ok(())
}
#[test]
fn test_order_by_compound_fast_fields() -> crate::Result<()> {
let index = make_index()?;
type CompoundSortKey = (Option<String>, Option<f64>);
fn assert_query(
index: &Index,
city_order: Order,
altitude_order: Order,
expected: Vec<(CompoundSortKey, u64)>,
) -> crate::Result<()> {
let searcher = index.reader()?.searcher();
let ids = id_mapping(&searcher);
let top_collector = TopDocs::with_limit(4).order_by((
(SortByString::for_field("city"), city_order),
(
SortByStaticFastValue::<f64>::for_field("altitude"),
altitude_order,
),
));
let actual = searcher
.search(&AllQuery, &top_collector)?
.into_iter()
.map(|(key, doc)| (key, ids[&doc]))
.collect::<Vec<_>>();
assert_eq!(actual, expected);
Ok(())
}
assert_query(
&index,
Order::Asc,
Order::Desc,
vec![
((Some("austin".to_owned()), Some(149.0)), 0),
((Some("greenville".to_owned()), Some(27.0)), 1),
((Some("tokyo".to_owned()), Some(40.0)), 2),
((None, Some(0.0)), 3),
],
)?;
Ok(())
}
use proptest::prelude::*;
proptest! {
@@ -451,4 +498,197 @@ pub(crate) mod tests {
);
}
}
proptest! {
#[test]
fn test_order_by_compound_prop(
city_order in prop_oneof!(Just(Order::Desc), Just(Order::Asc)),
altitude_order in prop_oneof!(Just(Order::Desc), Just(Order::Asc)),
limit in 1..20_usize,
offset in 0..20_usize,
segments_data in proptest::collection::vec(
proptest::collection::vec(
(proptest::option::of("[a-c]"), proptest::option::of(0..50u64)),
1..10_usize // segment size
),
1..4_usize // num segments
)
) {
use crate::collector::sort_key::ComparatorEnum;
use crate::TantivyDocument;
let mut schema_builder = Schema::builder();
let city = schema_builder.add_text_field("city", TEXT | FAST);
let altitude = schema_builder.add_u64_field("altitude", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
for segment_data in segments_data.into_iter() {
for (city_val, altitude_val) in segment_data.into_iter() {
let mut doc = TantivyDocument::default();
if let Some(c) = city_val {
doc.add_text(city, c);
}
if let Some(a) = altitude_val {
doc.add_u64(altitude, a);
}
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
}
let searcher = index.reader().unwrap().searcher();
let top_collector = TopDocs::with_limit(limit)
.and_offset(offset)
.order_by((
(SortByString::for_field("city"), city_order),
(
SortByStaticFastValue::<u64>::for_field("altitude"),
altitude_order,
),
));
let actual_results = searcher.search(&AllQuery, &top_collector).unwrap();
let actual_doc_ids: Vec<DocAddress> =
actual_results.into_iter().map(|(_, doc)| doc).collect();
// Verification logic
let all_docs_collector = DocSetCollector;
let all_docs = searcher.search(&AllQuery, &all_docs_collector).unwrap();
let docs_with_keys: Vec<((Option<String>, Option<u64>), DocAddress)> = all_docs
.into_iter()
.map(|doc_addr| {
let reader = searcher.segment_reader(doc_addr.segment_ord);
let city_val = if let Some(col) = reader.fast_fields().str("city").unwrap() {
let ord = col.ords().first(doc_addr.doc_id);
if let Some(ord) = ord {
let mut out = Vec::new();
col.dictionary().ord_to_term(ord, &mut out).unwrap();
String::from_utf8(out).ok()
} else {
None
}
} else {
None
};
let alt_val = if let Some((col, _)) = reader.fast_fields().u64_lenient("altitude").unwrap() {
col.first(doc_addr.doc_id)
} else {
None
};
((city_val, alt_val), doc_addr)
})
.collect();
let city_comparator = ComparatorEnum::from(city_order);
let alt_comparator = ComparatorEnum::from(altitude_order);
let comparator = (city_comparator, alt_comparator);
let mut comparable_docs: Vec<ComparableDoc<_, _>> = docs_with_keys
.into_iter()
.map(|(sort_key, doc)| ComparableDoc { sort_key, doc })
.collect();
comparable_docs.sort_by(|l, r| compare_for_top_k(&comparator, l, r));
let expected_results = comparable_docs
.into_iter()
.skip(offset)
.take(limit)
.collect::<Vec<_>>();
let expected_doc_ids: Vec<DocAddress> =
expected_results.into_iter().map(|cd| cd.doc).collect();
prop_assert_eq!(actual_doc_ids, expected_doc_ids);
}
}
proptest! {
#[test]
fn test_order_by_u64_prop(
order in prop_oneof!(Just(Order::Desc), Just(Order::Asc)),
limit in 1..20_usize,
offset in 0..20_usize,
segments_data in proptest::collection::vec(
proptest::collection::vec(
proptest::option::of(0..100u64),
1..1000_usize // segment size
),
1..4_usize // num segments
)
) {
use crate::collector::sort_key::ComparatorEnum;
use crate::TantivyDocument;
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
for segment_data in segments_data.into_iter() {
for val in segment_data.into_iter() {
let mut doc = TantivyDocument::default();
if let Some(v) = val {
doc.add_u64(field, v);
}
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
}
let searcher = index.reader().unwrap().searcher();
let top_collector = TopDocs::with_limit(limit)
.and_offset(offset)
.order_by((SortByStaticFastValue::<u64>::for_field("field"), order));
let actual_results = searcher.search(&AllQuery, &top_collector).unwrap();
let actual_doc_ids: Vec<DocAddress> =
actual_results.into_iter().map(|(_, doc)| doc).collect();
// Verification logic
let all_docs_collector = DocSetCollector;
let all_docs = searcher.search(&AllQuery, &all_docs_collector).unwrap();
let docs_with_keys: Vec<(Option<u64>, DocAddress)> = all_docs
.into_iter()
.map(|doc_addr| {
let reader = searcher.segment_reader(doc_addr.segment_ord);
let val = if let Some((col, _)) = reader.fast_fields().u64_lenient("field").unwrap() {
col.first(doc_addr.doc_id)
} else {
None
};
(val, doc_addr)
})
.collect();
let comparator = ComparatorEnum::from(order);
let mut comparable_docs: Vec<ComparableDoc<_, _>> = docs_with_keys
.into_iter()
.map(|(sort_key, doc)| ComparableDoc { sort_key, doc })
.collect();
comparable_docs.sort_by(|l, r| compare_for_top_k(&comparator, l, r));
let expected_results = comparable_docs
.into_iter()
.skip(offset)
.take(limit)
.collect::<Vec<_>>();
let expected_doc_ids: Vec<DocAddress> =
expected_results.into_iter().map(|cd| cd.doc).collect();
prop_assert_eq!(actual_doc_ids, expected_doc_ids);
}
}
}

View File

@@ -1,6 +1,6 @@
use std::cmp::Ordering;
use columnar::MonotonicallyMappableToU64;
use columnar::{ComparableDoc, MonotonicallyMappableToU64, ValueRange};
use serde::{Deserialize, Serialize};
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
@@ -69,6 +69,9 @@ fn compare_owned_value<const NULLS_FIRST: bool>(lhs: &OwnedValue, rhs: &OwnedVal
pub trait Comparator<T>: Send + Sync + std::fmt::Debug + Default {
/// Return the order between two values.
fn compare(&self, lhs: &T, rhs: &T) -> Ordering;
/// Return a `ValueRange` that matches all values that are greater than the provided threshold.
fn threshold_to_valuerange(&self, threshold: T) -> ValueRange<T>;
}
/// Compare values naturally (e.g. 1 < 2).
@@ -86,6 +89,10 @@ impl<T: PartialOrd> Comparator<T> for NaturalComparator {
fn compare(&self, lhs: &T, rhs: &T) -> Ordering {
lhs.partial_cmp(rhs).unwrap_or(Ordering::Equal)
}
fn threshold_to_valuerange(&self, threshold: T) -> ValueRange<T> {
ValueRange::GreaterThan(threshold, false)
}
}
/// A (partial) implementation of comparison for OwnedValue.
@@ -97,6 +104,10 @@ impl Comparator<OwnedValue> for NaturalComparator {
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
compare_owned_value::</* NULLS_FIRST= */ true>(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: OwnedValue) -> ValueRange<OwnedValue> {
ValueRange::GreaterThan(threshold, false)
}
}
/// Compare values in reverse (e.g. 2 < 1).
@@ -114,13 +125,69 @@ impl Comparator<OwnedValue> for NaturalComparator {
#[derive(Debug, Copy, Clone, Default, Serialize, Deserialize)]
pub struct ReverseComparator;
impl<T> Comparator<T> for ReverseComparator
where NaturalComparator: Comparator<T>
macro_rules! impl_reverse_comparator_primitive {
($($t:ty),*) => {
$(
impl Comparator<$t> for ReverseComparator {
#[inline(always)]
fn compare(&self, lhs: &$t, rhs: &$t) -> Ordering {
NaturalComparator.compare(rhs, lhs)
}
fn threshold_to_valuerange(&self, threshold: $t) -> ValueRange<$t> {
ValueRange::LessThan(threshold, true)
}
}
)*
}
}
impl_reverse_comparator_primitive!(
bool,
u8,
u16,
u32,
u64,
u128,
usize,
i8,
i16,
i32,
i64,
i128,
isize,
f32,
f64,
String,
crate::DateTime,
Vec<u8>,
crate::schema::Facet
);
impl<T: PartialOrd + Send + Sync + std::fmt::Debug + Clone + 'static> Comparator<Option<T>>
for ReverseComparator
{
#[inline(always)]
fn compare(&self, lhs: &T, rhs: &T) -> Ordering {
fn compare(&self, lhs: &Option<T>, rhs: &Option<T>) -> Ordering {
NaturalComparator.compare(rhs, lhs)
}
fn threshold_to_valuerange(&self, threshold: Option<T>) -> ValueRange<Option<T>> {
let is_some = threshold.is_some();
ValueRange::LessThan(threshold, is_some)
}
}
impl Comparator<OwnedValue> for ReverseComparator {
#[inline(always)]
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
NaturalComparator.compare(rhs, lhs)
}
fn threshold_to_valuerange(&self, threshold: OwnedValue) -> ValueRange<OwnedValue> {
let is_not_null = !matches!(threshold, OwnedValue::Null);
ValueRange::LessThan(threshold, is_not_null)
}
}
/// Compare values in reverse, but treating `None` as lower than `Some`.
@@ -147,6 +214,14 @@ where ReverseComparator: Comparator<T>
(Some(lhs), Some(rhs)) => ReverseComparator.compare(lhs, rhs),
}
}
fn threshold_to_valuerange(&self, threshold: Option<T>) -> ValueRange<Option<T>> {
if threshold.is_some() {
ValueRange::LessThan(threshold, false)
} else {
ValueRange::GreaterThan(threshold, false)
}
}
}
impl Comparator<u32> for ReverseNoneIsLowerComparator {
@@ -154,6 +229,10 @@ impl Comparator<u32> for ReverseNoneIsLowerComparator {
fn compare(&self, lhs: &u32, rhs: &u32) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: u32) -> ValueRange<u32> {
ValueRange::LessThan(threshold, false)
}
}
impl Comparator<u64> for ReverseNoneIsLowerComparator {
@@ -161,6 +240,10 @@ impl Comparator<u64> for ReverseNoneIsLowerComparator {
fn compare(&self, lhs: &u64, rhs: &u64) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: u64) -> ValueRange<u64> {
ValueRange::LessThan(threshold, false)
}
}
impl Comparator<f64> for ReverseNoneIsLowerComparator {
@@ -168,6 +251,10 @@ impl Comparator<f64> for ReverseNoneIsLowerComparator {
fn compare(&self, lhs: &f64, rhs: &f64) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: f64) -> ValueRange<f64> {
ValueRange::LessThan(threshold, false)
}
}
impl Comparator<f32> for ReverseNoneIsLowerComparator {
@@ -175,6 +262,10 @@ impl Comparator<f32> for ReverseNoneIsLowerComparator {
fn compare(&self, lhs: &f32, rhs: &f32) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: f32) -> ValueRange<f32> {
ValueRange::LessThan(threshold, false)
}
}
impl Comparator<i64> for ReverseNoneIsLowerComparator {
@@ -182,6 +273,10 @@ impl Comparator<i64> for ReverseNoneIsLowerComparator {
fn compare(&self, lhs: &i64, rhs: &i64) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: i64) -> ValueRange<i64> {
ValueRange::LessThan(threshold, false)
}
}
impl Comparator<String> for ReverseNoneIsLowerComparator {
@@ -189,6 +284,10 @@ impl Comparator<String> for ReverseNoneIsLowerComparator {
fn compare(&self, lhs: &String, rhs: &String) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: String) -> ValueRange<String> {
ValueRange::LessThan(threshold, false)
}
}
impl Comparator<OwnedValue> for ReverseNoneIsLowerComparator {
@@ -196,6 +295,10 @@ impl Comparator<OwnedValue> for ReverseNoneIsLowerComparator {
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
compare_owned_value::</* NULLS_FIRST= */ false>(rhs, lhs)
}
fn threshold_to_valuerange(&self, threshold: OwnedValue) -> ValueRange<OwnedValue> {
ValueRange::LessThan(threshold, false)
}
}
/// Compare values naturally, but treating `None` as higher than `Some`.
@@ -218,6 +321,15 @@ where NaturalComparator: Comparator<T>
(Some(lhs), Some(rhs)) => NaturalComparator.compare(lhs, rhs),
}
}
fn threshold_to_valuerange(&self, threshold: Option<T>) -> ValueRange<Option<T>> {
if threshold.is_some() {
let is_some = threshold.is_some();
ValueRange::GreaterThan(threshold, is_some)
} else {
ValueRange::LessThan(threshold, false)
}
}
}
impl Comparator<u32> for NaturalNoneIsHigherComparator {
@@ -225,6 +337,10 @@ impl Comparator<u32> for NaturalNoneIsHigherComparator {
fn compare(&self, lhs: &u32, rhs: &u32) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: u32) -> ValueRange<u32> {
ValueRange::GreaterThan(threshold, true)
}
}
impl Comparator<u64> for NaturalNoneIsHigherComparator {
@@ -232,6 +348,10 @@ impl Comparator<u64> for NaturalNoneIsHigherComparator {
fn compare(&self, lhs: &u64, rhs: &u64) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: u64) -> ValueRange<u64> {
ValueRange::GreaterThan(threshold, true)
}
}
impl Comparator<f64> for NaturalNoneIsHigherComparator {
@@ -239,6 +359,10 @@ impl Comparator<f64> for NaturalNoneIsHigherComparator {
fn compare(&self, lhs: &f64, rhs: &f64) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: f64) -> ValueRange<f64> {
ValueRange::GreaterThan(threshold, true)
}
}
impl Comparator<f32> for NaturalNoneIsHigherComparator {
@@ -246,6 +370,10 @@ impl Comparator<f32> for NaturalNoneIsHigherComparator {
fn compare(&self, lhs: &f32, rhs: &f32) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: f32) -> ValueRange<f32> {
ValueRange::GreaterThan(threshold, true)
}
}
impl Comparator<i64> for NaturalNoneIsHigherComparator {
@@ -253,6 +381,10 @@ impl Comparator<i64> for NaturalNoneIsHigherComparator {
fn compare(&self, lhs: &i64, rhs: &i64) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: i64) -> ValueRange<i64> {
ValueRange::GreaterThan(threshold, true)
}
}
impl Comparator<String> for NaturalNoneIsHigherComparator {
@@ -260,6 +392,10 @@ impl Comparator<String> for NaturalNoneIsHigherComparator {
fn compare(&self, lhs: &String, rhs: &String) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: String) -> ValueRange<String> {
ValueRange::GreaterThan(threshold, true)
}
}
impl Comparator<OwnedValue> for NaturalNoneIsHigherComparator {
@@ -267,6 +403,10 @@ impl Comparator<OwnedValue> for NaturalNoneIsHigherComparator {
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
compare_owned_value::</* NULLS_FIRST= */ false>(lhs, rhs)
}
fn threshold_to_valuerange(&self, threshold: OwnedValue) -> ValueRange<OwnedValue> {
ValueRange::GreaterThan(threshold, true)
}
}
/// An enum representing the different sort orders.
@@ -308,6 +448,19 @@ where
ComparatorEnum::NaturalNoneHigher => NaturalNoneIsHigherComparator.compare(lhs, rhs),
}
}
fn threshold_to_valuerange(&self, threshold: T) -> ValueRange<T> {
match self {
ComparatorEnum::Natural => NaturalComparator.threshold_to_valuerange(threshold),
ComparatorEnum::Reverse => ReverseComparator.threshold_to_valuerange(threshold),
ComparatorEnum::ReverseNoneLower => {
ReverseNoneIsLowerComparator.threshold_to_valuerange(threshold)
}
ComparatorEnum::NaturalNoneHigher => {
NaturalNoneIsHigherComparator.threshold_to_valuerange(threshold)
}
}
}
}
impl<Head, Tail, LeftComparator, RightComparator> Comparator<(Head, Tail)>
@@ -322,6 +475,10 @@ where
.compare(&lhs.0, &rhs.0)
.then_with(|| self.1.compare(&lhs.1, &rhs.1))
}
fn threshold_to_valuerange(&self, threshold: (Head, Tail)) -> ValueRange<(Head, Tail)> {
ValueRange::GreaterThan(threshold, false)
}
}
impl<Type1, Type2, Type3, Comparator1, Comparator2, Comparator3> Comparator<(Type1, (Type2, Type3))>
@@ -338,6 +495,13 @@ where
.then_with(|| self.1.compare(&lhs.1 .0, &rhs.1 .0))
.then_with(|| self.2.compare(&lhs.1 .1, &rhs.1 .1))
}
fn threshold_to_valuerange(
&self,
threshold: (Type1, (Type2, Type3)),
) -> ValueRange<(Type1, (Type2, Type3))> {
ValueRange::GreaterThan(threshold, false)
}
}
impl<Type1, Type2, Type3, Comparator1, Comparator2, Comparator3> Comparator<(Type1, Type2, Type3)>
@@ -354,6 +518,13 @@ where
.then_with(|| self.1.compare(&lhs.1, &rhs.1))
.then_with(|| self.2.compare(&lhs.2, &rhs.2))
}
fn threshold_to_valuerange(
&self,
threshold: (Type1, Type2, Type3),
) -> ValueRange<(Type1, Type2, Type3)> {
ValueRange::GreaterThan(threshold, false)
}
}
impl<Type1, Type2, Type3, Type4, Comparator1, Comparator2, Comparator3, Comparator4>
@@ -377,6 +548,13 @@ where
.then_with(|| self.2.compare(&lhs.1 .1 .0, &rhs.1 .1 .0))
.then_with(|| self.3.compare(&lhs.1 .1 .1, &rhs.1 .1 .1))
}
fn threshold_to_valuerange(
&self,
threshold: (Type1, (Type2, (Type3, Type4))),
) -> ValueRange<(Type1, (Type2, (Type3, Type4)))> {
ValueRange::GreaterThan(threshold, false)
}
}
impl<Type1, Type2, Type3, Type4, Comparator1, Comparator2, Comparator3, Comparator4>
@@ -400,6 +578,13 @@ where
.then_with(|| self.2.compare(&lhs.2, &rhs.2))
.then_with(|| self.3.compare(&lhs.3, &rhs.3))
}
fn threshold_to_valuerange(
&self,
threshold: (Type1, Type2, Type3, Type4),
) -> ValueRange<(Type1, Type2, Type3, Type4)> {
ValueRange::GreaterThan(threshold, false)
}
}
impl<TSortKeyComputer> SortKeyComputer for (TSortKeyComputer, ComparatorEnum)
@@ -489,16 +674,32 @@ impl<TSegmentSortKeyComputer, TSegmentSortKey, TComparator> SegmentSortKeyComput
where
TSegmentSortKeyComputer: SegmentSortKeyComputer<SegmentSortKey = TSegmentSortKey>,
TSegmentSortKey: Clone + 'static + Sync + Send,
TComparator: Comparator<TSegmentSortKey> + 'static + Sync + Send,
TComparator: Comparator<TSegmentSortKey> + Clone + 'static + Sync + Send,
{
type SortKey = TSegmentSortKeyComputer::SortKey;
type SegmentSortKey = TSegmentSortKey;
type SegmentComparator = TComparator;
type Buffer = TSegmentSortKeyComputer::Buffer;
fn segment_comparator(&self) -> Self::SegmentComparator {
self.comparator.clone()
}
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey {
self.segment_sort_key_computer.segment_sort_key(doc, score)
}
fn segment_sort_keys(
&mut self,
input_docs: &[DocId],
output: &mut Vec<ComparableDoc<Self::SegmentSortKey, DocId>>,
buffer: &mut Self::Buffer,
filter: ValueRange<Self::SegmentSortKey>,
) {
self.segment_sort_key_computer
.segment_sort_keys(input_docs, output, buffer, filter)
}
#[inline(always)]
fn compare_segment_sort_key(
&self,

View File

@@ -1,9 +1,10 @@
use columnar::{ColumnType, MonotonicallyMappableToU64};
use columnar::{ColumnType, MonotonicallyMappableToU64, ValueRange};
use crate::collector::sort_key::sort_by_score::SortBySimilarityScoreSegmentComputer;
use crate::collector::sort_key::{
NaturalComparator, SortBySimilarityScore, SortByStaticFastValue, SortByString,
};
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::collector::{ComparableDoc, SegmentSortKeyComputer, SortKeyComputer};
use crate::fastfield::FastFieldNotAvailableError;
use crate::schema::OwnedValue;
use crate::{DateTime, DocId, Score};
@@ -36,12 +37,23 @@ impl SortByErasedType {
trait ErasedSegmentSortKeyComputer: Send + Sync {
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Option<u64>;
fn segment_sort_keys(
&mut self,
input_docs: &[DocId],
output: &mut Vec<ComparableDoc<Option<u64>, DocId>>,
filter: ValueRange<Option<u64>>,
);
fn convert_segment_sort_key(&self, sort_key: Option<u64>) -> OwnedValue;
}
struct ErasedSegmentSortKeyComputerWrapper<C, F> {
struct ErasedSegmentSortKeyComputerWrapper<C, F>
where
C: SegmentSortKeyComputer<SegmentSortKey = Option<u64>> + Send + Sync,
F: Fn(C::SortKey) -> OwnedValue + Send + Sync + 'static,
{
inner: C,
converter: F,
buffer: C::Buffer,
}
impl<C, F> ErasedSegmentSortKeyComputer for ErasedSegmentSortKeyComputerWrapper<C, F>
@@ -53,6 +65,16 @@ where
self.inner.segment_sort_key(doc, score)
}
fn segment_sort_keys(
&mut self,
input_docs: &[DocId],
output: &mut Vec<ComparableDoc<Option<u64>, DocId>>,
filter: ValueRange<Option<u64>>,
) {
self.inner
.segment_sort_keys(input_docs, output, &mut self.buffer, filter)
}
fn convert_segment_sort_key(&self, sort_key: Option<u64>) -> OwnedValue {
let val = self.inner.convert_segment_sort_key(sort_key);
(self.converter)(val)
@@ -60,7 +82,7 @@ where
}
struct ScoreSegmentSortKeyComputer {
segment_computer: SortBySimilarityScore,
segment_computer: SortBySimilarityScoreSegmentComputer,
}
impl ErasedSegmentSortKeyComputer for ScoreSegmentSortKeyComputer {
@@ -69,6 +91,15 @@ impl ErasedSegmentSortKeyComputer for ScoreSegmentSortKeyComputer {
Some(score_value.to_u64())
}
fn segment_sort_keys(
&mut self,
_input_docs: &[DocId],
_output: &mut Vec<ComparableDoc<Option<u64>, DocId>>,
_filter: ValueRange<Option<u64>>,
) {
unimplemented!("Batch computation not supported for score sorting")
}
fn convert_segment_sort_key(&self, sort_key: Option<u64>) -> OwnedValue {
let score_value: u64 = sort_key.expect("This implementation always produces a score.");
OwnedValue::F64(f64::from_u64(score_value))
@@ -112,6 +143,7 @@ impl SortKeyComputer for SortByErasedType {
converter: |val: Option<String>| {
val.map(OwnedValue::Str).unwrap_or(OwnedValue::Null)
},
buffer: Default::default(),
})
}
ColumnType::U64 => {
@@ -122,6 +154,7 @@ impl SortKeyComputer for SortByErasedType {
converter: |val: Option<u64>| {
val.map(OwnedValue::U64).unwrap_or(OwnedValue::Null)
},
buffer: Default::default(),
})
}
ColumnType::I64 => {
@@ -132,6 +165,7 @@ impl SortKeyComputer for SortByErasedType {
converter: |val: Option<i64>| {
val.map(OwnedValue::I64).unwrap_or(OwnedValue::Null)
},
buffer: Default::default(),
})
}
ColumnType::F64 => {
@@ -142,6 +176,7 @@ impl SortKeyComputer for SortByErasedType {
converter: |val: Option<f64>| {
val.map(OwnedValue::F64).unwrap_or(OwnedValue::Null)
},
buffer: Default::default(),
})
}
ColumnType::Bool => {
@@ -152,6 +187,7 @@ impl SortKeyComputer for SortByErasedType {
converter: |val: Option<bool>| {
val.map(OwnedValue::Bool).unwrap_or(OwnedValue::Null)
},
buffer: Default::default(),
})
}
ColumnType::DateTime => {
@@ -162,6 +198,7 @@ impl SortKeyComputer for SortByErasedType {
converter: |val: Option<DateTime>| {
val.map(OwnedValue::Date).unwrap_or(OwnedValue::Null)
},
buffer: Default::default(),
})
}
column_type => {
@@ -174,7 +211,8 @@ impl SortKeyComputer for SortByErasedType {
}
}
Self::Score => Box::new(ScoreSegmentSortKeyComputer {
segment_computer: SortBySimilarityScore,
segment_computer: SortBySimilarityScore
.segment_sort_key_computer(segment_reader)?,
}),
};
Ok(ErasedColumnSegmentSortKeyComputer { inner })
@@ -189,12 +227,23 @@ impl SegmentSortKeyComputer for ErasedColumnSegmentSortKeyComputer {
type SortKey = OwnedValue;
type SegmentSortKey = Option<u64>;
type SegmentComparator = NaturalComparator;
type Buffer = ();
#[inline(always)]
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Option<u64> {
self.inner.segment_sort_key(doc, score)
}
fn segment_sort_keys(
&mut self,
input_docs: &[DocId],
output: &mut Vec<ComparableDoc<Self::SegmentSortKey, DocId>>,
_buffer: &mut Self::Buffer,
filter: ValueRange<Self::SegmentSortKey>,
) {
self.inner.segment_sort_keys(input_docs, output, filter)
}
fn convert_segment_sort_key(&self, segment_sort_key: Self::SegmentSortKey) -> OwnedValue {
self.inner.convert_segment_sort_key(segment_sort_key)
}

View File

@@ -1,5 +1,7 @@
use columnar::ValueRange;
use crate::collector::sort_key::NaturalComparator;
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer, TopNComputer};
use crate::collector::{ComparableDoc, SegmentSortKeyComputer, SortKeyComputer, TopNComputer};
use crate::{DocAddress, DocId, Score};
/// Sort by similarity score.
@@ -9,7 +11,7 @@ pub struct SortBySimilarityScore;
impl SortKeyComputer for SortBySimilarityScore {
type SortKey = Score;
type Child = SortBySimilarityScore;
type Child = SortBySimilarityScoreSegmentComputer;
type Comparator = NaturalComparator;
@@ -21,7 +23,7 @@ impl SortKeyComputer for SortBySimilarityScore {
&self,
_segment_reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
Ok(SortBySimilarityScore)
Ok(SortBySimilarityScoreSegmentComputer)
}
// Sorting by score is special in that it allows for the Block-Wand optimization.
@@ -61,16 +63,29 @@ impl SortKeyComputer for SortBySimilarityScore {
}
}
impl SegmentSortKeyComputer for SortBySimilarityScore {
pub struct SortBySimilarityScoreSegmentComputer;
impl SegmentSortKeyComputer for SortBySimilarityScoreSegmentComputer {
type SortKey = Score;
type SegmentSortKey = Score;
type SegmentComparator = NaturalComparator;
type Buffer = ();
#[inline(always)]
fn segment_sort_key(&mut self, _doc: DocId, score: Score) -> Score {
score
}
fn segment_sort_keys(
&mut self,
_input_docs: &[DocId],
_output: &mut Vec<ComparableDoc<Self::SegmentSortKey, DocId>>,
_buffer: &mut Self::Buffer,
_filter: ValueRange<Self::SegmentSortKey>,
) {
unimplemented!("Batch computation not supported for score sorting")
}
fn convert_segment_sort_key(&self, score: Score) -> Score {
score
}

View File

@@ -1,9 +1,10 @@
use std::marker::PhantomData;
use columnar::Column;
use columnar::{Column, ValueRange};
use crate::collector::sort_key::sort_key_computer::convert_optional_u64_range_to_u64_range;
use crate::collector::sort_key::NaturalComparator;
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::collector::{ComparableDoc, SegmentSortKeyComputer, SortKeyComputer};
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::{DocId, Score, SegmentReader};
@@ -84,13 +85,110 @@ impl<T: FastValue> SegmentSortKeyComputer for SortByFastValueSegmentSortKeyCompu
type SortKey = Option<T>;
type SegmentSortKey = Option<u64>;
type SegmentComparator = NaturalComparator;
type Buffer = ();
#[inline(always)]
fn segment_sort_key(&mut self, doc: DocId, _score: Score) -> Self::SegmentSortKey {
self.sort_column.first(doc)
}
fn segment_sort_keys(
&mut self,
input_docs: &[DocId],
output: &mut Vec<ComparableDoc<Self::SegmentSortKey, DocId>>,
_buffer: &mut Self::Buffer,
filter: ValueRange<Self::SegmentSortKey>,
) {
let u64_filter = convert_optional_u64_range_to_u64_range(filter);
self.sort_column
.first_vals_in_value_range(input_docs, output, u64_filter);
}
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey {
sort_key.map(T::from_u64)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::schema::{Schema, FAST};
use crate::Index;
#[test]
fn test_sort_by_fast_value_batch() {
let mut schema_builder = Schema::builder();
let field_col = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(crate::doc!(field_col => 10u64))
.unwrap();
index_writer
.add_document(crate::doc!(field_col => 20u64))
.unwrap();
index_writer.add_document(crate::doc!()).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let sorter = SortByStaticFastValue::<u64>::for_field("field");
let mut computer = sorter.segment_sort_key_computer(segment_reader).unwrap();
let mut docs = vec![0, 1, 2];
let mut output = Vec::new();
let mut buffer = ();
computer.segment_sort_keys(&mut docs, &mut output, &mut buffer, ValueRange::All);
assert_eq!(
output.iter().map(|c| c.sort_key).collect::<Vec<_>>(),
&[Some(10), Some(20), None]
);
assert_eq!(output.iter().map(|c| c.doc).collect::<Vec<_>>(), &[0, 1, 2]);
}
#[test]
fn test_sort_by_fast_value_batch_with_filter() {
let mut schema_builder = Schema::builder();
let field_col = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(crate::doc!(field_col => 10u64))
.unwrap();
index_writer
.add_document(crate::doc!(field_col => 20u64))
.unwrap();
index_writer.add_document(crate::doc!()).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let sorter = SortByStaticFastValue::<u64>::for_field("field");
let mut computer = sorter.segment_sort_key_computer(segment_reader).unwrap();
let mut docs = vec![0, 1, 2];
let mut output = Vec::new();
let mut buffer = ();
computer.segment_sort_keys(
&mut docs,
&mut output,
&mut buffer,
ValueRange::GreaterThan(Some(15u64), false /* inclusive */),
);
assert_eq!(
output.iter().map(|c| c.sort_key).collect::<Vec<_>>(),
&[Some(20)]
);
assert_eq!(output.iter().map(|c| c.doc).collect::<Vec<_>>(), &[1]);
}
}

View File

@@ -1,7 +1,10 @@
use columnar::StrColumn;
use columnar::{StrColumn, ValueRange};
use crate::collector::sort_key::sort_key_computer::{
convert_optional_u64_range_to_u64_range, range_contains_none,
};
use crate::collector::sort_key::NaturalComparator;
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::collector::{ComparableDoc, SegmentSortKeyComputer, SortKeyComputer};
use crate::termdict::TermOrdinal;
use crate::{DocId, Score};
@@ -50,6 +53,7 @@ impl SegmentSortKeyComputer for ByStringColumnSegmentSortKeyComputer {
type SortKey = Option<String>;
type SegmentSortKey = Option<TermOrdinal>;
type SegmentComparator = NaturalComparator;
type Buffer = ();
#[inline(always)]
fn segment_sort_key(&mut self, doc: DocId, _score: Score) -> Option<TermOrdinal> {
@@ -57,6 +61,28 @@ impl SegmentSortKeyComputer for ByStringColumnSegmentSortKeyComputer {
str_column.ords().first(doc)
}
fn segment_sort_keys(
&mut self,
input_docs: &[DocId],
output: &mut Vec<ComparableDoc<Self::SegmentSortKey, DocId>>,
_buffer: &mut Self::Buffer,
filter: ValueRange<Self::SegmentSortKey>,
) {
if let Some(str_column) = &self.str_column_opt {
let u64_filter = convert_optional_u64_range_to_u64_range(filter);
str_column
.ords()
.first_vals_in_value_range(input_docs, output, u64_filter);
} else if range_contains_none(&filter) {
for &doc in input_docs {
output.push(ComparableDoc {
doc,
sort_key: None,
});
}
}
}
fn convert_segment_sort_key(&self, term_ord_opt: Option<TermOrdinal>) -> Option<String> {
// TODO: Individual lookups to the dictionary like this are very likely to repeatedly
// decompress the same blocks. See https://github.com/quickwit-oss/tantivy/issues/2776
@@ -70,3 +96,90 @@ impl SegmentSortKeyComputer for ByStringColumnSegmentSortKeyComputer {
String::try_from(bytes).ok()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::schema::{Schema, FAST, TEXT};
use crate::Index;
#[test]
fn test_sort_by_string_batch() {
let mut schema_builder = Schema::builder();
let field_col = schema_builder.add_text_field("field", FAST | TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(crate::doc!(field_col => "a"))
.unwrap();
index_writer
.add_document(crate::doc!(field_col => "c"))
.unwrap();
index_writer.add_document(crate::doc!()).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let sorter = SortByString::for_field("field");
let mut computer = sorter.segment_sort_key_computer(segment_reader).unwrap();
let mut docs = vec![0, 1, 2];
let mut output = Vec::new();
let mut buffer = ();
computer.segment_sort_keys(&mut docs, &mut output, &mut buffer, ValueRange::All);
assert_eq!(
output.iter().map(|c| c.sort_key).collect::<Vec<_>>(),
&[Some(0), Some(1), None]
);
assert_eq!(output.iter().map(|c| c.doc).collect::<Vec<_>>(), &[0, 1, 2]);
}
#[test]
fn test_sort_by_string_batch_with_filter() {
let mut schema_builder = Schema::builder();
let field_col = schema_builder.add_text_field("field", FAST | TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(crate::doc!(field_col => "a"))
.unwrap();
index_writer
.add_document(crate::doc!(field_col => "c"))
.unwrap();
index_writer.add_document(crate::doc!()).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let sorter = SortByString::for_field("field");
let mut computer = sorter.segment_sort_key_computer(segment_reader).unwrap();
let mut docs = vec![0, 1, 2];
let mut output = Vec::new();
// Filter: > "b". "a" is 0, "c" is 1.
// We want > "a" (ord 0). So we filter > ord 0.
// 0 is "a", 1 is "c".
let mut buffer = ();
computer.segment_sort_keys(
&mut docs,
&mut output,
&mut buffer,
ValueRange::GreaterThan(Some(0), false /* inclusive */),
);
assert_eq!(
output.iter().map(|c| c.sort_key).collect::<Vec<_>>(),
&[Some(1)]
);
assert_eq!(output.iter().map(|c| c.doc).collect::<Vec<_>>(), &[1]);
}
}

View File

@@ -1,8 +1,12 @@
use std::cmp::Ordering;
use columnar::ValueRange;
use crate::collector::sort_key::{Comparator, NaturalComparator};
use crate::collector::sort_key_top_collector::TopBySortKeySegmentCollector;
use crate::collector::{default_collect_segment_impl, SegmentCollector as _, TopNComputer};
use crate::collector::{
default_collect_segment_impl, ComparableDoc, SegmentCollector as _, TopNComputer,
};
use crate::schema::Schema;
use crate::{DocAddress, DocId, Result, Score, SegmentReader};
@@ -21,7 +25,10 @@ pub trait SegmentSortKeyComputer: 'static {
type SegmentSortKey: 'static + Clone + Send + Sync + Clone;
/// Comparator type.
type SegmentComparator: Comparator<Self::SegmentSortKey> + 'static;
type SegmentComparator: Comparator<Self::SegmentSortKey> + Clone + 'static;
/// Buffer type used for scratch space.
type Buffer: Default + Send + Sync + 'static;
/// Returns the segment sort key comparator.
fn segment_comparator(&self) -> Self::SegmentComparator {
@@ -31,6 +38,18 @@ pub trait SegmentSortKeyComputer: 'static {
/// Computes the sort key for the given document and score.
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey;
/// Computes the sort keys for a batch of documents.
///
/// The computed sort keys and document IDs are pushed into the `output` vector.
/// The `buffer` is used for scratch space.
fn segment_sort_keys(
&mut self,
input_docs: &[DocId],
output: &mut Vec<ComparableDoc<Self::SegmentSortKey, DocId>>,
buffer: &mut Self::Buffer,
filter: ValueRange<Self::SegmentSortKey>,
);
/// Computes the sort key and pushes the document in a TopN Computer.
///
/// When using a tuple as the sorting key, the sort key is evaluated in a lazy manner.
@@ -39,12 +58,32 @@ pub trait SegmentSortKeyComputer: 'static {
&mut self,
doc: DocId,
score: Score,
top_n_computer: &mut TopNComputer<Self::SegmentSortKey, DocId, C>,
top_n_computer: &mut TopNComputer<Self::SegmentSortKey, DocId, C, Self::Buffer>,
) {
let sort_key = self.segment_sort_key(doc, score);
top_n_computer.push(sort_key, doc);
}
fn compute_sort_keys_and_collect<C: Comparator<Self::SegmentSortKey>>(
&mut self,
docs: &[DocId],
top_n_computer: &mut TopNComputer<Self::SegmentSortKey, DocId, C, Self::Buffer>,
) {
// The capacity of a TopNComputer is larger than 2*n + COLLECT_BLOCK_BUFFER_LEN, so we
// should always be able to `reserve` space for the entire block.
top_n_computer.reserve(docs.len());
let comparator = self.segment_comparator();
let value_range = if let Some(threshold) = &top_n_computer.threshold {
comparator.threshold_to_valuerange(threshold.clone())
} else {
ValueRange::All
};
let (buffer, scratch) = top_n_computer.buffer_and_scratch();
self.segment_sort_keys(docs, buffer, scratch, value_range);
}
/// A SegmentSortKeyComputer maps to a SegmentSortKey, but it can also decide on
/// its ordering.
///
@@ -58,26 +97,6 @@ pub trait SegmentSortKeyComputer: 'static {
self.segment_comparator().compare(left, right)
}
/// Implementing this method makes it possible to avoid computing
/// a sort_key entirely if we can assess that it won't pass a threshold
/// with a partial computation.
///
/// This is currently used for lexicographic sorting.
fn accept_sort_key_lazy(
&mut self,
doc_id: DocId,
score: Score,
threshold: &Self::SegmentSortKey,
) -> Option<(Ordering, Self::SegmentSortKey)> {
let sort_key = self.segment_sort_key(doc_id, score);
let cmp = self.compare_segment_sort_key(&sort_key, threshold);
if cmp == Ordering::Less {
None
} else {
Some((cmp, sort_key))
}
}
/// Convert a segment level sort key into the global sort key.
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey;
}
@@ -145,7 +164,8 @@ where
TailSortKeyComputer: SortKeyComputer,
{
type SortKey = (HeadSortKeyComputer::SortKey, TailSortKeyComputer::SortKey);
type Child = (HeadSortKeyComputer::Child, TailSortKeyComputer::Child);
type Child =
ChainSegmentSortKeyComputer<HeadSortKeyComputer::Child, TailSortKeyComputer::Child>;
type Comparator = (
HeadSortKeyComputer::Comparator,
@@ -157,10 +177,10 @@ where
}
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
Ok((
self.0.segment_sort_key_computer(segment_reader)?,
self.1.segment_sort_key_computer(segment_reader)?,
))
Ok(ChainSegmentSortKeyComputer {
head: self.0.segment_sort_key_computer(segment_reader)?,
tail: self.1.segment_sort_key_computer(segment_reader)?,
})
}
/// Checks whether the schema is compatible with the sort key computer.
@@ -178,25 +198,91 @@ where
}
}
impl<HeadSegmentSortKeyComputer, TailSegmentSortKeyComputer> SegmentSortKeyComputer
for (HeadSegmentSortKeyComputer, TailSegmentSortKeyComputer)
pub struct ChainSegmentSortKeyComputer<Head, Tail>
where
HeadSegmentSortKeyComputer: SegmentSortKeyComputer,
TailSegmentSortKeyComputer: SegmentSortKeyComputer,
Head: SegmentSortKeyComputer,
Tail: SegmentSortKeyComputer,
{
type SortKey = (
HeadSegmentSortKeyComputer::SortKey,
TailSegmentSortKeyComputer::SortKey,
);
type SegmentSortKey = (
HeadSegmentSortKeyComputer::SegmentSortKey,
TailSegmentSortKeyComputer::SegmentSortKey,
);
head: Head,
tail: Tail,
}
type SegmentComparator = (
HeadSegmentSortKeyComputer::SegmentComparator,
TailSegmentSortKeyComputer::SegmentComparator,
);
pub struct ChainBuffer<HeadBuffer, TailBuffer, HeadKey, TailKey> {
pub head: HeadBuffer,
pub tail: TailBuffer,
pub head_output: Vec<ComparableDoc<HeadKey, DocId>>,
pub tail_output: Vec<ComparableDoc<TailKey, DocId>>,
pub tail_input_docs: Vec<DocId>,
}
impl<HeadBuffer: Default, TailBuffer: Default, HeadKey, TailKey> Default
for ChainBuffer<HeadBuffer, TailBuffer, HeadKey, TailKey>
{
fn default() -> Self {
ChainBuffer {
head: HeadBuffer::default(),
tail: TailBuffer::default(),
head_output: Vec::new(),
tail_output: Vec::new(),
tail_input_docs: Vec::new(),
}
}
}
impl<Head, Tail> ChainSegmentSortKeyComputer<Head, Tail>
where
Head: SegmentSortKeyComputer,
Tail: SegmentSortKeyComputer,
{
fn accept_sort_key_lazy(
&mut self,
doc_id: DocId,
score: Score,
threshold: &<Self as SegmentSortKeyComputer>::SegmentSortKey,
) -> Option<(Ordering, <Self as SegmentSortKeyComputer>::SegmentSortKey)> {
let (head_threshold, tail_threshold) = threshold;
let head_sort_key = self.head.segment_sort_key(doc_id, score);
let head_cmp = self
.head
.compare_segment_sort_key(&head_sort_key, head_threshold);
if head_cmp == Ordering::Less {
None
} else if head_cmp == Ordering::Equal {
let tail_sort_key = self.tail.segment_sort_key(doc_id, score);
let tail_cmp = self
.tail
.compare_segment_sort_key(&tail_sort_key, tail_threshold);
if tail_cmp == Ordering::Less {
None
} else {
Some((tail_cmp, (head_sort_key, tail_sort_key)))
}
} else {
let tail_sort_key = self.tail.segment_sort_key(doc_id, score);
Some((head_cmp, (head_sort_key, tail_sort_key)))
}
}
}
impl<Head, Tail> SegmentSortKeyComputer for ChainSegmentSortKeyComputer<Head, Tail>
where
Head: SegmentSortKeyComputer,
Tail: SegmentSortKeyComputer,
{
type SortKey = (Head::SortKey, Tail::SortKey);
type SegmentSortKey = (Head::SegmentSortKey, Tail::SegmentSortKey);
type SegmentComparator = (Head::SegmentComparator, Tail::SegmentComparator);
type Buffer =
ChainBuffer<Head::Buffer, Tail::Buffer, Head::SegmentSortKey, Tail::SegmentSortKey>;
fn segment_comparator(&self) -> Self::SegmentComparator {
(
self.head.segment_comparator(),
self.tail.segment_comparator(),
)
}
/// A SegmentSortKeyComputer maps to a SegmentSortKey, but it can also decide on
/// its ordering.
@@ -208,9 +294,90 @@ where
left: &Self::SegmentSortKey,
right: &Self::SegmentSortKey,
) -> Ordering {
self.0
self.head
.compare_segment_sort_key(&left.0, &right.0)
.then_with(|| self.1.compare_segment_sort_key(&left.1, &right.1))
.then_with(|| self.tail.compare_segment_sort_key(&left.1, &right.1))
}
fn segment_sort_keys(
&mut self,
input_docs: &[DocId],
output: &mut Vec<ComparableDoc<Self::SegmentSortKey, DocId>>,
buffer: &mut Self::Buffer,
filter: ValueRange<Self::SegmentSortKey>,
) {
let (head_filter, threshold) = match filter {
ValueRange::GreaterThan((head_threshold, tail_threshold), _)
| ValueRange::LessThan((head_threshold, tail_threshold), _) => {
let head_cmp = self.head.segment_comparator();
let strict_head_filter = head_cmp.threshold_to_valuerange(head_threshold.clone());
let head_filter = match strict_head_filter {
ValueRange::GreaterThan(t, m) => ValueRange::GreaterThanOrEqual(t, m),
ValueRange::LessThan(t, m) => ValueRange::LessThanOrEqual(t, m),
other => other,
};
(head_filter, Some((head_threshold, tail_threshold)))
}
_ => (ValueRange::All, None),
};
buffer.head_output.clear();
self.head.segment_sort_keys(
input_docs,
&mut buffer.head_output,
&mut buffer.head,
head_filter,
);
if buffer.head_output.is_empty() {
return;
}
buffer.tail_output.clear();
buffer.tail_input_docs.clear();
for cd in &buffer.head_output {
buffer.tail_input_docs.push(cd.doc);
}
self.tail.segment_sort_keys(
&buffer.tail_input_docs,
&mut buffer.tail_output,
&mut buffer.tail,
ValueRange::All,
);
let head_cmp = self.head.segment_comparator();
let tail_cmp = self.tail.segment_comparator();
for (head_doc, tail_doc) in buffer
.head_output
.drain(..)
.zip(buffer.tail_output.drain(..))
{
debug_assert_eq!(head_doc.doc, tail_doc.doc);
let doc = head_doc.doc;
let head_key = head_doc.sort_key;
let tail_key = tail_doc.sort_key;
let accept = if let Some((head_threshold, tail_threshold)) = &threshold {
let head_ord = head_cmp.compare(&head_key, head_threshold);
let ord = if head_ord == Ordering::Equal {
tail_cmp.compare(&tail_key, tail_threshold)
} else {
head_ord
};
ord == Ordering::Greater
} else {
true
};
if accept {
output.push(ComparableDoc {
sort_key: (head_key, tail_key),
doc,
});
}
}
}
#[inline(always)]
@@ -218,7 +385,7 @@ where
&mut self,
doc: DocId,
score: Score,
top_n_computer: &mut TopNComputer<Self::SegmentSortKey, DocId, C>,
top_n_computer: &mut TopNComputer<Self::SegmentSortKey, DocId, C, Self::Buffer>,
) {
let sort_key: Self::SegmentSortKey;
if let Some(threshold) = &top_n_computer.threshold {
@@ -235,48 +402,29 @@ where
#[inline(always)]
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey {
let head_sort_key = self.0.segment_sort_key(doc, score);
let tail_sort_key = self.1.segment_sort_key(doc, score);
let head_sort_key = self.head.segment_sort_key(doc, score);
let tail_sort_key = self.tail.segment_sort_key(doc, score);
(head_sort_key, tail_sort_key)
}
fn accept_sort_key_lazy(
&mut self,
doc_id: DocId,
score: Score,
threshold: &Self::SegmentSortKey,
) -> Option<(Ordering, Self::SegmentSortKey)> {
let (head_threshold, tail_threshold) = threshold;
let (head_cmp, head_sort_key) =
self.0.accept_sort_key_lazy(doc_id, score, head_threshold)?;
if head_cmp == Ordering::Equal {
let (tail_cmp, tail_sort_key) =
self.1.accept_sort_key_lazy(doc_id, score, tail_threshold)?;
Some((tail_cmp, (head_sort_key, tail_sort_key)))
} else {
let tail_sort_key = self.1.segment_sort_key(doc_id, score);
Some((head_cmp, (head_sort_key, tail_sort_key)))
}
}
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey {
let (head_sort_key, tail_sort_key) = sort_key;
(
self.0.convert_segment_sort_key(head_sort_key),
self.1.convert_segment_sort_key(tail_sort_key),
self.head.convert_segment_sort_key(head_sort_key),
self.tail.convert_segment_sort_key(tail_sort_key),
)
}
}
/// This struct is used as an adapter to take a sort key computer and map its score to another
/// new sort key.
pub struct MappedSegmentSortKeyComputer<T, PreviousSortKey, NewSortKey> {
pub struct MappedSegmentSortKeyComputer<T: SegmentSortKeyComputer, NewSortKey> {
sort_key_computer: T,
map: fn(PreviousSortKey) -> NewSortKey,
map: fn(T::SortKey) -> NewSortKey,
}
impl<T, PreviousScore, NewScore> SegmentSortKeyComputer
for MappedSegmentSortKeyComputer<T, PreviousScore, NewScore>
for MappedSegmentSortKeyComputer<T, NewScore>
where
T: SegmentSortKeyComputer<SortKey = PreviousScore>,
PreviousScore: 'static + Clone + Send + Sync,
@@ -285,19 +433,25 @@ where
type SortKey = NewScore;
type SegmentSortKey = T::SegmentSortKey;
type SegmentComparator = T::SegmentComparator;
type Buffer = T::Buffer;
fn segment_comparator(&self) -> Self::SegmentComparator {
self.sort_key_computer.segment_comparator()
}
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey {
self.sort_key_computer.segment_sort_key(doc, score)
}
fn accept_sort_key_lazy(
fn segment_sort_keys(
&mut self,
doc_id: DocId,
score: Score,
threshold: &Self::SegmentSortKey,
) -> Option<(Ordering, Self::SegmentSortKey)> {
input_docs: &[DocId],
output: &mut Vec<ComparableDoc<Self::SegmentSortKey, DocId>>,
buffer: &mut Self::Buffer,
filter: ValueRange<Self::SegmentSortKey>,
) {
self.sort_key_computer
.accept_sort_key_lazy(doc_id, score, threshold)
.segment_sort_keys(input_docs, output, buffer, filter)
}
#[inline(always)]
@@ -305,12 +459,21 @@ where
&mut self,
doc: DocId,
score: Score,
top_n_computer: &mut TopNComputer<Self::SegmentSortKey, DocId, C>,
top_n_computer: &mut TopNComputer<Self::SegmentSortKey, DocId, C, Self::Buffer>,
) {
self.sort_key_computer
.compute_sort_key_and_collect(doc, score, top_n_computer);
}
fn compute_sort_keys_and_collect<C: Comparator<Self::SegmentSortKey>>(
&mut self,
docs: &[DocId],
top_n_computer: &mut TopNComputer<Self::SegmentSortKey, DocId, C, Self::Buffer>,
) {
self.sort_key_computer
.compute_sort_keys_and_collect(docs, top_n_computer);
}
fn convert_segment_sort_key(&self, segment_sort_key: Self::SegmentSortKey) -> Self::SortKey {
(self.map)(
self.sort_key_computer
@@ -336,10 +499,6 @@ where
);
type Child = MappedSegmentSortKeyComputer<
<(SortKeyComputer1, (SortKeyComputer2, SortKeyComputer3)) as SortKeyComputer>::Child,
(
SortKeyComputer1::SortKey,
(SortKeyComputer2::SortKey, SortKeyComputer3::SortKey),
),
Self::SortKey,
>;
@@ -363,7 +522,13 @@ where
let sort_key_computer3 = self.2.segment_sort_key_computer(segment_reader)?;
let map = |(sort_key1, (sort_key2, sort_key3))| (sort_key1, sort_key2, sort_key3);
Ok(MappedSegmentSortKeyComputer {
sort_key_computer: (sort_key_computer1, (sort_key_computer2, sort_key_computer3)),
sort_key_computer: ChainSegmentSortKeyComputer {
head: sort_key_computer1,
tail: ChainSegmentSortKeyComputer {
head: sort_key_computer2,
tail: sort_key_computer3,
},
},
map,
})
}
@@ -398,13 +563,6 @@ where
SortKeyComputer1,
(SortKeyComputer2, (SortKeyComputer3, SortKeyComputer4)),
) as SortKeyComputer>::Child,
(
SortKeyComputer1::SortKey,
(
SortKeyComputer2::SortKey,
(SortKeyComputer3::SortKey, SortKeyComputer4::SortKey),
),
),
Self::SortKey,
>;
type SortKey = (
@@ -426,10 +584,16 @@ where
let sort_key_computer3 = self.2.segment_sort_key_computer(segment_reader)?;
let sort_key_computer4 = self.3.segment_sort_key_computer(segment_reader)?;
Ok(MappedSegmentSortKeyComputer {
sort_key_computer: (
sort_key_computer1,
(sort_key_computer2, (sort_key_computer3, sort_key_computer4)),
),
sort_key_computer: ChainSegmentSortKeyComputer {
head: sort_key_computer1,
tail: ChainSegmentSortKeyComputer {
head: sort_key_computer2,
tail: ChainSegmentSortKeyComputer {
head: sort_key_computer3,
tail: sort_key_computer4,
},
},
},
map: |(sort_key1, (sort_key2, (sort_key3, sort_key4)))| {
(sort_key1, sort_key2, sort_key3, sort_key4)
},
@@ -452,6 +616,13 @@ where
}
}
use std::marker::PhantomData;
pub struct FuncSegmentSortKeyComputer<F, TSortKey> {
func: F,
_phantom: PhantomData<TSortKey>,
}
impl<F, SegmentF, TSortKey> SortKeyComputer for F
where
F: 'static + Send + Sync + Fn(&SegmentReader) -> SegmentF,
@@ -459,15 +630,18 @@ where
TSortKey: 'static + PartialOrd + Clone + Send + Sync + std::fmt::Debug,
{
type SortKey = TSortKey;
type Child = SegmentF;
type Child = FuncSegmentSortKeyComputer<SegmentF, TSortKey>;
type Comparator = NaturalComparator;
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
Ok((self)(segment_reader))
Ok(FuncSegmentSortKeyComputer {
func: (self)(segment_reader),
_phantom: PhantomData,
})
}
}
impl<F, TSortKey> SegmentSortKeyComputer for F
impl<F, TSortKey> SegmentSortKeyComputer for FuncSegmentSortKeyComputer<F, TSortKey>
where
F: 'static + FnMut(DocId) -> TSortKey,
TSortKey: 'static + PartialOrd + Clone + Send + Sync,
@@ -475,9 +649,25 @@ where
type SortKey = TSortKey;
type SegmentSortKey = TSortKey;
type SegmentComparator = NaturalComparator;
type Buffer = ();
fn segment_sort_key(&mut self, doc: DocId, _score: Score) -> TSortKey {
(self)(doc)
(self.func)(doc)
}
fn segment_sort_keys(
&mut self,
input_docs: &[DocId],
output: &mut Vec<ComparableDoc<Self::SegmentSortKey, DocId>>,
_buffer: &mut Self::Buffer,
_filter: ValueRange<Self::SegmentSortKey>,
) {
for &doc in input_docs {
output.push(ComparableDoc {
sort_key: (self.func)(doc),
doc,
});
}
}
/// Convert a segment level score into the global level score.
@@ -486,13 +676,75 @@ where
}
}
pub(crate) fn range_contains_none(range: &ValueRange<Option<u64>>) -> bool {
match range {
ValueRange::All => true,
ValueRange::Inclusive(r) => r.contains(&None),
ValueRange::GreaterThan(_threshold, match_nulls) => *match_nulls,
ValueRange::GreaterThanOrEqual(_threshold, match_nulls) => *match_nulls,
ValueRange::LessThan(_threshold, match_nulls) => *match_nulls,
ValueRange::LessThanOrEqual(_threshold, match_nulls) => *match_nulls,
}
}
pub(crate) fn convert_optional_u64_range_to_u64_range(
range: ValueRange<Option<u64>>,
) -> ValueRange<u64> {
match range {
ValueRange::Inclusive(r) => {
let start = r.start().unwrap_or(0);
let end = r.end().unwrap_or(u64::MAX);
ValueRange::Inclusive(start..=end)
}
ValueRange::GreaterThan(Some(val), match_nulls) => {
ValueRange::GreaterThan(val, match_nulls)
}
ValueRange::GreaterThan(None, match_nulls) => {
if match_nulls {
ValueRange::All
} else {
ValueRange::Inclusive(u64::MIN..=u64::MAX)
}
}
ValueRange::GreaterThanOrEqual(Some(val), match_nulls) => {
ValueRange::GreaterThanOrEqual(val, match_nulls)
}
ValueRange::GreaterThanOrEqual(None, match_nulls) => {
if match_nulls {
ValueRange::All
} else {
ValueRange::Inclusive(u64::MIN..=u64::MAX)
}
}
ValueRange::LessThan(None, match_nulls) => {
if match_nulls {
ValueRange::LessThan(u64::MIN, true)
} else {
ValueRange::Inclusive(1..=0)
}
}
ValueRange::LessThan(Some(val), match_nulls) => ValueRange::LessThan(val, match_nulls),
ValueRange::LessThanOrEqual(None, match_nulls) => {
if match_nulls {
ValueRange::LessThan(u64::MIN, true)
} else {
ValueRange::Inclusive(1..=0)
}
}
ValueRange::LessThanOrEqual(Some(val), match_nulls) => {
ValueRange::LessThanOrEqual(val, match_nulls)
}
ValueRange::All => ValueRange::All,
}
}
#[cfg(test)]
mod tests {
use std::cmp::Ordering;
use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering};
use std::sync::Arc;
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer, TopNComputer};
use crate::schema::Schema;
use crate::{DocId, Index, Order, SegmentReader};
@@ -640,4 +892,178 @@ mod tests {
(200u32, 2u32)
);
}
#[test]
fn test_batch_score_computer_edge_case() {
let score_computer_primary = |_segment_reader: &SegmentReader| |_doc: DocId| 200u32;
let score_computer_secondary = |_segment_reader: &SegmentReader| |_doc: DocId| "b";
let lazy_score_computer = (score_computer_primary, score_computer_secondary);
let index = build_test_index();
let searcher = index.reader().unwrap().searcher();
let mut segment_sort_key_computer = lazy_score_computer
.segment_sort_key_computer(searcher.segment_reader(0))
.unwrap();
let mut top_n_computer =
TopNComputer::new_with_comparator(10, lazy_score_computer.comparator());
// Threshold (200, "a"). Doc is (200, "b"). 200 == 200, "b" > "a". Should be accepted.
top_n_computer.threshold = Some((200, "a"));
let docs = vec![0];
segment_sort_key_computer.compute_sort_keys_and_collect(&docs, &mut top_n_computer);
let results = top_n_computer.into_sorted_vec();
assert_eq!(results.len(), 1);
let result = &results[0];
assert_eq!(result.doc, 0);
assert_eq!(result.sort_key, (200, "b"));
}
}
#[cfg(test)]
mod proptest_tests {
use proptest::prelude::*;
use super::*;
use crate::collector::sort_key::order::*;
// Re-implement logic to interpret ValueRange<Option<u64>> manually to verify expectations
fn range_contains_opt(range: &ValueRange<Option<u64>>, val: &Option<u64>) -> bool {
match range {
ValueRange::All => true,
ValueRange::Inclusive(r) => r.contains(val),
ValueRange::GreaterThan(t, match_nulls) => {
if val.is_none() {
*match_nulls
} else {
val > t
}
}
ValueRange::GreaterThanOrEqual(t, match_nulls) => {
if val.is_none() {
*match_nulls
} else {
val >= t
}
}
ValueRange::LessThan(t, match_nulls) => {
if val.is_none() {
*match_nulls
} else {
val < t
}
}
ValueRange::LessThanOrEqual(t, match_nulls) => {
if val.is_none() {
*match_nulls
} else {
val <= t
}
}
}
}
fn range_contains_u64(range: &ValueRange<u64>, val: &u64) -> bool {
match range {
ValueRange::All => true,
ValueRange::Inclusive(r) => r.contains(val),
ValueRange::GreaterThan(t, _) => val > t,
ValueRange::GreaterThanOrEqual(t, _) => val >= t,
ValueRange::LessThan(t, _) => val < t,
ValueRange::LessThanOrEqual(t, _) => val <= t,
}
}
proptest! {
#[test]
fn test_comparator_consistency_natural_none_is_lower(
threshold in any::<Option<u64>>(),
val in any::<Option<u64>>()
) {
check_comparator::<NaturalComparator>(threshold, val)?;
}
#[test]
fn test_comparator_consistency_reverse(
threshold in any::<Option<u64>>(),
val in any::<Option<u64>>()
) {
check_comparator::<ReverseComparator>(threshold, val)?;
}
#[test]
fn test_comparator_consistency_reverse_none_is_lower(
threshold in any::<Option<u64>>(),
val in any::<Option<u64>>()
) {
check_comparator::<ReverseNoneIsLowerComparator>(threshold, val)?;
}
#[test]
fn test_comparator_consistency_natural_none_is_higher(
threshold in any::<Option<u64>>(),
val in any::<Option<u64>>()
) {
check_comparator::<NaturalNoneIsHigherComparator>(threshold, val)?;
}
}
fn check_comparator<C: Comparator<Option<u64>>>(
threshold: Option<u64>,
val: Option<u64>,
) -> std::result::Result<(), proptest::test_runner::TestCaseError> {
let comparator = C::default();
let range = comparator.threshold_to_valuerange(threshold);
let ordering = comparator.compare(&val, &threshold);
let should_be_in_range = ordering == Ordering::Greater;
let in_range_opt = range_contains_opt(&range, &val);
prop_assert_eq!(
in_range_opt,
should_be_in_range,
"Comparator consistency failed for {:?}. Threshold: {:?}, Val: {:?}, Range: {:?}, \
Ordering: {:?}. range_contains_opt says {}, but compare says {}",
std::any::type_name::<C>(),
threshold,
val,
range,
ordering,
in_range_opt,
should_be_in_range
);
// Check range_contains_none
let expected_none_in_range = range_contains_opt(&range, &None);
let actual_none_in_range = range_contains_none(&range);
prop_assert_eq!(
actual_none_in_range,
expected_none_in_range,
"range_contains_none failed for {:?}. Range: {:?}. Expected (from \
range_contains_opt): {}, Actual: {}",
std::any::type_name::<C>(),
range,
expected_none_in_range,
actual_none_in_range
);
// Check convert_optional_u64_range_to_u64_range
let u64_range = convert_optional_u64_range_to_u64_range(range.clone());
if let Some(v) = val {
let in_u64_range = range_contains_u64(&u64_range, &v);
let in_opt_range = range_contains_opt(&range, &Some(v));
prop_assert_eq!(
in_u64_range,
in_opt_range,
"convert_optional_u64_range_to_u64_range failed for {:?}. Val: {:?}, OptRange: \
{:?}, U64Range: {:?}. Opt says {}, U64 says {}",
std::any::type_name::<C>(),
v,
range,
u64_range,
in_opt_range,
in_u64_range
);
}
Ok(())
}
}

View File

@@ -99,7 +99,12 @@ where
TSegmentSortKeyComputer: SegmentSortKeyComputer,
C: Comparator<TSegmentSortKeyComputer::SegmentSortKey>,
{
pub(crate) topn_computer: TopNComputer<TSegmentSortKeyComputer::SegmentSortKey, DocId, C>,
pub(crate) topn_computer: TopNComputer<
TSegmentSortKeyComputer::SegmentSortKey,
DocId,
C,
TSegmentSortKeyComputer::Buffer,
>,
pub(crate) segment_ord: u32,
pub(crate) segment_sort_key_computer: TSegmentSortKeyComputer,
}
@@ -120,6 +125,11 @@ where
);
}
fn collect_block(&mut self, docs: &[DocId]) {
self.segment_sort_key_computer
.compute_sort_keys_and_collect(docs, &mut self.topn_computer);
}
fn harvest(self) -> Self::Fruit {
let segment_ord = self.segment_ord;
let segment_hits: Vec<(TSegmentSortKeyComputer::SortKey, DocAddress)> = self

View File

@@ -2,6 +2,7 @@ use std::cmp::Ordering;
use std::fmt;
use std::ops::Range;
use columnar::ValueRange;
use serde::{Deserialize, Serialize};
use super::Collector;
@@ -10,8 +11,7 @@ use crate::collector::sort_key::{
SortByStaticFastValue, SortByString,
};
use crate::collector::sort_key_top_collector::TopBySortKeyCollector;
use crate::collector::top_collector::ComparableDoc;
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::collector::{ComparableDoc, SegmentSortKeyComputer, SortKeyComputer};
use crate::fastfield::FastValue;
use crate::{DocAddress, DocId, Order, Score, SegmentReader};
@@ -481,11 +481,22 @@ where
type SortKey = TSortKey;
type SegmentSortKey = TSortKey;
type SegmentComparator = NaturalComparator;
type Buffer = ();
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> TSortKey {
(self.sort_key_fn)(doc, score)
}
fn segment_sort_keys(
&mut self,
_input_docs: &[DocId],
_output: &mut Vec<ComparableDoc<Self::SegmentSortKey, DocId>>,
_buffer: &mut Self::Buffer,
_filter: ValueRange<Self::SegmentSortKey>,
) {
unimplemented!("Batch computation is not supported for tweak score.")
}
/// Convert a segment level score into the global level score.
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey {
sort_key
@@ -509,12 +520,14 @@ where
/// the ascending `DocId|DocAddress` tie-breaking behavior without additional comparisons.
#[derive(Serialize, Deserialize)]
#[serde(from = "TopNComputerDeser<Score, D, C>")]
pub struct TopNComputer<Score, D, C> {
pub struct TopNComputer<Score, D, C, Buffer = ()> {
/// The buffer reverses sort order to get top-semantics instead of bottom-semantics
buffer: Vec<ComparableDoc<Score, D>>,
top_n: usize,
pub(crate) threshold: Option<Score>,
comparator: C,
#[serde(skip)]
scratch: Buffer,
}
// Intermediate struct for TopNComputer for deserialization, to keep vec capacity
@@ -526,7 +539,9 @@ struct TopNComputerDeser<Score, D, C> {
comparator: C,
}
impl<Score, D, C> From<TopNComputerDeser<Score, D, C>> for TopNComputer<Score, D, C> {
impl<Score, D, C, Buffer> From<TopNComputerDeser<Score, D, C>> for TopNComputer<Score, D, C, Buffer>
where Buffer: Default
{
fn from(mut value: TopNComputerDeser<Score, D, C>) -> Self {
let expected_cap = value.top_n.max(1) * 2;
let current_cap = value.buffer.capacity();
@@ -541,12 +556,15 @@ impl<Score, D, C> From<TopNComputerDeser<Score, D, C>> for TopNComputer<Score, D
top_n: value.top_n,
threshold: value.threshold,
comparator: value.comparator,
scratch: Buffer::default(),
}
}
}
impl<Score: std::fmt::Debug, D, C> std::fmt::Debug for TopNComputer<Score, D, C>
where C: Comparator<Score>
impl<Score: std::fmt::Debug, D, C, Buffer> std::fmt::Debug for TopNComputer<Score, D, C, Buffer>
where
C: Comparator<Score>,
Buffer: std::fmt::Debug,
{
fn fmt(&self, f: &mut fmt::Formatter) -> std::fmt::Result {
f.debug_struct("TopNComputer")
@@ -554,12 +572,13 @@ where C: Comparator<Score>
.field("top_n", &self.top_n)
.field("current_threshold", &self.threshold)
.field("comparator", &self.comparator)
.field("scratch", &self.scratch)
.finish()
}
}
// Custom clone to keep capacity
impl<Score: Clone, D: Clone, C: Clone> Clone for TopNComputer<Score, D, C> {
impl<Score: Clone, D: Clone, C: Clone, Buffer: Clone> Clone for TopNComputer<Score, D, C, Buffer> {
fn clone(&self) -> Self {
let mut buffer_clone = Vec::with_capacity(self.buffer.capacity());
buffer_clone.extend(self.buffer.iter().cloned());
@@ -568,15 +587,17 @@ impl<Score: Clone, D: Clone, C: Clone> Clone for TopNComputer<Score, D, C> {
top_n: self.top_n,
threshold: self.threshold.clone(),
comparator: self.comparator.clone(),
scratch: self.scratch.clone(),
}
}
}
impl<TSortKey, D> TopNComputer<TSortKey, D, ReverseComparator>
impl<TSortKey, D> TopNComputer<TSortKey, D, ReverseComparator, ()>
where
D: Ord,
TSortKey: Clone,
NaturalComparator: Comparator<TSortKey>,
ReverseComparator: Comparator<TSortKey>,
{
/// Create a new `TopNComputer`.
/// Internally it will allocate a buffer of size `2 * top_n`.
@@ -586,7 +607,7 @@ where
}
#[inline(always)]
fn compare_for_top_k<TSortKey, D: Ord, C: Comparator<TSortKey>>(
pub fn compare_for_top_k<TSortKey, D: Ord, C: Comparator<TSortKey>>(
c: &C,
lhs: &ComparableDoc<TSortKey, D>,
rhs: &ComparableDoc<TSortKey, D>,
@@ -597,21 +618,26 @@ fn compare_for_top_k<TSortKey, D: Ord, C: Comparator<TSortKey>>(
// sort by doc id
}
impl<TSortKey, D, C> TopNComputer<TSortKey, D, C>
impl<TSortKey, D, C, Buffer> TopNComputer<TSortKey, D, C, Buffer>
where
D: Ord,
TSortKey: Clone,
C: Comparator<TSortKey>,
Buffer: Default,
{
/// Create a new `TopNComputer`.
/// Internally it will allocate a buffer of size `2 * top_n`.
/// Internally it will allocate a buffer of size `(top_n.max(1) * 2) +
/// COLLECT_BLOCK_BUFFER_LEN`.
pub fn new_with_comparator(top_n: usize, comparator: C) -> Self {
let vec_cap = top_n.max(1) * 2;
// We ensure that there is always enough space to include an entire block in the buffer if
// need be, so that `push_block_lazy` can avoid checking capacity inside its loop.
let vec_cap = (top_n.max(1) * 2) + crate::COLLECT_BLOCK_BUFFER_LEN;
TopNComputer {
buffer: Vec::with_capacity(vec_cap),
top_n,
threshold: None,
comparator,
scratch: Buffer::default(),
}
}
@@ -635,16 +661,28 @@ where
// At this point, we need to have established that the doc is above the threshold.
#[inline(always)]
pub(crate) fn append_doc(&mut self, doc: D, sort_key: TSortKey) {
if self.buffer.len() == self.buffer.capacity() {
let median = self.truncate_top_n();
self.threshold = Some(median);
}
// This cannot panic, because we truncate_median will at least remove one element, since
// the min capacity is 2.
self.reserve(1);
// This cannot panic, because we've reserved room for one element.
let comparable_doc = ComparableDoc { doc, sort_key };
push_assuming_capacity(comparable_doc, &mut self.buffer);
}
// Ensure that there is capacity to push `additional` more elements without resizing.
#[inline(always)]
pub(crate) fn reserve(&mut self, additional: usize) {
if self.buffer.len() + additional > self.buffer.capacity() {
let median = self.truncate_top_n();
debug_assert!(self.buffer.len() + additional <= self.buffer.capacity());
self.threshold = Some(median);
}
}
pub(crate) fn buffer_and_scratch(
&mut self,
) -> (&mut Vec<ComparableDoc<TSortKey, D>>, &mut Buffer) {
(&mut self.buffer, &mut self.scratch)
}
#[inline(never)]
fn truncate_top_n(&mut self) -> TSortKey {
// Use select_nth_unstable to find the top nth score
@@ -684,7 +722,7 @@ where
//
// Panics if there is not enough capacity to add an element.
#[inline(always)]
fn push_assuming_capacity<T>(el: T, buf: &mut Vec<T>) {
pub fn push_assuming_capacity<T>(el: T, buf: &mut Vec<T>) {
let prev_len = buf.len();
assert!(prev_len < buf.capacity());
// This is mimicking the current (non-stabilized) implementation in std.
@@ -702,8 +740,7 @@ mod tests {
use super::{TopDocs, TopNComputer};
use crate::collector::sort_key::{ComparatorEnum, NaturalComparator, ReverseComparator};
use crate::collector::top_collector::ComparableDoc;
use crate::collector::{Collector, DocSetCollector};
use crate::collector::{Collector, ComparableDoc, DocSetCollector};
use crate::query::{AllQuery, Query, QueryParser};
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
use crate::time::format_description::well_known::Rfc3339;
@@ -1408,11 +1445,11 @@ mod tests {
#[test]
fn test_top_field_collect_string_prop(
order in prop_oneof!(Just(Order::Desc), Just(Order::Asc)),
limit in 1..256_usize,
offset in 0..256_usize,
limit in 1..32_usize,
offset in 0..32_usize,
segments_terms in
proptest::collection::vec(
proptest::collection::vec(0..32_u8, 1..32_usize),
proptest::collection::vec(0..64_u8, 1..256_usize),
0..8_usize,
)
) {
@@ -1733,7 +1770,8 @@ mod tests {
#[test]
fn test_top_n_computer_not_at_capacity() {
let mut top_n_computer = TopNComputer::new_with_comparator(4, NaturalComparator);
let mut top_n_computer: TopNComputer<f32, u32, _, ()> =
TopNComputer::new_with_comparator(4, NaturalComparator);
top_n_computer.append_doc(1, 0.8);
top_n_computer.append_doc(3, 0.2);
top_n_computer.append_doc(5, 0.3);
@@ -1758,7 +1796,8 @@ mod tests {
#[test]
fn test_top_n_computer_at_capacity() {
let mut top_collector = TopNComputer::new_with_comparator(4, NaturalComparator);
let mut top_collector: TopNComputer<f32, u32, _, ()> =
TopNComputer::new_with_comparator(4, NaturalComparator);
top_collector.append_doc(1, 0.8);
top_collector.append_doc(3, 0.2);
top_collector.append_doc(5, 0.3);
@@ -1795,12 +1834,14 @@ mod tests {
let doc_ids_collection = [4, 5, 6];
let score = 3.3f32;
let mut top_collector_limit_2 = TopNComputer::new_with_comparator(2, NaturalComparator);
let mut top_collector_limit_2: TopNComputer<f32, u32, _, ()> =
TopNComputer::new_with_comparator(2, NaturalComparator);
for id in &doc_ids_collection {
top_collector_limit_2.append_doc(*id, score);
}
let mut top_collector_limit_3 = TopNComputer::new_with_comparator(3, NaturalComparator);
let mut top_collector_limit_3: TopNComputer<f32, u32, _, ()> =
TopNComputer::new_with_comparator(3, NaturalComparator);
for id in &doc_ids_collection {
top_collector_limit_3.append_doc(*id, score);
}
@@ -1821,15 +1862,16 @@ mod bench {
#[bench]
fn bench_top_segment_collector_collect_at_capacity(b: &mut Bencher) {
let mut top_collector = TopNComputer::new_with_comparator(100, NaturalComparator);
let mut top_collector: TopNComputer<f32, u32, _, ()> =
TopNComputer::new_with_comparator(100, NaturalComparator);
for i in 0..100 {
top_collector.append_doc(i, 0.8);
top_collector.append_doc(i as u32, 0.8);
}
b.iter(|| {
for i in 0..100 {
top_collector.append_doc(i, 0.8);
top_collector.append_doc(i as u32, 0.8);
}
});
}

View File

@@ -79,7 +79,7 @@ mod tests {
use std::ops::{Range, RangeInclusive};
use std::path::Path;
use columnar::StrColumn;
use columnar::{StrColumn, ValueRange};
use common::{ByteCount, DateTimePrecision, HasLen, TerminatingWrite};
use once_cell::sync::Lazy;
use rand::prelude::SliceRandom;
@@ -944,7 +944,7 @@ mod tests {
let test_range = |range: RangeInclusive<u64>| {
let expected_count = numbers.iter().filter(|num| range.contains(*num)).count();
let mut vec = vec![];
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
field.get_row_ids_for_value_range(ValueRange::Inclusive(range), 0..u32::MAX, &mut vec);
assert_eq!(vec.len(), expected_count);
};
test_range(50..=50);
@@ -1022,7 +1022,7 @@ mod tests {
let test_range = |range: RangeInclusive<u64>| {
let expected_count = numbers.iter().filter(|num| range.contains(*num)).count();
let mut vec = vec![];
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
field.get_row_ids_for_value_range(ValueRange::Inclusive(range), 0..u32::MAX, &mut vec);
assert_eq!(vec.len(), expected_count);
};
let test_range_variant = |start, stop| {

View File

@@ -1,7 +1,6 @@
use core::fmt::Debug;
use std::ops::RangeInclusive;
use columnar::Column;
use columnar::{Column, ValueRange};
use crate::{DocId, DocSet, TERMINATED};
@@ -41,7 +40,7 @@ impl VecCursor {
pub(crate) struct RangeDocSet<T> {
/// The range filter on the values.
value_range: RangeInclusive<T>,
value_range: ValueRange<T>,
column: Column<T>,
/// The next docid start range to fetch (inclusive).
next_fetch_start: u32,
@@ -61,8 +60,8 @@ pub(crate) struct RangeDocSet<T> {
const DEFAULT_FETCH_HORIZON: u32 = 128;
impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
pub(crate) fn new(value_range: RangeInclusive<T>, column: Column<T>) -> Self {
if *value_range.start() > column.max_value() || *value_range.end() < column.min_value() {
pub(crate) fn new(value_range: ValueRange<T>, column: Column<T>) -> Self {
if !value_range.intersects(column.min_value(), column.max_value()) {
return Self {
value_range,
column,

View File

@@ -7,7 +7,7 @@ use std::ops::{Bound, RangeInclusive};
use columnar::{
Cardinality, Column, ColumnType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
NumericalType, StrColumn,
NumericalType, StrColumn, ValueRange,
};
use common::bounds::{BoundsRange, TransformBound};
@@ -154,7 +154,7 @@ impl Weight for FastFieldRangeWeight {
ip_addr_column.min_value(),
ip_addr_column.max_value(),
);
let docset = RangeDocSet::new(value_range, ip_addr_column);
let docset = RangeDocSet::new(ValueRange::Inclusive(value_range), ip_addr_column);
Ok(Box::new(ConstScorer::new(docset, boost)))
} else if field_type.is_str() {
let Some(str_dict_column): Option<StrColumn> = reader.fast_fields().str(&field_name)?
@@ -426,7 +426,7 @@ fn search_on_u64_ff(
}
}
let docset = RangeDocSet::new(value_range, column);
let docset = RangeDocSet::new(ValueRange::Inclusive(value_range), column);
Ok(Box::new(ConstScorer::new(docset, boost)))
}