Compare commits

...

3 Commits

Author SHA1 Message Date
Pascal Seitz
2ce485b8cc skip estimate phase for merge multivalue index
precompute stats for merge multivalue index + disable Line encoding for
multivalue index. That combination allows to skip the first estimation
pass. This gives up to 2x on merge performance on multivalue indices.

This change may decrease compression as Line is
very good compressible for documents, which have a fixed amount of
values in each doc. The line codec should be replaced.

```
merge_multi_and_multi          Avg: 22.7880ms (-47.15%)    Median: 22.5469ms (-47.38%)    [22.3691ms .. 25.8392ms]
merge_dense_and_dense          Avg: 14.4398ms (+2.18%)     Median: 14.2465ms (+0.74%)     [14.1620ms .. 16.1270ms]
merge_sparse_and_sparse        Avg: 10.6559ms (+1.10%)     Median: 10.6318ms (+0.91%)     [10.5527ms .. 11.2848ms]
merge_sparse_and_dense         Avg: 12.4886ms (+1.52%)     Median: 12.4044ms (+0.84%)     [12.3261ms .. 13.9439ms]
merge_multi_and_dense          Avg: 25.6686ms (-45.56%)    Median: 25.4851ms (-45.84%)    [25.1618ms .. 27.6226ms]
merge_multi_and_sparse         Avg: 24.3278ms (-47.00%)    Median: 24.1917ms (-47.34%)    [23.7159ms .. 27.0513ms]
```
2024-06-11 20:22:00 +08:00
PSeitz
c3b92a5412 fix compiler warning, cleanup (#2393)
fix compiler warning for missing feature flag
remove unused variables
cleanup unused methods
2024-06-11 16:03:50 +08:00
PSeitz
2f55511064 extend indexwriter proptests (#2342)
* index random values in proptest

* add proptest with multiple docs
2024-06-11 16:02:57 +08:00
20 changed files with 450 additions and 268 deletions

View File

@@ -22,7 +22,7 @@ impl Display for Card {
} }
} }
const NUM_DOCS: u32 = 100_000; const NUM_DOCS: u32 = 1_000_000;
fn generate_columnar(card: Card, num_docs: u32) -> ColumnarReader { fn generate_columnar(card: Card, num_docs: u32) -> ColumnarReader {
use tantivy_columnar::ColumnarWriter; use tantivy_columnar::ColumnarWriter;
@@ -88,12 +88,8 @@ fn main() {
let columnar_readers = columnar_readers.iter().collect::<Vec<_>>(); let columnar_readers = columnar_readers.iter().collect::<Vec<_>>();
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]); let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
let _ = black_box(merge_columnar( merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
&columnar_readers, black_box(out);
&[],
merge_row_order.into(),
&mut out,
));
}, },
); );
} }

View File

@@ -73,14 +73,18 @@ fn detect_cardinality(
pub fn merge_column_index<'a>( pub fn merge_column_index<'a>(
columns: &'a [ColumnIndex], columns: &'a [ColumnIndex],
merge_row_order: &'a MergeRowOrder, merge_row_order: &'a MergeRowOrder,
num_values: u32,
) -> SerializableColumnIndex<'a> { ) -> SerializableColumnIndex<'a> {
// For simplification, we do not try to detect whether the cardinality could be // For simplification, we do not try to detect whether the cardinality could be
// downgraded thanks to deletes. // downgraded thanks to deletes.
let cardinality_after_merge = detect_cardinality(columns, merge_row_order); let cardinality_after_merge = detect_cardinality(columns, merge_row_order);
match merge_row_order { match merge_row_order {
MergeRowOrder::Stack(stack_merge_order) => { MergeRowOrder::Stack(stack_merge_order) => merge_column_index_stacked(
merge_column_index_stacked(columns, cardinality_after_merge, stack_merge_order) columns,
} cardinality_after_merge,
stack_merge_order,
num_values,
),
MergeRowOrder::Shuffled(complex_merge_order) => { MergeRowOrder::Shuffled(complex_merge_order) => {
merge_column_index_shuffled(columns, cardinality_after_merge, complex_merge_order) merge_column_index_shuffled(columns, cardinality_after_merge, complex_merge_order)
} }
@@ -167,8 +171,12 @@ mod tests {
], ],
) )
.into(); .into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order); let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order, 3);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else { let SerializableColumnIndex::Multivalued {
indices: start_index_iterable,
..
} = merged_column_index
else {
panic!("Excpected a multivalued index") panic!("Excpected a multivalued index")
}; };
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect(); let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
@@ -200,8 +208,12 @@ mod tests {
], ],
) )
.into(); .into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order); let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order, 6);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else { let SerializableColumnIndex::Multivalued {
indices: start_index_iterable,
..
} = merged_column_index
else {
panic!("Excpected a multivalued index") panic!("Excpected a multivalued index")
}; };
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect(); let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();

View File

@@ -22,7 +22,10 @@ pub fn merge_column_index_shuffled<'a>(
Cardinality::Multivalued => { Cardinality::Multivalued => {
let multivalue_start_index = let multivalue_start_index =
merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order); merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order);
SerializableColumnIndex::Multivalued(multivalue_start_index) SerializableColumnIndex::Multivalued {
indices: multivalue_start_index,
stats: None,
}
} }
} }
} }

View File

@@ -1,6 +1,8 @@
use std::iter; use std::iter;
use std::num::NonZeroU64;
use crate::column_index::{SerializableColumnIndex, Set}; use crate::column_index::{SerializableColumnIndex, Set};
use crate::column_values::ColumnStats;
use crate::iterable::Iterable; use crate::iterable::Iterable;
use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder}; use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};
@@ -12,6 +14,7 @@ pub fn merge_column_index_stacked<'a>(
columns: &'a [ColumnIndex], columns: &'a [ColumnIndex],
cardinality_after_merge: Cardinality, cardinality_after_merge: Cardinality,
stack_merge_order: &'a StackMergeOrder, stack_merge_order: &'a StackMergeOrder,
num_values: u32,
) -> SerializableColumnIndex<'a> { ) -> SerializableColumnIndex<'a> {
match cardinality_after_merge { match cardinality_after_merge {
Cardinality::Full => SerializableColumnIndex::Full, Cardinality::Full => SerializableColumnIndex::Full,
@@ -27,7 +30,17 @@ pub fn merge_column_index_stacked<'a>(
columns, columns,
stack_merge_order, stack_merge_order,
}; };
SerializableColumnIndex::Multivalued(Box::new(stacked_multivalued_index)) SerializableColumnIndex::Multivalued {
indices: Box::new(stacked_multivalued_index),
stats: Some(ColumnStats {
gcd: NonZeroU64::new(1).unwrap(),
// The values in the multivalue index are the positions of the values
min_value: 0,
max_value: num_values as u64,
// This is num docs, but it starts at 0 so we need +1
num_rows: stack_merge_order.num_rows() + 1,
}),
}
} }
} }
} }

View File

@@ -6,20 +6,29 @@ use std::sync::Arc;
use common::OwnedBytes; use common::OwnedBytes;
use crate::column_values::{ use crate::column_values::{
load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues, load_u64_based_column_values, serialize_u64_based_column_values,
serialize_u64_with_codec_and_stats, CodecType, ColumnStats, ColumnValues,
}; };
use crate::iterable::Iterable; use crate::iterable::Iterable;
use crate::{DocId, RowId}; use crate::{DocId, RowId};
pub fn serialize_multivalued_index( pub fn serialize_multivalued_index(
multivalued_index: &dyn Iterable<RowId>, multivalued_index: &dyn Iterable<RowId>,
stats: Option<ColumnStats>,
output: &mut impl Write, output: &mut impl Write,
) -> io::Result<()> { ) -> io::Result<()> {
serialize_u64_based_column_values( if let Some(stats) = stats {
multivalued_index, // TODO: Add something with higher compression that doesn't require a full scan upfront
&[CodecType::Bitpacked, CodecType::Linear], let estimator = CodecType::Bitpacked.estimator();
output, assert!(!estimator.requires_full_scan());
)?; serialize_u64_with_codec_and_stats(multivalued_index, estimator, stats, output)?;
} else {
serialize_u64_based_column_values(
multivalued_index,
&[CodecType::Bitpacked, CodecType::Linear],
output,
)?;
}
Ok(()) Ok(())
} }
@@ -52,7 +61,7 @@ impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
impl MultiValueIndex { impl MultiValueIndex {
pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex { pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
serialize_multivalued_index(&start_offsets, &mut buffer).unwrap(); serialize_multivalued_index(&start_offsets, None, &mut buffer).unwrap();
let bytes = OwnedBytes::new(buffer); let bytes = OwnedBytes::new(buffer);
open_multivalued_index(bytes).unwrap() open_multivalued_index(bytes).unwrap()
} }

View File

@@ -6,6 +6,7 @@ use common::{CountingWriter, OwnedBytes};
use crate::column_index::multivalued_index::serialize_multivalued_index; use crate::column_index::multivalued_index::serialize_multivalued_index;
use crate::column_index::optional_index::serialize_optional_index; use crate::column_index::optional_index::serialize_optional_index;
use crate::column_index::ColumnIndex; use crate::column_index::ColumnIndex;
use crate::column_values::ColumnStats;
use crate::iterable::Iterable; use crate::iterable::Iterable;
use crate::{Cardinality, RowId}; use crate::{Cardinality, RowId};
@@ -15,9 +16,12 @@ pub enum SerializableColumnIndex<'a> {
non_null_row_ids: Box<dyn Iterable<RowId> + 'a>, non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
num_rows: RowId, num_rows: RowId,
}, },
// TODO remove the Arc<dyn> apart from serialization this is not Multivalued {
// dynamic at all. /// Iterator emitting the indices for the index
Multivalued(Box<dyn Iterable<RowId> + 'a>), indices: Box<dyn Iterable<RowId> + 'a>,
/// In the merge case we can precompute the column stats
stats: Option<ColumnStats>,
},
} }
impl<'a> SerializableColumnIndex<'a> { impl<'a> SerializableColumnIndex<'a> {
@@ -25,7 +29,7 @@ impl<'a> SerializableColumnIndex<'a> {
match self { match self {
SerializableColumnIndex::Full => Cardinality::Full, SerializableColumnIndex::Full => Cardinality::Full,
SerializableColumnIndex::Optional { .. } => Cardinality::Optional, SerializableColumnIndex::Optional { .. } => Cardinality::Optional,
SerializableColumnIndex::Multivalued(_) => Cardinality::Multivalued, SerializableColumnIndex::Multivalued { .. } => Cardinality::Multivalued,
} }
} }
} }
@@ -44,9 +48,10 @@ pub fn serialize_column_index(
non_null_row_ids, non_null_row_ids,
num_rows, num_rows,
} => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?, } => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
SerializableColumnIndex::Multivalued(multivalued_index) => { SerializableColumnIndex::Multivalued {
serialize_multivalued_index(&*multivalued_index, &mut output)? indices: multivalued_index,
} stats,
} => serialize_multivalued_index(&*multivalued_index, stats, &mut output)?,
} }
let column_index_num_bytes = output.written_bytes() as u32; let column_index_num_bytes = output.written_bytes() as u32;
Ok(column_index_num_bytes) Ok(column_index_num_bytes)

View File

@@ -32,7 +32,8 @@ pub use u128_based::{
}; };
pub use u64_based::{ pub use u64_based::{
load_u64_based_column_values, serialize_and_load_u64_based_column_values, load_u64_based_column_values, serialize_and_load_u64_based_column_values,
serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES, serialize_u64_based_column_values, serialize_u64_with_codec_and_stats, CodecType,
ALL_U64_CODEC_TYPES,
}; };
pub use vec_column::VecColumn; pub use vec_column::VecColumn;

View File

@@ -128,6 +128,9 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {
bit_packer.close(wrt)?; bit_packer.close(wrt)?;
Ok(()) Ok(())
} }
fn codec_type(&self) -> super::CodecType {
super::CodecType::Bitpacked
}
} }
pub struct BitpackedCodec; pub struct BitpackedCodec;

View File

@@ -163,6 +163,10 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {
Ok(()) Ok(())
} }
fn codec_type(&self) -> super::CodecType {
super::CodecType::BlockwiseLinear
}
} }
pub struct BlockwiseLinearCodec; pub struct BlockwiseLinearCodec;

View File

@@ -153,6 +153,12 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
self.collect_before_line_estimation(value); self.collect_before_line_estimation(value);
} }
} }
fn requires_full_scan(&self) -> bool {
true
}
fn codec_type(&self) -> super::CodecType {
super::CodecType::Linear
}
} }
impl LinearCodecEstimator { impl LinearCodecEstimator {

View File

@@ -37,7 +37,11 @@ pub trait ColumnCodecEstimator<T = u64>: 'static {
/// This method will be called for each element of the column during /// This method will be called for each element of the column during
/// `estimation`. /// `estimation`.
fn collect(&mut self, value: u64); fn collect(&mut self, value: u64);
/// Finalizes the first pass phase. /// Returns true if the estimator needs a full pass over the column before serialization
fn requires_full_scan(&self) -> bool {
false
}
fn codec_type(&self) -> CodecType;
fn finalize(&mut self) {} fn finalize(&mut self) {}
/// Returns an accurate estimation of the number of bytes that will /// Returns an accurate estimation of the number of bytes that will
/// be used to represent this column. /// be used to represent this column.
@@ -150,34 +154,45 @@ pub fn serialize_u64_based_column_values<T: MonotonicallyMappableToU64>(
wrt: &mut dyn Write, wrt: &mut dyn Write,
) -> io::Result<()> { ) -> io::Result<()> {
let mut stats_collector = StatsCollector::default(); let mut stats_collector = StatsCollector::default();
let mut estimators: Vec<(CodecType, Box<dyn ColumnCodecEstimator>)> = let mut estimators: Vec<Box<dyn ColumnCodecEstimator>> = Vec::with_capacity(codec_types.len());
Vec::with_capacity(codec_types.len());
for &codec_type in codec_types { for &codec_type in codec_types {
estimators.push((codec_type, codec_type.estimator())); estimators.push(codec_type.estimator());
} }
for val in vals.boxed_iter() { for val in vals.boxed_iter() {
let val_u64 = val.to_u64(); let val_u64 = val.to_u64();
stats_collector.collect(val_u64); stats_collector.collect(val_u64);
for (_, estimator) in &mut estimators { for estimator in &mut estimators {
estimator.collect(val_u64); estimator.collect(val_u64);
} }
} }
for (_, estimator) in &mut estimators { for estimator in &mut estimators {
estimator.finalize(); estimator.finalize();
} }
let stats = stats_collector.stats(); let stats = stats_collector.stats();
let (_, best_codec, best_codec_estimator) = estimators let (_, best_codec) = estimators
.into_iter() .into_iter()
.flat_map(|(codec_type, estimator)| { .flat_map(|estimator| {
let num_bytes = estimator.estimate(&stats)?; let num_bytes = estimator.estimate(&stats)?;
Some((num_bytes, codec_type, estimator)) Some((num_bytes, estimator))
}) })
.min_by_key(|(num_bytes, _, _)| *num_bytes) .min_by_key(|(num_bytes, _)| *num_bytes)
.ok_or_else(|| { .ok_or_else(|| {
io::Error::new(io::ErrorKind::InvalidData, "No available applicable codec.") io::Error::new(io::ErrorKind::InvalidData, "No available applicable codec.")
})?; })?;
best_codec.to_code().serialize(wrt)?; serialize_u64_with_codec_and_stats(vals, best_codec, stats, wrt)?;
best_codec_estimator.serialize( Ok(())
}
/// Serializes a given column of u64-mapped values.
/// The codec estimator needs to be collected fully for the Line codec before calling this.
pub fn serialize_u64_with_codec_and_stats<T: MonotonicallyMappableToU64>(
vals: &dyn Iterable<T>,
codec: Box<dyn ColumnCodecEstimator>,
stats: ColumnStats,
wrt: &mut dyn Write,
) -> io::Result<()> {
codec.codec_type().to_code().serialize(wrt)?;
codec.serialize(
&stats, &stats,
&mut vals.boxed_iter().map(MonotonicallyMappableToU64::to_u64), &mut vals.boxed_iter().map(MonotonicallyMappableToU64::to_u64),
wrt, wrt,

View File

@@ -3,7 +3,7 @@ mod merge_mapping;
mod term_merger; mod term_merger;
use std::collections::{BTreeMap, HashSet}; use std::collections::{BTreeMap, HashSet};
use std::io; use std::io::{self};
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
use std::sync::Arc; use std::sync::Arc;
@@ -156,8 +156,15 @@ fn merge_column(
column_values.push(None); column_values.push(None);
} }
} }
let merged_column_index = let num_values: u32 = column_values
crate::column_index::merge_column_index(&column_indexes[..], merge_row_order); .iter()
.map(|vals| vals.as_ref().map(|idx| idx.num_vals()).unwrap_or(0))
.sum();
let merged_column_index = crate::column_index::merge_column_index(
&column_indexes[..],
merge_row_order,
num_values,
);
let merge_column_values = MergedColumnValues { let merge_column_values = MergedColumnValues {
column_indexes: &column_indexes[..], column_indexes: &column_indexes[..],
column_values: &column_values[..], column_values: &column_values[..],
@@ -183,8 +190,15 @@ fn merge_column(
} }
} }
let merged_column_index = let num_values: u32 = column_values
crate::column_index::merge_column_index(&column_indexes[..], merge_row_order); .iter()
.map(|vals| vals.as_ref().map(|idx| idx.num_vals()).unwrap_or(0))
.sum();
let merged_column_index = crate::column_index::merge_column_index(
&column_indexes[..],
merge_row_order,
num_values,
);
let merge_column_values = MergedColumnValues { let merge_column_values = MergedColumnValues {
column_indexes: &column_indexes[..], column_indexes: &column_indexes[..],
column_values: &column_values, column_values: &column_values,
@@ -214,8 +228,19 @@ fn merge_column(
} }
} }
} }
let merged_column_index = let num_values: u32 = bytes_columns
crate::column_index::merge_column_index(&column_indexes[..], merge_row_order); .iter()
.map(|vals| {
vals.as_ref()
.map(|idx| idx.term_ord_column.values.num_vals())
.unwrap_or(0)
})
.sum();
let merged_column_index = crate::column_index::merge_column_index(
&column_indexes[..],
merge_row_order,
num_values,
);
merge_bytes_or_str_column(merged_column_index, &bytes_columns, merge_row_order, wrt)?; merge_bytes_or_str_column(merged_column_index, &bytes_columns, merge_row_order, wrt)?;
} }
} }

View File

@@ -644,7 +644,10 @@ fn send_to_serialize_column_mappable_to_u128<
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder(); let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
consume_operation_iterator(op_iterator, multivalued_index_builder, values); consume_operation_iterator(op_iterator, multivalued_index_builder, values);
let multivalued_index = multivalued_index_builder.finish(num_rows); let multivalued_index = multivalued_index_builder.finish(num_rows);
SerializableColumnIndex::Multivalued(Box::new(multivalued_index)) SerializableColumnIndex::Multivalued {
indices: Box::new(multivalued_index),
stats: Default::default(), // TODO: implement stats for u128
}
} }
}; };
crate::column::serialize_column_mappable_to_u128( crate::column::serialize_column_mappable_to_u128(
@@ -699,7 +702,10 @@ fn send_to_serialize_column_mappable_to_u64(
if sort_values_within_row { if sort_values_within_row {
sort_values_within_row_in_place(multivalued_index, values); sort_values_within_row_in_place(multivalued_index, values);
} }
SerializableColumnIndex::Multivalued(Box::new(multivalued_index)) SerializableColumnIndex::Multivalued {
indices: Box::new(multivalued_index),
stats: None,
}
} }
}; };
crate::column::serialize_column_mappable_to_u64( crate::column::serialize_column_mappable_to_u64(

View File

@@ -738,35 +738,22 @@ proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))] #![proptest_config(ProptestConfig::with_cases(1000))]
#[test] #[test]
fn test_columnar_merge_proptest(columnar_docs in proptest::collection::vec(columnar_docs_strategy(), 2..=3)) { fn test_columnar_merge_proptest(columnar_docs in proptest::collection::vec(columnar_docs_strategy(), 2..=3)) {
let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter() test_columnar_docs(columnar_docs);
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().flatten().cloned().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
} }
} }
#[test] fn test_columnar_docs(columnar_docs: Vec<Vec<Vec<(&'static str, ColumnValue)>>>) {
fn test_columnar_merging_empty_columnar() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
let columnar_readers: Vec<ColumnarReader> = columnar_docs let columnar_readers: Vec<ColumnarReader> = columnar_docs
.iter() .iter()
.map(|docs| build_columnar(&docs[..])) .map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect(); let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new(); let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]); let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
crate::merge_columnar( crate::merge_columnar(
&columnar_readers_arr[..], &columnar_readers_arr[..],
&[], &[],
crate::MergeRowOrder::Stack(stack_merge_order), stack_merge_order,
&mut output, &mut output,
) )
.unwrap(); .unwrap();
@@ -777,6 +764,24 @@ fn test_columnar_merging_empty_columnar() {
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar); assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
} }
#[test]
fn test_columnar_merging_empty_columnar() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
test_columnar_docs(columnar_docs);
}
#[test]
fn test_columnar_merging_simple() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
vec![],
vec![vec![
("c1", ColumnValue::Numerical(0u64.into())),
("c1", ColumnValue::Numerical(0u64.into())),
]],
];
test_columnar_docs(columnar_docs);
}
#[test] #[test]
fn test_columnar_merging_number_columns() { fn test_columnar_merging_number_columns() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![ let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
@@ -793,25 +798,7 @@ fn test_columnar_merging_number_columns() {
vec![("c2", ColumnValue::Numerical(u64::MAX.into()))], vec![("c2", ColumnValue::Numerical(u64::MAX.into()))],
], ],
]; ];
let columnar_readers: Vec<ColumnarReader> = columnar_docs test_columnar_docs(columnar_docs);
.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
crate::merge_columnar(
&columnar_readers_arr[..],
&[],
crate::MergeRowOrder::Stack(stack_merge_order),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().flatten().cloned().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
} }
// TODO add non trivial remap and merge // TODO add non trivial remap and merge

View File

@@ -22,3 +22,6 @@ serde = { version = "1.0.136", features = ["derive"] }
[dev-dependencies] [dev-dependencies]
proptest = "1.0.0" proptest = "1.0.0"
rand = "0.8.4" rand = "0.8.4"
[features]
unstable = [] # useful for benches.

View File

@@ -808,7 +808,7 @@ mod tests {
use proptest::prop_oneof; use proptest::prop_oneof;
use super::super::operation::UserOperation; use super::super::operation::UserOperation;
use crate::collector::TopDocs; use crate::collector::{Count, TopDocs};
use crate::directory::error::LockError; use crate::directory::error::LockError;
use crate::error::*; use crate::error::*;
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN; use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
@@ -1572,20 +1572,74 @@ mod tests {
Ok(()) Ok(())
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone)]
enum IndexingOp { enum IndexingOp {
AddDoc { id: u64 }, AddMultipleDoc {
DeleteDoc { id: u64 }, id: u64,
DeleteDocQuery { id: u64 }, num_docs: u64,
value: IndexValue,
},
AddDoc {
id: u64,
value: IndexValue,
},
DeleteDoc {
id: u64,
},
DeleteDocQuery {
id: u64,
},
Commit, Commit,
Merge, Merge,
} }
impl IndexingOp {
fn add(id: u64) -> Self {
IndexingOp::AddDoc {
id,
value: IndexValue::F64(id as f64),
}
}
}
use serde::Serialize;
#[derive(Debug, Clone, Serialize)]
#[serde(untagged)]
enum IndexValue {
Str(String),
F64(f64),
U64(u64),
I64(i64),
}
impl Default for IndexValue {
fn default() -> Self {
IndexValue::F64(0.0)
}
}
fn value_strategy() -> impl Strategy<Value = IndexValue> {
prop_oneof![
any::<f64>().prop_map(IndexValue::F64),
any::<u64>().prop_map(IndexValue::U64),
any::<i64>().prop_map(IndexValue::I64),
any::<String>().prop_map(IndexValue::Str),
]
}
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> { fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![ prop_oneof![
(0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }), (0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
(0u64..20u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }), (0u64..20u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
(0u64..20u64).prop_map(|id| IndexingOp::AddDoc { id }), (0u64..20u64, value_strategy())
.prop_map(move |(id, value)| IndexingOp::AddDoc { id, value }),
((0u64..20u64), (1u64..100), value_strategy()).prop_map(
move |(id, num_docs, value)| {
IndexingOp::AddMultipleDoc {
id,
num_docs,
value,
}
}
),
(0u64..1u64).prop_map(|_| IndexingOp::Commit), (0u64..1u64).prop_map(|_| IndexingOp::Commit),
(0u64..1u64).prop_map(|_| IndexingOp::Merge), (0u64..1u64).prop_map(|_| IndexingOp::Merge),
] ]
@@ -1595,7 +1649,17 @@ mod tests {
prop_oneof![ prop_oneof![
5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }), 5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }), 5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
50 => (0u64..100u64).prop_map(|id| IndexingOp::AddDoc { id }), 50 => (0u64..100u64, value_strategy())
.prop_map(move |(id, value)| IndexingOp::AddDoc { id, value }),
50 => (0u64..100u64, (1u64..100), value_strategy()).prop_map(
move |(id, num_docs, value)| {
IndexingOp::AddMultipleDoc {
id,
num_docs,
value,
}
}
),
2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit), 2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit),
1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge), 1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge),
] ]
@@ -1604,19 +1668,27 @@ mod tests {
fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) { fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) {
let mut existing_ids = HashMap::new(); let mut existing_ids = HashMap::new();
let mut deleted_ids = HashSet::new(); let mut deleted_ids = HashSet::new();
for &op in ops { for op in ops {
match op { match op {
IndexingOp::AddDoc { id } => { IndexingOp::AddDoc { id, value: _ } => {
*existing_ids.entry(id).or_insert(0) += 1; *existing_ids.entry(*id).or_insert(0) += 1;
deleted_ids.remove(&id); deleted_ids.remove(id);
}
IndexingOp::AddMultipleDoc {
id,
num_docs,
value: _,
} => {
*existing_ids.entry(*id).or_insert(0) += num_docs;
deleted_ids.remove(id);
} }
IndexingOp::DeleteDoc { id } => { IndexingOp::DeleteDoc { id } => {
existing_ids.remove(&id); existing_ids.remove(&id);
deleted_ids.insert(id); deleted_ids.insert(*id);
} }
IndexingOp::DeleteDocQuery { id } => { IndexingOp::DeleteDocQuery { id } => {
existing_ids.remove(&id); existing_ids.remove(&id);
deleted_ids.insert(id); deleted_ids.insert(*id);
} }
_ => {} _ => {}
} }
@@ -1626,16 +1698,19 @@ mod tests {
fn get_id_list(ops: &[IndexingOp]) -> Vec<u64> { fn get_id_list(ops: &[IndexingOp]) -> Vec<u64> {
let mut id_list = Vec::new(); let mut id_list = Vec::new();
for &op in ops { for op in ops {
match op { match op {
IndexingOp::AddDoc { id } => { IndexingOp::AddDoc { id, value: _ } => {
id_list.push(id); id_list.push(*id);
}
IndexingOp::AddMultipleDoc { id, .. } => {
id_list.push(*id);
} }
IndexingOp::DeleteDoc { id } => { IndexingOp::DeleteDoc { id } => {
id_list.retain(|el| *el != id); id_list.retain(|el| el != id);
} }
IndexingOp::DeleteDocQuery { id } => { IndexingOp::DeleteDocQuery { id } => {
id_list.retain(|el| *el != id); id_list.retain(|el| el != id);
} }
_ => {} _ => {}
} }
@@ -1716,42 +1791,59 @@ mod tests {
let ip_from_id = |id| Ipv6Addr::from_u128(id as u128); let ip_from_id = |id| Ipv6Addr::from_u128(id as u128);
for &op in ops { let add_docs = |index_writer: &mut IndexWriter,
match op { id: u64,
IndexingOp::AddDoc { id } => { value: IndexValue,
let facet = Facet::from(&("/cola/".to_string() + &id.to_string())); num: u64|
let ip = ip_from_id(id); -> crate::Result<()> {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
if !id_is_full_doc(id) { let ip = ip_from_id(id);
// every 3rd doc has no ip field let doc = if !id_is_full_doc(id) {
index_writer.add_document(doc!( // every 3rd doc has no ip field
id_field=>id, doc!(
))?; id_field=>id,
} else { )
let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string()}); } else {
index_writer.add_document(doc!(id_field=>id, let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string(), "val": value});
json_field=>json, doc!(id_field=>id,
bytes_field => id.to_le_bytes().as_slice(), json_field=>json,
id_opt_field => id, bytes_field => id.to_le_bytes().as_slice(),
ip_field => ip, id_opt_field => id,
ips_field => ip, ip_field => ip,
ips_field => ip, ips_field => ip,
multi_numbers=> id, ips_field => ip,
multi_numbers => id, multi_numbers=> id,
bool_field => (id % 2u64) != 0, multi_numbers => id,
i64_field => id as i64, bool_field => (id % 2u64) != 0,
f64_field => id as f64, i64_field => id as i64,
date_field => DateTime::from_timestamp_secs(id as i64), f64_field => id as f64,
multi_bools => (id % 2u64) != 0, date_field => DateTime::from_timestamp_secs(id as i64),
multi_bools => (id % 2u64) == 0, multi_bools => (id % 2u64) != 0,
text_field => id.to_string(), multi_bools => (id % 2u64) == 0,
facet_field => facet, text_field => id.to_string(),
large_text_field => LOREM, facet_field => facet,
multi_text_fields => multi_text_field_text1, large_text_field => LOREM,
multi_text_fields => multi_text_field_text2, multi_text_fields => multi_text_field_text1,
multi_text_fields => multi_text_field_text3, multi_text_fields => multi_text_field_text2,
))?; multi_text_fields => multi_text_field_text3,
} )
};
for _ in 0..num {
index_writer.add_document(doc.clone())?;
}
Ok(())
};
for op in ops {
match op.clone() {
IndexingOp::AddMultipleDoc {
id,
num_docs,
value,
} => {
add_docs(&mut index_writer, id, value, num_docs)?;
}
IndexingOp::AddDoc { id, value } => {
add_docs(&mut index_writer, id, value, 1)?;
} }
IndexingOp::DeleteDoc { id } => { IndexingOp::DeleteDoc { id } => {
index_writer.delete_term(Term::from_field_u64(id_field, id)); index_writer.delete_term(Term::from_field_u64(id_field, id));
@@ -2032,18 +2124,22 @@ mod tests {
top_docs.iter().map(|el| el.1).collect::<Vec<_>>() top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
}; };
let count_search = |term: &str, field| {
let query = QueryParser::for_index(&index, vec![field])
.parse_query(term)
.unwrap();
searcher.search(&query, &Count).unwrap()
};
let do_search2 = |term: Term| { let count_search2 = |term: Term| {
let query = TermQuery::new(term, IndexRecordOption::Basic); let query = TermQuery::new(term, IndexRecordOption::Basic);
let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &Count).unwrap()
searcher.search(&query, &TopDocs::with_limit(1000)).unwrap();
top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
}; };
for (id, count) in &expected_ids_and_num_occurrences { for (id, count) in &expected_ids_and_num_occurrences {
// skip expensive queries
let (existing_id, count) = (*id, *count); let (existing_id, count) = (*id, *count);
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64; let get_num_hits = |field| count_search(&existing_id.to_string(), field) as u64;
assert_eq!(get_num_hits(id_field), count); assert_eq!(get_num_hits(id_field), count);
if !id_is_full_doc(existing_id) { if !id_is_full_doc(existing_id) {
continue; continue;
@@ -2053,29 +2149,31 @@ mod tests {
assert_eq!(get_num_hits(f64_field), count); assert_eq!(get_num_hits(f64_field), count);
// Test multi text // Test multi text
assert_eq!( if num_docs_with_values < 1000 {
do_search("\"test1 test2\"", multi_text_fields).len(), assert_eq!(
num_docs_with_values do_search("\"test1 test2\"", multi_text_fields).len(),
); num_docs_with_values
assert_eq!( );
do_search("\"test2 test3\"", multi_text_fields).len(), assert_eq!(
num_docs_with_values do_search("\"test2 test3\"", multi_text_fields).len(),
); num_docs_with_values
);
}
// Test bytes // Test bytes
let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice()); let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice());
assert_eq!(do_search2(term).len() as u64, count); assert_eq!(count_search2(term) as u64, count);
// Test date // Test date
let term = Term::from_field_date( let term = Term::from_field_date(
date_field, date_field,
DateTime::from_timestamp_secs(existing_id as i64), DateTime::from_timestamp_secs(existing_id as i64),
); );
assert_eq!(do_search2(term).len() as u64, count); assert_eq!(count_search2(term) as u64, count);
} }
for deleted_id in deleted_ids { for deleted_id in deleted_ids {
let assert_field = |field| { let assert_field = |field| {
assert_eq!(do_search(&deleted_id.to_string(), field).len() as u64, 0); assert_eq!(count_search(&deleted_id.to_string(), field) as u64, 0);
}; };
assert_field(text_field); assert_field(text_field);
assert_field(f64_field); assert_field(f64_field);
@@ -2084,12 +2182,12 @@ mod tests {
// Test bytes // Test bytes
let term = Term::from_field_bytes(bytes_field, deleted_id.to_le_bytes().as_slice()); let term = Term::from_field_bytes(bytes_field, deleted_id.to_le_bytes().as_slice());
assert_eq!(do_search2(term).len() as u64, 0); assert_eq!(count_search2(term), 0);
// Test date // Test date
let term = let term =
Term::from_field_date(date_field, DateTime::from_timestamp_secs(deleted_id as i64)); Term::from_field_date(date_field, DateTime::from_timestamp_secs(deleted_id as i64));
assert_eq!(do_search2(term).len() as u64, 0); assert_eq!(count_search2(term), 0);
} }
// search ip address // search ip address
// //
@@ -2098,13 +2196,13 @@ mod tests {
if !id_is_full_doc(existing_id) { if !id_is_full_doc(existing_id) {
continue; continue;
} }
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64; let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
let ip_addr = Ipv6Addr::from_u128(existing_id as u128); let ip_addr = Ipv6Addr::from_u128(existing_id as u128);
// Test incoming ip as ipv6 // Test incoming ip as ipv6
assert_eq!(do_search_ip_field(&format!("\"{ip_addr}\"")), count); assert_eq!(do_search_ip_field(&format!("\"{ip_addr}\"")), count);
let term = Term::from_field_ip_addr(ip_field, ip_addr); let term = Term::from_field_ip_addr(ip_field, ip_addr);
assert_eq!(do_search2(term).len() as u64, count); assert_eq!(count_search2(term) as u64, count);
// Test incoming ip as ipv4 // Test incoming ip as ipv4
if let Some(ip_addr) = ip_addr.to_ipv4_mapped() { if let Some(ip_addr) = ip_addr.to_ipv4_mapped() {
@@ -2121,7 +2219,7 @@ mod tests {
if !sample.is_empty() { if !sample.is_empty() {
let (left_sample, right_sample) = sample.split_at(sample.len() / 2); let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
let expected_count = |sample: &[(&u64, &u64)]| { let calc_expected_count = |sample: &[(&u64, &u64)]| {
sample sample
.iter() .iter()
.filter(|(id, _)| id_is_full_doc(**id)) .filter(|(id, _)| id_is_full_doc(**id))
@@ -2137,18 +2235,17 @@ mod tests {
} }
// Query first half // Query first half
if !left_sample.is_empty() { let expected_count = calc_expected_count(left_sample);
let expected_count = expected_count(left_sample); if !left_sample.is_empty() && expected_count < 1000 {
let start_range = *left_sample[0].0; let start_range = *left_sample[0].0;
let end_range = *left_sample.last().unwrap().0; let end_range = *left_sample.last().unwrap().0;
let query = gen_query_inclusive("id_opt", start_range, end_range); let query = gen_query_inclusive("id_opt", start_range, end_range);
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count); assert_eq!(count_search(&query, id_opt_field) as u64, expected_count);
// Range query on ip field // Range query on ip field
let ip1 = ip_from_id(start_range); let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range); let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64; let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
let query = gen_query_inclusive("ip", ip1, ip2); let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count); assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", "*", ip2); let query = gen_query_inclusive("ip", "*", ip2);
@@ -2160,19 +2257,19 @@ mod tests {
assert_eq!(do_search_ip_field(&query), expected_count); assert_eq!(do_search_ip_field(&query), expected_count);
} }
// Query second half // Query second half
if !right_sample.is_empty() { let expected_count = calc_expected_count(right_sample);
let expected_count = expected_count(right_sample); if !right_sample.is_empty() && expected_count < 1000 {
let start_range = *right_sample[0].0; let start_range = *right_sample[0].0;
let end_range = *right_sample.last().unwrap().0; let end_range = *right_sample.last().unwrap().0;
// Range query on id opt field // Range query on id opt field
let query = let query =
gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string()); gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count); assert_eq!(count_search(&query, id_opt_field) as u64, expected_count);
// Range query on ip field // Range query on ip field
let ip1 = ip_from_id(start_range); let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range); let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64; let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
let query = gen_query_inclusive("ip", ip1, ip2); let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count); assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", ip1, "*"); let query = gen_query_inclusive("ip", ip1, "*");
@@ -2197,7 +2294,7 @@ mod tests {
}; };
let ip = ip_from_id(existing_id); let ip = ip_from_id(existing_id);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64; let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
// Range query on single value field // Range query on single value field
let query = gen_query_inclusive("ip", ip, ip); let query = gen_query_inclusive("ip", ip, ip);
assert_eq!(do_search_ip_field(&query), count); assert_eq!(do_search_ip_field(&query), count);
@@ -2257,7 +2354,7 @@ mod tests {
#[test] #[test]
fn test_fast_field_range() { fn test_fast_field_range() {
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect(); let ops: Vec<_> = (0..1000).map(|id| IndexingOp::add(id)).collect();
assert!(test_operation_strategy(&ops, false, true).is_ok()); assert!(test_operation_strategy(&ops, false, true).is_ok());
} }
@@ -2265,8 +2362,8 @@ mod tests {
fn test_sort_index_on_opt_field_regression() { fn test_sort_index_on_opt_field_regression() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[ &[
IndexingOp::AddDoc { id: 81 }, IndexingOp::add(81),
IndexingOp::AddDoc { id: 70 }, IndexingOp::add(70),
IndexingOp::DeleteDoc { id: 70 } IndexingOp::DeleteDoc { id: 70 }
], ],
true, true,
@@ -2275,14 +2372,45 @@ mod tests {
.is_ok()); .is_ok());
} }
#[test]
fn test_simple_multiple_doc() {
assert!(test_operation_strategy(
&[
IndexingOp::AddMultipleDoc {
id: 7,
num_docs: 800,
value: IndexValue::U64(0),
},
IndexingOp::AddMultipleDoc {
id: 92,
num_docs: 800,
value: IndexValue::U64(0),
},
IndexingOp::AddMultipleDoc {
id: 30,
num_docs: 800,
value: IndexValue::U64(0),
},
IndexingOp::AddMultipleDoc {
id: 33,
num_docs: 800,
value: IndexValue::U64(0),
},
],
true,
false
)
.is_ok());
}
#[test] #[test]
fn test_ip_range_query_multivalue_bug() { fn test_ip_range_query_multivalue_bug() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[ &[
IndexingOp::AddDoc { id: 2 }, IndexingOp::add(2),
IndexingOp::Commit, IndexingOp::Commit,
IndexingOp::AddDoc { id: 1 }, IndexingOp::add(1),
IndexingOp::AddDoc { id: 1 }, IndexingOp::add(1),
IndexingOp::Commit, IndexingOp::Commit,
IndexingOp::Merge IndexingOp::Merge
], ],
@@ -2296,11 +2424,11 @@ mod tests {
fn test_ff_num_ips_regression() { fn test_ff_num_ips_regression() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[ &[
IndexingOp::AddDoc { id: 13 }, IndexingOp::add(13),
IndexingOp::AddDoc { id: 1 }, IndexingOp::add(1),
IndexingOp::Commit, IndexingOp::Commit,
IndexingOp::DeleteDocQuery { id: 13 }, IndexingOp::DeleteDocQuery { id: 13 },
IndexingOp::AddDoc { id: 1 }, IndexingOp::add(1),
IndexingOp::Commit, IndexingOp::Commit,
], ],
false, false,
@@ -2312,7 +2440,7 @@ mod tests {
#[test] #[test]
fn test_minimal_sort_force_end_merge() { fn test_minimal_sort_force_end_merge() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[IndexingOp::AddDoc { id: 23 }, IndexingOp::AddDoc { id: 13 },], &[IndexingOp::add(23), IndexingOp::add(13),],
false, false,
false false
) )
@@ -2373,8 +2501,8 @@ mod tests {
fn test_minimal_sort_force_end_merge_with_delete() { fn test_minimal_sort_force_end_merge_with_delete() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[ &[
IndexingOp::AddDoc { id: 23 }, IndexingOp::add(23),
IndexingOp::AddDoc { id: 13 }, IndexingOp::add(13),
IndexingOp::DeleteDoc { id: 13 } IndexingOp::DeleteDoc { id: 13 }
], ],
true, true,
@@ -2387,8 +2515,8 @@ mod tests {
fn test_minimal_no_sort_no_force_end_merge() { fn test_minimal_no_sort_no_force_end_merge() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[ &[
IndexingOp::AddDoc { id: 23 }, IndexingOp::add(23),
IndexingOp::AddDoc { id: 13 }, IndexingOp::add(13),
IndexingOp::DeleteDoc { id: 13 } IndexingOp::DeleteDoc { id: 13 }
], ],
false, false,
@@ -2399,7 +2527,7 @@ mod tests {
#[test] #[test]
fn test_minimal_sort_merge() { fn test_minimal_sort_merge() {
assert!(test_operation_strategy(&[IndexingOp::AddDoc { id: 3 },], true, true).is_ok()); assert!(test_operation_strategy(&[IndexingOp::add(3),], true, true).is_ok());
} }
use proptest::prelude::*; use proptest::prelude::*;
@@ -2495,14 +2623,14 @@ mod tests {
fn test_delete_bug_reproduction_ip_addr() { fn test_delete_bug_reproduction_ip_addr() {
use IndexingOp::*; use IndexingOp::*;
let ops = &[ let ops = &[
AddDoc { id: 1 }, IndexingOp::add(1),
AddDoc { id: 2 }, IndexingOp::add(2),
Commit, Commit,
AddDoc { id: 3 }, IndexingOp::add(3),
DeleteDoc { id: 1 }, DeleteDoc { id: 1 },
Commit, Commit,
Merge, Merge,
AddDoc { id: 4 }, IndexingOp::add(4),
Commit, Commit,
]; ];
test_operation_strategy(&ops[..], false, true).unwrap(); test_operation_strategy(&ops[..], false, true).unwrap();
@@ -2511,7 +2639,13 @@ mod tests {
#[test] #[test]
fn test_merge_regression_1() { fn test_merge_regression_1() {
use IndexingOp::*; use IndexingOp::*;
let ops = &[AddDoc { id: 15 }, Commit, AddDoc { id: 9 }, Commit, Merge]; let ops = &[
IndexingOp::add(15),
Commit,
IndexingOp::add(9),
Commit,
Merge,
];
test_operation_strategy(&ops[..], false, true).unwrap(); test_operation_strategy(&ops[..], false, true).unwrap();
} }
@@ -2519,9 +2653,9 @@ mod tests {
fn test_range_query_bug_1() { fn test_range_query_bug_1() {
use IndexingOp::*; use IndexingOp::*;
let ops = &[ let ops = &[
AddDoc { id: 9 }, IndexingOp::add(9),
AddDoc { id: 0 }, IndexingOp::add(0),
AddDoc { id: 13 }, IndexingOp::add(13),
Commit, Commit,
]; ];
test_operation_strategy(&ops[..], false, true).unwrap(); test_operation_strategy(&ops[..], false, true).unwrap();
@@ -2529,12 +2663,11 @@ mod tests {
#[test] #[test]
fn test_range_query_bug_2() { fn test_range_query_bug_2() {
use IndexingOp::*;
let ops = &[ let ops = &[
AddDoc { id: 3 }, IndexingOp::add(3),
AddDoc { id: 6 }, IndexingOp::add(6),
AddDoc { id: 9 }, IndexingOp::add(9),
AddDoc { id: 10 }, IndexingOp::add(10),
]; ];
test_operation_strategy(&ops[..], false, false).unwrap(); test_operation_strategy(&ops[..], false, false).unwrap();
} }
@@ -2556,7 +2689,7 @@ mod tests {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[ &[
IndexingOp::DeleteDoc { id: 0 }, IndexingOp::DeleteDoc { id: 0 },
IndexingOp::AddDoc { id: 6 }, IndexingOp::add(6),
IndexingOp::DeleteDocQuery { id: 11 }, IndexingOp::DeleteDocQuery { id: 11 },
IndexingOp::Commit, IndexingOp::Commit,
IndexingOp::Merge, IndexingOp::Merge,
@@ -2573,10 +2706,13 @@ mod tests {
fn test_bug_1617_2() { fn test_bug_1617_2() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[ &[
IndexingOp::AddDoc { id: 13 }, IndexingOp::AddDoc {
id: 13,
value: Default::default()
},
IndexingOp::DeleteDoc { id: 13 }, IndexingOp::DeleteDoc { id: 13 },
IndexingOp::Commit, IndexingOp::Commit,
IndexingOp::AddDoc { id: 30 }, IndexingOp::add(30),
IndexingOp::Commit, IndexingOp::Commit,
IndexingOp::Merge, IndexingOp::Merge,
], ],

View File

@@ -202,9 +202,8 @@ impl SegmentWriter {
match field_entry.field_type() { match field_entry.field_type() {
FieldType::Facet(_) => { FieldType::Facet(_) => {
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
let facet_str = value.as_facet().ok_or_else(make_schema_error)?; let facet_str = value.as_facet().ok_or_else(make_schema_error)?;
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str); let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
@@ -220,15 +219,14 @@ impl SegmentWriter {
} }
FieldType::Str(_) => { FieldType::Str(_) => {
let mut indexing_position = IndexingPosition::default(); let mut indexing_position = IndexingPosition::default();
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
let mut token_stream = if let Some(text) = value.as_str() { let mut token_stream = if let Some(text) = value.as_str() {
let text_analyzer = let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize]; &mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text) text_analyzer.token_stream(text)
} else if let Some(tok_str) = value.as_pre_tokenized_text() { } else if let Some(tok_str) = value.into_pre_tokenized_text() {
BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone())) BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone()))
} else { } else {
continue; continue;
@@ -250,9 +248,8 @@ impl SegmentWriter {
} }
FieldType::U64(_) => { FieldType::U64(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let u64_val = value.as_u64().ok_or_else(make_schema_error)?; let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
@@ -265,10 +262,8 @@ impl SegmentWriter {
} }
FieldType::Date(_) => { FieldType::Date(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
num_vals += 1; num_vals += 1;
let date_val = value.as_datetime().ok_or_else(make_schema_error)?; let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
@@ -282,9 +277,8 @@ impl SegmentWriter {
} }
FieldType::I64(_) => { FieldType::I64(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let i64_val = value.as_i64().ok_or_else(make_schema_error)?; let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
@@ -297,10 +291,8 @@ impl SegmentWriter {
} }
FieldType::F64(_) => { FieldType::F64(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let f64_val = value.as_f64().ok_or_else(make_schema_error)?; let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
term_buffer.set_f64(f64_val); term_buffer.set_f64(f64_val);
@@ -312,10 +304,8 @@ impl SegmentWriter {
} }
FieldType::Bool(_) => { FieldType::Bool(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let bool_val = value.as_bool().ok_or_else(make_schema_error)?; let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
term_buffer.set_bool(bool_val); term_buffer.set_bool(bool_val);
@@ -327,10 +317,8 @@ impl SegmentWriter {
} }
FieldType::Bytes(_) => { FieldType::Bytes(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let bytes = value.as_bytes().ok_or_else(make_schema_error)?; let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes); term_buffer.set_bytes(bytes);
@@ -364,9 +352,8 @@ impl SegmentWriter {
} }
FieldType::IpAddr(_) => { FieldType::IpAddr(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?; let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;

View File

@@ -2,7 +2,7 @@ use std::fmt;
use std::ops::Bound; use std::ops::Bound;
use crate::query::Occur; use crate::query::Occur;
use crate::schema::{Field, Term, Type}; use crate::schema::{Term, Type};
use crate::Score; use crate::Score;
#[derive(Clone)] #[derive(Clone)]
@@ -20,8 +20,6 @@ pub enum LogicalLiteral {
upper: Bound<Term>, upper: Bound<Term>,
}, },
Set { Set {
field: Field,
value_type: Type,
elements: Vec<Term>, elements: Vec<Term>,
}, },
All, All,

View File

@@ -832,17 +832,11 @@ impl QueryParser {
let (field, json_path) = try_tuple!(self let (field, json_path) = try_tuple!(self
.split_full_path(&full_path) .split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))); .ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let (elements, errors) = elements let (elements, errors) = elements
.into_iter() .into_iter()
.map(|element| self.compute_boundary_term(field, json_path, &element)) .map(|element| self.compute_boundary_term(field, json_path, &element))
.partition_result(); .partition_result();
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set { let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set { elements }));
elements,
field,
value_type,
}));
(Some(logical_ast), errors) (Some(logical_ast), errors)
} }
UserInputLeaf::Exists { .. } => ( UserInputLeaf::Exists { .. } => (

View File

@@ -17,15 +17,6 @@ pub trait Value<'a>: Send + Sync + Debug {
/// Returns the field value represented by an enum which borrows it's data. /// Returns the field value represented by an enum which borrows it's data.
fn as_value(&self) -> ReferenceValue<'a, Self>; fn as_value(&self) -> ReferenceValue<'a, Self>;
#[inline]
/// Returns if the value is `null` or not.
fn is_null(&self) -> bool {
matches!(
self.as_value(),
ReferenceValue::Leaf(ReferenceValueLeaf::Null)
)
}
#[inline] #[inline]
/// If the Value is a leaf, returns the associated leaf. Returns None otherwise. /// If the Value is a leaf, returns the associated leaf. Returns None otherwise.
fn as_leaf(&self) -> Option<ReferenceValueLeaf<'a>> { fn as_leaf(&self) -> Option<ReferenceValueLeaf<'a>> {
@@ -117,18 +108,6 @@ pub trait Value<'a>: Send + Sync + Debug {
None None
} }
} }
#[inline]
/// Returns true if the Value is an array.
fn is_array(&self) -> bool {
matches!(self.as_value(), ReferenceValue::Object(_))
}
#[inline]
/// Returns true if the Value is an object.
fn is_object(&self) -> bool {
matches!(self.as_value(), ReferenceValue::Object(_))
}
} }
/// A enum representing a leaf value for tantivy to index. /// A enum representing a leaf value for tantivy to index.