Compare commits

..

14 Commits

Author SHA1 Message Date
Pascal Seitz
2ce485b8cc skip estimate phase for merge multivalue index
precompute stats for merge multivalue index + disable Line encoding for
multivalue index. That combination allows to skip the first estimation
pass. This gives up to 2x on merge performance on multivalue indices.

This change may decrease compression as Line is
very good compressible for documents, which have a fixed amount of
values in each doc. The line codec should be replaced.

```
merge_multi_and_multi          Avg: 22.7880ms (-47.15%)    Median: 22.5469ms (-47.38%)    [22.3691ms .. 25.8392ms]
merge_dense_and_dense          Avg: 14.4398ms (+2.18%)     Median: 14.2465ms (+0.74%)     [14.1620ms .. 16.1270ms]
merge_sparse_and_sparse        Avg: 10.6559ms (+1.10%)     Median: 10.6318ms (+0.91%)     [10.5527ms .. 11.2848ms]
merge_sparse_and_dense         Avg: 12.4886ms (+1.52%)     Median: 12.4044ms (+0.84%)     [12.3261ms .. 13.9439ms]
merge_multi_and_dense          Avg: 25.6686ms (-45.56%)    Median: 25.4851ms (-45.84%)    [25.1618ms .. 27.6226ms]
merge_multi_and_sparse         Avg: 24.3278ms (-47.00%)    Median: 24.1917ms (-47.34%)    [23.7159ms .. 27.0513ms]
```
2024-06-11 20:22:00 +08:00
PSeitz
c3b92a5412 fix compiler warning, cleanup (#2393)
fix compiler warning for missing feature flag
remove unused variables
cleanup unused methods
2024-06-11 16:03:50 +08:00
PSeitz
2f55511064 extend indexwriter proptests (#2342)
* index random values in proptest

* add proptest with multiple docs
2024-06-11 16:02:57 +08:00
trinity-1686a
08b9fc0b31 fix de-escaping too much in query parser (#2427)
* fix de-escaping too much in query parser
2024-06-10 11:19:01 +02:00
PSeitz
714f363d43 add bench & test for columnar merging (#2428)
* add merge columnar proptest

* add columnar merge benchmark
2024-06-10 16:26:16 +08:00
PSeitz
93ff7365b0 reduce top hits aggregation memory consumption (#2426)
move request structure out of top hits aggregation collector and use from the
passed structure instead

full
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 425.9680ms (-21.38%)    Median: 415.1097ms (-23.56%)    [395.5303ms .. 484.6325ms]
dense
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 440.0817ms (-19.68%)    Median: 432.2286ms (-21.10%)    [403.5632ms .. 497.7541ms]
sparse
terms_many_with_top_hits    Memory: 13.1 MB (-49.31%)    Avg: 33.3568ms (-32.19%)    Median: 33.0834ms (-31.86%)    [32.5126ms .. 35.7397ms]
multivalue
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 414.2340ms (-25.44%)    Median: 413.4144ms (-25.64%)    [403.9919ms .. 430.3170ms]
2024-06-06 22:32:58 +08:00
Adam Reichold
8151925068 Panicking in spawned Rayon tasks will abort the process by default. (#2409) 2024-06-04 17:04:30 +09:00
dependabot[bot]
b960e40bc8 Update sketches-ddsketch requirement from 0.2.1 to 0.3.0 (#2423)
Updates the requirements on [sketches-ddsketch](https://github.com/mheffner/rust-sketches-ddsketch) to permit the latest version.
- [Release notes](https://github.com/mheffner/rust-sketches-ddsketch/releases)
- [Commits](https://github.com/mheffner/rust-sketches-ddsketch/compare/v0.2.1...v0.3.0)

---
updated-dependencies:
- dependency-name: sketches-ddsketch
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-06-04 15:50:23 +08:00
giovannicuccu
1095c9b073 Issue 1787 extended stats (#2247)
* first version of extended stats along with its tests

* using IntermediateExtendStats instead of IntermediateStats with all tests passing

* Created struct for request and response

* first test with extended_stats

* kahan summation and tests with approximate equality

* version ready for merge

* removed approx dependency

* refactor for using ExtendedStats only when needed

* interim version

* refined version with code formatted

* refactored a struct

* cosmetic refactor

* fix after merge

* fix format

* added extended_stat bench

* merge and new benchmark for extended stats

* split stat segment collectors

* wrapped intermediate extended stat with a box to limit memory usage

* Revert "wrapped intermediate extended stat with a box to limit memory usage"

This reverts commit 5b4aa9f393.

* some code reformat, commented kahan summation

* refactor after review

* refactor after code review

* fix after incorrectly restoring kahan summation

* modifications for code review + bug fix in merge_fruit

* refactor assert_nearly_equals macro

* update after code review

---------

Co-authored-by: Giovanni Cuccu <gcuccu@imolainformatica.it>
2024-06-04 14:25:17 +08:00
PSeitz
c0686515a9 update one_shot (#2420) 2024-05-31 11:07:35 +08:00
trinity-1686a
455156f51c improve query parser (#2416)
* support escape sequence in more place

and fix bug with singlequoted strings

* add query parser test for range query on default field
2024-05-30 17:29:27 +02:00
Meng Zhang
4143d31865 chore: fix build as the rev is gone (#2417) 2024-05-29 09:49:16 +08:00
Hamir Mahal
0c634adbe1 style: simplify strings with string interpolation (#2412)
* style: simplify strings with string interpolation

* fix: formatting
2024-05-27 09:16:47 +02:00
PSeitz
2e3641c2ae return CompactDocValue instead of trait (#2410)
The CompactDocValue is easier to handle than the trait in some cases like comparison
and conversion
2024-05-27 07:33:50 +02:00
61 changed files with 2299 additions and 532 deletions

View File

@@ -15,8 +15,7 @@ rust-version = "1.63"
exclude = ["benches/*.json", "benches/*.txt"] exclude = ["benches/*.json", "benches/*.txt"]
[dependencies] [dependencies]
# Switch back to the non-forked oneshot crate once https://github.com/faern/oneshot/pull/35 is merged oneshot = "0.1.7"
oneshot = { git = "https://github.com/fulmicoton/oneshot.git", rev = "c10a3ba" }
base64 = "0.22.0" base64 = "0.22.0"
byteorder = "1.4.3" byteorder = "1.4.3"
crc32fast = "1.3.2" crc32fast = "1.3.2"
@@ -64,7 +63,7 @@ query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tanti
tantivy-bitpacker = { version = "0.6", path = "./bitpacker" } tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
common = { version = "0.7", path = "./common/", package = "tantivy-common" } common = { version = "0.7", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" } tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] } sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
futures-util = { version = "0.3.28", optional = true } futures-util = { version = "0.3.28", optional = true }
fnv = "1.0.7" fnv = "1.0.7"

View File

@@ -47,6 +47,7 @@ fn bench_agg(mut group: InputGroup<Index>) {
register!(group, average_f64); register!(group, average_f64);
register!(group, average_f64_u64); register!(group, average_f64_u64);
register!(group, stats_f64); register!(group, stats_f64);
register!(group, extendedstats_f64);
register!(group, percentiles_f64); register!(group, percentiles_f64);
register!(group, terms_few); register!(group, terms_few);
register!(group, terms_many); register!(group, terms_many);
@@ -105,7 +106,12 @@ fn stats_f64(index: &Index) {
}); });
exec_term_with_agg(index, agg_req) exec_term_with_agg(index, agg_req)
} }
fn extendedstats_f64(index: &Index) {
let agg_req = json!({
"extendedstats_f64": { "extended_stats": { "field": "score_f64", } }
});
exec_term_with_agg(index, agg_req)
}
fn percentiles_f64(index: &Index) { fn percentiles_f64(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"mypercentiles": { "mypercentiles": {
@@ -349,7 +355,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap(); let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let many_terms_data = (0..150_000) let many_terms_data = (0..150_000)
.map(|num| format!("author{}", num)) .map(|num| format!("author{num}"))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
{ {
let mut rng = StdRng::from_seed([1u8; 32]); let mut rng = StdRng::from_seed([1u8; 32]);

View File

@@ -141,12 +141,12 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
let parse_json = false; let parse_json = false;
// for parse_json in [false, true] { // for parse_json in [false, true] {
let suffix = if parse_json { let suffix = if parse_json {
format!("{}-with-json-parsing", suffix) format!("{suffix}-with-json-parsing")
} else { } else {
suffix.to_string() suffix.to_string()
}; };
let bench_name = format!("{}{}", prefix, suffix); let bench_name = format!("{prefix}{suffix}");
group.bench_function(bench_name, |b| { group.bench_function(bench_name, |b| {
benchmark(b, HDFS_LOGS, schema.clone(), commit, parse_json, is_dynamic) benchmark(b, HDFS_LOGS, schema.clone(), commit, parse_json, is_dynamic)
}); });

View File

@@ -23,6 +23,12 @@ downcast-rs = "1.2.0"
proptest = "1" proptest = "1"
more-asserts = "0.3.1" more-asserts = "0.3.1"
rand = "0.8" rand = "0.8"
binggan = "0.8.1"
[[bench]]
name = "bench_merge"
harness = false
[features] [features]
unstable = [] unstable = []

View File

@@ -0,0 +1,97 @@
#![feature(test)]
extern crate test;
use core::fmt;
use std::fmt::{Display, Formatter};
use binggan::{black_box, BenchRunner};
use tantivy_columnar::*;
enum Card {
Multi,
Sparse,
Dense,
}
impl Display for Card {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Card::Multi => write!(f, "multi"),
Card::Sparse => write!(f, "sparse"),
Card::Dense => write!(f, "dense"),
}
}
}
const NUM_DOCS: u32 = 1_000_000;
fn generate_columnar(card: Card, num_docs: u32) -> ColumnarReader {
use tantivy_columnar::ColumnarWriter;
let mut columnar_writer = ColumnarWriter::default();
match card {
Card::Multi => {
columnar_writer.record_numerical(0, "price", 10u64);
columnar_writer.record_numerical(0, "price", 10u64);
}
_ => {}
}
for i in 0..num_docs {
match card {
Card::Multi | Card::Sparse => {
if i % 8 == 0 {
columnar_writer.record_numerical(i, "price", i as u64);
}
}
Card::Dense => {
if i % 6 == 0 {
columnar_writer.record_numerical(i, "price", i as u64);
}
}
}
}
let mut wrt: Vec<u8> = Vec::new();
columnar_writer.serialize(num_docs, None, &mut wrt).unwrap();
ColumnarReader::open(wrt).unwrap()
}
fn main() {
let mut inputs = Vec::new();
let mut add_combo = |card1: Card, card2: Card| {
inputs.push((
format!("merge_{card1}_and_{card2}"),
vec![
generate_columnar(card1, NUM_DOCS),
generate_columnar(card2, NUM_DOCS),
],
));
};
add_combo(Card::Multi, Card::Multi);
add_combo(Card::Dense, Card::Dense);
add_combo(Card::Sparse, Card::Sparse);
add_combo(Card::Sparse, Card::Dense);
add_combo(Card::Multi, Card::Dense);
add_combo(Card::Multi, Card::Sparse);
let runner: BenchRunner = BenchRunner::new();
let mut group = runner.new_group();
for (input_name, columnar_readers) in inputs.iter() {
group.register_with_input(
input_name,
columnar_readers,
move |columnar_readers: &Vec<ColumnarReader>| {
let mut out = vec![];
let columnar_readers = columnar_readers.iter().collect::<Vec<_>>();
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
black_box(out);
},
);
}
group.run();
}

View File

@@ -73,14 +73,18 @@ fn detect_cardinality(
pub fn merge_column_index<'a>( pub fn merge_column_index<'a>(
columns: &'a [ColumnIndex], columns: &'a [ColumnIndex],
merge_row_order: &'a MergeRowOrder, merge_row_order: &'a MergeRowOrder,
num_values: u32,
) -> SerializableColumnIndex<'a> { ) -> SerializableColumnIndex<'a> {
// For simplification, we do not try to detect whether the cardinality could be // For simplification, we do not try to detect whether the cardinality could be
// downgraded thanks to deletes. // downgraded thanks to deletes.
let cardinality_after_merge = detect_cardinality(columns, merge_row_order); let cardinality_after_merge = detect_cardinality(columns, merge_row_order);
match merge_row_order { match merge_row_order {
MergeRowOrder::Stack(stack_merge_order) => { MergeRowOrder::Stack(stack_merge_order) => merge_column_index_stacked(
merge_column_index_stacked(columns, cardinality_after_merge, stack_merge_order) columns,
} cardinality_after_merge,
stack_merge_order,
num_values,
),
MergeRowOrder::Shuffled(complex_merge_order) => { MergeRowOrder::Shuffled(complex_merge_order) => {
merge_column_index_shuffled(columns, cardinality_after_merge, complex_merge_order) merge_column_index_shuffled(columns, cardinality_after_merge, complex_merge_order)
} }
@@ -167,8 +171,12 @@ mod tests {
], ],
) )
.into(); .into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order); let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order, 3);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else { let SerializableColumnIndex::Multivalued {
indices: start_index_iterable,
..
} = merged_column_index
else {
panic!("Excpected a multivalued index") panic!("Excpected a multivalued index")
}; };
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect(); let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
@@ -200,8 +208,12 @@ mod tests {
], ],
) )
.into(); .into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order); let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order, 6);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else { let SerializableColumnIndex::Multivalued {
indices: start_index_iterable,
..
} = merged_column_index
else {
panic!("Excpected a multivalued index") panic!("Excpected a multivalued index")
}; };
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect(); let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();

View File

@@ -22,7 +22,10 @@ pub fn merge_column_index_shuffled<'a>(
Cardinality::Multivalued => { Cardinality::Multivalued => {
let multivalue_start_index = let multivalue_start_index =
merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order); merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order);
SerializableColumnIndex::Multivalued(multivalue_start_index) SerializableColumnIndex::Multivalued {
indices: multivalue_start_index,
stats: None,
}
} }
} }
} }

View File

@@ -1,6 +1,8 @@
use std::iter; use std::iter;
use std::num::NonZeroU64;
use crate::column_index::{SerializableColumnIndex, Set}; use crate::column_index::{SerializableColumnIndex, Set};
use crate::column_values::ColumnStats;
use crate::iterable::Iterable; use crate::iterable::Iterable;
use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder}; use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};
@@ -12,6 +14,7 @@ pub fn merge_column_index_stacked<'a>(
columns: &'a [ColumnIndex], columns: &'a [ColumnIndex],
cardinality_after_merge: Cardinality, cardinality_after_merge: Cardinality,
stack_merge_order: &'a StackMergeOrder, stack_merge_order: &'a StackMergeOrder,
num_values: u32,
) -> SerializableColumnIndex<'a> { ) -> SerializableColumnIndex<'a> {
match cardinality_after_merge { match cardinality_after_merge {
Cardinality::Full => SerializableColumnIndex::Full, Cardinality::Full => SerializableColumnIndex::Full,
@@ -27,7 +30,17 @@ pub fn merge_column_index_stacked<'a>(
columns, columns,
stack_merge_order, stack_merge_order,
}; };
SerializableColumnIndex::Multivalued(Box::new(stacked_multivalued_index)) SerializableColumnIndex::Multivalued {
indices: Box::new(stacked_multivalued_index),
stats: Some(ColumnStats {
gcd: NonZeroU64::new(1).unwrap(),
// The values in the multivalue index are the positions of the values
min_value: 0,
max_value: num_values as u64,
// This is num docs, but it starts at 0 so we need +1
num_rows: stack_merge_order.num_rows() + 1,
}),
}
} }
} }
} }

View File

@@ -6,20 +6,29 @@ use std::sync::Arc;
use common::OwnedBytes; use common::OwnedBytes;
use crate::column_values::{ use crate::column_values::{
load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues, load_u64_based_column_values, serialize_u64_based_column_values,
serialize_u64_with_codec_and_stats, CodecType, ColumnStats, ColumnValues,
}; };
use crate::iterable::Iterable; use crate::iterable::Iterable;
use crate::{DocId, RowId}; use crate::{DocId, RowId};
pub fn serialize_multivalued_index( pub fn serialize_multivalued_index(
multivalued_index: &dyn Iterable<RowId>, multivalued_index: &dyn Iterable<RowId>,
stats: Option<ColumnStats>,
output: &mut impl Write, output: &mut impl Write,
) -> io::Result<()> { ) -> io::Result<()> {
serialize_u64_based_column_values( if let Some(stats) = stats {
multivalued_index, // TODO: Add something with higher compression that doesn't require a full scan upfront
&[CodecType::Bitpacked, CodecType::Linear], let estimator = CodecType::Bitpacked.estimator();
output, assert!(!estimator.requires_full_scan());
)?; serialize_u64_with_codec_and_stats(multivalued_index, estimator, stats, output)?;
} else {
serialize_u64_based_column_values(
multivalued_index,
&[CodecType::Bitpacked, CodecType::Linear],
output,
)?;
}
Ok(()) Ok(())
} }
@@ -52,7 +61,7 @@ impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
impl MultiValueIndex { impl MultiValueIndex {
pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex { pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
serialize_multivalued_index(&start_offsets, &mut buffer).unwrap(); serialize_multivalued_index(&start_offsets, None, &mut buffer).unwrap();
let bytes = OwnedBytes::new(buffer); let bytes = OwnedBytes::new(buffer);
open_multivalued_index(bytes).unwrap() open_multivalued_index(bytes).unwrap()
} }

View File

@@ -196,6 +196,7 @@ impl Set<RowId> for OptionalIndex {
} = row_addr_from_row_id(doc_id); } = row_addr_from_row_id(doc_id);
let block_meta = self.block_metas[block_id as usize]; let block_meta = self.block_metas[block_id as usize];
let block = self.block(block_meta); let block = self.block(block_meta);
let block_offset_row_id = match block { let block_offset_row_id = match block {
Block::Dense(dense_block) => dense_block.rank(in_block_row_id), Block::Dense(dense_block) => dense_block.rank(in_block_row_id),
Block::Sparse(sparse_block) => sparse_block.rank(in_block_row_id), Block::Sparse(sparse_block) => sparse_block.rank(in_block_row_id),

View File

@@ -6,6 +6,7 @@ use common::{CountingWriter, OwnedBytes};
use crate::column_index::multivalued_index::serialize_multivalued_index; use crate::column_index::multivalued_index::serialize_multivalued_index;
use crate::column_index::optional_index::serialize_optional_index; use crate::column_index::optional_index::serialize_optional_index;
use crate::column_index::ColumnIndex; use crate::column_index::ColumnIndex;
use crate::column_values::ColumnStats;
use crate::iterable::Iterable; use crate::iterable::Iterable;
use crate::{Cardinality, RowId}; use crate::{Cardinality, RowId};
@@ -15,9 +16,12 @@ pub enum SerializableColumnIndex<'a> {
non_null_row_ids: Box<dyn Iterable<RowId> + 'a>, non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
num_rows: RowId, num_rows: RowId,
}, },
// TODO remove the Arc<dyn> apart from serialization this is not Multivalued {
// dynamic at all. /// Iterator emitting the indices for the index
Multivalued(Box<dyn Iterable<RowId> + 'a>), indices: Box<dyn Iterable<RowId> + 'a>,
/// In the merge case we can precompute the column stats
stats: Option<ColumnStats>,
},
} }
impl<'a> SerializableColumnIndex<'a> { impl<'a> SerializableColumnIndex<'a> {
@@ -25,7 +29,7 @@ impl<'a> SerializableColumnIndex<'a> {
match self { match self {
SerializableColumnIndex::Full => Cardinality::Full, SerializableColumnIndex::Full => Cardinality::Full,
SerializableColumnIndex::Optional { .. } => Cardinality::Optional, SerializableColumnIndex::Optional { .. } => Cardinality::Optional,
SerializableColumnIndex::Multivalued(_) => Cardinality::Multivalued, SerializableColumnIndex::Multivalued { .. } => Cardinality::Multivalued,
} }
} }
} }
@@ -44,9 +48,10 @@ pub fn serialize_column_index(
non_null_row_ids, non_null_row_ids,
num_rows, num_rows,
} => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?, } => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
SerializableColumnIndex::Multivalued(multivalued_index) => { SerializableColumnIndex::Multivalued {
serialize_multivalued_index(&*multivalued_index, &mut output)? indices: multivalued_index,
} stats,
} => serialize_multivalued_index(&*multivalued_index, stats, &mut output)?,
} }
let column_index_num_bytes = output.written_bytes() as u32; let column_index_num_bytes = output.written_bytes() as u32;
Ok(column_index_num_bytes) Ok(column_index_num_bytes)

View File

@@ -32,7 +32,8 @@ pub use u128_based::{
}; };
pub use u64_based::{ pub use u64_based::{
load_u64_based_column_values, serialize_and_load_u64_based_column_values, load_u64_based_column_values, serialize_and_load_u64_based_column_values,
serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES, serialize_u64_based_column_values, serialize_u64_with_codec_and_stats, CodecType,
ALL_U64_CODEC_TYPES,
}; };
pub use vec_column::VecColumn; pub use vec_column::VecColumn;

View File

@@ -128,6 +128,9 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {
bit_packer.close(wrt)?; bit_packer.close(wrt)?;
Ok(()) Ok(())
} }
fn codec_type(&self) -> super::CodecType {
super::CodecType::Bitpacked
}
} }
pub struct BitpackedCodec; pub struct BitpackedCodec;

View File

@@ -163,6 +163,10 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {
Ok(()) Ok(())
} }
fn codec_type(&self) -> super::CodecType {
super::CodecType::BlockwiseLinear
}
} }
pub struct BlockwiseLinearCodec; pub struct BlockwiseLinearCodec;

View File

@@ -153,6 +153,12 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
self.collect_before_line_estimation(value); self.collect_before_line_estimation(value);
} }
} }
fn requires_full_scan(&self) -> bool {
true
}
fn codec_type(&self) -> super::CodecType {
super::CodecType::Linear
}
} }
impl LinearCodecEstimator { impl LinearCodecEstimator {

View File

@@ -37,7 +37,11 @@ pub trait ColumnCodecEstimator<T = u64>: 'static {
/// This method will be called for each element of the column during /// This method will be called for each element of the column during
/// `estimation`. /// `estimation`.
fn collect(&mut self, value: u64); fn collect(&mut self, value: u64);
/// Finalizes the first pass phase. /// Returns true if the estimator needs a full pass over the column before serialization
fn requires_full_scan(&self) -> bool {
false
}
fn codec_type(&self) -> CodecType;
fn finalize(&mut self) {} fn finalize(&mut self) {}
/// Returns an accurate estimation of the number of bytes that will /// Returns an accurate estimation of the number of bytes that will
/// be used to represent this column. /// be used to represent this column.
@@ -150,34 +154,45 @@ pub fn serialize_u64_based_column_values<T: MonotonicallyMappableToU64>(
wrt: &mut dyn Write, wrt: &mut dyn Write,
) -> io::Result<()> { ) -> io::Result<()> {
let mut stats_collector = StatsCollector::default(); let mut stats_collector = StatsCollector::default();
let mut estimators: Vec<(CodecType, Box<dyn ColumnCodecEstimator>)> = let mut estimators: Vec<Box<dyn ColumnCodecEstimator>> = Vec::with_capacity(codec_types.len());
Vec::with_capacity(codec_types.len());
for &codec_type in codec_types { for &codec_type in codec_types {
estimators.push((codec_type, codec_type.estimator())); estimators.push(codec_type.estimator());
} }
for val in vals.boxed_iter() { for val in vals.boxed_iter() {
let val_u64 = val.to_u64(); let val_u64 = val.to_u64();
stats_collector.collect(val_u64); stats_collector.collect(val_u64);
for (_, estimator) in &mut estimators { for estimator in &mut estimators {
estimator.collect(val_u64); estimator.collect(val_u64);
} }
} }
for (_, estimator) in &mut estimators { for estimator in &mut estimators {
estimator.finalize(); estimator.finalize();
} }
let stats = stats_collector.stats(); let stats = stats_collector.stats();
let (_, best_codec, best_codec_estimator) = estimators let (_, best_codec) = estimators
.into_iter() .into_iter()
.flat_map(|(codec_type, estimator)| { .flat_map(|estimator| {
let num_bytes = estimator.estimate(&stats)?; let num_bytes = estimator.estimate(&stats)?;
Some((num_bytes, codec_type, estimator)) Some((num_bytes, estimator))
}) })
.min_by_key(|(num_bytes, _, _)| *num_bytes) .min_by_key(|(num_bytes, _)| *num_bytes)
.ok_or_else(|| { .ok_or_else(|| {
io::Error::new(io::ErrorKind::InvalidData, "No available applicable codec.") io::Error::new(io::ErrorKind::InvalidData, "No available applicable codec.")
})?; })?;
best_codec.to_code().serialize(wrt)?; serialize_u64_with_codec_and_stats(vals, best_codec, stats, wrt)?;
best_codec_estimator.serialize( Ok(())
}
/// Serializes a given column of u64-mapped values.
/// The codec estimator needs to be collected fully for the Line codec before calling this.
pub fn serialize_u64_with_codec_and_stats<T: MonotonicallyMappableToU64>(
vals: &dyn Iterable<T>,
codec: Box<dyn ColumnCodecEstimator>,
stats: ColumnStats,
wrt: &mut dyn Write,
) -> io::Result<()> {
codec.codec_type().to_code().serialize(wrt)?;
codec.serialize(
&stats, &stats,
&mut vals.boxed_iter().map(MonotonicallyMappableToU64::to_u64), &mut vals.boxed_iter().map(MonotonicallyMappableToU64::to_u64),
wrt, wrt,

View File

@@ -3,7 +3,7 @@ mod merge_mapping;
mod term_merger; mod term_merger;
use std::collections::{BTreeMap, HashSet}; use std::collections::{BTreeMap, HashSet};
use std::io; use std::io::{self};
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
use std::sync::Arc; use std::sync::Arc;
@@ -156,8 +156,15 @@ fn merge_column(
column_values.push(None); column_values.push(None);
} }
} }
let merged_column_index = let num_values: u32 = column_values
crate::column_index::merge_column_index(&column_indexes[..], merge_row_order); .iter()
.map(|vals| vals.as_ref().map(|idx| idx.num_vals()).unwrap_or(0))
.sum();
let merged_column_index = crate::column_index::merge_column_index(
&column_indexes[..],
merge_row_order,
num_values,
);
let merge_column_values = MergedColumnValues { let merge_column_values = MergedColumnValues {
column_indexes: &column_indexes[..], column_indexes: &column_indexes[..],
column_values: &column_values[..], column_values: &column_values[..],
@@ -183,8 +190,15 @@ fn merge_column(
} }
} }
let merged_column_index = let num_values: u32 = column_values
crate::column_index::merge_column_index(&column_indexes[..], merge_row_order); .iter()
.map(|vals| vals.as_ref().map(|idx| idx.num_vals()).unwrap_or(0))
.sum();
let merged_column_index = crate::column_index::merge_column_index(
&column_indexes[..],
merge_row_order,
num_values,
);
let merge_column_values = MergedColumnValues { let merge_column_values = MergedColumnValues {
column_indexes: &column_indexes[..], column_indexes: &column_indexes[..],
column_values: &column_values, column_values: &column_values,
@@ -214,8 +228,19 @@ fn merge_column(
} }
} }
} }
let merged_column_index = let num_values: u32 = bytes_columns
crate::column_index::merge_column_index(&column_indexes[..], merge_row_order); .iter()
.map(|vals| {
vals.as_ref()
.map(|idx| idx.term_ord_column.values.num_vals())
.unwrap_or(0)
})
.sum();
let merged_column_index = crate::column_index::merge_column_index(
&column_indexes[..],
merge_row_order,
num_values,
);
merge_bytes_or_str_column(merged_column_index, &bytes_columns, merge_row_order, wrt)?; merge_bytes_or_str_column(merged_column_index, &bytes_columns, merge_row_order, wrt)?;
} }
} }

View File

@@ -644,7 +644,10 @@ fn send_to_serialize_column_mappable_to_u128<
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder(); let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
consume_operation_iterator(op_iterator, multivalued_index_builder, values); consume_operation_iterator(op_iterator, multivalued_index_builder, values);
let multivalued_index = multivalued_index_builder.finish(num_rows); let multivalued_index = multivalued_index_builder.finish(num_rows);
SerializableColumnIndex::Multivalued(Box::new(multivalued_index)) SerializableColumnIndex::Multivalued {
indices: Box::new(multivalued_index),
stats: Default::default(), // TODO: implement stats for u128
}
} }
}; };
crate::column::serialize_column_mappable_to_u128( crate::column::serialize_column_mappable_to_u128(
@@ -699,7 +702,10 @@ fn send_to_serialize_column_mappable_to_u64(
if sort_values_within_row { if sort_values_within_row {
sort_values_within_row_in_place(multivalued_index, values); sort_values_within_row_in_place(multivalued_index, values);
} }
SerializableColumnIndex::Multivalued(Box::new(multivalued_index)) SerializableColumnIndex::Multivalued {
indices: Box::new(multivalued_index),
stats: None,
}
} }
}; };
crate::column::serialize_column_mappable_to_u64( crate::column::serialize_column_mappable_to_u64(

View File

@@ -738,35 +738,22 @@ proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))] #![proptest_config(ProptestConfig::with_cases(1000))]
#[test] #[test]
fn test_columnar_merge_proptest(columnar_docs in proptest::collection::vec(columnar_docs_strategy(), 2..=3)) { fn test_columnar_merge_proptest(columnar_docs in proptest::collection::vec(columnar_docs_strategy(), 2..=3)) {
let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter() test_columnar_docs(columnar_docs);
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().flatten().cloned().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
} }
} }
#[test] fn test_columnar_docs(columnar_docs: Vec<Vec<Vec<(&'static str, ColumnValue)>>>) {
fn test_columnar_merging_empty_columnar() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
let columnar_readers: Vec<ColumnarReader> = columnar_docs let columnar_readers: Vec<ColumnarReader> = columnar_docs
.iter() .iter()
.map(|docs| build_columnar(&docs[..])) .map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect(); let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new(); let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]); let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
crate::merge_columnar( crate::merge_columnar(
&columnar_readers_arr[..], &columnar_readers_arr[..],
&[], &[],
crate::MergeRowOrder::Stack(stack_merge_order), stack_merge_order,
&mut output, &mut output,
) )
.unwrap(); .unwrap();
@@ -777,6 +764,24 @@ fn test_columnar_merging_empty_columnar() {
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar); assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
} }
#[test]
fn test_columnar_merging_empty_columnar() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
test_columnar_docs(columnar_docs);
}
#[test]
fn test_columnar_merging_simple() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
vec![],
vec![vec![
("c1", ColumnValue::Numerical(0u64.into())),
("c1", ColumnValue::Numerical(0u64.into())),
]],
];
test_columnar_docs(columnar_docs);
}
#[test] #[test]
fn test_columnar_merging_number_columns() { fn test_columnar_merging_number_columns() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![ let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
@@ -793,25 +798,7 @@ fn test_columnar_merging_number_columns() {
vec![("c2", ColumnValue::Numerical(u64::MAX.into()))], vec![("c2", ColumnValue::Numerical(u64::MAX.into()))],
], ],
]; ];
let columnar_readers: Vec<ColumnarReader> = columnar_docs test_columnar_docs(columnar_docs);
.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
crate::merge_columnar(
&columnar_readers_arr[..],
&[],
crate::MergeRowOrder::Stack(stack_merge_order),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().flatten().cloned().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
} }
// TODO add non trivial remap and merge // TODO add non trivial remap and merge

View File

@@ -22,3 +22,6 @@ serde = { version = "1.0.136", features = ["derive"] }
[dev-dependencies] [dev-dependencies]
proptest = "1.0.0" proptest = "1.0.0"
rand = "0.8.4" rand = "0.8.4"
[features]
unstable = [] # useful for benches.

View File

@@ -4,7 +4,7 @@
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{DateOptions, Document, Schema, INDEXED, STORED, STRING}; use tantivy::schema::{DateOptions, Document, Schema, Value, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument}; use tantivy::{Index, IndexWriter, TantivyDocument};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
@@ -64,6 +64,7 @@ fn main() -> tantivy::Result<()> {
assert!(retrieved_doc assert!(retrieved_doc
.get_first(occurred_at) .get_first(occurred_at)
.unwrap() .unwrap()
.as_value()
.as_datetime() .as_datetime()
.is_some(),); .is_some(),);
assert_eq!( assert_eq!(

View File

@@ -61,7 +61,7 @@ fn main() -> tantivy::Result<()> {
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
))?; ))?;
println!("add doc {} from thread 1 - opstamp {}", i, opstamp); println!("add doc {i} from thread 1 - opstamp {opstamp}");
thread::sleep(Duration::from_millis(20)); thread::sleep(Duration::from_millis(20));
} }
Result::<(), TantivyError>::Ok(()) Result::<(), TantivyError>::Ok(())
@@ -82,7 +82,7 @@ fn main() -> tantivy::Result<()> {
body => "Some great book description..." body => "Some great book description..."
))? ))?
}; };
println!("add doc {} from thread 2 - opstamp {}", i, opstamp); println!("add doc {i} from thread 2 - opstamp {opstamp}");
thread::sleep(Duration::from_millis(10)); thread::sleep(Duration::from_millis(10));
} }
Result::<(), TantivyError>::Ok(()) Result::<(), TantivyError>::Ok(())

View File

@@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::iter::once; use std::iter::once;
use nom::branch::alt; use nom::branch::alt;
@@ -19,7 +20,7 @@ use crate::Occur;
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to // Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
// special characters. // special characters.
const SPECIAL_CHARS: &[char] = &[ const SPECIAL_CHARS: &[char] = &[
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ', '+', '^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '!', '\\', '*', ' ',
]; ];
/// consume a field name followed by colon. Return the field name with escape sequence /// consume a field name followed by colon. Return the field name with escape sequence
@@ -41,36 +42,92 @@ fn field_name(inp: &str) -> IResult<&str, String> {
)(inp) )(inp)
} }
const ESCAPE_IN_WORD: &[char] = &['^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '\\'];
fn interpret_escape(source: &str) -> String {
let mut res = String::with_capacity(source.len());
let mut in_escape = false;
let require_escape = |c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c) || c == '-';
for c in source.chars() {
if in_escape {
if !require_escape(c) {
// we re-add the escape sequence
res.push('\\');
}
res.push(c);
in_escape = false;
} else if c == '\\' {
in_escape = true;
} else {
res.push(c);
}
}
res
}
/// Consume a word outside of any context. /// Consume a word outside of any context.
// TODO should support escape sequences // TODO should support escape sequences
fn word(inp: &str) -> IResult<&str, &str> { fn word(inp: &str) -> IResult<&str, Cow<str>> {
map_res( map_res(
recognize(tuple(( recognize(tuple((
satisfy(|c| { alt((
!c.is_whitespace() preceded(char('\\'), anychar),
&& !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')'].contains(&c) satisfy(|c| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c) && c != '-'),
}), )),
many0(satisfy(|c: char| { many0(alt((
!c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c) preceded(char('\\'), anychar),
})), satisfy(|c: char| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c)),
))),
))), ))),
|s| match s { |s| match s {
"OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)), "OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)),
_ => Ok(s), s if s.contains('\\') => Ok(Cow::Owned(interpret_escape(s))),
s => Ok(Cow::Borrowed(s)),
}, },
)(inp) )(inp)
} }
fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ { fn word_infallible(
|inp| { delimiter: &str,
opt_i_err( emit_error: bool,
preceded( ) -> impl Fn(&str) -> JResult<&str, Option<Cow<str>>> + '_ {
multispace0, // emit error is set when receiving an unescaped `:` should emit an error
recognize(many1(satisfy(|c| {
!c.is_whitespace() && !delimiter.contains(c) move |inp| {
}))), map(
opt_i_err(
preceded(
multispace0,
recognize(many1(alt((
preceded(char::<&str, _>('\\'), anychar),
satisfy(|c| !c.is_whitespace() && !delimiter.contains(c)),
)))),
),
"expected word",
), ),
"expected word", |(opt_s, mut errors)| match opt_s {
Some(s) => {
if emit_error
&& (s
.as_bytes()
.windows(2)
.any(|window| window[0] != b'\\' && window[1] == b':')
|| s.starts_with(':'))
{
errors.push(LenientErrorInternal {
pos: inp.len(),
message: "parsed possible invalid field as term".to_string(),
});
}
if s.contains('\\') {
(Some(Cow::Owned(interpret_escape(s))), errors)
} else {
(Some(Cow::Borrowed(s)), errors)
}
}
None => (None, errors),
},
)(inp) )(inp)
} }
} }
@@ -159,7 +216,7 @@ fn simple_term_infallible(
(value((), char('\'')), simple_quotes), (value((), char('\'')), simple_quotes),
), ),
// numbers are parsed with words in this case, as we allow string starting with a - // numbers are parsed with words in this case, as we allow string starting with a -
map(word_infallible(delimiter), |(text, errors)| { map(word_infallible(delimiter, true), |(text, errors)| {
(text.map(|text| (Delimiter::None, text.to_string())), errors) (text.map(|text| (Delimiter::None, text.to_string())), errors)
}), }),
)(inp) )(inp)
@@ -322,15 +379,6 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
|((field_name, _, leaf), mut errors)| { |((field_name, _, leaf), mut errors)| {
( (
leaf.map(|leaf| { leaf.map(|leaf| {
if matches!(&leaf, UserInputLeaf::Literal(literal)
if literal.phrase.contains(':') && literal.delimiter == Delimiter::None)
&& field_name.is_none()
{
errors.push(LenientErrorInternal {
pos: inp.len(),
message: "parsed possible invalid field as term".to_string(),
});
}
if matches!(&leaf, UserInputLeaf::Literal(literal) if matches!(&leaf, UserInputLeaf::Literal(literal)
if literal.phrase == "NOT" && literal.delimiter == Delimiter::None) if literal.phrase == "NOT" && literal.delimiter == Delimiter::None)
&& field_name.is_none() && field_name.is_none()
@@ -449,20 +497,20 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
tuple_infallible(( tuple_infallible((
opt_i(anychar), opt_i(anychar),
space0_infallible, space0_infallible,
word_infallible("]}"), word_infallible("]}", false),
space1_infallible, space1_infallible,
opt_i_err( opt_i_err(
terminated(tag("TO"), alt((value((), multispace1), value((), eof)))), terminated(tag("TO"), alt((value((), multispace1), value((), eof)))),
"missing keyword TO", "missing keyword TO",
), ),
word_infallible("]}"), word_infallible("]}", false),
opt_i_err(one_of("]}"), "missing range delimiter"), opt_i_err(one_of("]}"), "missing range delimiter"),
)), )),
|( |(
(lower_bound_kind, _multispace0, lower, _multispace1, to, upper, upper_bound_kind), (lower_bound_kind, _multispace0, lower, _multispace1, to, upper, upper_bound_kind),
errs, errs,
)| { )| {
let lower_bound = match (lower_bound_kind, lower) { let lower_bound = match (lower_bound_kind, lower.as_deref()) {
(_, Some("*")) => UserInputBound::Unbounded, (_, Some("*")) => UserInputBound::Unbounded,
(_, None) => UserInputBound::Unbounded, (_, None) => UserInputBound::Unbounded,
// if it is some, TO was actually the bound (i.e. [TO TO something]) // if it is some, TO was actually the bound (i.e. [TO TO something])
@@ -471,7 +519,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
(Some('{'), Some(bound)) => UserInputBound::Exclusive(bound.to_string()), (Some('{'), Some(bound)) => UserInputBound::Exclusive(bound.to_string()),
_ => unreachable!("precondition failed, range did not start with [ or {{"), _ => unreachable!("precondition failed, range did not start with [ or {{"),
}; };
let upper_bound = match (upper_bound_kind, upper) { let upper_bound = match (upper_bound_kind, upper.as_deref()) {
(_, Some("*")) => UserInputBound::Unbounded, (_, Some("*")) => UserInputBound::Unbounded,
(_, None) => UserInputBound::Unbounded, (_, None) => UserInputBound::Unbounded,
(Some(']'), Some(bound)) => UserInputBound::Inclusive(bound.to_string()), (Some(']'), Some(bound)) => UserInputBound::Inclusive(bound.to_string()),
@@ -488,7 +536,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
( (
( (
value((), tag(">=")), value((), tag(">=")),
map(word_infallible(""), |(bound, err)| { map(word_infallible("", false), |(bound, err)| {
( (
( (
bound bound
@@ -502,7 +550,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
), ),
( (
value((), tag("<=")), value((), tag("<=")),
map(word_infallible(""), |(bound, err)| { map(word_infallible("", false), |(bound, err)| {
( (
( (
UserInputBound::Unbounded, UserInputBound::Unbounded,
@@ -516,7 +564,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
), ),
( (
value((), tag(">")), value((), tag(">")),
map(word_infallible(""), |(bound, err)| { map(word_infallible("", false), |(bound, err)| {
( (
( (
bound bound
@@ -530,7 +578,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
), ),
( (
value((), tag("<")), value((), tag("<")),
map(word_infallible(""), |(bound, err)| { map(word_infallible("", false), |(bound, err)| {
( (
( (
UserInputBound::Unbounded, UserInputBound::Unbounded,
@@ -1157,6 +1205,12 @@ mod test {
test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]"); test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]"); test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]");
test_parse_query_to_ast_helper(">a", "{\"a\" TO \"*\"}");
test_parse_query_to_ast_helper(">=a", "[\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("<a", "{\"*\" TO \"a\"}");
test_parse_query_to_ast_helper("<=a", "{\"*\" TO \"a\"]");
test_parse_query_to_ast_helper("<=bsd", "{\"*\" TO \"bsd\"]");
} }
#[test] #[test]
@@ -1590,5 +1644,21 @@ mod test {
r#"myfield:'hello\"happy\'tax'"#, r#"myfield:'hello\"happy\'tax'"#,
r#""myfield":'hello"happy'tax'"#, r#""myfield":'hello"happy'tax'"#,
); );
// we don't process escape sequence for chars which don't require it
test_parse_query_to_ast_helper(r#"abc\*"#, r#"abc\*"#);
}
#[test]
fn test_queries_with_colons() {
test_parse_query_to_ast_helper(r#""abc:def""#, r#""abc:def""#);
test_parse_query_to_ast_helper(r#"'abc:def'"#, r#"'abc:def'"#);
test_parse_query_to_ast_helper(r#"abc\:def"#, r#"abc:def"#);
test_parse_query_to_ast_helper(r#""abc\:def""#, r#""abc:def""#);
test_parse_query_to_ast_helper(r#"'abc\:def'"#, r#"'abc:def'"#);
}
#[test]
fn test_invalid_field() {
test_is_parse_err(r#"!bc:def"#, "!bc:def");
} }
} }

View File

@@ -34,7 +34,7 @@ use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation, DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
}; };
use super::metric::{ use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation, PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation,
}; };
@@ -146,6 +146,11 @@ pub enum AggregationVariants {
/// extracted values. /// extracted values.
#[serde(rename = "stats")] #[serde(rename = "stats")]
Stats(StatsAggregation), Stats(StatsAggregation),
/// Computes a collection of estended statistics (`min`, `max`, `sum`, `count`, `avg`,
/// `sum_of_squares`, `variance`, `variance_sampling`, `std_deviation`,
/// `std_deviation_sampling`) over the extracted values.
#[serde(rename = "extended_stats")]
ExtendedStats(ExtendedStatsAggregation),
/// Computes the sum of the extracted values. /// Computes the sum of the extracted values.
#[serde(rename = "sum")] #[serde(rename = "sum")]
Sum(SumAggregation), Sum(SumAggregation),
@@ -170,6 +175,7 @@ impl AggregationVariants {
AggregationVariants::Max(max) => vec![max.field_name()], AggregationVariants::Max(max) => vec![max.field_name()],
AggregationVariants::Min(min) => vec![min.field_name()], AggregationVariants::Min(min) => vec![min.field_name()],
AggregationVariants::Stats(stats) => vec![stats.field_name()], AggregationVariants::Stats(stats) => vec![stats.field_name()],
AggregationVariants::ExtendedStats(extended_stats) => vec![extended_stats.field_name()],
AggregationVariants::Sum(sum) => vec![sum.field_name()], AggregationVariants::Sum(sum) => vec![sum.field_name()],
AggregationVariants::Percentiles(per) => vec![per.field_name()], AggregationVariants::Percentiles(per) => vec![per.field_name()],
AggregationVariants::TopHits(top_hits) => top_hits.field_names(), AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
@@ -197,6 +203,12 @@ impl AggregationVariants {
_ => None, _ => None,
} }
} }
pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregation> {
match &self {
AggregationVariants::TopHits(top_hits) => Some(top_hits),
_ => None,
}
}
pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> { pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
match &self { match &self {

View File

@@ -11,8 +11,8 @@ use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation, DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
}; };
use super::metric::{ use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, StatsAggregation, AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
SumAggregation, StatsAggregation, SumAggregation,
}; };
use super::segment_agg_result::AggregationLimits; use super::segment_agg_result::AggregationLimits;
use super::VecWithNames; use super::VecWithNames;
@@ -276,6 +276,10 @@ impl AggregationWithAccessor {
field: ref field_name, field: ref field_name,
.. ..
}) })
| ExtendedStats(ExtendedStatsAggregation {
field: ref field_name,
..
})
| Sum(SumAggregation { | Sum(SumAggregation {
field: ref field_name, field: ref field_name,
.. ..
@@ -335,8 +339,8 @@ fn get_missing_val(
} }
_ => { _ => {
return Err(crate::TantivyError::InvalidArgument(format!( return Err(crate::TantivyError::InvalidArgument(format!(
"Missing value {:?} for field {} is not supported for column type {:?}", "Missing value {missing:?} for field {field_name} is not supported for column \
missing, field_name, column_type type {column_type:?}"
))); )));
} }
}; };
@@ -403,7 +407,7 @@ fn get_dynamic_columns(
.iter() .iter()
.map(|h| h.open()) .map(|h| h.open())
.collect::<io::Result<_>>()?; .collect::<io::Result<_>>()?;
assert!(!ff_fields.is_empty(), "field {} not found", field_name); assert!(!ff_fields.is_empty(), "field {field_name} not found");
Ok(cols) Ok(cols)
} }

View File

@@ -8,7 +8,9 @@ use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::bucket::GetDocCount; use super::bucket::GetDocCount;
use super::metric::{PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult}; use super::metric::{
ExtendedStats, PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult,
};
use super::{AggregationError, Key}; use super::{AggregationError, Key};
use crate::TantivyError; use crate::TantivyError;
@@ -88,6 +90,8 @@ pub enum MetricResult {
Min(SingleMetricResult), Min(SingleMetricResult),
/// Stats metric result. /// Stats metric result.
Stats(Stats), Stats(Stats),
/// ExtendedStats metric result.
ExtendedStats(Box<ExtendedStats>),
/// Sum metric result. /// Sum metric result.
Sum(SingleMetricResult), Sum(SingleMetricResult),
/// Percentiles metric result. /// Percentiles metric result.
@@ -104,6 +108,7 @@ impl MetricResult {
MetricResult::Max(max) => Ok(max.value), MetricResult::Max(max) => Ok(max.value),
MetricResult::Min(min) => Ok(min.value), MetricResult::Min(min) => Ok(min.value),
MetricResult::Stats(stats) => stats.get_value(agg_property), MetricResult::Stats(stats) => stats.get_value(agg_property),
MetricResult::ExtendedStats(extended_stats) => extended_stats.get_value(agg_property),
MetricResult::Sum(sum) => Ok(sum.value), MetricResult::Sum(sum) => Ok(sum.value),
MetricResult::Percentiles(_) => Err(TantivyError::AggregationError( MetricResult::Percentiles(_) => Err(TantivyError::AggregationError(
AggregationError::InvalidRequest("percentiles can't be used to order".to_string()), AggregationError::InvalidRequest("percentiles can't be used to order".to_string()),

View File

@@ -357,8 +357,7 @@ impl SegmentTermCollector {
) -> crate::Result<Self> { ) -> crate::Result<Self> {
if field_type == ColumnType::Bytes { if field_type == ColumnType::Bytes {
return Err(TantivyError::InvalidArgument(format!( return Err(TantivyError::InvalidArgument(format!(
"terms aggregation is not supported for column type {:?}", "terms aggregation is not supported for column type {field_type:?}"
field_type
))); )));
} }
let term_buckets = TermBuckets::default(); let term_buckets = TermBuckets::default();

View File

@@ -19,8 +19,8 @@ use super::bucket::{
GetDocCount, Order, OrderTarget, RangeAggregation, TermsAggregation, GetDocCount, Order, OrderTarget, RangeAggregation, TermsAggregation,
}; };
use super::metric::{ use super::metric::{
IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats, IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
IntermediateSum, PercentilesCollector, TopHitsTopNComputer, IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
}; };
use super::segment_agg_result::AggregationLimits; use super::segment_agg_result::AggregationLimits;
use super::{format_date, AggregationError, Key, SerializedKey}; use super::{format_date, AggregationError, Key, SerializedKey};
@@ -215,6 +215,9 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
Stats(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Stats( Stats(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Stats(
IntermediateStats::default(), IntermediateStats::default(),
)), )),
ExtendedStats(_) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::ExtendedStats(IntermediateExtendedStats::default()),
),
Sum(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Sum( Sum(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Sum(
IntermediateSum::default(), IntermediateSum::default(),
)), )),
@@ -222,7 +225,7 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
IntermediateMetricResult::Percentiles(PercentilesCollector::default()), IntermediateMetricResult::Percentiles(PercentilesCollector::default()),
), ),
TopHits(ref req) => IntermediateAggregationResult::Metric( TopHits(ref req) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req.clone())), IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
), ),
} }
} }
@@ -282,6 +285,8 @@ pub enum IntermediateMetricResult {
Min(IntermediateMin), Min(IntermediateMin),
/// Intermediate stats result. /// Intermediate stats result.
Stats(IntermediateStats), Stats(IntermediateStats),
/// Intermediate stats result.
ExtendedStats(IntermediateExtendedStats),
/// Intermediate sum result. /// Intermediate sum result.
Sum(IntermediateSum), Sum(IntermediateSum),
/// Intermediate top_hits result /// Intermediate top_hits result
@@ -306,6 +311,9 @@ impl IntermediateMetricResult {
IntermediateMetricResult::Stats(intermediate_stats) => { IntermediateMetricResult::Stats(intermediate_stats) => {
MetricResult::Stats(intermediate_stats.finalize()) MetricResult::Stats(intermediate_stats.finalize())
} }
IntermediateMetricResult::ExtendedStats(intermediate_stats) => {
MetricResult::ExtendedStats(intermediate_stats.finalize())
}
IntermediateMetricResult::Sum(intermediate_sum) => { IntermediateMetricResult::Sum(intermediate_sum) => {
MetricResult::Sum(intermediate_sum.finalize().into()) MetricResult::Sum(intermediate_sum.finalize().into())
} }
@@ -346,6 +354,12 @@ impl IntermediateMetricResult {
) => { ) => {
stats_left.merge_fruits(stats_right); stats_left.merge_fruits(stats_right);
} }
(
IntermediateMetricResult::ExtendedStats(extended_stats_left),
IntermediateMetricResult::ExtendedStats(extended_stats_right),
) => {
extended_stats_left.merge_fruits(extended_stats_right);
}
(IntermediateMetricResult::Sum(sum_left), IntermediateMetricResult::Sum(sum_right)) => { (IntermediateMetricResult::Sum(sum_left), IntermediateMetricResult::Sum(sum_right)) => {
sum_left.merge_fruits(sum_right); sum_left.merge_fruits(sum_right);
} }

File diff suppressed because it is too large Load Diff

View File

@@ -18,6 +18,7 @@
mod average; mod average;
mod count; mod count;
mod extended_stats;
mod max; mod max;
mod min; mod min;
mod percentiles; mod percentiles;
@@ -29,6 +30,7 @@ use std::collections::HashMap;
pub use average::*; pub use average::*;
pub use count::*; pub use count::*;
pub use extended_stats::*;
pub use max::*; pub use max::*;
pub use min::*; pub use min::*;
pub use percentiles::*; pub use percentiles::*;

View File

@@ -1,3 +1,5 @@
use std::fmt::Debug;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::*; use super::*;
@@ -85,13 +87,15 @@ impl Stats {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateStats { pub struct IntermediateStats {
/// The number of extracted values. /// The number of extracted values.
count: u64, pub(crate) count: u64,
/// The sum of the extracted values. /// The sum of the extracted values.
sum: f64, pub(crate) sum: f64,
/// delta for sum needed for [Kahan algorithm for summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
pub(crate) delta: f64,
/// The min value. /// The min value.
min: f64, pub(crate) min: f64,
/// The max value. /// The max value.
max: f64, pub(crate) max: f64,
} }
impl Default for IntermediateStats { impl Default for IntermediateStats {
@@ -99,6 +103,7 @@ impl Default for IntermediateStats {
Self { Self {
count: 0, count: 0,
sum: 0.0, sum: 0.0,
delta: 0.0,
min: f64::MAX, min: f64::MAX,
max: f64::MIN, max: f64::MIN,
} }
@@ -109,7 +114,13 @@ impl IntermediateStats {
/// Merges the other stats intermediate result into self. /// Merges the other stats intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateStats) { pub fn merge_fruits(&mut self, other: IntermediateStats) {
self.count += other.count; self.count += other.count;
self.sum += other.sum;
// kahan algorithm for sum
let y = other.sum - (self.delta + other.delta);
let t = self.sum + y;
self.delta = (t - self.sum) - y;
self.sum = t;
self.min = self.min.min(other.min); self.min = self.min.min(other.min);
self.max = self.max.max(other.max); self.max = self.max.max(other.max);
} }
@@ -141,9 +152,15 @@ impl IntermediateStats {
} }
#[inline] #[inline]
fn collect(&mut self, value: f64) { pub(in crate::aggregation::metric) fn collect(&mut self, value: f64) {
self.count += 1; self.count += 1;
self.sum += value;
// kahan algorithm for sum
let y = value - self.delta;
let t = self.sum + y;
self.delta = (t - self.sum) - y;
self.sum = t;
self.min = self.min.min(value); self.min = self.min.min(value);
self.max = self.max.max(value); self.max = self.max.max(value);
} }
@@ -288,7 +305,6 @@ impl SegmentAggregationCollector for SegmentStatsCollector {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use serde_json::Value; use serde_json::Value;
use crate::aggregation::agg_req::{Aggregation, Aggregations}; use crate::aggregation::agg_req::{Aggregation, Aggregations};

View File

@@ -1,7 +1,7 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
use columnar::{ColumnarReader, DynamicColumn}; use columnar::{Column, ColumnType, ColumnarReader, DynamicColumn};
use common::json_path_writer::JSON_PATH_SEGMENT_SEP_STR; use common::json_path_writer::JSON_PATH_SEGMENT_SEP_STR;
use common::DateTime; use common::DateTime;
use regex::Regex; use regex::Regex;
@@ -131,8 +131,8 @@ impl<'de> Deserialize<'de> for KeyOrder {
))?; ))?;
if key_order.next().is_some() { if key_order.next().is_some() {
return Err(serde::de::Error::custom(format!( return Err(serde::de::Error::custom(format!(
"Expected exactly one key-value pair in sort parameter of top_hits, found {:?}", "Expected exactly one key-value pair in sort parameter of top_hits, found \
key_order {key_order:?}"
))); )));
} }
Ok(Self { field, order }) Ok(Self { field, order })
@@ -144,27 +144,22 @@ fn globbed_string_to_regex(glob: &str) -> Result<Regex, crate::TantivyError> {
// Replace `*` glob with `.*` regex // Replace `*` glob with `.*` regex
let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*")); let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*"));
Regex::new(&sanitized.replace('*', ".*")).map_err(|e| { Regex::new(&sanitized.replace('*', ".*")).map_err(|e| {
crate::TantivyError::SchemaError(format!( crate::TantivyError::SchemaError(format!("Invalid regex '{glob}' in docvalue_fields: {e}"))
"Invalid regex '{}' in docvalue_fields: {}",
glob, e
))
}) })
} }
fn use_doc_value_fields_err(parameter: &str) -> crate::Result<()> { fn use_doc_value_fields_err(parameter: &str) -> crate::Result<()> {
Err(crate::TantivyError::AggregationError( Err(crate::TantivyError::AggregationError(
AggregationError::InvalidRequest(format!( AggregationError::InvalidRequest(format!(
"The `{}` parameter is not supported, only `docvalue_fields` is supported in \ "The `{parameter}` parameter is not supported, only `docvalue_fields` is supported in \
`top_hits` aggregation", `top_hits` aggregation"
parameter
)), )),
)) ))
} }
fn unsupported_err(parameter: &str) -> crate::Result<()> { fn unsupported_err(parameter: &str) -> crate::Result<()> {
Err(crate::TantivyError::AggregationError( Err(crate::TantivyError::AggregationError(
AggregationError::InvalidRequest(format!( AggregationError::InvalidRequest(format!(
"The `{}` parameter is not supported in the `top_hits` aggregation", "The `{parameter}` parameter is not supported in the `top_hits` aggregation"
parameter
)), )),
)) ))
} }
@@ -217,8 +212,7 @@ impl TopHitsAggregation {
.collect::<Vec<_>>(); .collect::<Vec<_>>();
assert!( assert!(
!fields.is_empty(), !fields.is_empty(),
"No fields matched the glob '{}' in docvalue_fields", "No fields matched the glob '{field}' in docvalue_fields"
field
); );
Ok(fields) Ok(fields)
}) })
@@ -254,7 +248,7 @@ impl TopHitsAggregation {
.map(|field| { .map(|field| {
let accessors = accessors let accessors = accessors
.get(field) .get(field)
.unwrap_or_else(|| panic!("field '{}' not found in accessors", field)); .unwrap_or_else(|| panic!("field '{field}' not found in accessors"));
let values: Vec<FastFieldValue> = accessors let values: Vec<FastFieldValue> = accessors
.iter() .iter()
@@ -449,10 +443,10 @@ impl std::cmp::PartialEq for TopHitsTopNComputer {
impl TopHitsTopNComputer { impl TopHitsTopNComputer {
/// Create a new TopHitsCollector /// Create a new TopHitsCollector
pub fn new(req: TopHitsAggregation) -> Self { pub fn new(req: &TopHitsAggregation) -> Self {
Self { Self {
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)), top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
req, req: req.clone(),
} }
} }
@@ -497,7 +491,6 @@ impl TopHitsTopNComputer {
pub(crate) struct TopHitsSegmentCollector { pub(crate) struct TopHitsSegmentCollector {
segment_ordinal: SegmentOrdinal, segment_ordinal: SegmentOrdinal,
accessor_idx: usize, accessor_idx: usize,
req: TopHitsAggregation,
top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, false>, top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, false>,
} }
@@ -508,7 +501,6 @@ impl TopHitsSegmentCollector {
segment_ordinal: SegmentOrdinal, segment_ordinal: SegmentOrdinal,
) -> Self { ) -> Self {
Self { Self {
req: req.clone(),
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)), top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
segment_ordinal, segment_ordinal,
accessor_idx, accessor_idx,
@@ -517,14 +509,13 @@ impl TopHitsSegmentCollector {
fn into_top_hits_collector( fn into_top_hits_collector(
self, self,
value_accessors: &HashMap<String, Vec<DynamicColumn>>, value_accessors: &HashMap<String, Vec<DynamicColumn>>,
req: &TopHitsAggregation,
) -> TopHitsTopNComputer { ) -> TopHitsTopNComputer {
let mut top_hits_computer = TopHitsTopNComputer::new(self.req.clone()); let mut top_hits_computer = TopHitsTopNComputer::new(req);
let top_results = self.top_n.into_vec(); let top_results = self.top_n.into_vec();
for res in top_results { for res in top_results {
let doc_value_fields = self let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id);
.req
.get_document_field_data(value_accessors, res.doc.doc_id);
top_hits_computer.collect( top_hits_computer.collect(
DocSortValuesAndFields { DocSortValuesAndFields {
sorts: res.feature, sorts: res.feature,
@@ -536,34 +527,15 @@ impl TopHitsSegmentCollector {
top_hits_computer top_hits_computer
} }
}
impl SegmentAggregationCollector for TopHitsSegmentCollector { /// TODO add a specialized variant for a single sort field
fn add_intermediate_aggregation_result( fn collect_with(
self: Box<Self>,
agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
let intermediate_result =
IntermediateMetricResult::TopHits(self.into_top_hits_collector(value_accessors));
results.push(
name,
IntermediateAggregationResult::Metric(intermediate_result),
)
}
fn collect(
&mut self, &mut self,
doc_id: crate::DocId, doc_id: crate::DocId,
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor, req: &TopHitsAggregation,
accessors: &[(Column<u64>, ColumnType)],
) -> crate::Result<()> { ) -> crate::Result<()> {
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors; let sorts: Vec<DocValueAndOrder> = req
let sorts: Vec<DocValueAndOrder> = self
.req
.sort .sort
.iter() .iter()
.enumerate() .enumerate()
@@ -588,15 +560,62 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
); );
Ok(()) Ok(())
} }
}
impl SegmentAggregationCollector for TopHitsSegmentCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let intermediate_result = IntermediateMetricResult::TopHits(
self.into_top_hits_collector(value_accessors, tophits_req),
);
results.push(
name,
IntermediateAggregationResult::Metric(intermediate_result),
)
}
/// TODO: Consider a caching layer to reduce the call overhead
fn collect(
&mut self,
doc_id: crate::DocId,
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
) -> crate::Result<()> {
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
self.collect_with(doc_id, tophits_req, accessors)?;
Ok(())
}
fn collect_block( fn collect_block(
&mut self, &mut self,
docs: &[crate::DocId], docs: &[crate::DocId],
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor, agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
) -> crate::Result<()> { ) -> crate::Result<()> {
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
// TODO: Consider getting fields with the column block accessor. // TODO: Consider getting fields with the column block accessor.
for doc in docs { for doc in docs {
self.collect(*doc, agg_with_accessor)?; self.collect_with(*doc, tophits_req, accessors)?;
} }
Ok(()) Ok(())
} }

View File

@@ -158,15 +158,14 @@ use serde::de::{self, Visitor};
use serde::{Deserialize, Deserializer, Serialize}; use serde::{Deserialize, Deserializer, Serialize};
fn parse_str_into_f64<E: de::Error>(value: &str) -> Result<f64, E> { fn parse_str_into_f64<E: de::Error>(value: &str) -> Result<f64, E> {
let parsed = value.parse::<f64>().map_err(|_err| { let parsed = value
de::Error::custom(format!("Failed to parse f64 from string: {:?}", value)) .parse::<f64>()
})?; .map_err(|_err| de::Error::custom(format!("Failed to parse f64 from string: {value:?}")))?;
// Check if the parsed value is NaN or infinity // Check if the parsed value is NaN or infinity
if parsed.is_nan() || parsed.is_infinite() { if parsed.is_nan() || parsed.is_infinite() {
Err(de::Error::custom(format!( Err(de::Error::custom(format!(
"Value is not a valid f64 (NaN or Infinity): {:?}", "Value is not a valid f64 (NaN or Infinity): {value:?}"
value
))) )))
} else { } else {
Ok(parsed) Ok(parsed)

View File

@@ -11,12 +11,12 @@ use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAcce
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector}; use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
use super::intermediate_agg_result::IntermediateAggregationResults; use super::intermediate_agg_result::IntermediateAggregationResults;
use super::metric::{ use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
SegmentPercentilesCollector, SegmentStatsCollector, SegmentStatsType, StatsAggregation, SegmentPercentilesCollector, SegmentStatsCollector, SegmentStatsType, StatsAggregation,
SumAggregation, SumAggregation,
}; };
use crate::aggregation::bucket::TermMissingAgg; use crate::aggregation::bucket::TermMissingAgg;
use crate::aggregation::metric::TopHitsSegmentCollector; use crate::aggregation::metric::{SegmentExtendedStatsCollector, TopHitsSegmentCollector};
pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug { pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
@@ -148,6 +148,9 @@ pub(crate) fn build_single_agg_segment_collector(
accessor_idx, accessor_idx,
*missing, *missing,
))), ))),
ExtendedStats(ExtendedStatsAggregation { missing, sigma, .. }) => Ok(Box::new(
SegmentExtendedStatsCollector::from_req(req.field_type, *sigma, accessor_idx, *missing),
)),
Sum(SumAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req( Sum(SumAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type, req.field_type,
SegmentStatsType::Sum, SegmentStatsType::Sum,

View File

@@ -598,7 +598,7 @@ mod tests {
let mid = n % 4; let mid = n % 4;
n /= 4; n /= 4;
let leaf = n % 5; let leaf = n % 5;
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf)) Facet::from(&format!("/top{top}/mid{mid}/leaf{leaf}"))
}) })
.collect(); .collect();
for i in 0..num_facets * 10 { for i in 0..num_facets * 10 {
@@ -737,7 +737,7 @@ mod tests {
vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)] vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
.into_iter() .into_iter()
.flat_map(|(c, count)| { .flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c)); let facet = Facet::from(&format!("/facet/{c}"));
let doc = doc!(facet_field => facet); let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count) iter::repeat(doc).take(count)
}) })
@@ -785,7 +785,7 @@ mod tests {
let docs: Vec<TantivyDocument> = vec![("b", 2), ("a", 2), ("c", 4)] let docs: Vec<TantivyDocument> = vec![("b", 2), ("a", 2), ("c", 4)]
.into_iter() .into_iter()
.flat_map(|(c, count)| { .flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c)); let facet = Facet::from(&format!("/facet/{c}"));
let doc = doc!(facet_field => facet); let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count) iter::repeat(doc).take(count)
}) })

View File

@@ -871,7 +871,10 @@ mod tests {
use crate::schema::{Field, Schema, FAST, STORED, TEXT}; use crate::schema::{Field, Schema, FAST, STORED, TEXT};
use crate::time::format_description::well_known::Rfc3339; use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime; use crate::time::OffsetDateTime;
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score, SegmentReader}; use crate::{
assert_nearly_equals, DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score,
SegmentReader,
};
fn make_index() -> crate::Result<Index> { fn make_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();

View File

@@ -195,7 +195,7 @@ mod tests {
let (tx, rx) = crossbeam_channel::bounded::<()>(0); let (tx, rx) = crossbeam_channel::bounded::<()>(0);
let rx = Arc::new(rx); let rx = Arc::new(rx);
let executor = Executor::multi_thread(3, "search-test").unwrap(); let executor = Executor::multi_thread(3, "search-test").unwrap();
for i in 0..1000 { for _ in 0..1000 {
let counter_clone: Arc<AtomicU64> = counter.clone(); let counter_clone: Arc<AtomicU64> = counter.clone();
let other_counter_clone: Arc<AtomicU64> = other_counter.clone(); let other_counter_clone: Arc<AtomicU64> = other_counter.clone();
@@ -203,18 +203,18 @@ mod tests {
let rx_clone2 = rx.clone(); let rx_clone2 = rx.clone();
let fut = executor.spawn_blocking(move || { let fut = executor.spawn_blocking(move || {
counter_clone.fetch_add(1, Ordering::SeqCst); counter_clone.fetch_add(1, Ordering::SeqCst);
let () = rx_clone.recv().unwrap(); let _ = rx_clone.recv();
}); });
futures.push(fut); futures.push(fut);
let other_fut = executor.spawn_blocking(move || { let other_fut = executor.spawn_blocking(move || {
other_counter_clone.fetch_add(1, Ordering::SeqCst); other_counter_clone.fetch_add(1, Ordering::SeqCst);
let () = rx_clone2.recv().unwrap(); let _ = rx_clone2.recv();
}); });
other_futures.push(other_fut); other_futures.push(other_fut);
} }
// We execute 100 futures. // We execute 100 futures.
for i in 0..100 { for _ in 0..100 {
tx.send(()).unwrap(); tx.send(()).unwrap();
} }
@@ -226,7 +226,7 @@ mod tests {
drop(other_futures); drop(other_futures);
// We execute 100 futures. // We execute 100 futures.
for i in 0..100 { for _ in 0..100 {
tx.send(()).unwrap(); tx.send(()).unwrap();
} }

View File

@@ -338,14 +338,14 @@ mod tests {
let mut term = Term::from_field_json_path(field, "attributes.color", false); let mut term = Term::from_field_json_path(field, "attributes.color", false);
term.append_type_and_str("red"); term.append_type_and_str("red");
assert_eq!( assert_eq!(
format!("{:?}", term), format!("{term:?}"),
"Term(field=1, type=Json, path=attributes.color, type=Str, \"red\")" "Term(field=1, type=Json, path=attributes.color, type=Str, \"red\")"
); );
let mut term = Term::from_field_json_path(field, "attributes.dimensions.width", false); let mut term = Term::from_field_json_path(field, "attributes.dimensions.width", false);
term.append_type_and_fast_value(400i64); term.append_type_and_fast_value(400i64);
assert_eq!( assert_eq!(
format!("{:?}", term), format!("{term:?}"),
"Term(field=1, type=Json, path=attributes.dimensions.width, type=I64, 400)" "Term(field=1, type=Json, path=attributes.dimensions.width, type=I64, 400)"
); );
} }

View File

@@ -566,7 +566,7 @@ mod tests {
let mmap_directory = MmapDirectory::create_from_tempdir().unwrap(); let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
let num_paths = 10; let num_paths = 10;
let paths: Vec<PathBuf> = (0..num_paths) let paths: Vec<PathBuf> = (0..num_paths)
.map(|i| PathBuf::from(&*format!("file_{}", i))) .map(|i| PathBuf::from(&*format!("file_{i}")))
.collect(); .collect();
{ {
for path in &paths { for path in &paths {

View File

@@ -62,7 +62,7 @@ impl FacetReader {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED}; use crate::schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED};
use crate::{DocAddress, Index, IndexWriter, TantivyDocument}; use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
#[test] #[test]
@@ -88,7 +88,9 @@ mod tests {
let doc = searcher let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32)) .doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))
.unwrap(); .unwrap();
let value = doc.get_first(facet_field).and_then(|v| v.as_facet()); let value = doc
.get_first(facet_field)
.and_then(|v| v.as_value().as_facet());
assert_eq!(value, None); assert_eq!(value, None);
} }

View File

@@ -252,9 +252,8 @@ impl IndexBuilder {
let field_type = entry.field_type().value_type(); let field_type = entry.field_type().value_type();
if !supported_field_types.contains(&field_type) { if !supported_field_types.contains(&field_type) {
return Err(TantivyError::InvalidArgument(format!( return Err(TantivyError::InvalidArgument(format!(
"Unsupported field type in sort_by_field: {:?}. Supported field types: \ "Unsupported field type in sort_by_field: {field_type:?}. Supported field \
{:?} ", types: {supported_field_types:?} ",
field_type, supported_field_types,
))); )));
} }
} }

View File

@@ -318,14 +318,14 @@ impl SegmentReader {
if create_canonical { if create_canonical {
// Without expand dots enabled dots need to be escaped. // Without expand dots enabled dots need to be escaped.
let escaped_json_path = json_path.replace('.', "\\."); let escaped_json_path = json_path.replace('.', "\\.");
let full_path = format!("{}.{}", field_name, escaped_json_path); let full_path = format!("{field_name}.{escaped_json_path}");
let full_path_unescaped = format!("{}.{}", field_name, &json_path); let full_path_unescaped = format!("{}.{}", field_name, &json_path);
map_to_canonical.insert(full_path_unescaped, full_path.to_string()); map_to_canonical.insert(full_path_unescaped, full_path.to_string());
full_path full_path
} else { } else {
// With expand dots enabled, we can use '.' instead of '\u{1}'. // With expand dots enabled, we can use '.' instead of '\u{1}'.
json_path_sep_to_dot(&mut json_path); json_path_sep_to_dot(&mut json_path);
format!("{}.{}", field_name, json_path) format!("{field_name}.{json_path}")
} }
}; };
indexed_fields.extend( indexed_fields.extend(

View File

@@ -808,7 +808,7 @@ mod tests {
use proptest::prop_oneof; use proptest::prop_oneof;
use super::super::operation::UserOperation; use super::super::operation::UserOperation;
use crate::collector::TopDocs; use crate::collector::{Count, TopDocs};
use crate::directory::error::LockError; use crate::directory::error::LockError;
use crate::error::*; use crate::error::*;
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN; use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
@@ -816,7 +816,7 @@ mod tests {
use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery}; use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
use crate::schema::{ use crate::schema::{
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema, self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema,
TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT, TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED, STRING, TEXT,
}; };
use crate::store::DOCSTORE_CACHE_CAPACITY; use crate::store::DOCSTORE_CACHE_CAPACITY;
use crate::{ use crate::{
@@ -1572,20 +1572,74 @@ mod tests {
Ok(()) Ok(())
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone)]
enum IndexingOp { enum IndexingOp {
AddDoc { id: u64 }, AddMultipleDoc {
DeleteDoc { id: u64 }, id: u64,
DeleteDocQuery { id: u64 }, num_docs: u64,
value: IndexValue,
},
AddDoc {
id: u64,
value: IndexValue,
},
DeleteDoc {
id: u64,
},
DeleteDocQuery {
id: u64,
},
Commit, Commit,
Merge, Merge,
} }
impl IndexingOp {
fn add(id: u64) -> Self {
IndexingOp::AddDoc {
id,
value: IndexValue::F64(id as f64),
}
}
}
use serde::Serialize;
#[derive(Debug, Clone, Serialize)]
#[serde(untagged)]
enum IndexValue {
Str(String),
F64(f64),
U64(u64),
I64(i64),
}
impl Default for IndexValue {
fn default() -> Self {
IndexValue::F64(0.0)
}
}
fn value_strategy() -> impl Strategy<Value = IndexValue> {
prop_oneof![
any::<f64>().prop_map(IndexValue::F64),
any::<u64>().prop_map(IndexValue::U64),
any::<i64>().prop_map(IndexValue::I64),
any::<String>().prop_map(IndexValue::Str),
]
}
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> { fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![ prop_oneof![
(0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }), (0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
(0u64..20u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }), (0u64..20u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
(0u64..20u64).prop_map(|id| IndexingOp::AddDoc { id }), (0u64..20u64, value_strategy())
.prop_map(move |(id, value)| IndexingOp::AddDoc { id, value }),
((0u64..20u64), (1u64..100), value_strategy()).prop_map(
move |(id, num_docs, value)| {
IndexingOp::AddMultipleDoc {
id,
num_docs,
value,
}
}
),
(0u64..1u64).prop_map(|_| IndexingOp::Commit), (0u64..1u64).prop_map(|_| IndexingOp::Commit),
(0u64..1u64).prop_map(|_| IndexingOp::Merge), (0u64..1u64).prop_map(|_| IndexingOp::Merge),
] ]
@@ -1595,7 +1649,17 @@ mod tests {
prop_oneof![ prop_oneof![
5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }), 5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }), 5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
50 => (0u64..100u64).prop_map(|id| IndexingOp::AddDoc { id }), 50 => (0u64..100u64, value_strategy())
.prop_map(move |(id, value)| IndexingOp::AddDoc { id, value }),
50 => (0u64..100u64, (1u64..100), value_strategy()).prop_map(
move |(id, num_docs, value)| {
IndexingOp::AddMultipleDoc {
id,
num_docs,
value,
}
}
),
2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit), 2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit),
1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge), 1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge),
] ]
@@ -1604,19 +1668,27 @@ mod tests {
fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) { fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) {
let mut existing_ids = HashMap::new(); let mut existing_ids = HashMap::new();
let mut deleted_ids = HashSet::new(); let mut deleted_ids = HashSet::new();
for &op in ops { for op in ops {
match op { match op {
IndexingOp::AddDoc { id } => { IndexingOp::AddDoc { id, value: _ } => {
*existing_ids.entry(id).or_insert(0) += 1; *existing_ids.entry(*id).or_insert(0) += 1;
deleted_ids.remove(&id); deleted_ids.remove(id);
}
IndexingOp::AddMultipleDoc {
id,
num_docs,
value: _,
} => {
*existing_ids.entry(*id).or_insert(0) += num_docs;
deleted_ids.remove(id);
} }
IndexingOp::DeleteDoc { id } => { IndexingOp::DeleteDoc { id } => {
existing_ids.remove(&id); existing_ids.remove(&id);
deleted_ids.insert(id); deleted_ids.insert(*id);
} }
IndexingOp::DeleteDocQuery { id } => { IndexingOp::DeleteDocQuery { id } => {
existing_ids.remove(&id); existing_ids.remove(&id);
deleted_ids.insert(id); deleted_ids.insert(*id);
} }
_ => {} _ => {}
} }
@@ -1626,16 +1698,19 @@ mod tests {
fn get_id_list(ops: &[IndexingOp]) -> Vec<u64> { fn get_id_list(ops: &[IndexingOp]) -> Vec<u64> {
let mut id_list = Vec::new(); let mut id_list = Vec::new();
for &op in ops { for op in ops {
match op { match op {
IndexingOp::AddDoc { id } => { IndexingOp::AddDoc { id, value: _ } => {
id_list.push(id); id_list.push(*id);
}
IndexingOp::AddMultipleDoc { id, .. } => {
id_list.push(*id);
} }
IndexingOp::DeleteDoc { id } => { IndexingOp::DeleteDoc { id } => {
id_list.retain(|el| *el != id); id_list.retain(|el| el != id);
} }
IndexingOp::DeleteDocQuery { id } => { IndexingOp::DeleteDocQuery { id } => {
id_list.retain(|el| *el != id); id_list.retain(|el| el != id);
} }
_ => {} _ => {}
} }
@@ -1716,42 +1791,59 @@ mod tests {
let ip_from_id = |id| Ipv6Addr::from_u128(id as u128); let ip_from_id = |id| Ipv6Addr::from_u128(id as u128);
for &op in ops { let add_docs = |index_writer: &mut IndexWriter,
match op { id: u64,
IndexingOp::AddDoc { id } => { value: IndexValue,
let facet = Facet::from(&("/cola/".to_string() + &id.to_string())); num: u64|
let ip = ip_from_id(id); -> crate::Result<()> {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
if !id_is_full_doc(id) { let ip = ip_from_id(id);
// every 3rd doc has no ip field let doc = if !id_is_full_doc(id) {
index_writer.add_document(doc!( // every 3rd doc has no ip field
id_field=>id, doc!(
))?; id_field=>id,
} else { )
let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string()}); } else {
index_writer.add_document(doc!(id_field=>id, let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string(), "val": value});
json_field=>json, doc!(id_field=>id,
bytes_field => id.to_le_bytes().as_slice(), json_field=>json,
id_opt_field => id, bytes_field => id.to_le_bytes().as_slice(),
ip_field => ip, id_opt_field => id,
ips_field => ip, ip_field => ip,
ips_field => ip, ips_field => ip,
multi_numbers=> id, ips_field => ip,
multi_numbers => id, multi_numbers=> id,
bool_field => (id % 2u64) != 0, multi_numbers => id,
i64_field => id as i64, bool_field => (id % 2u64) != 0,
f64_field => id as f64, i64_field => id as i64,
date_field => DateTime::from_timestamp_secs(id as i64), f64_field => id as f64,
multi_bools => (id % 2u64) != 0, date_field => DateTime::from_timestamp_secs(id as i64),
multi_bools => (id % 2u64) == 0, multi_bools => (id % 2u64) != 0,
text_field => id.to_string(), multi_bools => (id % 2u64) == 0,
facet_field => facet, text_field => id.to_string(),
large_text_field => LOREM, facet_field => facet,
multi_text_fields => multi_text_field_text1, large_text_field => LOREM,
multi_text_fields => multi_text_field_text2, multi_text_fields => multi_text_field_text1,
multi_text_fields => multi_text_field_text3, multi_text_fields => multi_text_field_text2,
))?; multi_text_fields => multi_text_field_text3,
} )
};
for _ in 0..num {
index_writer.add_document(doc.clone())?;
}
Ok(())
};
for op in ops {
match op.clone() {
IndexingOp::AddMultipleDoc {
id,
num_docs,
value,
} => {
add_docs(&mut index_writer, id, value, num_docs)?;
}
IndexingOp::AddDoc { id, value } => {
add_docs(&mut index_writer, id, value, 1)?;
} }
IndexingOp::DeleteDoc { id } => { IndexingOp::DeleteDoc { id } => {
index_writer.delete_term(Term::from_field_u64(id_field, id)); index_writer.delete_term(Term::from_field_u64(id_field, id));
@@ -1979,7 +2071,13 @@ mod tests {
.unwrap(); .unwrap();
// test store iterator // test store iterator
for doc in store_reader.iter::<TantivyDocument>(segment_reader.alive_bitset()) { for doc in store_reader.iter::<TantivyDocument>(segment_reader.alive_bitset()) {
let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap(); let id = doc
.unwrap()
.get_first(id_field)
.unwrap()
.as_value()
.as_u64()
.unwrap();
assert!(expected_ids_and_num_occurrences.contains_key(&id)); assert!(expected_ids_and_num_occurrences.contains_key(&id));
} }
// test store random access // test store random access
@@ -2026,18 +2124,22 @@ mod tests {
top_docs.iter().map(|el| el.1).collect::<Vec<_>>() top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
}; };
let count_search = |term: &str, field| {
let query = QueryParser::for_index(&index, vec![field])
.parse_query(term)
.unwrap();
searcher.search(&query, &Count).unwrap()
};
let do_search2 = |term: Term| { let count_search2 = |term: Term| {
let query = TermQuery::new(term, IndexRecordOption::Basic); let query = TermQuery::new(term, IndexRecordOption::Basic);
let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &Count).unwrap()
searcher.search(&query, &TopDocs::with_limit(1000)).unwrap();
top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
}; };
for (id, count) in &expected_ids_and_num_occurrences { for (id, count) in &expected_ids_and_num_occurrences {
// skip expensive queries
let (existing_id, count) = (*id, *count); let (existing_id, count) = (*id, *count);
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64; let get_num_hits = |field| count_search(&existing_id.to_string(), field) as u64;
assert_eq!(get_num_hits(id_field), count); assert_eq!(get_num_hits(id_field), count);
if !id_is_full_doc(existing_id) { if !id_is_full_doc(existing_id) {
continue; continue;
@@ -2047,29 +2149,31 @@ mod tests {
assert_eq!(get_num_hits(f64_field), count); assert_eq!(get_num_hits(f64_field), count);
// Test multi text // Test multi text
assert_eq!( if num_docs_with_values < 1000 {
do_search("\"test1 test2\"", multi_text_fields).len(), assert_eq!(
num_docs_with_values do_search("\"test1 test2\"", multi_text_fields).len(),
); num_docs_with_values
assert_eq!( );
do_search("\"test2 test3\"", multi_text_fields).len(), assert_eq!(
num_docs_with_values do_search("\"test2 test3\"", multi_text_fields).len(),
); num_docs_with_values
);
}
// Test bytes // Test bytes
let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice()); let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice());
assert_eq!(do_search2(term).len() as u64, count); assert_eq!(count_search2(term) as u64, count);
// Test date // Test date
let term = Term::from_field_date( let term = Term::from_field_date(
date_field, date_field,
DateTime::from_timestamp_secs(existing_id as i64), DateTime::from_timestamp_secs(existing_id as i64),
); );
assert_eq!(do_search2(term).len() as u64, count); assert_eq!(count_search2(term) as u64, count);
} }
for deleted_id in deleted_ids { for deleted_id in deleted_ids {
let assert_field = |field| { let assert_field = |field| {
assert_eq!(do_search(&deleted_id.to_string(), field).len() as u64, 0); assert_eq!(count_search(&deleted_id.to_string(), field) as u64, 0);
}; };
assert_field(text_field); assert_field(text_field);
assert_field(f64_field); assert_field(f64_field);
@@ -2078,12 +2182,12 @@ mod tests {
// Test bytes // Test bytes
let term = Term::from_field_bytes(bytes_field, deleted_id.to_le_bytes().as_slice()); let term = Term::from_field_bytes(bytes_field, deleted_id.to_le_bytes().as_slice());
assert_eq!(do_search2(term).len() as u64, 0); assert_eq!(count_search2(term), 0);
// Test date // Test date
let term = let term =
Term::from_field_date(date_field, DateTime::from_timestamp_secs(deleted_id as i64)); Term::from_field_date(date_field, DateTime::from_timestamp_secs(deleted_id as i64));
assert_eq!(do_search2(term).len() as u64, 0); assert_eq!(count_search2(term), 0);
} }
// search ip address // search ip address
// //
@@ -2092,13 +2196,13 @@ mod tests {
if !id_is_full_doc(existing_id) { if !id_is_full_doc(existing_id) {
continue; continue;
} }
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64; let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
let ip_addr = Ipv6Addr::from_u128(existing_id as u128); let ip_addr = Ipv6Addr::from_u128(existing_id as u128);
// Test incoming ip as ipv6 // Test incoming ip as ipv6
assert_eq!(do_search_ip_field(&format!("\"{ip_addr}\"")), count); assert_eq!(do_search_ip_field(&format!("\"{ip_addr}\"")), count);
let term = Term::from_field_ip_addr(ip_field, ip_addr); let term = Term::from_field_ip_addr(ip_field, ip_addr);
assert_eq!(do_search2(term).len() as u64, count); assert_eq!(count_search2(term) as u64, count);
// Test incoming ip as ipv4 // Test incoming ip as ipv4
if let Some(ip_addr) = ip_addr.to_ipv4_mapped() { if let Some(ip_addr) = ip_addr.to_ipv4_mapped() {
@@ -2115,7 +2219,7 @@ mod tests {
if !sample.is_empty() { if !sample.is_empty() {
let (left_sample, right_sample) = sample.split_at(sample.len() / 2); let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
let expected_count = |sample: &[(&u64, &u64)]| { let calc_expected_count = |sample: &[(&u64, &u64)]| {
sample sample
.iter() .iter()
.filter(|(id, _)| id_is_full_doc(**id)) .filter(|(id, _)| id_is_full_doc(**id))
@@ -2131,18 +2235,17 @@ mod tests {
} }
// Query first half // Query first half
if !left_sample.is_empty() { let expected_count = calc_expected_count(left_sample);
let expected_count = expected_count(left_sample); if !left_sample.is_empty() && expected_count < 1000 {
let start_range = *left_sample[0].0; let start_range = *left_sample[0].0;
let end_range = *left_sample.last().unwrap().0; let end_range = *left_sample.last().unwrap().0;
let query = gen_query_inclusive("id_opt", start_range, end_range); let query = gen_query_inclusive("id_opt", start_range, end_range);
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count); assert_eq!(count_search(&query, id_opt_field) as u64, expected_count);
// Range query on ip field // Range query on ip field
let ip1 = ip_from_id(start_range); let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range); let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64; let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
let query = gen_query_inclusive("ip", ip1, ip2); let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count); assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", "*", ip2); let query = gen_query_inclusive("ip", "*", ip2);
@@ -2154,19 +2257,19 @@ mod tests {
assert_eq!(do_search_ip_field(&query), expected_count); assert_eq!(do_search_ip_field(&query), expected_count);
} }
// Query second half // Query second half
if !right_sample.is_empty() { let expected_count = calc_expected_count(right_sample);
let expected_count = expected_count(right_sample); if !right_sample.is_empty() && expected_count < 1000 {
let start_range = *right_sample[0].0; let start_range = *right_sample[0].0;
let end_range = *right_sample.last().unwrap().0; let end_range = *right_sample.last().unwrap().0;
// Range query on id opt field // Range query on id opt field
let query = let query =
gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string()); gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count); assert_eq!(count_search(&query, id_opt_field) as u64, expected_count);
// Range query on ip field // Range query on ip field
let ip1 = ip_from_id(start_range); let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range); let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64; let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
let query = gen_query_inclusive("ip", ip1, ip2); let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count); assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", ip1, "*"); let query = gen_query_inclusive("ip", ip1, "*");
@@ -2191,7 +2294,7 @@ mod tests {
}; };
let ip = ip_from_id(existing_id); let ip = ip_from_id(existing_id);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64; let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
// Range query on single value field // Range query on single value field
let query = gen_query_inclusive("ip", ip, ip); let query = gen_query_inclusive("ip", ip, ip);
assert_eq!(do_search_ip_field(&query), count); assert_eq!(do_search_ip_field(&query), count);
@@ -2251,7 +2354,7 @@ mod tests {
#[test] #[test]
fn test_fast_field_range() { fn test_fast_field_range() {
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect(); let ops: Vec<_> = (0..1000).map(|id| IndexingOp::add(id)).collect();
assert!(test_operation_strategy(&ops, false, true).is_ok()); assert!(test_operation_strategy(&ops, false, true).is_ok());
} }
@@ -2259,8 +2362,8 @@ mod tests {
fn test_sort_index_on_opt_field_regression() { fn test_sort_index_on_opt_field_regression() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[ &[
IndexingOp::AddDoc { id: 81 }, IndexingOp::add(81),
IndexingOp::AddDoc { id: 70 }, IndexingOp::add(70),
IndexingOp::DeleteDoc { id: 70 } IndexingOp::DeleteDoc { id: 70 }
], ],
true, true,
@@ -2269,14 +2372,45 @@ mod tests {
.is_ok()); .is_ok());
} }
#[test]
fn test_simple_multiple_doc() {
assert!(test_operation_strategy(
&[
IndexingOp::AddMultipleDoc {
id: 7,
num_docs: 800,
value: IndexValue::U64(0),
},
IndexingOp::AddMultipleDoc {
id: 92,
num_docs: 800,
value: IndexValue::U64(0),
},
IndexingOp::AddMultipleDoc {
id: 30,
num_docs: 800,
value: IndexValue::U64(0),
},
IndexingOp::AddMultipleDoc {
id: 33,
num_docs: 800,
value: IndexValue::U64(0),
},
],
true,
false
)
.is_ok());
}
#[test] #[test]
fn test_ip_range_query_multivalue_bug() { fn test_ip_range_query_multivalue_bug() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[ &[
IndexingOp::AddDoc { id: 2 }, IndexingOp::add(2),
IndexingOp::Commit, IndexingOp::Commit,
IndexingOp::AddDoc { id: 1 }, IndexingOp::add(1),
IndexingOp::AddDoc { id: 1 }, IndexingOp::add(1),
IndexingOp::Commit, IndexingOp::Commit,
IndexingOp::Merge IndexingOp::Merge
], ],
@@ -2290,11 +2424,11 @@ mod tests {
fn test_ff_num_ips_regression() { fn test_ff_num_ips_regression() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[ &[
IndexingOp::AddDoc { id: 13 }, IndexingOp::add(13),
IndexingOp::AddDoc { id: 1 }, IndexingOp::add(1),
IndexingOp::Commit, IndexingOp::Commit,
IndexingOp::DeleteDocQuery { id: 13 }, IndexingOp::DeleteDocQuery { id: 13 },
IndexingOp::AddDoc { id: 1 }, IndexingOp::add(1),
IndexingOp::Commit, IndexingOp::Commit,
], ],
false, false,
@@ -2306,7 +2440,7 @@ mod tests {
#[test] #[test]
fn test_minimal_sort_force_end_merge() { fn test_minimal_sort_force_end_merge() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[IndexingOp::AddDoc { id: 23 }, IndexingOp::AddDoc { id: 13 },], &[IndexingOp::add(23), IndexingOp::add(13),],
false, false,
false false
) )
@@ -2367,8 +2501,8 @@ mod tests {
fn test_minimal_sort_force_end_merge_with_delete() { fn test_minimal_sort_force_end_merge_with_delete() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[ &[
IndexingOp::AddDoc { id: 23 }, IndexingOp::add(23),
IndexingOp::AddDoc { id: 13 }, IndexingOp::add(13),
IndexingOp::DeleteDoc { id: 13 } IndexingOp::DeleteDoc { id: 13 }
], ],
true, true,
@@ -2381,8 +2515,8 @@ mod tests {
fn test_minimal_no_sort_no_force_end_merge() { fn test_minimal_no_sort_no_force_end_merge() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[ &[
IndexingOp::AddDoc { id: 23 }, IndexingOp::add(23),
IndexingOp::AddDoc { id: 13 }, IndexingOp::add(13),
IndexingOp::DeleteDoc { id: 13 } IndexingOp::DeleteDoc { id: 13 }
], ],
false, false,
@@ -2393,7 +2527,7 @@ mod tests {
#[test] #[test]
fn test_minimal_sort_merge() { fn test_minimal_sort_merge() {
assert!(test_operation_strategy(&[IndexingOp::AddDoc { id: 3 },], true, true).is_ok()); assert!(test_operation_strategy(&[IndexingOp::add(3),], true, true).is_ok());
} }
use proptest::prelude::*; use proptest::prelude::*;
@@ -2489,14 +2623,14 @@ mod tests {
fn test_delete_bug_reproduction_ip_addr() { fn test_delete_bug_reproduction_ip_addr() {
use IndexingOp::*; use IndexingOp::*;
let ops = &[ let ops = &[
AddDoc { id: 1 }, IndexingOp::add(1),
AddDoc { id: 2 }, IndexingOp::add(2),
Commit, Commit,
AddDoc { id: 3 }, IndexingOp::add(3),
DeleteDoc { id: 1 }, DeleteDoc { id: 1 },
Commit, Commit,
Merge, Merge,
AddDoc { id: 4 }, IndexingOp::add(4),
Commit, Commit,
]; ];
test_operation_strategy(&ops[..], false, true).unwrap(); test_operation_strategy(&ops[..], false, true).unwrap();
@@ -2505,7 +2639,13 @@ mod tests {
#[test] #[test]
fn test_merge_regression_1() { fn test_merge_regression_1() {
use IndexingOp::*; use IndexingOp::*;
let ops = &[AddDoc { id: 15 }, Commit, AddDoc { id: 9 }, Commit, Merge]; let ops = &[
IndexingOp::add(15),
Commit,
IndexingOp::add(9),
Commit,
Merge,
];
test_operation_strategy(&ops[..], false, true).unwrap(); test_operation_strategy(&ops[..], false, true).unwrap();
} }
@@ -2513,9 +2653,9 @@ mod tests {
fn test_range_query_bug_1() { fn test_range_query_bug_1() {
use IndexingOp::*; use IndexingOp::*;
let ops = &[ let ops = &[
AddDoc { id: 9 }, IndexingOp::add(9),
AddDoc { id: 0 }, IndexingOp::add(0),
AddDoc { id: 13 }, IndexingOp::add(13),
Commit, Commit,
]; ];
test_operation_strategy(&ops[..], false, true).unwrap(); test_operation_strategy(&ops[..], false, true).unwrap();
@@ -2523,12 +2663,11 @@ mod tests {
#[test] #[test]
fn test_range_query_bug_2() { fn test_range_query_bug_2() {
use IndexingOp::*;
let ops = &[ let ops = &[
AddDoc { id: 3 }, IndexingOp::add(3),
AddDoc { id: 6 }, IndexingOp::add(6),
AddDoc { id: 9 }, IndexingOp::add(9),
AddDoc { id: 10 }, IndexingOp::add(10),
]; ];
test_operation_strategy(&ops[..], false, false).unwrap(); test_operation_strategy(&ops[..], false, false).unwrap();
} }
@@ -2550,7 +2689,7 @@ mod tests {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[ &[
IndexingOp::DeleteDoc { id: 0 }, IndexingOp::DeleteDoc { id: 0 },
IndexingOp::AddDoc { id: 6 }, IndexingOp::add(6),
IndexingOp::DeleteDocQuery { id: 11 }, IndexingOp::DeleteDocQuery { id: 11 },
IndexingOp::Commit, IndexingOp::Commit,
IndexingOp::Merge, IndexingOp::Merge,
@@ -2567,10 +2706,13 @@ mod tests {
fn test_bug_1617_2() { fn test_bug_1617_2() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
&[ &[
IndexingOp::AddDoc { id: 13 }, IndexingOp::AddDoc {
id: 13,
value: Default::default()
},
IndexingOp::DeleteDoc { id: 13 }, IndexingOp::DeleteDoc { id: 13 },
IndexingOp::Commit, IndexingOp::Commit,
IndexingOp::AddDoc { id: 30 }, IndexingOp::add(30),
IndexingOp::Commit, IndexingOp::Commit,
IndexingOp::Merge, IndexingOp::Merge,
], ],

View File

@@ -696,7 +696,7 @@ impl IndexMerger {
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() { for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
let doc_bytes_it = &mut document_iterators[old_doc_addr.segment_ord as usize]; let doc_bytes_it = &mut document_iterators[old_doc_addr.segment_ord as usize];
if let Some(doc_bytes_res) = doc_bytes_it.next() { if let Some(doc_bytes_res) = doc_bytes_it.next() {
let (_, doc_bytes) = doc_bytes_res?; let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?; store_writer.store_bytes(&doc_bytes)?;
} else { } else {
return Err(DataCorruption::comment_only(format!( return Err(DataCorruption::comment_only(format!(
@@ -728,7 +728,7 @@ impl IndexMerger {
|| store_reader.decompressor() != store_writer.compressor().into() || store_reader.decompressor() != store_writer.compressor().into()
{ {
for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) { for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
let (_, doc_bytes) = doc_bytes_res?; let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?; store_writer.store_bytes(&doc_bytes)?;
} }
} else { } else {
@@ -787,6 +787,8 @@ impl IndexMerger {
mod tests { mod tests {
use columnar::Column; use columnar::Column;
use proptest::prop_oneof;
use proptest::strategy::Strategy;
use schema::FAST; use schema::FAST;
use crate::collector::tests::{ use crate::collector::tests::{
@@ -794,10 +796,11 @@ mod tests {
}; };
use crate::collector::{Count, FacetCollector}; use crate::collector::{Count, FacetCollector};
use crate::index::{Index, SegmentId}; use crate::index::{Index, SegmentId};
use crate::indexer::NoMergePolicy;
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery}; use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::schema::{ use crate::schema::{
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term, Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
TextFieldIndexing, INDEXED, TEXT, TextFieldIndexing, Value, INDEXED, TEXT,
}; };
use crate::time::OffsetDateTime; use crate::time::OffsetDateTime;
use crate::{ use crate::{
@@ -909,15 +912,24 @@ mod tests {
} }
{ {
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?; let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b")); assert_eq!(
doc.get_first(text_field).unwrap().as_value().as_str(),
Some("af b")
);
} }
{ {
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?; let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c")); assert_eq!(
doc.get_first(text_field).unwrap().as_value().as_str(),
Some("a b c")
);
} }
{ {
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?; let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c d")); assert_eq!(
doc.get_first(text_field).unwrap().as_value().as_str(),
Some("a b c d")
);
} }
{ {
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 3))?; let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 3))?;
@@ -1522,6 +1534,112 @@ mod tests {
Ok(()) Ok(())
} }
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
enum IndexingOp {
ZeroVal,
OneVal { val: u64 },
TwoVal { val: u64 },
Commit,
}
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
(0u64..1u64).prop_map(|_| IndexingOp::ZeroVal),
(0u64..1u64).prop_map(|val| IndexingOp::OneVal { val }),
(0u64..1u64).prop_map(|val| IndexingOp::TwoVal { val }),
(0u64..1u64).prop_map(|_| IndexingOp::Commit),
]
}
use proptest::prelude::*;
proptest! {
#[test]
fn test_merge_columnar_int_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..20)) {
assert!(test_merge_int_fields(&ops[..]).is_ok());
}
}
fn test_merge_int_fields(ops: &[IndexingOp]) -> crate::Result<()> {
if ops.iter().all(|op| *op == IndexingOp::Commit) {
return Ok(());
}
let expected_doc_and_vals: Vec<(u32, Vec<u64>)> = ops
.iter()
.filter(|op| *op != &IndexingOp::Commit)
.map(|op| match op {
IndexingOp::ZeroVal => vec![],
IndexingOp::OneVal { val } => vec![*val],
IndexingOp::TwoVal { val } => vec![*val, *val],
IndexingOp::Commit => unreachable!(),
})
.enumerate()
.map(|(id, val)| (id as u32, val))
.collect();
let mut schema_builder = schema::Schema::builder();
let int_options = NumericOptions::default().set_fast().set_indexed();
let int_field = schema_builder.add_u64_field("intvals", int_options);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = TantivyDocument::default();
for &val in int_vals {
doc.add_u64(int_field, val);
}
index_writer.add_document(doc).unwrap();
};
for op in ops {
match op {
IndexingOp::ZeroVal => index_doc(&mut index_writer, &[]),
IndexingOp::OneVal { val } => index_doc(&mut index_writer, &[*val]),
IndexingOp::TwoVal { val } => index_doc(&mut index_writer, &[*val, *val]),
IndexingOp::Commit => {
index_writer.commit().expect("commit failed");
}
}
}
index_writer.commit().expect("commit failed");
}
{
let mut segment_ids = index.searchable_segment_ids()?;
segment_ids.sort();
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
let reader = index.reader()?;
reader.reload()?;
let mut vals: Vec<u64> = Vec::new();
let mut test_vals = move |col: &Column<u64>, doc: DocId, expected: &[u64]| {
vals.clear();
vals.extend(col.values_for_doc(doc));
assert_eq!(&vals[..], expected);
};
let mut test_col = move |col: &Column<u64>, column_expected: &[(u32, Vec<u64>)]| {
for (doc_id, vals) in column_expected.iter() {
test_vals(col, *doc_id, vals);
}
};
{
let searcher = reader.searcher();
let segment = searcher.segment_reader(0u32);
let col = segment
.fast_fields()
.column_opt::<u64>("intvals")
.unwrap()
.unwrap();
test_col(&col, &expected_doc_and_vals);
}
Ok(())
}
#[test] #[test]
fn test_merge_multivalued_int_fields_simple() -> crate::Result<()> { fn test_merge_multivalued_int_fields_simple() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();

View File

@@ -7,7 +7,7 @@ mod tests {
use crate::query::QueryParser; use crate::query::QueryParser;
use crate::schema::{ use crate::schema::{
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions, self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions, TextFieldIndexing, TextOptions, Value,
}; };
use crate::{ use crate::{
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument, DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument,
@@ -280,13 +280,16 @@ mod tests {
.doc::<TantivyDocument>(DocAddress::new(0, blubber_pos)) .doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
.unwrap(); .unwrap();
assert_eq!( assert_eq!(
doc.get_first(my_text_field).unwrap().as_str(), doc.get_first(my_text_field).unwrap().as_value().as_str(),
Some("blubber") Some("blubber")
); );
let doc = searcher let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0)) .doc::<TantivyDocument>(DocAddress::new(0, 0))
.unwrap(); .unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000)); assert_eq!(
doc.get_first(int_field).unwrap().as_value().as_u64(),
Some(1000)
);
} }
} }

View File

@@ -216,7 +216,7 @@ mod tests_mmap {
let test_query = |query_str: &str| { let test_query = |query_str: &str| {
let query = parse_query.parse_query(query_str).unwrap(); let query = parse_query.parse_query(query_str).unwrap();
let num_docs = searcher.search(&query, &Count).unwrap(); let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1, "{}", query_str); assert_eq!(num_docs, 1, "{query_str}");
}; };
test_query(format!("json.{field_name_out}:test1").as_str()); test_query(format!("json.{field_name_out}:test1").as_str());
test_query(format!("json.a{field_name_out}:test2").as_str()); test_query(format!("json.a{field_name_out}:test2").as_str());
@@ -590,10 +590,10 @@ mod tests_mmap {
let query_parser = QueryParser::for_index(&index, vec![]); let query_parser = QueryParser::for_index(&index, vec![]);
// Test if field name can be queried // Test if field name can be queried
for (indexed_field, val) in fields_and_vals.iter() { for (indexed_field, val) in fields_and_vals.iter() {
let query_str = &format!("{}:{}", indexed_field, val); let query_str = &format!("{indexed_field}:{val}");
let query = query_parser.parse_query(query_str).unwrap(); let query = query_parser.parse_query(query_str).unwrap();
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap(); let count_docs = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
assert!(!count_docs.is_empty(), "{}:{}", indexed_field, val); assert!(!count_docs.is_empty(), "{indexed_field}:{val}");
} }
// Test if field name can be used for aggregation // Test if field name can be used for aggregation
for (field_name, val) in fields_and_vals.iter() { for (field_name, val) in fields_and_vals.iter() {

View File

@@ -202,9 +202,8 @@ impl SegmentWriter {
match field_entry.field_type() { match field_entry.field_type() {
FieldType::Facet(_) => { FieldType::Facet(_) => {
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
let facet_str = value.as_facet().ok_or_else(make_schema_error)?; let facet_str = value.as_facet().ok_or_else(make_schema_error)?;
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str); let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
@@ -220,15 +219,14 @@ impl SegmentWriter {
} }
FieldType::Str(_) => { FieldType::Str(_) => {
let mut indexing_position = IndexingPosition::default(); let mut indexing_position = IndexingPosition::default();
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
let mut token_stream = if let Some(text) = value.as_str() { let mut token_stream = if let Some(text) = value.as_str() {
let text_analyzer = let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize]; &mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text) text_analyzer.token_stream(text)
} else if let Some(tok_str) = value.as_pre_tokenized_text() { } else if let Some(tok_str) = value.into_pre_tokenized_text() {
BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone())) BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone()))
} else { } else {
continue; continue;
@@ -250,9 +248,8 @@ impl SegmentWriter {
} }
FieldType::U64(_) => { FieldType::U64(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let u64_val = value.as_u64().ok_or_else(make_schema_error)?; let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
@@ -265,10 +262,8 @@ impl SegmentWriter {
} }
FieldType::Date(_) => { FieldType::Date(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
num_vals += 1; num_vals += 1;
let date_val = value.as_datetime().ok_or_else(make_schema_error)?; let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
@@ -282,9 +277,8 @@ impl SegmentWriter {
} }
FieldType::I64(_) => { FieldType::I64(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let i64_val = value.as_i64().ok_or_else(make_schema_error)?; let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
@@ -297,10 +291,8 @@ impl SegmentWriter {
} }
FieldType::F64(_) => { FieldType::F64(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let f64_val = value.as_f64().ok_or_else(make_schema_error)?; let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
term_buffer.set_f64(f64_val); term_buffer.set_f64(f64_val);
@@ -312,10 +304,8 @@ impl SegmentWriter {
} }
FieldType::Bool(_) => { FieldType::Bool(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let bool_val = value.as_bool().ok_or_else(make_schema_error)?; let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
term_buffer.set_bool(bool_val); term_buffer.set_bool(bool_val);
@@ -327,10 +317,8 @@ impl SegmentWriter {
} }
FieldType::Bytes(_) => { FieldType::Bytes(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let bytes = value.as_bytes().ok_or_else(make_schema_error)?; let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes); term_buffer.set_bytes(bytes);
@@ -364,9 +352,8 @@ impl SegmentWriter {
} }
FieldType::IpAddr(_) => { FieldType::IpAddr(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking. let value = value.as_value();
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?; let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
@@ -500,8 +487,8 @@ mod tests {
use crate::postings::{Postings, TermInfo}; use crate::postings::{Postings, TermInfo};
use crate::query::{PhraseQuery, QueryParser}; use crate::query::{PhraseQuery, QueryParser};
use crate::schema::{ use crate::schema::{
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, STORED, Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
STRING, TEXT, STORED, STRING, TEXT,
}; };
use crate::store::{Compressor, StoreReader, StoreWriter}; use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339; use crate::time::format_description::well_known::Rfc3339;
@@ -555,9 +542,12 @@ mod tests {
let doc = reader.get::<TantivyDocument>(0).unwrap(); let doc = reader.get::<TantivyDocument>(0).unwrap();
assert_eq!(doc.field_values().count(), 2); assert_eq!(doc.field_values().count(), 2);
assert_eq!(doc.get_all(text_field).next().unwrap().as_str(), Some("A"));
assert_eq!( assert_eq!(
doc.get_all(text_field).nth(1).unwrap().as_str(), doc.get_all(text_field).next().unwrap().as_value().as_str(),
Some("A")
);
assert_eq!(
doc.get_all(text_field).nth(1).unwrap().as_value().as_str(),
Some("title") Some("title")
); );
} }

View File

@@ -397,16 +397,20 @@ pub mod tests {
#[macro_export] #[macro_export]
macro_rules! assert_nearly_equals { macro_rules! assert_nearly_equals {
($left:expr, $right:expr) => {{ ($left:expr, $right:expr) => {{
match (&$left, &$right) { assert_nearly_equals!($left, $right, 0.0005);
(left_val, right_val) => { }};
($left:expr, $right:expr, $epsilon:expr) => {{
match (&$left, &$right, &$epsilon) {
(left_val, right_val, epsilon_val) => {
let diff = (left_val - right_val).abs(); let diff = (left_val - right_val).abs();
let add = left_val.abs() + right_val.abs();
if diff > 0.0005 * add { if diff > *epsilon_val {
panic!( panic!(
r#"assertion failed: `(left ~= right)` r#"assertion failed: `abs(left-right)>epsilon`
left: `{:?}`, left: `{:?}`,
right: `{:?}`"#, right: `{:?}`,
&*left_val, &*right_val epsilon: `{:?}`"#,
&*left_val, &*right_val, &*epsilon_val
) )
} }
} }

View File

@@ -138,8 +138,7 @@ impl FuzzyTermQuery {
if json_path_type != Type::Str { if json_path_type != Type::Str {
return Err(InvalidArgument(format!( return Err(InvalidArgument(format!(
"The fuzzy term query requires a string path type for a json term. Found \ "The fuzzy term query requires a string path type for a json term. Found \
{:?}", {json_path_type:?}"
json_path_type
))); )));
} }
} }

View File

@@ -2,7 +2,7 @@ use std::fmt;
use std::ops::Bound; use std::ops::Bound;
use crate::query::Occur; use crate::query::Occur;
use crate::schema::{Field, Term, Type}; use crate::schema::{Term, Type};
use crate::Score; use crate::Score;
#[derive(Clone)] #[derive(Clone)]
@@ -20,8 +20,6 @@ pub enum LogicalLiteral {
upper: Bound<Term>, upper: Bound<Term>,
}, },
Set { Set {
field: Field,
value_type: Type,
elements: Vec<Term>, elements: Vec<Term>,
}, },
All, All,

View File

@@ -832,17 +832,11 @@ impl QueryParser {
let (field, json_path) = try_tuple!(self let (field, json_path) = try_tuple!(self
.split_full_path(&full_path) .split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))); .ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let (elements, errors) = elements let (elements, errors) = elements
.into_iter() .into_iter()
.map(|element| self.compute_boundary_term(field, json_path, &element)) .map(|element| self.compute_boundary_term(field, json_path, &element))
.partition_result(); .partition_result();
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set { let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set { elements }));
elements,
field,
value_type,
}));
(Some(logical_ast), errors) (Some(logical_ast), errors)
} }
UserInputLeaf::Exists { .. } => ( UserInputLeaf::Exists { .. } => (

View File

@@ -185,7 +185,7 @@ mod test {
Err(crate::TantivyError::InvalidArgument(msg)) => { Err(crate::TantivyError::InvalidArgument(msg)) => {
assert!(msg.contains("error: unclosed group")) assert!(msg.contains("error: unclosed group"))
} }
res => panic!("unexpected result: {:?}", res), res => panic!("unexpected result: {res:?}"),
} }
} }
} }

View File

@@ -157,29 +157,24 @@ impl CompactDoc {
} }
/// field_values accessor /// field_values accessor
pub fn field_values( pub fn field_values(&self) -> impl Iterator<Item = (Field, CompactDocValue<'_>)> {
&self,
) -> impl Iterator<Item = (Field, ReferenceValue<'_, CompactDocValue<'_>>)> {
self.field_values.iter().map(|field_val| { self.field_values.iter().map(|field_val| {
let field = Field::from_field_id(field_val.field as u32); let field = Field::from_field_id(field_val.field as u32);
let val = self.extract_value(field_val.value_addr).unwrap(); let val = self.get_compact_doc_value(field_val.value_addr);
(field, val) (field, val)
}) })
} }
/// Returns all of the `ReferenceValue`s associated the given field /// Returns all of the `ReferenceValue`s associated the given field
pub fn get_all( pub fn get_all(&self, field: Field) -> impl Iterator<Item = CompactDocValue<'_>> + '_ {
&self,
field: Field,
) -> impl Iterator<Item = ReferenceValue<'_, CompactDocValue<'_>>> + '_ {
self.field_values self.field_values
.iter() .iter()
.filter(move |field_value| Field::from_field_id(field_value.field as u32) == field) .filter(move |field_value| Field::from_field_id(field_value.field as u32) == field)
.map(|val| self.extract_value(val.value_addr).unwrap()) .map(|val| self.get_compact_doc_value(val.value_addr))
} }
/// Returns the first `ReferenceValue` associated the given field /// Returns the first `ReferenceValue` associated the given field
pub fn get_first(&self, field: Field) -> Option<ReferenceValue<'_, CompactDocValue<'_>>> { pub fn get_first(&self, field: Field) -> Option<CompactDocValue<'_>> {
self.get_all(field).next() self.get_all(field).next()
} }
@@ -299,58 +294,11 @@ impl CompactDoc {
} }
} }
fn extract_value( /// Get CompactDocValue for address
&self, fn get_compact_doc_value(&self, value_addr: ValueAddr) -> CompactDocValue<'_> {
ref_value: ValueAddr, CompactDocValue {
) -> io::Result<ReferenceValue<'_, CompactDocValue<'_>>> { container: self,
match ref_value.type_id { value_addr,
ValueType::Null => Ok(ReferenceValueLeaf::Null.into()),
ValueType::Str => {
let str_ref = self.extract_str(ref_value.val_addr);
Ok(ReferenceValueLeaf::Str(str_ref).into())
}
ValueType::Facet => {
let str_ref = self.extract_str(ref_value.val_addr);
Ok(ReferenceValueLeaf::Facet(str_ref).into())
}
ValueType::Bytes => {
let data = self.extract_bytes(ref_value.val_addr);
Ok(ReferenceValueLeaf::Bytes(data).into())
}
ValueType::U64 => self
.read_from::<u64>(ref_value.val_addr)
.map(ReferenceValueLeaf::U64)
.map(Into::into),
ValueType::I64 => self
.read_from::<i64>(ref_value.val_addr)
.map(ReferenceValueLeaf::I64)
.map(Into::into),
ValueType::F64 => self
.read_from::<f64>(ref_value.val_addr)
.map(ReferenceValueLeaf::F64)
.map(Into::into),
ValueType::Bool => Ok(ReferenceValueLeaf::Bool(ref_value.val_addr != 0).into()),
ValueType::Date => self
.read_from::<i64>(ref_value.val_addr)
.map(|ts| ReferenceValueLeaf::Date(DateTime::from_timestamp_nanos(ts)))
.map(Into::into),
ValueType::IpAddr => self
.read_from::<u128>(ref_value.val_addr)
.map(|num| ReferenceValueLeaf::IpAddr(Ipv6Addr::from_u128(num)))
.map(Into::into),
ValueType::PreTokStr => self
.read_from::<PreTokenizedString>(ref_value.val_addr)
.map(Into::into)
.map(ReferenceValueLeaf::PreTokStr)
.map(Into::into),
ValueType::Object => Ok(ReferenceValue::Object(CompactDocObjectIter::new(
self,
ref_value.val_addr,
)?)),
ValueType::Array => Ok(ReferenceValue::Array(CompactDocArrayIter::new(
self,
ref_value.val_addr,
)?)),
} }
} }
@@ -410,7 +358,7 @@ impl PartialEq for CompactDoc {
let convert_to_comparable_map = |doc: &CompactDoc| { let convert_to_comparable_map = |doc: &CompactDoc| {
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default(); let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
for field_value in doc.field_values.iter() { for field_value in doc.field_values.iter() {
let value: OwnedValue = doc.extract_value(field_value.value_addr).unwrap().into(); let value: OwnedValue = doc.get_compact_doc_value(field_value.value_addr).into();
let value = serde_json::to_string(&value).unwrap(); let value = serde_json::to_string(&value).unwrap();
field_value_set field_value_set
.entry(Field::from_field_id(field_value.field as u32)) .entry(Field::from_field_id(field_value.field as u32))
@@ -444,7 +392,19 @@ impl DocumentDeserialize for CompactDoc {
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct CompactDocValue<'a> { pub struct CompactDocValue<'a> {
container: &'a CompactDoc, container: &'a CompactDoc,
value: ValueAddr, value_addr: ValueAddr,
}
impl PartialEq for CompactDocValue<'_> {
fn eq(&self, other: &Self) -> bool {
let value1: OwnedValue = (*self).into();
let value2: OwnedValue = (*other).into();
value1 == value2
}
}
impl<'a> From<CompactDocValue<'a>> for OwnedValue {
fn from(value: CompactDocValue) -> Self {
value.as_value().into()
}
} }
impl<'a> Value<'a> for CompactDocValue<'a> { impl<'a> Value<'a> for CompactDocValue<'a> {
type ArrayIter = CompactDocArrayIter<'a>; type ArrayIter = CompactDocArrayIter<'a>;
@@ -452,7 +412,67 @@ impl<'a> Value<'a> for CompactDocValue<'a> {
type ObjectIter = CompactDocObjectIter<'a>; type ObjectIter = CompactDocObjectIter<'a>;
fn as_value(&self) -> ReferenceValue<'a, Self> { fn as_value(&self) -> ReferenceValue<'a, Self> {
self.container.extract_value(self.value).unwrap() self.get_ref_value().unwrap()
}
}
impl<'a> CompactDocValue<'a> {
fn get_ref_value(&self) -> io::Result<ReferenceValue<'a, CompactDocValue<'a>>> {
let addr = self.value_addr.val_addr;
match self.value_addr.type_id {
ValueType::Null => Ok(ReferenceValueLeaf::Null.into()),
ValueType::Str => {
let str_ref = self.container.extract_str(addr);
Ok(ReferenceValueLeaf::Str(str_ref).into())
}
ValueType::Facet => {
let str_ref = self.container.extract_str(addr);
Ok(ReferenceValueLeaf::Facet(str_ref).into())
}
ValueType::Bytes => {
let data = self.container.extract_bytes(addr);
Ok(ReferenceValueLeaf::Bytes(data).into())
}
ValueType::U64 => self
.container
.read_from::<u64>(addr)
.map(ReferenceValueLeaf::U64)
.map(Into::into),
ValueType::I64 => self
.container
.read_from::<i64>(addr)
.map(ReferenceValueLeaf::I64)
.map(Into::into),
ValueType::F64 => self
.container
.read_from::<f64>(addr)
.map(ReferenceValueLeaf::F64)
.map(Into::into),
ValueType::Bool => Ok(ReferenceValueLeaf::Bool(addr != 0).into()),
ValueType::Date => self
.container
.read_from::<i64>(addr)
.map(|ts| ReferenceValueLeaf::Date(DateTime::from_timestamp_nanos(ts)))
.map(Into::into),
ValueType::IpAddr => self
.container
.read_from::<u128>(addr)
.map(|num| ReferenceValueLeaf::IpAddr(Ipv6Addr::from_u128(num)))
.map(Into::into),
ValueType::PreTokStr => self
.container
.read_from::<PreTokenizedString>(addr)
.map(Into::into)
.map(ReferenceValueLeaf::PreTokStr)
.map(Into::into),
ValueType::Object => Ok(ReferenceValue::Object(CompactDocObjectIter::new(
self.container,
addr,
)?)),
ValueType::Array => Ok(ReferenceValue::Array(CompactDocArrayIter::new(
self.container,
addr,
)?)),
}
} }
} }
@@ -537,7 +557,7 @@ impl BinarySerializable for ValueType {
} else { } else {
return Err(io::Error::new( return Err(io::Error::new(
io::ErrorKind::InvalidData, io::ErrorKind::InvalidData,
format!("Invalid value type id: {}", num), format!("Invalid value type id: {num}"),
)); ));
}; };
Ok(type_id) Ok(type_id)
@@ -601,9 +621,9 @@ impl<'a> Iterator for CompactDocObjectIter<'a> {
let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?; let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
let value = CompactDocValue { let value = CompactDocValue {
container: self.container, container: self.container,
value, value_addr: value,
}; };
return Some((key, value)); Some((key, value))
} }
} }
@@ -635,9 +655,9 @@ impl<'a> Iterator for CompactDocArrayIter<'a> {
let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?; let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
let value = CompactDocValue { let value = CompactDocValue {
container: self.container, container: self.container,
value, value_addr: value,
}; };
return Some(value); Some(value)
} }
} }
@@ -668,7 +688,7 @@ impl<'a> Iterator for FieldValueIterRef<'a> {
Field::from_field_id(field_value.field as u32), Field::from_field_id(field_value.field as u32),
CompactDocValue::<'a> { CompactDocValue::<'a> {
container: self.container, container: self.container,
value: field_value.value_addr, value_addr: field_value.value_addr,
}, },
) )
}) })

View File

@@ -58,9 +58,8 @@ where W: Write
return Err(io::Error::new( return Err(io::Error::new(
io::ErrorKind::Other, io::ErrorKind::Other,
format!( format!(
"Unexpected number of entries written to serializer, expected {} entries, got \ "Unexpected number of entries written to serializer, expected \
{} entries", {num_field_values} entries, got {actual_length} entries",
num_field_values, actual_length,
), ),
)); ));
} }

View File

@@ -17,15 +17,6 @@ pub trait Value<'a>: Send + Sync + Debug {
/// Returns the field value represented by an enum which borrows it's data. /// Returns the field value represented by an enum which borrows it's data.
fn as_value(&self) -> ReferenceValue<'a, Self>; fn as_value(&self) -> ReferenceValue<'a, Self>;
#[inline]
/// Returns if the value is `null` or not.
fn is_null(&self) -> bool {
matches!(
self.as_value(),
ReferenceValue::Leaf(ReferenceValueLeaf::Null)
)
}
#[inline] #[inline]
/// If the Value is a leaf, returns the associated leaf. Returns None otherwise. /// If the Value is a leaf, returns the associated leaf. Returns None otherwise.
fn as_leaf(&self) -> Option<ReferenceValueLeaf<'a>> { fn as_leaf(&self) -> Option<ReferenceValueLeaf<'a>> {
@@ -117,18 +108,6 @@ pub trait Value<'a>: Send + Sync + Debug {
None None
} }
} }
#[inline]
/// Returns true if the Value is an array.
fn is_array(&self) -> bool {
matches!(self.as_value(), ReferenceValue::Object(_))
}
#[inline]
/// Returns true if the Value is an object.
fn is_object(&self) -> bool {
matches!(self.as_value(), ReferenceValue::Object(_))
}
} }
/// A enum representing a leaf value for tantivy to index. /// A enum representing a leaf value for tantivy to index.

View File

@@ -659,9 +659,9 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#; let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#;
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
let date = doc.get_first(date_field).unwrap(); let date = OwnedValue::from(doc.get_first(date_field).unwrap());
// Time zone is converted to UTC // Time zone is converted to UTC
assert_eq!("Leaf(Date(2019-10-12T05:20:50.52Z))", format!("{date:?}")); assert_eq!("Date(2019-10-12T05:20:50.52Z)", format!("{date:?}"));
} }
#[test] #[test]

View File

@@ -60,7 +60,7 @@ pub mod tests {
use crate::directory::{Directory, RamDirectory, WritePtr}; use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::AliveBitSet; use crate::fastfield::AliveBitSet;
use crate::schema::{ use crate::schema::{
self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, STORED, TEXT, self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, Value, STORED, TEXT,
}; };
use crate::{Index, IndexWriter, Term}; use crate::{Index, IndexWriter, Term};
@@ -122,6 +122,7 @@ pub mod tests {
.get::<TantivyDocument>(i)? .get::<TantivyDocument>(i)?
.get_first(field_title) .get_first(field_title)
.unwrap() .unwrap()
.as_value()
.as_str() .as_str()
.unwrap(), .unwrap(),
format!("Doc {i}") format!("Doc {i}")
@@ -133,6 +134,7 @@ pub mod tests {
let title_content = doc let title_content = doc
.get_first(field_title) .get_first(field_title)
.unwrap() .unwrap()
.as_value()
.as_str() .as_str()
.unwrap() .unwrap()
.to_string(); .to_string();

View File

@@ -241,23 +241,12 @@ impl StoreReader {
&'b self, &'b self,
alive_bitset: Option<&'a AliveBitSet>, alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<D>> + 'b { ) -> impl Iterator<Item = crate::Result<D>> + 'b {
self.enumerate(alive_bitset)
.map(|res| res.map(|(_, doc)| doc))
}
/// A variant of [`iter`][Self::iter] which also yields document ID.
pub fn enumerate<'a: 'b, 'b, D: DocumentDeserialize>(
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<(DocId, D)>> + 'b {
self.iter_raw(alive_bitset).map(|doc_bytes_res| { self.iter_raw(alive_bitset).map(|doc_bytes_res| {
let (doc_id, mut doc_bytes) = doc_bytes_res?; let mut doc_bytes = doc_bytes_res?;
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes) let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?; .map_err(crate::TantivyError::from)?;
let doc = D::deserialize(deserializer).map_err(crate::TantivyError::from)?; D::deserialize(deserializer).map_err(crate::TantivyError::from)
Ok((doc_id, doc))
}) })
} }
@@ -267,7 +256,7 @@ impl StoreReader {
pub(crate) fn iter_raw<'a: 'b, 'b>( pub(crate) fn iter_raw<'a: 'b, 'b>(
&'b self, &'b self,
alive_bitset: Option<&'a AliveBitSet>, alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<(DocId, OwnedBytes)>> + 'b { ) -> impl Iterator<Item = crate::Result<OwnedBytes>> + 'b {
let last_doc_id = self let last_doc_id = self
.block_checkpoints() .block_checkpoints()
.last() .last()
@@ -295,14 +284,14 @@ impl StoreReader {
let alive = alive_bitset.map_or(true, |bitset| bitset.is_alive(doc_id)); let alive = alive_bitset.map_or(true, |bitset| bitset.is_alive(doc_id));
let res = if alive { let res = if alive {
Some((doc_id, curr_block.clone(), doc_pos)) Some((curr_block.clone(), doc_pos))
} else { } else {
None None
}; };
doc_pos += 1; doc_pos += 1;
res res
}) })
.map(move |(doc_id, block, doc_pos)| { .map(move |(block, doc_pos)| {
let block = block let block = block
.ok_or_else(|| { .ok_or_else(|| {
DataCorruption::comment_only( DataCorruption::comment_only(
@@ -315,7 +304,7 @@ impl StoreReader {
})?; })?;
let range = block_read_index(&block, doc_pos)?; let range = block_read_index(&block, doc_pos)?;
Ok((doc_id, block.slice(range))) Ok(block.slice(range))
}) })
} }
@@ -414,7 +403,7 @@ mod tests {
use super::*; use super::*;
use crate::directory::RamDirectory; use crate::directory::RamDirectory;
use crate::schema::{Field, TantivyDocument}; use crate::schema::{Field, TantivyDocument, Value};
use crate::store::tests::write_lorem_ipsum_store; use crate::store::tests::write_lorem_ipsum_store;
use crate::store::Compressor; use crate::store::Compressor;
use crate::Directory; use crate::Directory;
@@ -422,7 +411,7 @@ mod tests {
const BLOCK_SIZE: usize = 16_384; const BLOCK_SIZE: usize = 16_384;
fn get_text_field<'a>(doc: &'a TantivyDocument, field: &'a Field) -> Option<&'a str> { fn get_text_field<'a>(doc: &'a TantivyDocument, field: &'a Field) -> Option<&'a str> {
doc.get_first(*field).and_then(|f| f.as_str()) doc.get_first(*field).and_then(|f| f.as_value().as_str())
} }
#[test] #[test]

View File

@@ -93,7 +93,7 @@ fn open_fst_index(fst_file: FileSlice) -> io::Result<tantivy_fst::Map<OwnedBytes
let fst = Fst::new(bytes).map_err(|err| { let fst = Fst::new(bytes).map_err(|err| {
io::Error::new( io::Error::new(
io::ErrorKind::InvalidData, io::ErrorKind::InvalidData,
format!("Fst data is corrupted: {:?}", err), format!("Fst data is corrupted: {err:?}"),
) )
})?; })?;
Ok(tantivy_fst::Map::from(fst)) Ok(tantivy_fst::Map::from(fst))

View File

@@ -95,7 +95,7 @@ fn test_term_dictionary_simple() -> crate::Result<()> {
#[test] #[test]
fn test_term_dictionary_stream() -> crate::Result<()> { fn test_term_dictionary_stream() -> crate::Result<()> {
let ids: Vec<_> = (0u32..10_000u32) let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i)) .map(|i| (format!("doc{i:0>6}"), i))
.collect(); .collect();
let buffer: Vec<u8> = { let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
@@ -156,7 +156,7 @@ fn test_stream_high_range_prefix_suffix() -> crate::Result<()> {
#[test] #[test]
fn test_stream_range() -> crate::Result<()> { fn test_stream_range() -> crate::Result<()> {
let ids: Vec<_> = (0u32..10_000u32) let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i)) .map(|i| (format!("doc{i:0>6}"), i))
.collect(); .collect();
let buffer: Vec<u8> = { let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();

View File

@@ -96,7 +96,7 @@ mod tests {
{ {
let mut add_token = |token: &Token| { let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
tokens.push(format!("{}", facet)); tokens.push(format!("{facet}"));
}; };
FacetTokenizer::default() FacetTokenizer::default()
.token_stream(facet.encoded_str()) .token_stream(facet.encoded_str())
@@ -116,7 +116,7 @@ mod tests {
{ {
let mut add_token = |token: &Token| { let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
tokens.push(format!("{}", facet)); tokens.push(format!("{facet}"));
}; };
FacetTokenizer::default() FacetTokenizer::default()
.token_stream(facet.encoded_str()) // ok test .token_stream(facet.encoded_str()) // ok test