Compare commits

..

19 Commits

Author SHA1 Message Date
Pascal Seitz
2ce485b8cc skip estimate phase for merge multivalue index
precompute stats for merge multivalue index + disable Line encoding for
multivalue index. That combination allows to skip the first estimation
pass. This gives up to 2x on merge performance on multivalue indices.

This change may decrease compression as Line is
very good compressible for documents, which have a fixed amount of
values in each doc. The line codec should be replaced.

```
merge_multi_and_multi          Avg: 22.7880ms (-47.15%)    Median: 22.5469ms (-47.38%)    [22.3691ms .. 25.8392ms]
merge_dense_and_dense          Avg: 14.4398ms (+2.18%)     Median: 14.2465ms (+0.74%)     [14.1620ms .. 16.1270ms]
merge_sparse_and_sparse        Avg: 10.6559ms (+1.10%)     Median: 10.6318ms (+0.91%)     [10.5527ms .. 11.2848ms]
merge_sparse_and_dense         Avg: 12.4886ms (+1.52%)     Median: 12.4044ms (+0.84%)     [12.3261ms .. 13.9439ms]
merge_multi_and_dense          Avg: 25.6686ms (-45.56%)    Median: 25.4851ms (-45.84%)    [25.1618ms .. 27.6226ms]
merge_multi_and_sparse         Avg: 24.3278ms (-47.00%)    Median: 24.1917ms (-47.34%)    [23.7159ms .. 27.0513ms]
```
2024-06-11 20:22:00 +08:00
PSeitz
c3b92a5412 fix compiler warning, cleanup (#2393)
fix compiler warning for missing feature flag
remove unused variables
cleanup unused methods
2024-06-11 16:03:50 +08:00
PSeitz
2f55511064 extend indexwriter proptests (#2342)
* index random values in proptest

* add proptest with multiple docs
2024-06-11 16:02:57 +08:00
trinity-1686a
08b9fc0b31 fix de-escaping too much in query parser (#2427)
* fix de-escaping too much in query parser
2024-06-10 11:19:01 +02:00
PSeitz
714f363d43 add bench & test for columnar merging (#2428)
* add merge columnar proptest

* add columnar merge benchmark
2024-06-10 16:26:16 +08:00
PSeitz
93ff7365b0 reduce top hits aggregation memory consumption (#2426)
move request structure out of top hits aggregation collector and use from the
passed structure instead

full
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 425.9680ms (-21.38%)    Median: 415.1097ms (-23.56%)    [395.5303ms .. 484.6325ms]
dense
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 440.0817ms (-19.68%)    Median: 432.2286ms (-21.10%)    [403.5632ms .. 497.7541ms]
sparse
terms_many_with_top_hits    Memory: 13.1 MB (-49.31%)    Avg: 33.3568ms (-32.19%)    Median: 33.0834ms (-31.86%)    [32.5126ms .. 35.7397ms]
multivalue
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 414.2340ms (-25.44%)    Median: 413.4144ms (-25.64%)    [403.9919ms .. 430.3170ms]
2024-06-06 22:32:58 +08:00
Adam Reichold
8151925068 Panicking in spawned Rayon tasks will abort the process by default. (#2409) 2024-06-04 17:04:30 +09:00
dependabot[bot]
b960e40bc8 Update sketches-ddsketch requirement from 0.2.1 to 0.3.0 (#2423)
Updates the requirements on [sketches-ddsketch](https://github.com/mheffner/rust-sketches-ddsketch) to permit the latest version.
- [Release notes](https://github.com/mheffner/rust-sketches-ddsketch/releases)
- [Commits](https://github.com/mheffner/rust-sketches-ddsketch/compare/v0.2.1...v0.3.0)

---
updated-dependencies:
- dependency-name: sketches-ddsketch
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-06-04 15:50:23 +08:00
giovannicuccu
1095c9b073 Issue 1787 extended stats (#2247)
* first version of extended stats along with its tests

* using IntermediateExtendStats instead of IntermediateStats with all tests passing

* Created struct for request and response

* first test with extended_stats

* kahan summation and tests with approximate equality

* version ready for merge

* removed approx dependency

* refactor for using ExtendedStats only when needed

* interim version

* refined version with code formatted

* refactored a struct

* cosmetic refactor

* fix after merge

* fix format

* added extended_stat bench

* merge and new benchmark for extended stats

* split stat segment collectors

* wrapped intermediate extended stat with a box to limit memory usage

* Revert "wrapped intermediate extended stat with a box to limit memory usage"

This reverts commit 5b4aa9f393.

* some code reformat, commented kahan summation

* refactor after review

* refactor after code review

* fix after incorrectly restoring kahan summation

* modifications for code review + bug fix in merge_fruit

* refactor assert_nearly_equals macro

* update after code review

---------

Co-authored-by: Giovanni Cuccu <gcuccu@imolainformatica.it>
2024-06-04 14:25:17 +08:00
PSeitz
c0686515a9 update one_shot (#2420) 2024-05-31 11:07:35 +08:00
trinity-1686a
455156f51c improve query parser (#2416)
* support escape sequence in more place

and fix bug with singlequoted strings

* add query parser test for range query on default field
2024-05-30 17:29:27 +02:00
Meng Zhang
4143d31865 chore: fix build as the rev is gone (#2417) 2024-05-29 09:49:16 +08:00
Hamir Mahal
0c634adbe1 style: simplify strings with string interpolation (#2412)
* style: simplify strings with string interpolation

* fix: formatting
2024-05-27 09:16:47 +02:00
PSeitz
2e3641c2ae return CompactDocValue instead of trait (#2410)
The CompactDocValue is easier to handle than the trait in some cases like comparison
and conversion
2024-05-27 07:33:50 +02:00
Paul Masurel
b806122c81 Fixing flaky test (#2407) 2024-05-22 10:10:55 +09:00
PSeitz
e1679f3fb9 compact doc (#2402)
* compact doc

* add any value type

* pass references when building CompactDoc

* remove OwnedValue from API

* clippy

* clippy

* fail on large documents

* fmt

* cleanup

* cleanup

* implement Value for different types

fix serde_json date Value implementation

* fmt

* cleanup

* fmt

* cleanup

* store positions instead of pos+len

* remove nodes array

* remove mediumvec

* cleanup

* infallible serialize into vec

* remove positions indirection

* remove 24MB limitation in document

use u32 for Addr
Remove the 3 byte addressing limitation and use VInt instead

* cleanup

* extend test

* cleanup, add comments

* rename, remove pub
2024-05-21 10:16:08 +02:00
dependabot[bot]
5a80420b10 --- (#2406)
updated-dependencies:
- dependency-name: binggan
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-05-21 04:36:32 +02:00
dependabot[bot]
aa26ff5029 Update binggan requirement from 0.6.2 to 0.7.0 (#2401)
---
updated-dependencies:
- dependency-name: binggan
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-05-17 02:53:25 +02:00
dependabot[bot]
e197b59258 Update itertools requirement from 0.12.0 to 0.13.0 (#2400)
Updates the requirements on [itertools](https://github.com/rust-itertools/itertools) to permit the latest version.
- [Changelog](https://github.com/rust-itertools/itertools/blob/master/CHANGELOG.md)
- [Commits](https://github.com/rust-itertools/itertools/compare/v0.12.0...v0.13.0)

---
updated-dependencies:
- dependency-name: itertools
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-05-17 02:53:02 +02:00
71 changed files with 3141 additions and 686 deletions

View File

@@ -15,7 +15,6 @@ rust-version = "1.63"
exclude = ["benches/*.json", "benches/*.txt"]
[dependencies]
# Switch back to the non-forked oneshot crate once https://github.com/faern/oneshot/pull/35 is merged
oneshot = "0.1.7"
base64 = "0.22.0"
byteorder = "1.4.3"
@@ -53,7 +52,7 @@ smallvec = "1.8.0"
rayon = "1.5.2"
lru = "0.12.0"
fastdivide = "0.4.0"
itertools = "0.12.0"
itertools = "0.13.0"
measure_time = "0.8.2"
arc-swap = "1.5.0"
@@ -64,7 +63,7 @@ query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tanti
tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
common = { version = "0.7", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
futures-util = { version = "0.3.28", optional = true }
fnv = "1.0.7"
@@ -72,7 +71,7 @@ fnv = "1.0.7"
winapi = "0.3.9"
[dev-dependencies]
binggan = "0.6.2"
binggan = "0.8.0"
rand = "0.8.5"
maplit = "1.0.2"
matches = "0.1.9"

View File

@@ -47,6 +47,7 @@ fn bench_agg(mut group: InputGroup<Index>) {
register!(group, average_f64);
register!(group, average_f64_u64);
register!(group, stats_f64);
register!(group, extendedstats_f64);
register!(group, percentiles_f64);
register!(group, terms_few);
register!(group, terms_many);
@@ -105,7 +106,12 @@ fn stats_f64(index: &Index) {
});
exec_term_with_agg(index, agg_req)
}
fn extendedstats_f64(index: &Index) {
let agg_req = json!({
"extendedstats_f64": { "extended_stats": { "field": "score_f64", } }
});
exec_term_with_agg(index, agg_req)
}
fn percentiles_f64(index: &Index) {
let agg_req = json!({
"mypercentiles": {
@@ -349,7 +355,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let many_terms_data = (0..150_000)
.map(|num| format!("author{}", num))
.map(|num| format!("author{num}"))
.collect::<Vec<_>>();
{
let mut rng = StdRng::from_seed([1u8; 32]);

View File

@@ -18,7 +18,7 @@ fn benchmark(
benchmark_dynamic_json(b, input, schema, commit, parse_json)
} else {
_benchmark(b, input, schema, commit, parse_json, |schema, doc_json| {
TantivyDocument::parse_json(&schema, doc_json).unwrap()
TantivyDocument::parse_json(schema, doc_json).unwrap()
})
}
}
@@ -90,8 +90,7 @@ fn benchmark_dynamic_json(
) {
let json_field = schema.get_field("json").unwrap();
_benchmark(b, input, schema, commit, parse_json, |_schema, doc_json| {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let json_val: serde_json::Value = serde_json::from_str(doc_json).unwrap();
tantivy::doc!(json_field=>json_val)
})
}
@@ -138,15 +137,16 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
for (prefix, schema, is_dynamic) in benches {
for commit in [false, true] {
let suffix = if commit { "with-commit" } else { "no-commit" };
for parse_json in [false] {
{
let parse_json = false;
// for parse_json in [false, true] {
let suffix = if parse_json {
format!("{}-with-json-parsing", suffix)
format!("{suffix}-with-json-parsing")
} else {
format!("{}", suffix)
suffix.to_string()
};
let bench_name = format!("{}{}", prefix, suffix);
let bench_name = format!("{prefix}{suffix}");
group.bench_function(bench_name, |b| {
benchmark(b, HDFS_LOGS, schema.clone(), commit, parse_json, is_dynamic)
});

View File

@@ -9,7 +9,7 @@ description = "column oriented storage for tantivy"
categories = ["database-implementations", "data-structures", "compression"]
[dependencies]
itertools = "0.12.0"
itertools = "0.13.0"
fastdivide = "0.4.0"
stacker = { version= "0.3", path = "../stacker", package="tantivy-stacker"}
@@ -23,6 +23,12 @@ downcast-rs = "1.2.0"
proptest = "1"
more-asserts = "0.3.1"
rand = "0.8"
binggan = "0.8.1"
[[bench]]
name = "bench_merge"
harness = false
[features]
unstable = []

View File

@@ -0,0 +1,97 @@
#![feature(test)]
extern crate test;
use core::fmt;
use std::fmt::{Display, Formatter};
use binggan::{black_box, BenchRunner};
use tantivy_columnar::*;
enum Card {
Multi,
Sparse,
Dense,
}
impl Display for Card {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Card::Multi => write!(f, "multi"),
Card::Sparse => write!(f, "sparse"),
Card::Dense => write!(f, "dense"),
}
}
}
const NUM_DOCS: u32 = 1_000_000;
fn generate_columnar(card: Card, num_docs: u32) -> ColumnarReader {
use tantivy_columnar::ColumnarWriter;
let mut columnar_writer = ColumnarWriter::default();
match card {
Card::Multi => {
columnar_writer.record_numerical(0, "price", 10u64);
columnar_writer.record_numerical(0, "price", 10u64);
}
_ => {}
}
for i in 0..num_docs {
match card {
Card::Multi | Card::Sparse => {
if i % 8 == 0 {
columnar_writer.record_numerical(i, "price", i as u64);
}
}
Card::Dense => {
if i % 6 == 0 {
columnar_writer.record_numerical(i, "price", i as u64);
}
}
}
}
let mut wrt: Vec<u8> = Vec::new();
columnar_writer.serialize(num_docs, None, &mut wrt).unwrap();
ColumnarReader::open(wrt).unwrap()
}
fn main() {
let mut inputs = Vec::new();
let mut add_combo = |card1: Card, card2: Card| {
inputs.push((
format!("merge_{card1}_and_{card2}"),
vec![
generate_columnar(card1, NUM_DOCS),
generate_columnar(card2, NUM_DOCS),
],
));
};
add_combo(Card::Multi, Card::Multi);
add_combo(Card::Dense, Card::Dense);
add_combo(Card::Sparse, Card::Sparse);
add_combo(Card::Sparse, Card::Dense);
add_combo(Card::Multi, Card::Dense);
add_combo(Card::Multi, Card::Sparse);
let runner: BenchRunner = BenchRunner::new();
let mut group = runner.new_group();
for (input_name, columnar_readers) in inputs.iter() {
group.register_with_input(
input_name,
columnar_readers,
move |columnar_readers: &Vec<ColumnarReader>| {
let mut out = vec![];
let columnar_readers = columnar_readers.iter().collect::<Vec<_>>();
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
black_box(out);
},
);
}
group.run();
}

View File

@@ -73,14 +73,18 @@ fn detect_cardinality(
pub fn merge_column_index<'a>(
columns: &'a [ColumnIndex],
merge_row_order: &'a MergeRowOrder,
num_values: u32,
) -> SerializableColumnIndex<'a> {
// For simplification, we do not try to detect whether the cardinality could be
// downgraded thanks to deletes.
let cardinality_after_merge = detect_cardinality(columns, merge_row_order);
match merge_row_order {
MergeRowOrder::Stack(stack_merge_order) => {
merge_column_index_stacked(columns, cardinality_after_merge, stack_merge_order)
}
MergeRowOrder::Stack(stack_merge_order) => merge_column_index_stacked(
columns,
cardinality_after_merge,
stack_merge_order,
num_values,
),
MergeRowOrder::Shuffled(complex_merge_order) => {
merge_column_index_shuffled(columns, cardinality_after_merge, complex_merge_order)
}
@@ -167,8 +171,12 @@ mod tests {
],
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order, 3);
let SerializableColumnIndex::Multivalued {
indices: start_index_iterable,
..
} = merged_column_index
else {
panic!("Excpected a multivalued index")
};
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
@@ -200,8 +208,12 @@ mod tests {
],
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order, 6);
let SerializableColumnIndex::Multivalued {
indices: start_index_iterable,
..
} = merged_column_index
else {
panic!("Excpected a multivalued index")
};
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();

View File

@@ -22,7 +22,10 @@ pub fn merge_column_index_shuffled<'a>(
Cardinality::Multivalued => {
let multivalue_start_index =
merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order);
SerializableColumnIndex::Multivalued(multivalue_start_index)
SerializableColumnIndex::Multivalued {
indices: multivalue_start_index,
stats: None,
}
}
}
}

View File

@@ -1,6 +1,8 @@
use std::iter;
use std::num::NonZeroU64;
use crate::column_index::{SerializableColumnIndex, Set};
use crate::column_values::ColumnStats;
use crate::iterable::Iterable;
use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};
@@ -12,6 +14,7 @@ pub fn merge_column_index_stacked<'a>(
columns: &'a [ColumnIndex],
cardinality_after_merge: Cardinality,
stack_merge_order: &'a StackMergeOrder,
num_values: u32,
) -> SerializableColumnIndex<'a> {
match cardinality_after_merge {
Cardinality::Full => SerializableColumnIndex::Full,
@@ -27,7 +30,17 @@ pub fn merge_column_index_stacked<'a>(
columns,
stack_merge_order,
};
SerializableColumnIndex::Multivalued(Box::new(stacked_multivalued_index))
SerializableColumnIndex::Multivalued {
indices: Box::new(stacked_multivalued_index),
stats: Some(ColumnStats {
gcd: NonZeroU64::new(1).unwrap(),
// The values in the multivalue index are the positions of the values
min_value: 0,
max_value: num_values as u64,
// This is num docs, but it starts at 0 so we need +1
num_rows: stack_merge_order.num_rows() + 1,
}),
}
}
}
}

View File

@@ -6,20 +6,29 @@ use std::sync::Arc;
use common::OwnedBytes;
use crate::column_values::{
load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
load_u64_based_column_values, serialize_u64_based_column_values,
serialize_u64_with_codec_and_stats, CodecType, ColumnStats, ColumnValues,
};
use crate::iterable::Iterable;
use crate::{DocId, RowId};
pub fn serialize_multivalued_index(
multivalued_index: &dyn Iterable<RowId>,
stats: Option<ColumnStats>,
output: &mut impl Write,
) -> io::Result<()> {
serialize_u64_based_column_values(
multivalued_index,
&[CodecType::Bitpacked, CodecType::Linear],
output,
)?;
if let Some(stats) = stats {
// TODO: Add something with higher compression that doesn't require a full scan upfront
let estimator = CodecType::Bitpacked.estimator();
assert!(!estimator.requires_full_scan());
serialize_u64_with_codec_and_stats(multivalued_index, estimator, stats, output)?;
} else {
serialize_u64_based_column_values(
multivalued_index,
&[CodecType::Bitpacked, CodecType::Linear],
output,
)?;
}
Ok(())
}
@@ -52,7 +61,7 @@ impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
impl MultiValueIndex {
pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
let mut buffer = Vec::new();
serialize_multivalued_index(&start_offsets, &mut buffer).unwrap();
serialize_multivalued_index(&start_offsets, None, &mut buffer).unwrap();
let bytes = OwnedBytes::new(buffer);
open_multivalued_index(bytes).unwrap()
}

View File

@@ -196,6 +196,7 @@ impl Set<RowId> for OptionalIndex {
} = row_addr_from_row_id(doc_id);
let block_meta = self.block_metas[block_id as usize];
let block = self.block(block_meta);
let block_offset_row_id = match block {
Block::Dense(dense_block) => dense_block.rank(in_block_row_id),
Block::Sparse(sparse_block) => sparse_block.rank(in_block_row_id),

View File

@@ -6,6 +6,7 @@ use common::{CountingWriter, OwnedBytes};
use crate::column_index::multivalued_index::serialize_multivalued_index;
use crate::column_index::optional_index::serialize_optional_index;
use crate::column_index::ColumnIndex;
use crate::column_values::ColumnStats;
use crate::iterable::Iterable;
use crate::{Cardinality, RowId};
@@ -15,9 +16,12 @@ pub enum SerializableColumnIndex<'a> {
non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
num_rows: RowId,
},
// TODO remove the Arc<dyn> apart from serialization this is not
// dynamic at all.
Multivalued(Box<dyn Iterable<RowId> + 'a>),
Multivalued {
/// Iterator emitting the indices for the index
indices: Box<dyn Iterable<RowId> + 'a>,
/// In the merge case we can precompute the column stats
stats: Option<ColumnStats>,
},
}
impl<'a> SerializableColumnIndex<'a> {
@@ -25,7 +29,7 @@ impl<'a> SerializableColumnIndex<'a> {
match self {
SerializableColumnIndex::Full => Cardinality::Full,
SerializableColumnIndex::Optional { .. } => Cardinality::Optional,
SerializableColumnIndex::Multivalued(_) => Cardinality::Multivalued,
SerializableColumnIndex::Multivalued { .. } => Cardinality::Multivalued,
}
}
}
@@ -44,9 +48,10 @@ pub fn serialize_column_index(
non_null_row_ids,
num_rows,
} => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
SerializableColumnIndex::Multivalued(multivalued_index) => {
serialize_multivalued_index(&*multivalued_index, &mut output)?
}
SerializableColumnIndex::Multivalued {
indices: multivalued_index,
stats,
} => serialize_multivalued_index(&*multivalued_index, stats, &mut output)?,
}
let column_index_num_bytes = output.written_bytes() as u32;
Ok(column_index_num_bytes)

View File

@@ -32,7 +32,8 @@ pub use u128_based::{
};
pub use u64_based::{
load_u64_based_column_values, serialize_and_load_u64_based_column_values,
serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES,
serialize_u64_based_column_values, serialize_u64_with_codec_and_stats, CodecType,
ALL_U64_CODEC_TYPES,
};
pub use vec_column::VecColumn;

View File

@@ -128,6 +128,9 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {
bit_packer.close(wrt)?;
Ok(())
}
fn codec_type(&self) -> super::CodecType {
super::CodecType::Bitpacked
}
}
pub struct BitpackedCodec;

View File

@@ -163,6 +163,10 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {
Ok(())
}
fn codec_type(&self) -> super::CodecType {
super::CodecType::BlockwiseLinear
}
}
pub struct BlockwiseLinearCodec;

View File

@@ -153,6 +153,12 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
self.collect_before_line_estimation(value);
}
}
fn requires_full_scan(&self) -> bool {
true
}
fn codec_type(&self) -> super::CodecType {
super::CodecType::Linear
}
}
impl LinearCodecEstimator {

View File

@@ -37,7 +37,11 @@ pub trait ColumnCodecEstimator<T = u64>: 'static {
/// This method will be called for each element of the column during
/// `estimation`.
fn collect(&mut self, value: u64);
/// Finalizes the first pass phase.
/// Returns true if the estimator needs a full pass over the column before serialization
fn requires_full_scan(&self) -> bool {
false
}
fn codec_type(&self) -> CodecType;
fn finalize(&mut self) {}
/// Returns an accurate estimation of the number of bytes that will
/// be used to represent this column.
@@ -150,34 +154,45 @@ pub fn serialize_u64_based_column_values<T: MonotonicallyMappableToU64>(
wrt: &mut dyn Write,
) -> io::Result<()> {
let mut stats_collector = StatsCollector::default();
let mut estimators: Vec<(CodecType, Box<dyn ColumnCodecEstimator>)> =
Vec::with_capacity(codec_types.len());
let mut estimators: Vec<Box<dyn ColumnCodecEstimator>> = Vec::with_capacity(codec_types.len());
for &codec_type in codec_types {
estimators.push((codec_type, codec_type.estimator()));
estimators.push(codec_type.estimator());
}
for val in vals.boxed_iter() {
let val_u64 = val.to_u64();
stats_collector.collect(val_u64);
for (_, estimator) in &mut estimators {
for estimator in &mut estimators {
estimator.collect(val_u64);
}
}
for (_, estimator) in &mut estimators {
for estimator in &mut estimators {
estimator.finalize();
}
let stats = stats_collector.stats();
let (_, best_codec, best_codec_estimator) = estimators
let (_, best_codec) = estimators
.into_iter()
.flat_map(|(codec_type, estimator)| {
.flat_map(|estimator| {
let num_bytes = estimator.estimate(&stats)?;
Some((num_bytes, codec_type, estimator))
Some((num_bytes, estimator))
})
.min_by_key(|(num_bytes, _, _)| *num_bytes)
.min_by_key(|(num_bytes, _)| *num_bytes)
.ok_or_else(|| {
io::Error::new(io::ErrorKind::InvalidData, "No available applicable codec.")
})?;
best_codec.to_code().serialize(wrt)?;
best_codec_estimator.serialize(
serialize_u64_with_codec_and_stats(vals, best_codec, stats, wrt)?;
Ok(())
}
/// Serializes a given column of u64-mapped values.
/// The codec estimator needs to be collected fully for the Line codec before calling this.
pub fn serialize_u64_with_codec_and_stats<T: MonotonicallyMappableToU64>(
vals: &dyn Iterable<T>,
codec: Box<dyn ColumnCodecEstimator>,
stats: ColumnStats,
wrt: &mut dyn Write,
) -> io::Result<()> {
codec.codec_type().to_code().serialize(wrt)?;
codec.serialize(
&stats,
&mut vals.boxed_iter().map(MonotonicallyMappableToU64::to_u64),
wrt,

View File

@@ -3,7 +3,7 @@ mod merge_mapping;
mod term_merger;
use std::collections::{BTreeMap, HashSet};
use std::io;
use std::io::{self};
use std::net::Ipv6Addr;
use std::sync::Arc;
@@ -156,8 +156,15 @@ fn merge_column(
column_values.push(None);
}
}
let merged_column_index =
crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
let num_values: u32 = column_values
.iter()
.map(|vals| vals.as_ref().map(|idx| idx.num_vals()).unwrap_or(0))
.sum();
let merged_column_index = crate::column_index::merge_column_index(
&column_indexes[..],
merge_row_order,
num_values,
);
let merge_column_values = MergedColumnValues {
column_indexes: &column_indexes[..],
column_values: &column_values[..],
@@ -183,8 +190,15 @@ fn merge_column(
}
}
let merged_column_index =
crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
let num_values: u32 = column_values
.iter()
.map(|vals| vals.as_ref().map(|idx| idx.num_vals()).unwrap_or(0))
.sum();
let merged_column_index = crate::column_index::merge_column_index(
&column_indexes[..],
merge_row_order,
num_values,
);
let merge_column_values = MergedColumnValues {
column_indexes: &column_indexes[..],
column_values: &column_values,
@@ -214,8 +228,19 @@ fn merge_column(
}
}
}
let merged_column_index =
crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
let num_values: u32 = bytes_columns
.iter()
.map(|vals| {
vals.as_ref()
.map(|idx| idx.term_ord_column.values.num_vals())
.unwrap_or(0)
})
.sum();
let merged_column_index = crate::column_index::merge_column_index(
&column_indexes[..],
merge_row_order,
num_values,
);
merge_bytes_or_str_column(merged_column_index, &bytes_columns, merge_row_order, wrt)?;
}
}

View File

@@ -644,7 +644,10 @@ fn send_to_serialize_column_mappable_to_u128<
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
let multivalued_index = multivalued_index_builder.finish(num_rows);
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
SerializableColumnIndex::Multivalued {
indices: Box::new(multivalued_index),
stats: Default::default(), // TODO: implement stats for u128
}
}
};
crate::column::serialize_column_mappable_to_u128(
@@ -699,7 +702,10 @@ fn send_to_serialize_column_mappable_to_u64(
if sort_values_within_row {
sort_values_within_row_in_place(multivalued_index, values);
}
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
SerializableColumnIndex::Multivalued {
indices: Box::new(multivalued_index),
stats: None,
}
}
};
crate::column::serialize_column_mappable_to_u64(

View File

@@ -738,35 +738,22 @@ proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn test_columnar_merge_proptest(columnar_docs in proptest::collection::vec(columnar_docs_strategy(), 2..=3)) {
let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().flatten().cloned().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
test_columnar_docs(columnar_docs);
}
}
#[test]
fn test_columnar_merging_empty_columnar() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
fn test_columnar_docs(columnar_docs: Vec<Vec<Vec<(&'static str, ColumnValue)>>>) {
let columnar_readers: Vec<ColumnarReader> = columnar_docs
.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
crate::merge_columnar(
&columnar_readers_arr[..],
&[],
crate::MergeRowOrder::Stack(stack_merge_order),
stack_merge_order,
&mut output,
)
.unwrap();
@@ -777,6 +764,24 @@ fn test_columnar_merging_empty_columnar() {
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
}
#[test]
fn test_columnar_merging_empty_columnar() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
test_columnar_docs(columnar_docs);
}
#[test]
fn test_columnar_merging_simple() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
vec![],
vec![vec![
("c1", ColumnValue::Numerical(0u64.into())),
("c1", ColumnValue::Numerical(0u64.into())),
]],
];
test_columnar_docs(columnar_docs);
}
#[test]
fn test_columnar_merging_number_columns() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
@@ -793,25 +798,7 @@ fn test_columnar_merging_number_columns() {
vec![("c2", ColumnValue::Numerical(u64::MAX.into()))],
],
];
let columnar_readers: Vec<ColumnarReader> = columnar_docs
.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
crate::merge_columnar(
&columnar_readers_arr[..],
&[],
crate::MergeRowOrder::Stack(stack_merge_order),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().flatten().cloned().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
test_columnar_docs(columnar_docs);
}
// TODO add non trivial remap and merge

View File

@@ -22,3 +22,6 @@ serde = { version = "1.0.136", features = ["derive"] }
[dev-dependencies]
proptest = "1.0.0"
rand = "0.8.4"
[features]
unstable = [] # useful for benches.

View File

@@ -151,7 +151,7 @@ pub fn read_u32_vint_no_advance(data: &[u8]) -> (u32, usize) {
(result, vlen)
}
/// Write a `u32` as a vint payload.
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
pub fn write_u32_vint<W: io::Write + ?Sized>(val: u32, writer: &mut W) -> io::Result<()> {
let mut buf = [0u8; 8];
let data = serialize_vint_u32(val, &mut buf);
writer.write_all(data)

View File

@@ -4,7 +4,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{DateOptions, Document, OwnedValue, Schema, INDEXED, STORED, STRING};
use tantivy::schema::{DateOptions, Document, Schema, Value, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument};
fn main() -> tantivy::Result<()> {
@@ -61,10 +61,12 @@ fn main() -> tantivy::Result<()> {
assert_eq!(count_docs.len(), 1);
for (_score, doc_address) in count_docs {
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
assert!(matches!(
retrieved_doc.get_first(occurred_at),
Some(OwnedValue::Date(_))
));
assert!(retrieved_doc
.get_first(occurred_at)
.unwrap()
.as_value()
.as_datetime()
.is_some(),);
assert_eq!(
retrieved_doc.to_json(&schema),
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#

View File

@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
let reader = index.reader()?;
let searcher = reader.searcher();
{
let facets = vec![
let facets = [
Facet::from("/ingredient/egg"),
Facet::from("/ingredient/oil"),
Facet::from("/ingredient/garlic"),
@@ -94,9 +94,8 @@ fn main() -> tantivy::Result<()> {
.doc::<TantivyDocument>(*doc_id)
.unwrap()
.get_first(title)
.and_then(|v| v.as_str())
.and_then(|v| v.as_str().map(|el| el.to_string()))
.unwrap()
.to_owned()
})
.collect();
assert_eq!(titles, vec!["Fried egg", "Egg rolls"]);

View File

@@ -61,7 +61,7 @@ fn main() -> tantivy::Result<()> {
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
))?;
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
println!("add doc {i} from thread 1 - opstamp {opstamp}");
thread::sleep(Duration::from_millis(20));
}
Result::<(), TantivyError>::Ok(())
@@ -82,7 +82,7 @@ fn main() -> tantivy::Result<()> {
body => "Some great book description..."
))?
};
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
println!("add doc {i} from thread 2 - opstamp {opstamp}");
thread::sleep(Duration::from_millis(10));
}
Result::<(), TantivyError>::Ok(())

View File

@@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::iter::once;
use nom::branch::alt;
@@ -19,7 +20,7 @@ use crate::Occur;
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
// special characters.
const SPECIAL_CHARS: &[char] = &[
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ',
'+', '^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '!', '\\', '*', ' ',
];
/// consume a field name followed by colon. Return the field name with escape sequence
@@ -41,36 +42,92 @@ fn field_name(inp: &str) -> IResult<&str, String> {
)(inp)
}
const ESCAPE_IN_WORD: &[char] = &['^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '\\'];
fn interpret_escape(source: &str) -> String {
let mut res = String::with_capacity(source.len());
let mut in_escape = false;
let require_escape = |c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c) || c == '-';
for c in source.chars() {
if in_escape {
if !require_escape(c) {
// we re-add the escape sequence
res.push('\\');
}
res.push(c);
in_escape = false;
} else if c == '\\' {
in_escape = true;
} else {
res.push(c);
}
}
res
}
/// Consume a word outside of any context.
// TODO should support escape sequences
fn word(inp: &str) -> IResult<&str, &str> {
fn word(inp: &str) -> IResult<&str, Cow<str>> {
map_res(
recognize(tuple((
satisfy(|c| {
!c.is_whitespace()
&& !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
}),
many0(satisfy(|c: char| {
!c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
})),
alt((
preceded(char('\\'), anychar),
satisfy(|c| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c) && c != '-'),
)),
many0(alt((
preceded(char('\\'), anychar),
satisfy(|c: char| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c)),
))),
))),
|s| match s {
"OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)),
_ => Ok(s),
s if s.contains('\\') => Ok(Cow::Owned(interpret_escape(s))),
s => Ok(Cow::Borrowed(s)),
},
)(inp)
}
fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ {
|inp| {
opt_i_err(
preceded(
multispace0,
recognize(many1(satisfy(|c| {
!c.is_whitespace() && !delimiter.contains(c)
}))),
fn word_infallible(
delimiter: &str,
emit_error: bool,
) -> impl Fn(&str) -> JResult<&str, Option<Cow<str>>> + '_ {
// emit error is set when receiving an unescaped `:` should emit an error
move |inp| {
map(
opt_i_err(
preceded(
multispace0,
recognize(many1(alt((
preceded(char::<&str, _>('\\'), anychar),
satisfy(|c| !c.is_whitespace() && !delimiter.contains(c)),
)))),
),
"expected word",
),
"expected word",
|(opt_s, mut errors)| match opt_s {
Some(s) => {
if emit_error
&& (s
.as_bytes()
.windows(2)
.any(|window| window[0] != b'\\' && window[1] == b':')
|| s.starts_with(':'))
{
errors.push(LenientErrorInternal {
pos: inp.len(),
message: "parsed possible invalid field as term".to_string(),
});
}
if s.contains('\\') {
(Some(Cow::Owned(interpret_escape(s))), errors)
} else {
(Some(Cow::Borrowed(s)), errors)
}
}
None => (None, errors),
},
)(inp)
}
}
@@ -159,7 +216,7 @@ fn simple_term_infallible(
(value((), char('\'')), simple_quotes),
),
// numbers are parsed with words in this case, as we allow string starting with a -
map(word_infallible(delimiter), |(text, errors)| {
map(word_infallible(delimiter, true), |(text, errors)| {
(text.map(|text| (Delimiter::None, text.to_string())), errors)
}),
)(inp)
@@ -322,15 +379,6 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
|((field_name, _, leaf), mut errors)| {
(
leaf.map(|leaf| {
if matches!(&leaf, UserInputLeaf::Literal(literal)
if literal.phrase.contains(':') && literal.delimiter == Delimiter::None)
&& field_name.is_none()
{
errors.push(LenientErrorInternal {
pos: inp.len(),
message: "parsed possible invalid field as term".to_string(),
});
}
if matches!(&leaf, UserInputLeaf::Literal(literal)
if literal.phrase == "NOT" && literal.delimiter == Delimiter::None)
&& field_name.is_none()
@@ -449,20 +497,20 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
tuple_infallible((
opt_i(anychar),
space0_infallible,
word_infallible("]}"),
word_infallible("]}", false),
space1_infallible,
opt_i_err(
terminated(tag("TO"), alt((value((), multispace1), value((), eof)))),
"missing keyword TO",
),
word_infallible("]}"),
word_infallible("]}", false),
opt_i_err(one_of("]}"), "missing range delimiter"),
)),
|(
(lower_bound_kind, _multispace0, lower, _multispace1, to, upper, upper_bound_kind),
errs,
)| {
let lower_bound = match (lower_bound_kind, lower) {
let lower_bound = match (lower_bound_kind, lower.as_deref()) {
(_, Some("*")) => UserInputBound::Unbounded,
(_, None) => UserInputBound::Unbounded,
// if it is some, TO was actually the bound (i.e. [TO TO something])
@@ -471,7 +519,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
(Some('{'), Some(bound)) => UserInputBound::Exclusive(bound.to_string()),
_ => unreachable!("precondition failed, range did not start with [ or {{"),
};
let upper_bound = match (upper_bound_kind, upper) {
let upper_bound = match (upper_bound_kind, upper.as_deref()) {
(_, Some("*")) => UserInputBound::Unbounded,
(_, None) => UserInputBound::Unbounded,
(Some(']'), Some(bound)) => UserInputBound::Inclusive(bound.to_string()),
@@ -488,7 +536,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
(
(
value((), tag(">=")),
map(word_infallible(""), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
bound
@@ -502,7 +550,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag("<=")),
map(word_infallible(""), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
UserInputBound::Unbounded,
@@ -516,7 +564,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag(">")),
map(word_infallible(""), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
bound
@@ -530,7 +578,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag("<")),
map(word_infallible(""), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
UserInputBound::Unbounded,
@@ -1157,6 +1205,12 @@ mod test {
test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]");
test_parse_query_to_ast_helper(">a", "{\"a\" TO \"*\"}");
test_parse_query_to_ast_helper(">=a", "[\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("<a", "{\"*\" TO \"a\"}");
test_parse_query_to_ast_helper("<=a", "{\"*\" TO \"a\"]");
test_parse_query_to_ast_helper("<=bsd", "{\"*\" TO \"bsd\"]");
}
#[test]
@@ -1590,5 +1644,21 @@ mod test {
r#"myfield:'hello\"happy\'tax'"#,
r#""myfield":'hello"happy'tax'"#,
);
// we don't process escape sequence for chars which don't require it
test_parse_query_to_ast_helper(r#"abc\*"#, r#"abc\*"#);
}
#[test]
fn test_queries_with_colons() {
test_parse_query_to_ast_helper(r#""abc:def""#, r#""abc:def""#);
test_parse_query_to_ast_helper(r#"'abc:def'"#, r#"'abc:def'"#);
test_parse_query_to_ast_helper(r#"abc\:def"#, r#"abc:def"#);
test_parse_query_to_ast_helper(r#""abc\:def""#, r#""abc:def""#);
test_parse_query_to_ast_helper(r#"'abc\:def'"#, r#"'abc:def'"#);
}
#[test]
fn test_invalid_field() {
test_is_parse_err(r#"!bc:def"#, "!bc:def");
}
}

View File

@@ -34,7 +34,7 @@ use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
};
use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation,
};
@@ -146,6 +146,11 @@ pub enum AggregationVariants {
/// extracted values.
#[serde(rename = "stats")]
Stats(StatsAggregation),
/// Computes a collection of estended statistics (`min`, `max`, `sum`, `count`, `avg`,
/// `sum_of_squares`, `variance`, `variance_sampling`, `std_deviation`,
/// `std_deviation_sampling`) over the extracted values.
#[serde(rename = "extended_stats")]
ExtendedStats(ExtendedStatsAggregation),
/// Computes the sum of the extracted values.
#[serde(rename = "sum")]
Sum(SumAggregation),
@@ -170,6 +175,7 @@ impl AggregationVariants {
AggregationVariants::Max(max) => vec![max.field_name()],
AggregationVariants::Min(min) => vec![min.field_name()],
AggregationVariants::Stats(stats) => vec![stats.field_name()],
AggregationVariants::ExtendedStats(extended_stats) => vec![extended_stats.field_name()],
AggregationVariants::Sum(sum) => vec![sum.field_name()],
AggregationVariants::Percentiles(per) => vec![per.field_name()],
AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
@@ -197,6 +203,12 @@ impl AggregationVariants {
_ => None,
}
}
pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregation> {
match &self {
AggregationVariants::TopHits(top_hits) => Some(top_hits),
_ => None,
}
}
pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
match &self {

View File

@@ -11,8 +11,8 @@ use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
};
use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, StatsAggregation,
SumAggregation,
AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
StatsAggregation, SumAggregation,
};
use super::segment_agg_result::AggregationLimits;
use super::VecWithNames;
@@ -276,6 +276,10 @@ impl AggregationWithAccessor {
field: ref field_name,
..
})
| ExtendedStats(ExtendedStatsAggregation {
field: ref field_name,
..
})
| Sum(SumAggregation {
field: ref field_name,
..
@@ -335,8 +339,8 @@ fn get_missing_val(
}
_ => {
return Err(crate::TantivyError::InvalidArgument(format!(
"Missing value {:?} for field {} is not supported for column type {:?}",
missing, field_name, column_type
"Missing value {missing:?} for field {field_name} is not supported for column \
type {column_type:?}"
)));
}
};
@@ -403,7 +407,7 @@ fn get_dynamic_columns(
.iter()
.map(|h| h.open())
.collect::<io::Result<_>>()?;
assert!(!ff_fields.is_empty(), "field {} not found", field_name);
assert!(!ff_fields.is_empty(), "field {field_name} not found");
Ok(cols)
}

View File

@@ -8,7 +8,9 @@ use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use super::bucket::GetDocCount;
use super::metric::{PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult};
use super::metric::{
ExtendedStats, PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult,
};
use super::{AggregationError, Key};
use crate::TantivyError;
@@ -88,6 +90,8 @@ pub enum MetricResult {
Min(SingleMetricResult),
/// Stats metric result.
Stats(Stats),
/// ExtendedStats metric result.
ExtendedStats(Box<ExtendedStats>),
/// Sum metric result.
Sum(SingleMetricResult),
/// Percentiles metric result.
@@ -104,6 +108,7 @@ impl MetricResult {
MetricResult::Max(max) => Ok(max.value),
MetricResult::Min(min) => Ok(min.value),
MetricResult::Stats(stats) => stats.get_value(agg_property),
MetricResult::ExtendedStats(extended_stats) => extended_stats.get_value(agg_property),
MetricResult::Sum(sum) => Ok(sum.value),
MetricResult::Percentiles(_) => Err(TantivyError::AggregationError(
AggregationError::InvalidRequest("percentiles can't be used to order".to_string()),

View File

@@ -357,8 +357,7 @@ impl SegmentTermCollector {
) -> crate::Result<Self> {
if field_type == ColumnType::Bytes {
return Err(TantivyError::InvalidArgument(format!(
"terms aggregation is not supported for column type {:?}",
field_type
"terms aggregation is not supported for column type {field_type:?}"
)));
}
let term_buckets = TermBuckets::default();

View File

@@ -19,8 +19,8 @@ use super::bucket::{
GetDocCount, Order, OrderTarget, RangeAggregation, TermsAggregation,
};
use super::metric::{
IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats,
IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
};
use super::segment_agg_result::AggregationLimits;
use super::{format_date, AggregationError, Key, SerializedKey};
@@ -215,6 +215,9 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
Stats(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Stats(
IntermediateStats::default(),
)),
ExtendedStats(_) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::ExtendedStats(IntermediateExtendedStats::default()),
),
Sum(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Sum(
IntermediateSum::default(),
)),
@@ -222,7 +225,7 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
IntermediateMetricResult::Percentiles(PercentilesCollector::default()),
),
TopHits(ref req) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req.clone())),
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
),
}
}
@@ -282,6 +285,8 @@ pub enum IntermediateMetricResult {
Min(IntermediateMin),
/// Intermediate stats result.
Stats(IntermediateStats),
/// Intermediate stats result.
ExtendedStats(IntermediateExtendedStats),
/// Intermediate sum result.
Sum(IntermediateSum),
/// Intermediate top_hits result
@@ -306,6 +311,9 @@ impl IntermediateMetricResult {
IntermediateMetricResult::Stats(intermediate_stats) => {
MetricResult::Stats(intermediate_stats.finalize())
}
IntermediateMetricResult::ExtendedStats(intermediate_stats) => {
MetricResult::ExtendedStats(intermediate_stats.finalize())
}
IntermediateMetricResult::Sum(intermediate_sum) => {
MetricResult::Sum(intermediate_sum.finalize().into())
}
@@ -346,6 +354,12 @@ impl IntermediateMetricResult {
) => {
stats_left.merge_fruits(stats_right);
}
(
IntermediateMetricResult::ExtendedStats(extended_stats_left),
IntermediateMetricResult::ExtendedStats(extended_stats_right),
) => {
extended_stats_left.merge_fruits(extended_stats_right);
}
(IntermediateMetricResult::Sum(sum_left), IntermediateMetricResult::Sum(sum_right)) => {
sum_left.merge_fruits(sum_right);
}

File diff suppressed because it is too large Load Diff

View File

@@ -18,6 +18,7 @@
mod average;
mod count;
mod extended_stats;
mod max;
mod min;
mod percentiles;
@@ -29,6 +30,7 @@ use std::collections::HashMap;
pub use average::*;
pub use count::*;
pub use extended_stats::*;
pub use max::*;
pub use min::*;
pub use percentiles::*;

View File

@@ -1,3 +1,5 @@
use std::fmt::Debug;
use serde::{Deserialize, Serialize};
use super::*;
@@ -85,13 +87,15 @@ impl Stats {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateStats {
/// The number of extracted values.
count: u64,
pub(crate) count: u64,
/// The sum of the extracted values.
sum: f64,
pub(crate) sum: f64,
/// delta for sum needed for [Kahan algorithm for summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
pub(crate) delta: f64,
/// The min value.
min: f64,
pub(crate) min: f64,
/// The max value.
max: f64,
pub(crate) max: f64,
}
impl Default for IntermediateStats {
@@ -99,6 +103,7 @@ impl Default for IntermediateStats {
Self {
count: 0,
sum: 0.0,
delta: 0.0,
min: f64::MAX,
max: f64::MIN,
}
@@ -109,7 +114,13 @@ impl IntermediateStats {
/// Merges the other stats intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateStats) {
self.count += other.count;
self.sum += other.sum;
// kahan algorithm for sum
let y = other.sum - (self.delta + other.delta);
let t = self.sum + y;
self.delta = (t - self.sum) - y;
self.sum = t;
self.min = self.min.min(other.min);
self.max = self.max.max(other.max);
}
@@ -141,9 +152,15 @@ impl IntermediateStats {
}
#[inline]
fn collect(&mut self, value: f64) {
pub(in crate::aggregation::metric) fn collect(&mut self, value: f64) {
self.count += 1;
self.sum += value;
// kahan algorithm for sum
let y = value - self.delta;
let t = self.sum + y;
self.delta = (t - self.sum) - y;
self.sum = t;
self.min = self.min.min(value);
self.max = self.max.max(value);
}
@@ -288,7 +305,6 @@ impl SegmentAggregationCollector for SegmentStatsCollector {
#[cfg(test)]
mod tests {
use serde_json::Value;
use crate::aggregation::agg_req::{Aggregation, Aggregations};

View File

@@ -1,7 +1,7 @@
use std::collections::HashMap;
use std::net::Ipv6Addr;
use columnar::{ColumnarReader, DynamicColumn};
use columnar::{Column, ColumnType, ColumnarReader, DynamicColumn};
use common::json_path_writer::JSON_PATH_SEGMENT_SEP_STR;
use common::DateTime;
use regex::Regex;
@@ -131,8 +131,8 @@ impl<'de> Deserialize<'de> for KeyOrder {
))?;
if key_order.next().is_some() {
return Err(serde::de::Error::custom(format!(
"Expected exactly one key-value pair in sort parameter of top_hits, found {:?}",
key_order
"Expected exactly one key-value pair in sort parameter of top_hits, found \
{key_order:?}"
)));
}
Ok(Self { field, order })
@@ -144,27 +144,22 @@ fn globbed_string_to_regex(glob: &str) -> Result<Regex, crate::TantivyError> {
// Replace `*` glob with `.*` regex
let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*"));
Regex::new(&sanitized.replace('*', ".*")).map_err(|e| {
crate::TantivyError::SchemaError(format!(
"Invalid regex '{}' in docvalue_fields: {}",
glob, e
))
crate::TantivyError::SchemaError(format!("Invalid regex '{glob}' in docvalue_fields: {e}"))
})
}
fn use_doc_value_fields_err(parameter: &str) -> crate::Result<()> {
Err(crate::TantivyError::AggregationError(
AggregationError::InvalidRequest(format!(
"The `{}` parameter is not supported, only `docvalue_fields` is supported in \
`top_hits` aggregation",
parameter
"The `{parameter}` parameter is not supported, only `docvalue_fields` is supported in \
`top_hits` aggregation"
)),
))
}
fn unsupported_err(parameter: &str) -> crate::Result<()> {
Err(crate::TantivyError::AggregationError(
AggregationError::InvalidRequest(format!(
"The `{}` parameter is not supported in the `top_hits` aggregation",
parameter
"The `{parameter}` parameter is not supported in the `top_hits` aggregation"
)),
))
}
@@ -217,8 +212,7 @@ impl TopHitsAggregation {
.collect::<Vec<_>>();
assert!(
!fields.is_empty(),
"No fields matched the glob '{}' in docvalue_fields",
field
"No fields matched the glob '{field}' in docvalue_fields"
);
Ok(fields)
})
@@ -254,7 +248,7 @@ impl TopHitsAggregation {
.map(|field| {
let accessors = accessors
.get(field)
.unwrap_or_else(|| panic!("field '{}' not found in accessors", field));
.unwrap_or_else(|| panic!("field '{field}' not found in accessors"));
let values: Vec<FastFieldValue> = accessors
.iter()
@@ -449,10 +443,10 @@ impl std::cmp::PartialEq for TopHitsTopNComputer {
impl TopHitsTopNComputer {
/// Create a new TopHitsCollector
pub fn new(req: TopHitsAggregation) -> Self {
pub fn new(req: &TopHitsAggregation) -> Self {
Self {
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
req,
req: req.clone(),
}
}
@@ -497,7 +491,6 @@ impl TopHitsTopNComputer {
pub(crate) struct TopHitsSegmentCollector {
segment_ordinal: SegmentOrdinal,
accessor_idx: usize,
req: TopHitsAggregation,
top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, false>,
}
@@ -508,7 +501,6 @@ impl TopHitsSegmentCollector {
segment_ordinal: SegmentOrdinal,
) -> Self {
Self {
req: req.clone(),
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
segment_ordinal,
accessor_idx,
@@ -517,14 +509,13 @@ impl TopHitsSegmentCollector {
fn into_top_hits_collector(
self,
value_accessors: &HashMap<String, Vec<DynamicColumn>>,
req: &TopHitsAggregation,
) -> TopHitsTopNComputer {
let mut top_hits_computer = TopHitsTopNComputer::new(self.req.clone());
let mut top_hits_computer = TopHitsTopNComputer::new(req);
let top_results = self.top_n.into_vec();
for res in top_results {
let doc_value_fields = self
.req
.get_document_field_data(value_accessors, res.doc.doc_id);
let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id);
top_hits_computer.collect(
DocSortValuesAndFields {
sorts: res.feature,
@@ -536,34 +527,15 @@ impl TopHitsSegmentCollector {
top_hits_computer
}
}
impl SegmentAggregationCollector for TopHitsSegmentCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
let intermediate_result =
IntermediateMetricResult::TopHits(self.into_top_hits_collector(value_accessors));
results.push(
name,
IntermediateAggregationResult::Metric(intermediate_result),
)
}
fn collect(
/// TODO add a specialized variant for a single sort field
fn collect_with(
&mut self,
doc_id: crate::DocId,
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
req: &TopHitsAggregation,
accessors: &[(Column<u64>, ColumnType)],
) -> crate::Result<()> {
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
let sorts: Vec<DocValueAndOrder> = self
.req
let sorts: Vec<DocValueAndOrder> = req
.sort
.iter()
.enumerate()
@@ -588,15 +560,62 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
);
Ok(())
}
}
impl SegmentAggregationCollector for TopHitsSegmentCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let intermediate_result = IntermediateMetricResult::TopHits(
self.into_top_hits_collector(value_accessors, tophits_req),
);
results.push(
name,
IntermediateAggregationResult::Metric(intermediate_result),
)
}
/// TODO: Consider a caching layer to reduce the call overhead
fn collect(
&mut self,
doc_id: crate::DocId,
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
) -> crate::Result<()> {
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
self.collect_with(doc_id, tophits_req, accessors)?;
Ok(())
}
fn collect_block(
&mut self,
docs: &[crate::DocId],
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
) -> crate::Result<()> {
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
// TODO: Consider getting fields with the column block accessor.
for doc in docs {
self.collect(*doc, agg_with_accessor)?;
self.collect_with(*doc, tophits_req, accessors)?;
}
Ok(())
}

View File

@@ -158,15 +158,14 @@ use serde::de::{self, Visitor};
use serde::{Deserialize, Deserializer, Serialize};
fn parse_str_into_f64<E: de::Error>(value: &str) -> Result<f64, E> {
let parsed = value.parse::<f64>().map_err(|_err| {
de::Error::custom(format!("Failed to parse f64 from string: {:?}", value))
})?;
let parsed = value
.parse::<f64>()
.map_err(|_err| de::Error::custom(format!("Failed to parse f64 from string: {value:?}")))?;
// Check if the parsed value is NaN or infinity
if parsed.is_nan() || parsed.is_infinite() {
Err(de::Error::custom(format!(
"Value is not a valid f64 (NaN or Infinity): {:?}",
value
"Value is not a valid f64 (NaN or Infinity): {value:?}"
)))
} else {
Ok(parsed)

View File

@@ -11,12 +11,12 @@ use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAcce
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
use super::intermediate_agg_result::IntermediateAggregationResults;
use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
SegmentPercentilesCollector, SegmentStatsCollector, SegmentStatsType, StatsAggregation,
SumAggregation,
};
use crate::aggregation::bucket::TermMissingAgg;
use crate::aggregation::metric::TopHitsSegmentCollector;
use crate::aggregation::metric::{SegmentExtendedStatsCollector, TopHitsSegmentCollector};
pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
fn add_intermediate_aggregation_result(
@@ -148,6 +148,9 @@ pub(crate) fn build_single_agg_segment_collector(
accessor_idx,
*missing,
))),
ExtendedStats(ExtendedStatsAggregation { missing, sigma, .. }) => Ok(Box::new(
SegmentExtendedStatsCollector::from_req(req.field_type, *sigma, accessor_idx, *missing),
)),
Sum(SumAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Sum,

View File

@@ -598,7 +598,7 @@ mod tests {
let mid = n % 4;
n /= 4;
let leaf = n % 5;
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
Facet::from(&format!("/top{top}/mid{mid}/leaf{leaf}"))
})
.collect();
for i in 0..num_facets * 10 {
@@ -737,7 +737,7 @@ mod tests {
vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c));
let facet = Facet::from(&format!("/facet/{c}"));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})
@@ -785,7 +785,7 @@ mod tests {
let docs: Vec<TantivyDocument> = vec![("b", 2), ("a", 2), ("c", 4)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c));
let facet = Facet::from(&format!("/facet/{c}"));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})

View File

@@ -871,7 +871,10 @@ mod tests {
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score, SegmentReader};
use crate::{
assert_nearly_equals, DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score,
SegmentReader,
};
fn make_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();

View File

@@ -184,28 +184,56 @@ mod tests {
fn test_cancel_cpu_intensive_tasks() {
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Duration;
let counter: Arc<AtomicU64> = Default::default();
let other_counter: Arc<AtomicU64> = Default::default();
let mut futures = Vec::new();
let mut other_futures = Vec::new();
let (tx, rx) = crossbeam_channel::bounded::<()>(0);
let rx = Arc::new(rx);
let executor = Executor::multi_thread(3, "search-test").unwrap();
for _ in 0..1_000 {
let counter_clone = counter.clone();
for _ in 0..1000 {
let counter_clone: Arc<AtomicU64> = counter.clone();
let other_counter_clone: Arc<AtomicU64> = other_counter.clone();
let rx_clone = rx.clone();
let rx_clone2 = rx.clone();
let fut = executor.spawn_blocking(move || {
std::thread::sleep(Duration::from_millis(4));
counter_clone.fetch_add(1, Ordering::SeqCst)
counter_clone.fetch_add(1, Ordering::SeqCst);
let _ = rx_clone.recv();
});
futures.push(fut);
let other_fut = executor.spawn_blocking(move || {
other_counter_clone.fetch_add(1, Ordering::SeqCst);
let _ = rx_clone2.recv();
});
other_futures.push(other_fut);
}
std::thread::sleep(Duration::from_millis(5));
// The first few num_cores tasks should run, but the other should get cancelled.
drop(futures);
while Arc::strong_count(&counter) > 1 {
std::thread::sleep(Duration::from_millis(10));
// We execute 100 futures.
for _ in 0..100 {
tx.send(()).unwrap();
}
// with ideal timing, we expect the result to always be 6, but as long as we run some, and
// cancelled most, the test is a success
assert!(counter.load(Ordering::SeqCst) > 0);
assert!(counter.load(Ordering::SeqCst) < 50);
let counter_val = counter.load(Ordering::SeqCst);
let other_counter_val = other_counter.load(Ordering::SeqCst);
assert!(counter_val >= 30);
assert!(other_counter_val >= 30);
drop(other_futures);
// We execute 100 futures.
for _ in 0..100 {
tx.send(()).unwrap();
}
let counter_val2 = counter.load(Ordering::SeqCst);
assert!(counter_val2 >= counter_val + 100 - 6);
let other_counter_val2 = other_counter.load(Ordering::SeqCst);
assert!(other_counter_val2 <= other_counter_val + 6);
}
}

View File

@@ -338,14 +338,14 @@ mod tests {
let mut term = Term::from_field_json_path(field, "attributes.color", false);
term.append_type_and_str("red");
assert_eq!(
format!("{:?}", term),
format!("{term:?}"),
"Term(field=1, type=Json, path=attributes.color, type=Str, \"red\")"
);
let mut term = Term::from_field_json_path(field, "attributes.dimensions.width", false);
term.append_type_and_fast_value(400i64);
assert_eq!(
format!("{:?}", term),
format!("{term:?}"),
"Term(field=1, type=Json, path=attributes.dimensions.width, type=I64, 400)"
);
}

View File

@@ -566,7 +566,7 @@ mod tests {
let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
let num_paths = 10;
let paths: Vec<PathBuf> = (0..num_paths)
.map(|i| PathBuf::from(&*format!("file_{}", i)))
.map(|i| PathBuf::from(&*format!("file_{i}")))
.collect();
{
for path in &paths {

View File

@@ -62,8 +62,7 @@ impl FacetReader {
#[cfg(test)]
mod tests {
use crate::schema::document::Value;
use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED};
use crate::schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED};
use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
#[test]
@@ -89,7 +88,9 @@ mod tests {
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))
.unwrap();
let value = doc.get_first(facet_field).and_then(|v| v.as_facet());
let value = doc
.get_first(facet_field)
.and_then(|v| v.as_value().as_facet());
assert_eq!(value, None);
}

View File

@@ -252,9 +252,8 @@ impl IndexBuilder {
let field_type = entry.field_type().value_type();
if !supported_field_types.contains(&field_type) {
return Err(TantivyError::InvalidArgument(format!(
"Unsupported field type in sort_by_field: {:?}. Supported field types: \
{:?} ",
field_type, supported_field_types,
"Unsupported field type in sort_by_field: {field_type:?}. Supported field \
types: {supported_field_types:?} ",
)));
}
}

View File

@@ -318,14 +318,14 @@ impl SegmentReader {
if create_canonical {
// Without expand dots enabled dots need to be escaped.
let escaped_json_path = json_path.replace('.', "\\.");
let full_path = format!("{}.{}", field_name, escaped_json_path);
let full_path = format!("{field_name}.{escaped_json_path}");
let full_path_unescaped = format!("{}.{}", field_name, &json_path);
map_to_canonical.insert(full_path_unescaped, full_path.to_string());
full_path
} else {
// With expand dots enabled, we can use '.' instead of '\u{1}'.
json_path_sep_to_dot(&mut json_path);
format!("{}.{}", field_name, json_path)
format!("{field_name}.{json_path}")
}
};
indexed_fields.extend(

View File

@@ -306,12 +306,10 @@ mod tests_indexsorting {
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
{
assert_eq!(
searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
.get_first(my_string_field),
None
);
assert!(searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
.get_first(my_string_field)
.is_none());
assert_eq!(
searcher
.doc::<TantivyDocument>(DocAddress::new(0, 3))?
@@ -344,7 +342,7 @@ mod tests_indexsorting {
Some("blublub")
);
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
assert_eq!(doc.get_first(my_string_field), None);
assert!(doc.get_first(my_string_field).is_none());
}
// sort by field desc
let index = create_test_index(

View File

@@ -808,16 +808,15 @@ mod tests {
use proptest::prop_oneof;
use super::super::operation::UserOperation;
use crate::collector::TopDocs;
use crate::collector::{Count, TopDocs};
use crate::directory::error::LockError;
use crate::error::*;
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::indexer::NoMergePolicy;
use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
use crate::schema::document::Value;
use crate::schema::{
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema,
TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED, STRING, TEXT,
};
use crate::store::DOCSTORE_CACHE_CAPACITY;
use crate::{
@@ -1573,20 +1572,74 @@ mod tests {
Ok(())
}
#[derive(Debug, Clone, Copy)]
#[derive(Debug, Clone)]
enum IndexingOp {
AddDoc { id: u64 },
DeleteDoc { id: u64 },
DeleteDocQuery { id: u64 },
AddMultipleDoc {
id: u64,
num_docs: u64,
value: IndexValue,
},
AddDoc {
id: u64,
value: IndexValue,
},
DeleteDoc {
id: u64,
},
DeleteDocQuery {
id: u64,
},
Commit,
Merge,
}
impl IndexingOp {
fn add(id: u64) -> Self {
IndexingOp::AddDoc {
id,
value: IndexValue::F64(id as f64),
}
}
}
use serde::Serialize;
#[derive(Debug, Clone, Serialize)]
#[serde(untagged)]
enum IndexValue {
Str(String),
F64(f64),
U64(u64),
I64(i64),
}
impl Default for IndexValue {
fn default() -> Self {
IndexValue::F64(0.0)
}
}
fn value_strategy() -> impl Strategy<Value = IndexValue> {
prop_oneof![
any::<f64>().prop_map(IndexValue::F64),
any::<u64>().prop_map(IndexValue::U64),
any::<i64>().prop_map(IndexValue::I64),
any::<String>().prop_map(IndexValue::Str),
]
}
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
(0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
(0u64..20u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
(0u64..20u64).prop_map(|id| IndexingOp::AddDoc { id }),
(0u64..20u64, value_strategy())
.prop_map(move |(id, value)| IndexingOp::AddDoc { id, value }),
((0u64..20u64), (1u64..100), value_strategy()).prop_map(
move |(id, num_docs, value)| {
IndexingOp::AddMultipleDoc {
id,
num_docs,
value,
}
}
),
(0u64..1u64).prop_map(|_| IndexingOp::Commit),
(0u64..1u64).prop_map(|_| IndexingOp::Merge),
]
@@ -1596,7 +1649,17 @@ mod tests {
prop_oneof![
5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
50 => (0u64..100u64).prop_map(|id| IndexingOp::AddDoc { id }),
50 => (0u64..100u64, value_strategy())
.prop_map(move |(id, value)| IndexingOp::AddDoc { id, value }),
50 => (0u64..100u64, (1u64..100), value_strategy()).prop_map(
move |(id, num_docs, value)| {
IndexingOp::AddMultipleDoc {
id,
num_docs,
value,
}
}
),
2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit),
1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge),
]
@@ -1605,19 +1668,27 @@ mod tests {
fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) {
let mut existing_ids = HashMap::new();
let mut deleted_ids = HashSet::new();
for &op in ops {
for op in ops {
match op {
IndexingOp::AddDoc { id } => {
*existing_ids.entry(id).or_insert(0) += 1;
deleted_ids.remove(&id);
IndexingOp::AddDoc { id, value: _ } => {
*existing_ids.entry(*id).or_insert(0) += 1;
deleted_ids.remove(id);
}
IndexingOp::AddMultipleDoc {
id,
num_docs,
value: _,
} => {
*existing_ids.entry(*id).or_insert(0) += num_docs;
deleted_ids.remove(id);
}
IndexingOp::DeleteDoc { id } => {
existing_ids.remove(&id);
deleted_ids.insert(id);
deleted_ids.insert(*id);
}
IndexingOp::DeleteDocQuery { id } => {
existing_ids.remove(&id);
deleted_ids.insert(id);
deleted_ids.insert(*id);
}
_ => {}
}
@@ -1627,16 +1698,19 @@ mod tests {
fn get_id_list(ops: &[IndexingOp]) -> Vec<u64> {
let mut id_list = Vec::new();
for &op in ops {
for op in ops {
match op {
IndexingOp::AddDoc { id } => {
id_list.push(id);
IndexingOp::AddDoc { id, value: _ } => {
id_list.push(*id);
}
IndexingOp::AddMultipleDoc { id, .. } => {
id_list.push(*id);
}
IndexingOp::DeleteDoc { id } => {
id_list.retain(|el| *el != id);
id_list.retain(|el| el != id);
}
IndexingOp::DeleteDocQuery { id } => {
id_list.retain(|el| *el != id);
id_list.retain(|el| el != id);
}
_ => {}
}
@@ -1717,42 +1791,59 @@ mod tests {
let ip_from_id = |id| Ipv6Addr::from_u128(id as u128);
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
let ip = ip_from_id(id);
if !id_is_full_doc(id) {
// every 3rd doc has no ip field
index_writer.add_document(doc!(
id_field=>id,
))?;
} else {
let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string()});
index_writer.add_document(doc!(id_field=>id,
json_field=>json,
bytes_field => id.to_le_bytes().as_slice(),
id_opt_field => id,
ip_field => ip,
ips_field => ip,
ips_field => ip,
multi_numbers=> id,
multi_numbers => id,
bool_field => (id % 2u64) != 0,
i64_field => id as i64,
f64_field => id as f64,
date_field => DateTime::from_timestamp_secs(id as i64),
multi_bools => (id % 2u64) != 0,
multi_bools => (id % 2u64) == 0,
text_field => id.to_string(),
facet_field => facet,
large_text_field => LOREM,
multi_text_fields => multi_text_field_text1,
multi_text_fields => multi_text_field_text2,
multi_text_fields => multi_text_field_text3,
))?;
}
let add_docs = |index_writer: &mut IndexWriter,
id: u64,
value: IndexValue,
num: u64|
-> crate::Result<()> {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
let ip = ip_from_id(id);
let doc = if !id_is_full_doc(id) {
// every 3rd doc has no ip field
doc!(
id_field=>id,
)
} else {
let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string(), "val": value});
doc!(id_field=>id,
json_field=>json,
bytes_field => id.to_le_bytes().as_slice(),
id_opt_field => id,
ip_field => ip,
ips_field => ip,
ips_field => ip,
multi_numbers=> id,
multi_numbers => id,
bool_field => (id % 2u64) != 0,
i64_field => id as i64,
f64_field => id as f64,
date_field => DateTime::from_timestamp_secs(id as i64),
multi_bools => (id % 2u64) != 0,
multi_bools => (id % 2u64) == 0,
text_field => id.to_string(),
facet_field => facet,
large_text_field => LOREM,
multi_text_fields => multi_text_field_text1,
multi_text_fields => multi_text_field_text2,
multi_text_fields => multi_text_field_text3,
)
};
for _ in 0..num {
index_writer.add_document(doc.clone())?;
}
Ok(())
};
for op in ops {
match op.clone() {
IndexingOp::AddMultipleDoc {
id,
num_docs,
value,
} => {
add_docs(&mut index_writer, id, value, num_docs)?;
}
IndexingOp::AddDoc { id, value } => {
add_docs(&mut index_writer, id, value, 1)?;
}
IndexingOp::DeleteDoc { id } => {
index_writer.delete_term(Term::from_field_u64(id_field, id));
@@ -1980,7 +2071,13 @@ mod tests {
.unwrap();
// test store iterator
for doc in store_reader.iter::<TantivyDocument>(segment_reader.alive_bitset()) {
let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap();
let id = doc
.unwrap()
.get_first(id_field)
.unwrap()
.as_value()
.as_u64()
.unwrap();
assert!(expected_ids_and_num_occurrences.contains_key(&id));
}
// test store random access
@@ -2013,7 +2110,7 @@ mod tests {
let mut bool2 = doc.get_all(multi_bools);
assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap());
assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap());
assert_eq!(None, bool2.next())
assert!(bool2.next().is_none())
}
}
}
@@ -2027,18 +2124,22 @@ mod tests {
top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
};
let count_search = |term: &str, field| {
let query = QueryParser::for_index(&index, vec![field])
.parse_query(term)
.unwrap();
searcher.search(&query, &Count).unwrap()
};
let do_search2 = |term: Term| {
let count_search2 = |term: Term| {
let query = TermQuery::new(term, IndexRecordOption::Basic);
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(1000)).unwrap();
top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
searcher.search(&query, &Count).unwrap()
};
for (id, count) in &expected_ids_and_num_occurrences {
// skip expensive queries
let (existing_id, count) = (*id, *count);
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64;
let get_num_hits = |field| count_search(&existing_id.to_string(), field) as u64;
assert_eq!(get_num_hits(id_field), count);
if !id_is_full_doc(existing_id) {
continue;
@@ -2048,29 +2149,31 @@ mod tests {
assert_eq!(get_num_hits(f64_field), count);
// Test multi text
assert_eq!(
do_search("\"test1 test2\"", multi_text_fields).len(),
num_docs_with_values
);
assert_eq!(
do_search("\"test2 test3\"", multi_text_fields).len(),
num_docs_with_values
);
if num_docs_with_values < 1000 {
assert_eq!(
do_search("\"test1 test2\"", multi_text_fields).len(),
num_docs_with_values
);
assert_eq!(
do_search("\"test2 test3\"", multi_text_fields).len(),
num_docs_with_values
);
}
// Test bytes
let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice());
assert_eq!(do_search2(term).len() as u64, count);
assert_eq!(count_search2(term) as u64, count);
// Test date
let term = Term::from_field_date(
date_field,
DateTime::from_timestamp_secs(existing_id as i64),
);
assert_eq!(do_search2(term).len() as u64, count);
assert_eq!(count_search2(term) as u64, count);
}
for deleted_id in deleted_ids {
let assert_field = |field| {
assert_eq!(do_search(&deleted_id.to_string(), field).len() as u64, 0);
assert_eq!(count_search(&deleted_id.to_string(), field) as u64, 0);
};
assert_field(text_field);
assert_field(f64_field);
@@ -2079,12 +2182,12 @@ mod tests {
// Test bytes
let term = Term::from_field_bytes(bytes_field, deleted_id.to_le_bytes().as_slice());
assert_eq!(do_search2(term).len() as u64, 0);
assert_eq!(count_search2(term), 0);
// Test date
let term =
Term::from_field_date(date_field, DateTime::from_timestamp_secs(deleted_id as i64));
assert_eq!(do_search2(term).len() as u64, 0);
assert_eq!(count_search2(term), 0);
}
// search ip address
//
@@ -2093,13 +2196,13 @@ mod tests {
if !id_is_full_doc(existing_id) {
continue;
}
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
let ip_addr = Ipv6Addr::from_u128(existing_id as u128);
// Test incoming ip as ipv6
assert_eq!(do_search_ip_field(&format!("\"{ip_addr}\"")), count);
let term = Term::from_field_ip_addr(ip_field, ip_addr);
assert_eq!(do_search2(term).len() as u64, count);
assert_eq!(count_search2(term) as u64, count);
// Test incoming ip as ipv4
if let Some(ip_addr) = ip_addr.to_ipv4_mapped() {
@@ -2116,7 +2219,7 @@ mod tests {
if !sample.is_empty() {
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
let expected_count = |sample: &[(&u64, &u64)]| {
let calc_expected_count = |sample: &[(&u64, &u64)]| {
sample
.iter()
.filter(|(id, _)| id_is_full_doc(**id))
@@ -2132,18 +2235,17 @@ mod tests {
}
// Query first half
if !left_sample.is_empty() {
let expected_count = expected_count(left_sample);
let expected_count = calc_expected_count(left_sample);
if !left_sample.is_empty() && expected_count < 1000 {
let start_range = *left_sample[0].0;
let end_range = *left_sample.last().unwrap().0;
let query = gen_query_inclusive("id_opt", start_range, end_range);
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
assert_eq!(count_search(&query, id_opt_field) as u64, expected_count);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", "*", ip2);
@@ -2155,19 +2257,19 @@ mod tests {
assert_eq!(do_search_ip_field(&query), expected_count);
}
// Query second half
if !right_sample.is_empty() {
let expected_count = expected_count(right_sample);
let expected_count = calc_expected_count(right_sample);
if !right_sample.is_empty() && expected_count < 1000 {
let start_range = *right_sample[0].0;
let end_range = *right_sample.last().unwrap().0;
// Range query on id opt field
let query =
gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
assert_eq!(count_search(&query, id_opt_field) as u64, expected_count);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", ip1, "*");
@@ -2192,7 +2294,7 @@ mod tests {
};
let ip = ip_from_id(existing_id);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
// Range query on single value field
let query = gen_query_inclusive("ip", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
@@ -2252,7 +2354,7 @@ mod tests {
#[test]
fn test_fast_field_range() {
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect();
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::add(id)).collect();
assert!(test_operation_strategy(&ops, false, true).is_ok());
}
@@ -2260,8 +2362,8 @@ mod tests {
fn test_sort_index_on_opt_field_regression() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 81 },
IndexingOp::AddDoc { id: 70 },
IndexingOp::add(81),
IndexingOp::add(70),
IndexingOp::DeleteDoc { id: 70 }
],
true,
@@ -2270,14 +2372,45 @@ mod tests {
.is_ok());
}
#[test]
fn test_simple_multiple_doc() {
assert!(test_operation_strategy(
&[
IndexingOp::AddMultipleDoc {
id: 7,
num_docs: 800,
value: IndexValue::U64(0),
},
IndexingOp::AddMultipleDoc {
id: 92,
num_docs: 800,
value: IndexValue::U64(0),
},
IndexingOp::AddMultipleDoc {
id: 30,
num_docs: 800,
value: IndexValue::U64(0),
},
IndexingOp::AddMultipleDoc {
id: 33,
num_docs: 800,
value: IndexValue::U64(0),
},
],
true,
false
)
.is_ok());
}
#[test]
fn test_ip_range_query_multivalue_bug() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 2 },
IndexingOp::add(2),
IndexingOp::Commit,
IndexingOp::AddDoc { id: 1 },
IndexingOp::AddDoc { id: 1 },
IndexingOp::add(1),
IndexingOp::add(1),
IndexingOp::Commit,
IndexingOp::Merge
],
@@ -2291,11 +2424,11 @@ mod tests {
fn test_ff_num_ips_regression() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 13 },
IndexingOp::AddDoc { id: 1 },
IndexingOp::add(13),
IndexingOp::add(1),
IndexingOp::Commit,
IndexingOp::DeleteDocQuery { id: 13 },
IndexingOp::AddDoc { id: 1 },
IndexingOp::add(1),
IndexingOp::Commit,
],
false,
@@ -2307,7 +2440,7 @@ mod tests {
#[test]
fn test_minimal_sort_force_end_merge() {
assert!(test_operation_strategy(
&[IndexingOp::AddDoc { id: 23 }, IndexingOp::AddDoc { id: 13 },],
&[IndexingOp::add(23), IndexingOp::add(13),],
false,
false
)
@@ -2368,8 +2501,8 @@ mod tests {
fn test_minimal_sort_force_end_merge_with_delete() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 23 },
IndexingOp::AddDoc { id: 13 },
IndexingOp::add(23),
IndexingOp::add(13),
IndexingOp::DeleteDoc { id: 13 }
],
true,
@@ -2382,8 +2515,8 @@ mod tests {
fn test_minimal_no_sort_no_force_end_merge() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 23 },
IndexingOp::AddDoc { id: 13 },
IndexingOp::add(23),
IndexingOp::add(13),
IndexingOp::DeleteDoc { id: 13 }
],
false,
@@ -2394,7 +2527,7 @@ mod tests {
#[test]
fn test_minimal_sort_merge() {
assert!(test_operation_strategy(&[IndexingOp::AddDoc { id: 3 },], true, true).is_ok());
assert!(test_operation_strategy(&[IndexingOp::add(3),], true, true).is_ok());
}
use proptest::prelude::*;
@@ -2490,14 +2623,14 @@ mod tests {
fn test_delete_bug_reproduction_ip_addr() {
use IndexingOp::*;
let ops = &[
AddDoc { id: 1 },
AddDoc { id: 2 },
IndexingOp::add(1),
IndexingOp::add(2),
Commit,
AddDoc { id: 3 },
IndexingOp::add(3),
DeleteDoc { id: 1 },
Commit,
Merge,
AddDoc { id: 4 },
IndexingOp::add(4),
Commit,
];
test_operation_strategy(&ops[..], false, true).unwrap();
@@ -2506,7 +2639,13 @@ mod tests {
#[test]
fn test_merge_regression_1() {
use IndexingOp::*;
let ops = &[AddDoc { id: 15 }, Commit, AddDoc { id: 9 }, Commit, Merge];
let ops = &[
IndexingOp::add(15),
Commit,
IndexingOp::add(9),
Commit,
Merge,
];
test_operation_strategy(&ops[..], false, true).unwrap();
}
@@ -2514,9 +2653,9 @@ mod tests {
fn test_range_query_bug_1() {
use IndexingOp::*;
let ops = &[
AddDoc { id: 9 },
AddDoc { id: 0 },
AddDoc { id: 13 },
IndexingOp::add(9),
IndexingOp::add(0),
IndexingOp::add(13),
Commit,
];
test_operation_strategy(&ops[..], false, true).unwrap();
@@ -2524,12 +2663,11 @@ mod tests {
#[test]
fn test_range_query_bug_2() {
use IndexingOp::*;
let ops = &[
AddDoc { id: 3 },
AddDoc { id: 6 },
AddDoc { id: 9 },
AddDoc { id: 10 },
IndexingOp::add(3),
IndexingOp::add(6),
IndexingOp::add(9),
IndexingOp::add(10),
];
test_operation_strategy(&ops[..], false, false).unwrap();
}
@@ -2551,7 +2689,7 @@ mod tests {
assert!(test_operation_strategy(
&[
IndexingOp::DeleteDoc { id: 0 },
IndexingOp::AddDoc { id: 6 },
IndexingOp::add(6),
IndexingOp::DeleteDocQuery { id: 11 },
IndexingOp::Commit,
IndexingOp::Merge,
@@ -2568,10 +2706,13 @@ mod tests {
fn test_bug_1617_2() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 13 },
IndexingOp::AddDoc {
id: 13,
value: Default::default()
},
IndexingOp::DeleteDoc { id: 13 },
IndexingOp::Commit,
IndexingOp::AddDoc { id: 30 },
IndexingOp::add(30),
IndexingOp::Commit,
IndexingOp::Merge,
],

View File

@@ -787,6 +787,8 @@ impl IndexMerger {
mod tests {
use columnar::Column;
use proptest::prop_oneof;
use proptest::strategy::Strategy;
use schema::FAST;
use crate::collector::tests::{
@@ -794,11 +796,11 @@ mod tests {
};
use crate::collector::{Count, FacetCollector};
use crate::index::{Index, SegmentId};
use crate::indexer::NoMergePolicy;
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::schema::document::Value;
use crate::schema::{
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
TextFieldIndexing, INDEXED, TEXT,
TextFieldIndexing, Value, INDEXED, TEXT,
};
use crate::time::OffsetDateTime;
use crate::{
@@ -910,15 +912,24 @@ mod tests {
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b"));
assert_eq!(
doc.get_first(text_field).unwrap().as_value().as_str(),
Some("af b")
);
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c"));
assert_eq!(
doc.get_first(text_field).unwrap().as_value().as_str(),
Some("a b c")
);
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c d"));
assert_eq!(
doc.get_first(text_field).unwrap().as_value().as_str(),
Some("a b c d")
);
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 3))?;
@@ -1523,6 +1534,112 @@ mod tests {
Ok(())
}
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
enum IndexingOp {
ZeroVal,
OneVal { val: u64 },
TwoVal { val: u64 },
Commit,
}
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
(0u64..1u64).prop_map(|_| IndexingOp::ZeroVal),
(0u64..1u64).prop_map(|val| IndexingOp::OneVal { val }),
(0u64..1u64).prop_map(|val| IndexingOp::TwoVal { val }),
(0u64..1u64).prop_map(|_| IndexingOp::Commit),
]
}
use proptest::prelude::*;
proptest! {
#[test]
fn test_merge_columnar_int_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..20)) {
assert!(test_merge_int_fields(&ops[..]).is_ok());
}
}
fn test_merge_int_fields(ops: &[IndexingOp]) -> crate::Result<()> {
if ops.iter().all(|op| *op == IndexingOp::Commit) {
return Ok(());
}
let expected_doc_and_vals: Vec<(u32, Vec<u64>)> = ops
.iter()
.filter(|op| *op != &IndexingOp::Commit)
.map(|op| match op {
IndexingOp::ZeroVal => vec![],
IndexingOp::OneVal { val } => vec![*val],
IndexingOp::TwoVal { val } => vec![*val, *val],
IndexingOp::Commit => unreachable!(),
})
.enumerate()
.map(|(id, val)| (id as u32, val))
.collect();
let mut schema_builder = schema::Schema::builder();
let int_options = NumericOptions::default().set_fast().set_indexed();
let int_field = schema_builder.add_u64_field("intvals", int_options);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = TantivyDocument::default();
for &val in int_vals {
doc.add_u64(int_field, val);
}
index_writer.add_document(doc).unwrap();
};
for op in ops {
match op {
IndexingOp::ZeroVal => index_doc(&mut index_writer, &[]),
IndexingOp::OneVal { val } => index_doc(&mut index_writer, &[*val]),
IndexingOp::TwoVal { val } => index_doc(&mut index_writer, &[*val, *val]),
IndexingOp::Commit => {
index_writer.commit().expect("commit failed");
}
}
}
index_writer.commit().expect("commit failed");
}
{
let mut segment_ids = index.searchable_segment_ids()?;
segment_ids.sort();
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
let reader = index.reader()?;
reader.reload()?;
let mut vals: Vec<u64> = Vec::new();
let mut test_vals = move |col: &Column<u64>, doc: DocId, expected: &[u64]| {
vals.clear();
vals.extend(col.values_for_doc(doc));
assert_eq!(&vals[..], expected);
};
let mut test_col = move |col: &Column<u64>, column_expected: &[(u32, Vec<u64>)]| {
for (doc_id, vals) in column_expected.iter() {
test_vals(col, *doc_id, vals);
}
};
{
let searcher = reader.searcher();
let segment = searcher.segment_reader(0u32);
let col = segment
.fast_fields()
.column_opt::<u64>("intvals")
.unwrap()
.unwrap();
test_col(&col, &expected_doc_and_vals);
}
Ok(())
}
#[test]
fn test_merge_multivalued_int_fields_simple() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();

View File

@@ -5,10 +5,9 @@ mod tests {
use crate::index::Index;
use crate::postings::Postings;
use crate::query::QueryParser;
use crate::schema::document::Value;
use crate::schema::{
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions,
TextFieldIndexing, TextOptions, Value,
};
use crate::{
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument,
@@ -281,13 +280,16 @@ mod tests {
.doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
.unwrap();
assert_eq!(
doc.get_first(my_text_field).unwrap().as_str(),
doc.get_first(my_text_field).unwrap().as_value().as_str(),
Some("blubber")
);
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))
.unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000));
assert_eq!(
doc.get_first(int_field).unwrap().as_value().as_u64(),
Some(1000)
);
}
}

View File

@@ -216,7 +216,7 @@ mod tests_mmap {
let test_query = |query_str: &str| {
let query = parse_query.parse_query(query_str).unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1, "{}", query_str);
assert_eq!(num_docs, 1, "{query_str}");
};
test_query(format!("json.{field_name_out}:test1").as_str());
test_query(format!("json.a{field_name_out}:test2").as_str());
@@ -590,10 +590,10 @@ mod tests_mmap {
let query_parser = QueryParser::for_index(&index, vec![]);
// Test if field name can be queried
for (indexed_field, val) in fields_and_vals.iter() {
let query_str = &format!("{}:{}", indexed_field, val);
let query_str = &format!("{indexed_field}:{val}");
let query = query_parser.parse_query(query_str).unwrap();
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
assert!(!count_docs.is_empty(), "{}:{}", indexed_field, val);
assert!(!count_docs.is_empty(), "{indexed_field}:{val}");
}
// Test if field name can be used for aggregation
for (field_name, val) in fields_and_vals.iter() {

View File

@@ -202,9 +202,8 @@ impl SegmentWriter {
match field_entry.field_type() {
FieldType::Facet(_) => {
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
let facet_str = value.as_facet().ok_or_else(make_schema_error)?;
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
@@ -220,15 +219,14 @@ impl SegmentWriter {
}
FieldType::Str(_) => {
let mut indexing_position = IndexingPosition::default();
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
let mut token_stream = if let Some(text) = value.as_str() {
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text)
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
} else if let Some(tok_str) = value.into_pre_tokenized_text() {
BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone()))
} else {
continue;
@@ -250,9 +248,8 @@ impl SegmentWriter {
}
FieldType::U64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
@@ -265,10 +262,8 @@ impl SegmentWriter {
}
FieldType::Date(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
for value in values {
let value = value.as_value();
num_vals += 1;
let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
@@ -282,9 +277,8 @@ impl SegmentWriter {
}
FieldType::I64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
@@ -297,10 +291,8 @@ impl SegmentWriter {
}
FieldType::F64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
term_buffer.set_f64(f64_val);
@@ -312,10 +304,8 @@ impl SegmentWriter {
}
FieldType::Bool(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
term_buffer.set_bool(bool_val);
@@ -327,10 +317,8 @@ impl SegmentWriter {
}
FieldType::Bytes(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes);
@@ -364,9 +352,8 @@ impl SegmentWriter {
}
FieldType::IpAddr(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
@@ -499,10 +486,9 @@ mod tests {
use crate::fastfield::FastValue;
use crate::postings::{Postings, TermInfo};
use crate::query::{PhraseQuery, QueryParser};
use crate::schema::document::Value;
use crate::schema::{
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, STORED,
STRING, TEXT,
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
STORED, STRING, TEXT,
};
use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339;
@@ -555,9 +541,15 @@ mod tests {
let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
let doc = reader.get::<TantivyDocument>(0).unwrap();
assert_eq!(doc.field_values().len(), 2);
assert_eq!(doc.field_values()[0].value().as_str(), Some("A"));
assert_eq!(doc.field_values()[1].value().as_str(), Some("title"));
assert_eq!(doc.field_values().count(), 2);
assert_eq!(
doc.get_all(text_field).next().unwrap().as_value().as_str(),
Some("A")
);
assert_eq!(
doc.get_all(text_field).nth(1).unwrap().as_value().as_str(),
Some("title")
);
}
#[test]
fn test_simple_json_indexing() {
@@ -641,7 +633,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
let schema = schema_builder.build();
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
let json_val: serde_json::Value = serde_json::from_str(
r#"{
"toto": "titi",
"float": -0.2,
@@ -669,14 +661,10 @@ mod tests {
doc_id: 0u32,
})
.unwrap();
let serdeser_json_val = serde_json::from_str::<serde_json::Map<String, serde_json::Value>>(
&doc.to_json(&schema),
)
.unwrap()
.get("json")
.unwrap()[0]
.as_object()
let serdeser_json_val = serde_json::from_str::<serde_json::Value>(&doc.to_json(&schema))
.unwrap()
.get("json")
.unwrap()[0]
.clone();
assert_eq!(json_val, serdeser_json_val);
let segment_reader = searcher.segment_reader(0u32);
@@ -840,7 +828,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STRING);
let schema = schema_builder.build();
let json_val: serde_json::Map<String, serde_json::Value> =
let json_val: serde_json::Value =
serde_json::from_str(r#"{"mykey": "two tokens"}"#).unwrap();
let doc = doc!(json_field=>json_val);
let index = Index::create_in_ram(schema);
@@ -880,7 +868,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let schema = schema_builder.build();
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
let json_val: serde_json::Value = serde_json::from_str(
r#"{"mykey": [{"field": "hello happy tax payer"}, {"field": "nothello"}]}"#,
)
.unwrap();

View File

@@ -397,16 +397,20 @@ pub mod tests {
#[macro_export]
macro_rules! assert_nearly_equals {
($left:expr, $right:expr) => {{
match (&$left, &$right) {
(left_val, right_val) => {
assert_nearly_equals!($left, $right, 0.0005);
}};
($left:expr, $right:expr, $epsilon:expr) => {{
match (&$left, &$right, &$epsilon) {
(left_val, right_val, epsilon_val) => {
let diff = (left_val - right_val).abs();
let add = left_val.abs() + right_val.abs();
if diff > 0.0005 * add {
if diff > *epsilon_val {
panic!(
r#"assertion failed: `(left ~= right)`
left: `{:?}`,
right: `{:?}`"#,
&*left_val, &*right_val
r#"assertion failed: `abs(left-right)>epsilon`
left: `{:?}`,
right: `{:?}`,
epsilon: `{:?}`"#,
&*left_val, &*right_val, &*epsilon_val
)
}
}
@@ -935,7 +939,7 @@ pub mod tests {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
let schema = schema_builder.build();
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
let json_val: serde_json::Value = serde_json::from_str(
r#"{
"signed": 2,
"float": 2.0,
@@ -1025,13 +1029,16 @@ pub mod tests {
text_field => "some other value",
other_text_field => "short");
assert_eq!(document.len(), 3);
let values: Vec<&OwnedValue> = document.get_all(text_field).collect();
let values: Vec<OwnedValue> = document.get_all(text_field).map(OwnedValue::from).collect();
assert_eq!(values.len(), 2);
assert_eq!(values[0].as_str(), Some("tantivy"));
assert_eq!(values[1].as_str(), Some("some other value"));
let values: Vec<&OwnedValue> = document.get_all(other_text_field).collect();
assert_eq!(values[0].as_ref().as_str(), Some("tantivy"));
assert_eq!(values[1].as_ref().as_str(), Some("some other value"));
let values: Vec<OwnedValue> = document
.get_all(other_text_field)
.map(OwnedValue::from)
.collect();
assert_eq!(values.len(), 1);
assert_eq!(values[0].as_str(), Some("short"));
assert_eq!(values[0].as_ref().as_str(), Some("short"));
}
#[test]

View File

@@ -41,6 +41,7 @@
/// );
/// # }
/// ```
#[macro_export]
macro_rules! doc(
() => {
@@ -52,7 +53,7 @@ macro_rules! doc(
{
let mut document = $crate::TantivyDocument::default();
$(
document.add_field_value($field, $value);
document.add_field_value($field, &$value);
)*
document
}

View File

@@ -138,8 +138,7 @@ impl FuzzyTermQuery {
if json_path_type != Type::Str {
return Err(InvalidArgument(format!(
"The fuzzy term query requires a string path type for a json term. Found \
{:?}",
json_path_type
{json_path_type:?}"
)));
}
}

View File

@@ -2,7 +2,7 @@ use std::fmt;
use std::ops::Bound;
use crate::query::Occur;
use crate::schema::{Field, Term, Type};
use crate::schema::{Term, Type};
use crate::Score;
#[derive(Clone)]
@@ -20,8 +20,6 @@ pub enum LogicalLiteral {
upper: Bound<Term>,
},
Set {
field: Field,
value_type: Type,
elements: Vec<Term>,
},
All,

View File

@@ -832,17 +832,11 @@ impl QueryParser {
let (field, json_path) = try_tuple!(self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let (elements, errors) = elements
.into_iter()
.map(|element| self.compute_boundary_term(field, json_path, &element))
.partition_result();
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set {
elements,
field,
value_type,
}));
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set { elements }));
(Some(logical_ast), errors)
}
UserInputLeaf::Exists { .. } => (

View File

@@ -185,7 +185,7 @@ mod test {
Err(crate::TantivyError::InvalidArgument(msg)) => {
assert!(msg.contains("error: unclosed group"))
}
res => panic!("unexpected result: {:?}", res),
res => panic!("unexpected result: {res:?}"),
}
}
}

View File

@@ -1,93 +1,64 @@
use std::collections::{BTreeMap, HashMap, HashSet};
use std::io::{self, Read, Write};
use std::net::Ipv6Addr;
use common::DateTime;
use columnar::MonotonicallyMappableToU128;
use common::{read_u32_vint_no_advance, serialize_vint_u32, BinarySerializable, DateTime, VInt};
use serde_json::Map;
pub use CompactDoc as TantivyDocument;
use super::{ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::document::{
DeserializeError, Document, DocumentDeserialize, DocumentDeserializer,
};
use crate::schema::field_type::ValueParsingError;
use crate::schema::field_value::FieldValueIter;
use crate::schema::{Facet, Field, FieldValue, NamedFieldDocument, OwnedValue, Schema};
use crate::schema::{Facet, Field, NamedFieldDocument, OwnedValue, Schema};
use crate::tokenizer::PreTokenizedString;
/// TantivyDocument provides a default implementation of the `Document` trait.
/// It is the object that can be indexed and then searched for.
///
/// Documents are fundamentally a collection of unordered couples `(field, value)`.
/// In this list, one field may appear more than once.
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
pub struct TantivyDocument {
field_values: Vec<FieldValue>,
#[repr(packed)]
#[derive(Debug, Clone)]
/// A field value pair in the compact tantivy document
struct FieldValueAddr {
pub field: u16,
pub value_addr: ValueAddr,
}
impl Document for TantivyDocument {
type Value<'a> = &'a OwnedValue;
type FieldsValuesIter<'a> = FieldValueIter<'a>;
#[derive(Debug, Clone)]
/// The default document in tantivy. It encodes data in a compact form.
pub struct CompactDoc {
/// `node_data` is a vec of bytes, where each value is serialized into bytes and stored. It
/// includes all the data of the document and also metadata like where the nodes are located
/// in an object or array.
pub node_data: Vec<u8>,
/// The root (Field, Value) pairs
field_values: Vec<FieldValueAddr>,
}
fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> {
FieldValueIter(self.field_values.iter())
impl Default for CompactDoc {
fn default() -> Self {
Self::new()
}
}
impl DocumentDeserialize for TantivyDocument {
fn deserialize<'de, D>(mut deserializer: D) -> Result<Self, DeserializeError>
where D: DocumentDeserializer<'de> {
let mut field_values = Vec::with_capacity(deserializer.size_hint());
while let Some((field, value)) = deserializer.next_field()? {
field_values.push(FieldValue::new(field, value));
}
Ok(Self { field_values })
}
}
impl From<Vec<FieldValue>> for TantivyDocument {
fn from(field_values: Vec<FieldValue>) -> Self {
Self { field_values }
}
}
impl PartialEq for TantivyDocument {
fn eq(&self, other: &Self) -> bool {
// super slow, but only here for tests
let convert_to_comparable_map = |field_values: &[FieldValue]| {
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
for field_value in field_values.iter() {
let value = serde_json::to_string(field_value.value()).unwrap();
field_value_set
.entry(field_value.field())
.or_default()
.insert(value);
}
field_value_set
};
let self_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(&self.field_values);
let other_field_values: HashMap<Field, HashSet<String>> =
convert_to_comparable_map(&other.field_values);
self_field_values.eq(&other_field_values)
}
}
impl Eq for TantivyDocument {}
impl IntoIterator for TantivyDocument {
type Item = FieldValue;
type IntoIter = std::vec::IntoIter<FieldValue>;
fn into_iter(self) -> Self::IntoIter {
self.field_values.into_iter()
}
}
impl TantivyDocument {
impl CompactDoc {
/// Creates a new, empty document object
pub fn new() -> TantivyDocument {
TantivyDocument::default()
/// The reserved capacity is for the total serialized data
pub fn with_capacity(bytes: usize) -> CompactDoc {
CompactDoc {
node_data: Vec::with_capacity(bytes),
field_values: Vec::with_capacity(4),
}
}
/// Creates a new, empty document object
pub fn new() -> CompactDoc {
CompactDoc::with_capacity(1024)
}
/// Skrinks the capacity of the document to fit the data
pub fn shrink_to_fit(&mut self) {
self.node_data.shrink_to_fit();
self.field_values.shrink_to_fit();
}
/// Returns the length of the document.
@@ -99,83 +70,111 @@ impl TantivyDocument {
pub fn add_facet<F>(&mut self, field: Field, path: F)
where Facet: From<F> {
let facet = Facet::from(path);
let value = OwnedValue::Facet(facet);
self.add_field_value(field, value);
self.add_leaf_field_value(field, ReferenceValueLeaf::Facet(facet.encoded_str()));
}
/// Add a text field.
pub fn add_text<S: ToString>(&mut self, field: Field, text: S) {
let value = OwnedValue::Str(text.to_string());
self.add_field_value(field, value);
pub fn add_text<S: AsRef<str>>(&mut self, field: Field, text: S) {
self.add_leaf_field_value(field, ReferenceValueLeaf::Str(text.as_ref()));
}
/// Add a pre-tokenized text field.
pub fn add_pre_tokenized_text(&mut self, field: Field, pre_tokenized_text: PreTokenizedString) {
self.add_field_value(field, pre_tokenized_text);
self.add_leaf_field_value(field, pre_tokenized_text);
}
/// Add a u64 field
pub fn add_u64(&mut self, field: Field, value: u64) {
self.add_field_value(field, value);
self.add_leaf_field_value(field, value);
}
/// Add a IP address field. Internally only Ipv6Addr is used.
pub fn add_ip_addr(&mut self, field: Field, value: Ipv6Addr) {
self.add_field_value(field, value);
self.add_leaf_field_value(field, value);
}
/// Add a i64 field
pub fn add_i64(&mut self, field: Field, value: i64) {
self.add_field_value(field, value);
self.add_leaf_field_value(field, value);
}
/// Add a f64 field
pub fn add_f64(&mut self, field: Field, value: f64) {
self.add_field_value(field, value);
self.add_leaf_field_value(field, value);
}
/// Add a bool field
pub fn add_bool(&mut self, field: Field, value: bool) {
self.add_field_value(field, value);
self.add_leaf_field_value(field, value);
}
/// Add a date field with unspecified time zone offset
pub fn add_date(&mut self, field: Field, value: DateTime) {
self.add_field_value(field, value);
self.add_leaf_field_value(field, value);
}
/// Add a bytes field
pub fn add_bytes<T: Into<Vec<u8>>>(&mut self, field: Field, value: T) {
self.add_field_value(field, value.into());
pub fn add_bytes(&mut self, field: Field, value: &[u8]) {
self.add_leaf_field_value(field, value);
}
/// Add a dynamic object field
pub fn add_object(&mut self, field: Field, object: BTreeMap<String, OwnedValue>) {
self.add_field_value(field, object);
self.add_field_value(field, &OwnedValue::from(object));
}
/// Add a (field, value) to the document.
pub fn add_field_value<T: Into<OwnedValue>>(&mut self, field: Field, typed_val: T) {
///
/// `OwnedValue` implements Value, which should be easiest to use, but is not the most
/// performant.
pub fn add_field_value<'a, V: Value<'a>>(&mut self, field: Field, value: V) {
let field_value = FieldValueAddr {
field: field
.field_id()
.try_into()
.expect("support only up to u16::MAX field ids"),
value_addr: self.add_value(value),
};
self.field_values.push(field_value);
}
/// Add a (field, leaf value) to the document.
/// Leaf values don't have nested values.
pub fn add_leaf_field_value<'a, T: Into<ReferenceValueLeaf<'a>>>(
&mut self,
field: Field,
typed_val: T,
) {
let value = typed_val.into();
let field_value = FieldValue { field, value };
let field_value = FieldValueAddr {
field: field
.field_id()
.try_into()
.expect("support only up to u16::MAX field ids"),
value_addr: self.add_value_leaf(value),
};
self.field_values.push(field_value);
}
/// field_values accessor
pub fn field_values(&self) -> &[FieldValue] {
&self.field_values
pub fn field_values(&self) -> impl Iterator<Item = (Field, CompactDocValue<'_>)> {
self.field_values.iter().map(|field_val| {
let field = Field::from_field_id(field_val.field as u32);
let val = self.get_compact_doc_value(field_val.value_addr);
(field, val)
})
}
/// Returns all of the `FieldValue`s associated the given field
pub fn get_all(&self, field: Field) -> impl Iterator<Item = &OwnedValue> {
/// Returns all of the `ReferenceValue`s associated the given field
pub fn get_all(&self, field: Field) -> impl Iterator<Item = CompactDocValue<'_>> + '_ {
self.field_values
.iter()
.filter(move |field_value| field_value.field() == field)
.map(FieldValue::value)
.filter(move |field_value| Field::from_field_id(field_value.field as u32) == field)
.map(|val| self.get_compact_doc_value(val.value_addr))
}
/// Returns the first `FieldValue` associated the given field
pub fn get_first(&self, field: Field) -> Option<&OwnedValue> {
/// Returns the first `ReferenceValue` associated the given field
pub fn get_first(&self, field: Field) -> Option<CompactDocValue<'_>> {
self.get_all(field).next()
}
@@ -183,12 +182,12 @@ impl TantivyDocument {
pub fn convert_named_doc(
schema: &Schema,
named_doc: NamedFieldDocument,
) -> Result<TantivyDocument, DocParsingError> {
let mut document = TantivyDocument::new();
) -> Result<Self, DocParsingError> {
let mut document = Self::new();
for (field_name, values) in named_doc.0 {
if let Ok(field) = schema.get_field(&field_name) {
for value in values {
document.add_field_value(field, value);
document.add_field_value(field, &value);
}
}
}
@@ -196,7 +195,7 @@ impl TantivyDocument {
}
/// Build a document object from a json-object.
pub fn parse_json(schema: &Schema, doc_json: &str) -> Result<TantivyDocument, DocParsingError> {
pub fn parse_json(schema: &Schema, doc_json: &str) -> Result<Self, DocParsingError> {
let json_obj: Map<String, serde_json::Value> =
serde_json::from_str(doc_json).map_err(|_| DocParsingError::invalid_json(doc_json))?;
Self::from_json_object(schema, json_obj)
@@ -206,8 +205,8 @@ impl TantivyDocument {
pub fn from_json_object(
schema: &Schema,
json_obj: Map<String, serde_json::Value>,
) -> Result<TantivyDocument, DocParsingError> {
let mut doc = TantivyDocument::default();
) -> Result<Self, DocParsingError> {
let mut doc = Self::default();
for (field_name, json_value) in json_obj {
if let Ok(field) = schema.get_field(&field_name) {
let field_entry = schema.get_field_entry(field);
@@ -218,20 +217,482 @@ impl TantivyDocument {
let value = field_type
.value_from_json(json_item)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add_field_value(field, value);
doc.add_field_value(field, &value);
}
}
_ => {
let value = field_type
.value_from_json(json_value)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add_field_value(field, value);
doc.add_field_value(field, &value);
}
}
}
}
Ok(doc)
}
fn add_value_leaf(&mut self, leaf: ReferenceValueLeaf) -> ValueAddr {
let type_id = ValueType::from(&leaf);
// Write into `node_data` and return u32 position as its address
// Null and bool are inlined into the address
let val_addr = match leaf {
ReferenceValueLeaf::Null => 0,
ReferenceValueLeaf::Str(bytes) => {
write_bytes_into(&mut self.node_data, bytes.as_bytes())
}
ReferenceValueLeaf::Facet(bytes) => {
write_bytes_into(&mut self.node_data, bytes.as_bytes())
}
ReferenceValueLeaf::Bytes(bytes) => write_bytes_into(&mut self.node_data, bytes),
ReferenceValueLeaf::U64(num) => write_into(&mut self.node_data, num),
ReferenceValueLeaf::I64(num) => write_into(&mut self.node_data, num),
ReferenceValueLeaf::F64(num) => write_into(&mut self.node_data, num),
ReferenceValueLeaf::Bool(b) => b as u32,
ReferenceValueLeaf::Date(date) => {
write_into(&mut self.node_data, date.into_timestamp_nanos())
}
ReferenceValueLeaf::IpAddr(num) => write_into(&mut self.node_data, num.to_u128()),
ReferenceValueLeaf::PreTokStr(pre_tok) => write_into(&mut self.node_data, *pre_tok),
};
ValueAddr { type_id, val_addr }
}
/// Adds a value and returns in address into the
fn add_value<'a, V: Value<'a>>(&mut self, value: V) -> ValueAddr {
let value = value.as_value();
let type_id = ValueType::from(&value);
match value {
ReferenceValue::Leaf(leaf) => self.add_value_leaf(leaf),
ReferenceValue::Array(elements) => {
// addresses of the elements in node_data
// Reusing a vec would be nicer, but it's not easy because of the recursion
// A global vec would work if every writer get it's discriminator
let mut addresses = Vec::new();
for elem in elements {
let value_addr = self.add_value(elem);
write_into(&mut addresses, value_addr);
}
ValueAddr {
type_id,
val_addr: write_bytes_into(&mut self.node_data, &addresses),
}
}
ReferenceValue::Object(entries) => {
// addresses of the elements in node_data
let mut addresses = Vec::new();
for (key, value) in entries {
let key_addr = self.add_value_leaf(ReferenceValueLeaf::Str(key));
let value_addr = self.add_value(value);
write_into(&mut addresses, key_addr);
write_into(&mut addresses, value_addr);
}
ValueAddr {
type_id,
val_addr: write_bytes_into(&mut self.node_data, &addresses),
}
}
}
}
/// Get CompactDocValue for address
fn get_compact_doc_value(&self, value_addr: ValueAddr) -> CompactDocValue<'_> {
CompactDocValue {
container: self,
value_addr,
}
}
/// get &[u8] reference from node_data
fn extract_bytes(&self, addr: Addr) -> &[u8] {
binary_deserialize_bytes(self.get_slice(addr))
}
/// get &str reference from node_data
fn extract_str(&self, addr: Addr) -> &str {
let data = self.extract_bytes(addr);
// Utf-8 checks would have a noticeable performance overhead here
unsafe { std::str::from_utf8_unchecked(data) }
}
/// deserialized owned value from node_data
fn read_from<T: BinarySerializable>(&self, addr: Addr) -> io::Result<T> {
let data_slice = &self.node_data[addr as usize..];
let mut cursor = std::io::Cursor::new(data_slice);
T::deserialize(&mut cursor)
}
/// get slice from address. The returned slice is open ended
fn get_slice(&self, addr: Addr) -> &[u8] {
&self.node_data[addr as usize..]
}
}
/// BinarySerializable alternative to read references
fn binary_deserialize_bytes(data: &[u8]) -> &[u8] {
let (len, bytes_read) = read_u32_vint_no_advance(data);
&data[bytes_read..bytes_read + len as usize]
}
/// Write bytes and return the position of the written data.
///
/// BinarySerializable alternative to write references
fn write_bytes_into(vec: &mut Vec<u8>, data: &[u8]) -> u32 {
let pos = vec.len() as u32;
let mut buf = [0u8; 8];
let len_vint_bytes = serialize_vint_u32(data.len() as u32, &mut buf);
vec.extend_from_slice(len_vint_bytes);
vec.extend_from_slice(data);
pos
}
/// Serialize and return the position
fn write_into<T: BinarySerializable>(vec: &mut Vec<u8>, value: T) -> u32 {
let pos = vec.len() as u32;
value.serialize(vec).unwrap();
pos
}
impl PartialEq for CompactDoc {
fn eq(&self, other: &Self) -> bool {
// super slow, but only here for tests
let convert_to_comparable_map = |doc: &CompactDoc| {
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
for field_value in doc.field_values.iter() {
let value: OwnedValue = doc.get_compact_doc_value(field_value.value_addr).into();
let value = serde_json::to_string(&value).unwrap();
field_value_set
.entry(Field::from_field_id(field_value.field as u32))
.or_default()
.insert(value);
}
field_value_set
};
let self_field_values: HashMap<Field, HashSet<String>> = convert_to_comparable_map(self);
let other_field_values: HashMap<Field, HashSet<String>> = convert_to_comparable_map(other);
self_field_values.eq(&other_field_values)
}
}
impl Eq for CompactDoc {}
impl DocumentDeserialize for CompactDoc {
fn deserialize<'de, D>(mut deserializer: D) -> Result<Self, DeserializeError>
where D: DocumentDeserializer<'de> {
let mut doc = CompactDoc::default();
// TODO: Deserializing into OwnedValue is wasteful. The deserializer should be able to work
// on slices and referenced data.
while let Some((field, value)) = deserializer.next_field::<OwnedValue>()? {
doc.add_field_value(field, &value);
}
Ok(doc)
}
}
/// A value of Compact Doc needs a reference to the container to extract its payload
#[derive(Debug, Clone, Copy)]
pub struct CompactDocValue<'a> {
container: &'a CompactDoc,
value_addr: ValueAddr,
}
impl PartialEq for CompactDocValue<'_> {
fn eq(&self, other: &Self) -> bool {
let value1: OwnedValue = (*self).into();
let value2: OwnedValue = (*other).into();
value1 == value2
}
}
impl<'a> From<CompactDocValue<'a>> for OwnedValue {
fn from(value: CompactDocValue) -> Self {
value.as_value().into()
}
}
impl<'a> Value<'a> for CompactDocValue<'a> {
type ArrayIter = CompactDocArrayIter<'a>;
type ObjectIter = CompactDocObjectIter<'a>;
fn as_value(&self) -> ReferenceValue<'a, Self> {
self.get_ref_value().unwrap()
}
}
impl<'a> CompactDocValue<'a> {
fn get_ref_value(&self) -> io::Result<ReferenceValue<'a, CompactDocValue<'a>>> {
let addr = self.value_addr.val_addr;
match self.value_addr.type_id {
ValueType::Null => Ok(ReferenceValueLeaf::Null.into()),
ValueType::Str => {
let str_ref = self.container.extract_str(addr);
Ok(ReferenceValueLeaf::Str(str_ref).into())
}
ValueType::Facet => {
let str_ref = self.container.extract_str(addr);
Ok(ReferenceValueLeaf::Facet(str_ref).into())
}
ValueType::Bytes => {
let data = self.container.extract_bytes(addr);
Ok(ReferenceValueLeaf::Bytes(data).into())
}
ValueType::U64 => self
.container
.read_from::<u64>(addr)
.map(ReferenceValueLeaf::U64)
.map(Into::into),
ValueType::I64 => self
.container
.read_from::<i64>(addr)
.map(ReferenceValueLeaf::I64)
.map(Into::into),
ValueType::F64 => self
.container
.read_from::<f64>(addr)
.map(ReferenceValueLeaf::F64)
.map(Into::into),
ValueType::Bool => Ok(ReferenceValueLeaf::Bool(addr != 0).into()),
ValueType::Date => self
.container
.read_from::<i64>(addr)
.map(|ts| ReferenceValueLeaf::Date(DateTime::from_timestamp_nanos(ts)))
.map(Into::into),
ValueType::IpAddr => self
.container
.read_from::<u128>(addr)
.map(|num| ReferenceValueLeaf::IpAddr(Ipv6Addr::from_u128(num)))
.map(Into::into),
ValueType::PreTokStr => self
.container
.read_from::<PreTokenizedString>(addr)
.map(Into::into)
.map(ReferenceValueLeaf::PreTokStr)
.map(Into::into),
ValueType::Object => Ok(ReferenceValue::Object(CompactDocObjectIter::new(
self.container,
addr,
)?)),
ValueType::Array => Ok(ReferenceValue::Array(CompactDocArrayIter::new(
self.container,
addr,
)?)),
}
}
}
/// The address in the vec
type Addr = u32;
#[derive(Clone, Copy, Default)]
#[repr(packed)]
/// The value type and the address to its payload in the container.
struct ValueAddr {
type_id: ValueType,
/// This is the address to the value in the vec, except for bool and null, which are inlined
val_addr: Addr,
}
impl BinarySerializable for ValueAddr {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
self.type_id.serialize(writer)?;
VInt(self.val_addr as u64).serialize(writer)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let type_id = ValueType::deserialize(reader)?;
let val_addr = VInt::deserialize(reader)?.0 as u32;
Ok(ValueAddr { type_id, val_addr })
}
}
impl std::fmt::Debug for ValueAddr {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let val_addr = self.val_addr;
f.write_fmt(format_args!("{:?} at {:?}", self.type_id, val_addr))
}
}
/// A enum representing a value for tantivy to index.
///
/// Any changes need to be reflected in `BinarySerializable` for `ValueType`
///
/// We can't use [schema::Type] or [columnar::ColumnType] here, because they are missing
/// some items like Array and PreTokStr.
#[derive(Default, Clone, Copy, Debug, PartialEq)]
#[repr(u8)]
pub enum ValueType {
/// A null value.
#[default]
Null = 0,
/// The str type is used for any text information.
Str = 1,
/// Unsigned 64-bits Integer `u64`
U64 = 2,
/// Signed 64-bits Integer `i64`
I64 = 3,
/// 64-bits Float `f64`
F64 = 4,
/// Date/time with nanoseconds precision
Date = 5,
/// Facet
Facet = 6,
/// Arbitrarily sized byte array
Bytes = 7,
/// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`.
IpAddr = 8,
/// Bool value
Bool = 9,
/// Pre-tokenized str type,
PreTokStr = 10,
/// Object
Object = 11,
/// Pre-tokenized str type,
Array = 12,
}
impl BinarySerializable for ValueType {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
(*self as u8).serialize(writer)?;
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let num = u8::deserialize(reader)?;
let type_id = if (0..=12).contains(&num) {
unsafe { std::mem::transmute(num) }
} else {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("Invalid value type id: {num}"),
));
};
Ok(type_id)
}
}
impl<'a, V: Value<'a>> From<&ReferenceValue<'a, V>> for ValueType {
fn from(value: &ReferenceValue<'a, V>) -> Self {
match value {
ReferenceValue::Leaf(leaf) => leaf.into(),
ReferenceValue::Array(_) => ValueType::Array,
ReferenceValue::Object(_) => ValueType::Object,
}
}
}
impl<'a> From<&ReferenceValueLeaf<'a>> for ValueType {
fn from(value: &ReferenceValueLeaf<'a>) -> Self {
match value {
ReferenceValueLeaf::Null => ValueType::Null,
ReferenceValueLeaf::Str(_) => ValueType::Str,
ReferenceValueLeaf::U64(_) => ValueType::U64,
ReferenceValueLeaf::I64(_) => ValueType::I64,
ReferenceValueLeaf::F64(_) => ValueType::F64,
ReferenceValueLeaf::Bool(_) => ValueType::Bool,
ReferenceValueLeaf::Date(_) => ValueType::Date,
ReferenceValueLeaf::IpAddr(_) => ValueType::IpAddr,
ReferenceValueLeaf::PreTokStr(_) => ValueType::PreTokStr,
ReferenceValueLeaf::Facet(_) => ValueType::Facet,
ReferenceValueLeaf::Bytes(_) => ValueType::Bytes,
}
}
}
#[derive(Debug, Clone)]
/// The Iterator for the object values in the compact document
pub struct CompactDocObjectIter<'a> {
container: &'a CompactDoc,
node_addresses_slice: &'a [u8],
}
impl<'a> CompactDocObjectIter<'a> {
fn new(container: &'a CompactDoc, addr: Addr) -> io::Result<Self> {
// Objects are `&[ValueAddr]` serialized into bytes
let node_addresses_slice = container.extract_bytes(addr);
Ok(Self {
container,
node_addresses_slice,
})
}
}
impl<'a> Iterator for CompactDocObjectIter<'a> {
type Item = (&'a str, CompactDocValue<'a>);
fn next(&mut self) -> Option<Self::Item> {
if self.node_addresses_slice.is_empty() {
return None;
}
let key_addr = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
let key = self.container.extract_str(key_addr.val_addr);
let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
let value = CompactDocValue {
container: self.container,
value_addr: value,
};
Some((key, value))
}
}
#[derive(Debug, Clone)]
/// The Iterator for the array values in the compact document
pub struct CompactDocArrayIter<'a> {
container: &'a CompactDoc,
node_addresses_slice: &'a [u8],
}
impl<'a> CompactDocArrayIter<'a> {
fn new(container: &'a CompactDoc, addr: Addr) -> io::Result<Self> {
// Arrays are &[ValueAddr] serialized into bytes
let node_addresses_slice = container.extract_bytes(addr);
Ok(Self {
container,
node_addresses_slice,
})
}
}
impl<'a> Iterator for CompactDocArrayIter<'a> {
type Item = CompactDocValue<'a>;
fn next(&mut self) -> Option<Self::Item> {
if self.node_addresses_slice.is_empty() {
return None;
}
let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
let value = CompactDocValue {
container: self.container,
value_addr: value,
};
Some(value)
}
}
impl Document for CompactDoc {
type Value<'a> = CompactDocValue<'a>;
type FieldsValuesIter<'a> = FieldValueIterRef<'a>;
fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> {
FieldValueIterRef {
slice: self.field_values.iter(),
container: self,
}
}
}
/// A helper wrapper for creating an iterator over the field values
pub struct FieldValueIterRef<'a> {
slice: std::slice::Iter<'a, FieldValueAddr>,
container: &'a CompactDoc,
}
impl<'a> Iterator for FieldValueIterRef<'a> {
type Item = (Field, CompactDocValue<'a>);
fn next(&mut self) -> Option<Self::Item> {
self.slice.next().map(|field_value| {
(
Field::from_field_id(field_value.field as u32),
CompactDocValue::<'a> {
container: self.container,
value_addr: field_value.value_addr,
},
)
})
}
}
/// Error that may happen when deserializing
@@ -264,7 +725,40 @@ mod tests {
let text_field = schema_builder.add_text_field("title", TEXT);
let mut doc = TantivyDocument::default();
doc.add_text(text_field, "My title");
assert_eq!(doc.field_values().len(), 1);
assert_eq!(doc.field_values().count(), 1);
let schema = schema_builder.build();
let _val = doc.get_first(text_field).unwrap();
let _json = doc.to_named_doc(&schema);
}
#[test]
fn test_json_value() {
let json_str = r#"{
"toto": "titi",
"float": -0.2,
"bool": true,
"unsigned": 1,
"signed": -2,
"complexobject": {
"field.with.dot": 1
},
"date": "1985-04-12T23:20:50.52Z",
"my_arr": [2, 3, {"my_key": "two tokens"}, 4, {"nested_array": [2, 5, 6, [7, 8, {"a": [{"d": {"e":[99]}}, 9000]}, 9, 10], [5, 5]]}]
}"#;
let json_val: std::collections::BTreeMap<String, OwnedValue> =
serde_json::from_str(json_str).unwrap();
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let mut doc = TantivyDocument::default();
doc.add_object(json_field, json_val);
let schema = schema_builder.build();
let json = doc.to_json(&schema);
let actual_json: serde_json::Value = serde_json::from_str(&json).unwrap();
let expected_json: serde_json::Value = serde_json::from_str(json_str).unwrap();
assert_eq!(actual_json["json"][0], expected_json);
}
// TODO: Should this be re-added with the serialize method

View File

@@ -5,21 +5,39 @@
//! and don't care about some of the more specialised types or only want to customise
//! part of the document structure.
use std::collections::{btree_map, hash_map, BTreeMap, HashMap};
use std::iter::Empty;
use std::net::Ipv6Addr;
use common::DateTime;
use serde_json::Number;
use time::format_description::well_known::Rfc3339;
use time::OffsetDateTime;
use super::facet::Facet;
use super::ReferenceValueLeaf;
use crate::schema::document::{
ArrayAccess, DeserializeError, Document, DocumentDeserialize, DocumentDeserializer,
ObjectAccess, ReferenceValue, Value, ValueDeserialize, ValueDeserializer, ValueVisitor,
};
use crate::schema::Field;
use crate::tokenizer::PreTokenizedString;
// Serde compatibility support.
pub fn can_be_rfc3339_date_time(text: &str) -> bool {
if let Some(&first_byte) = text.as_bytes().first() {
if first_byte.is_ascii_digit() {
return true;
}
}
false
}
impl<'a> Value<'a> for &'a serde_json::Value {
type ArrayIter = std::slice::Iter<'a, serde_json::Value>;
type ObjectIter = JsonObjectIter<'a>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
match self {
serde_json::Value::Null => ReferenceValueLeaf::Null.into(),
@@ -35,7 +53,19 @@ impl<'a> Value<'a> for &'a serde_json::Value {
panic!("Unsupported serde_json number {number}");
}
}
serde_json::Value::String(val) => ReferenceValueLeaf::Str(val).into(),
serde_json::Value::String(text) => {
if can_be_rfc3339_date_time(text) {
match OffsetDateTime::parse(text, &Rfc3339) {
Ok(dt) => {
let dt_utc = dt.to_offset(time::UtcOffset::UTC);
ReferenceValueLeaf::Date(DateTime::from_utc(dt_utc)).into()
}
Err(_) => ReferenceValueLeaf::Str(text).into(),
}
} else {
ReferenceValueLeaf::Str(text).into()
}
}
serde_json::Value::Array(elements) => ReferenceValue::Array(elements.iter()),
serde_json::Value::Object(object) => {
ReferenceValue::Object(JsonObjectIter(object.iter()))
@@ -44,6 +74,126 @@ impl<'a> Value<'a> for &'a serde_json::Value {
}
}
impl<'a> Value<'a> for &'a String {
type ArrayIter = Empty<&'a String>;
type ObjectIter = Empty<(&'a str, &'a String)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Str(self))
}
}
impl<'a> Value<'a> for &'a Facet {
type ArrayIter = Empty<&'a Facet>;
type ObjectIter = Empty<(&'a str, &'a Facet)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Facet(self.encoded_str()))
}
}
impl<'a> Value<'a> for &'a u64 {
type ArrayIter = Empty<&'a u64>;
type ObjectIter = Empty<(&'a str, &'a u64)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::U64(**self))
}
}
impl<'a> Value<'a> for &'a i64 {
type ArrayIter = Empty<&'a i64>;
type ObjectIter = Empty<(&'a str, &'a i64)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::I64(**self))
}
}
impl<'a> Value<'a> for &'a f64 {
type ArrayIter = Empty<&'a f64>;
type ObjectIter = Empty<(&'a str, &'a f64)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::F64(**self))
}
}
impl<'a> Value<'a> for &'a bool {
type ArrayIter = Empty<&'a bool>;
type ObjectIter = Empty<(&'a str, &'a bool)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Bool(**self))
}
}
impl<'a> Value<'a> for &'a str {
type ArrayIter = Empty<&'a str>;
type ObjectIter = Empty<(&'a str, &'a str)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Str(self))
}
}
impl<'a> Value<'a> for &'a &'a str {
type ArrayIter = Empty<&'a &'a str>;
type ObjectIter = Empty<(&'a str, &'a &'a str)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Str(self))
}
}
impl<'a> Value<'a> for &'a [u8] {
type ArrayIter = Empty<&'a [u8]>;
type ObjectIter = Empty<(&'a str, &'a [u8])>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Bytes(self))
}
}
impl<'a> Value<'a> for &'a &'a [u8] {
type ArrayIter = Empty<&'a &'a [u8]>;
type ObjectIter = Empty<(&'a str, &'a &'a [u8])>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Bytes(self))
}
}
impl<'a> Value<'a> for &'a Vec<u8> {
type ArrayIter = Empty<&'a Vec<u8>>;
type ObjectIter = Empty<(&'a str, &'a Vec<u8>)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Bytes(self))
}
}
impl<'a> Value<'a> for &'a DateTime {
type ArrayIter = Empty<&'a DateTime>;
type ObjectIter = Empty<(&'a str, &'a DateTime)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::Date(**self))
}
}
impl<'a> Value<'a> for &'a Ipv6Addr {
type ArrayIter = Empty<&'a Ipv6Addr>;
type ObjectIter = Empty<(&'a str, &'a Ipv6Addr)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::IpAddr(**self))
}
}
impl<'a> Value<'a> for &'a PreTokenizedString {
type ArrayIter = Empty<&'a PreTokenizedString>;
type ObjectIter = Empty<(&'a str, &'a PreTokenizedString)>;
#[inline]
fn as_value(&self) -> ReferenceValue<'a, Self> {
ReferenceValue::Leaf(ReferenceValueLeaf::PreTokStr(Box::new((*self).clone())))
}
}
impl ValueDeserialize for serde_json::Value {
fn deserialize<'de, D>(deserializer: D) -> Result<Self, DeserializeError>
where D: ValueDeserializer<'de> {

View File

@@ -172,7 +172,9 @@ pub use self::de::{
ArrayAccess, DeserializeError, DocumentDeserialize, DocumentDeserializer, ObjectAccess,
ValueDeserialize, ValueDeserializer, ValueType, ValueVisitor,
};
pub use self::default_document::{DocParsingError, TantivyDocument};
pub use self::default_document::{
CompactDocArrayIter, CompactDocObjectIter, CompactDocValue, DocParsingError, TantivyDocument,
};
pub use self::owned_value::OwnedValue;
pub(crate) use self::se::BinaryDocumentSerializer;
pub use self::value::{ReferenceValue, ReferenceValueLeaf, Value};
@@ -233,7 +235,7 @@ pub trait Document: Send + Sync + 'static {
let field_name = schema.get_field_name(field);
let values: Vec<OwnedValue> = field_values
.into_iter()
.map(|val| val.as_value().into())
.map(|val| OwnedValue::from(val.as_value()))
.collect();
field_map.insert(field_name.to_string(), values);
}

View File

@@ -8,6 +8,7 @@ use serde::de::{MapAccess, SeqAccess};
use time::format_description::well_known::Rfc3339;
use time::OffsetDateTime;
use super::existing_type_impls::can_be_rfc3339_date_time;
use super::ReferenceValueLeaf;
use crate::schema::document::{
ArrayAccess, DeserializeError, ObjectAccess, ReferenceValue, Value, ValueDeserialize,
@@ -375,16 +376,6 @@ impl From<BTreeMap<String, OwnedValue>> for OwnedValue {
}
}
fn can_be_rfc3339_date_time(text: &str) -> bool {
if let Some(&first_byte) = text.as_bytes().first() {
if first_byte.is_ascii_digit() {
return true;
}
}
false
}
impl From<serde_json::Value> for OwnedValue {
fn from(value: serde_json::Value) -> Self {
match value {
@@ -472,6 +463,7 @@ mod tests {
let mut doc = TantivyDocument::default();
doc.add_bytes(bytes_field, "".as_bytes());
let json_string = doc.to_json(&schema);
assert_eq!(json_string, r#"{"my_bytes":[""]}"#);
}

View File

@@ -25,6 +25,7 @@ where W: Write
/// Attempts to serialize a given document and write the output
/// to the writer.
#[inline]
pub(crate) fn serialize_doc<D>(&mut self, doc: &D) -> io::Result<()>
where D: Document {
let stored_field_values = || {
@@ -57,9 +58,8 @@ where W: Write
return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"Unexpected number of entries written to serializer, expected {} entries, got \
{} entries",
num_field_values, actual_length,
"Unexpected number of entries written to serializer, expected \
{num_field_values} entries, got {actual_length} entries",
),
));
}
@@ -679,6 +679,7 @@ mod tests {
);
}
#[inline]
fn serialize_doc<D: Document>(doc: &D, schema: &Schema) -> Vec<u8> {
let mut writer = Vec::new();

View File

@@ -17,15 +17,6 @@ pub trait Value<'a>: Send + Sync + Debug {
/// Returns the field value represented by an enum which borrows it's data.
fn as_value(&self) -> ReferenceValue<'a, Self>;
#[inline]
/// Returns if the value is `null` or not.
fn is_null(&self) -> bool {
matches!(
self.as_value(),
ReferenceValue::Leaf(ReferenceValueLeaf::Null)
)
}
#[inline]
/// If the Value is a leaf, returns the associated leaf. Returns None otherwise.
fn as_leaf(&self) -> Option<ReferenceValueLeaf<'a>> {
@@ -117,18 +108,6 @@ pub trait Value<'a>: Send + Sync + Debug {
None
}
}
#[inline]
/// Returns true if the Value is an array.
fn is_array(&self) -> bool {
matches!(self.as_value(), ReferenceValue::Object(_))
}
#[inline]
/// Returns true if the Value is an object.
fn is_object(&self) -> bool {
matches!(self.as_value(), ReferenceValue::Object(_))
}
}
/// A enum representing a leaf value for tantivy to index.
@@ -159,6 +138,69 @@ pub enum ReferenceValueLeaf<'a> {
PreTokStr(Box<PreTokenizedString>),
}
impl From<u64> for ReferenceValueLeaf<'_> {
#[inline]
fn from(value: u64) -> Self {
ReferenceValueLeaf::U64(value)
}
}
impl From<i64> for ReferenceValueLeaf<'_> {
#[inline]
fn from(value: i64) -> Self {
ReferenceValueLeaf::I64(value)
}
}
impl From<f64> for ReferenceValueLeaf<'_> {
#[inline]
fn from(value: f64) -> Self {
ReferenceValueLeaf::F64(value)
}
}
impl From<bool> for ReferenceValueLeaf<'_> {
#[inline]
fn from(value: bool) -> Self {
ReferenceValueLeaf::Bool(value)
}
}
impl<'a> From<&'a str> for ReferenceValueLeaf<'a> {
#[inline]
fn from(value: &'a str) -> Self {
ReferenceValueLeaf::Str(value)
}
}
impl<'a> From<&'a [u8]> for ReferenceValueLeaf<'a> {
#[inline]
fn from(value: &'a [u8]) -> Self {
ReferenceValueLeaf::Bytes(value)
}
}
impl From<DateTime> for ReferenceValueLeaf<'_> {
#[inline]
fn from(value: DateTime) -> Self {
ReferenceValueLeaf::Date(value)
}
}
impl From<Ipv6Addr> for ReferenceValueLeaf<'_> {
#[inline]
fn from(value: Ipv6Addr) -> Self {
ReferenceValueLeaf::IpAddr(value)
}
}
impl From<PreTokenizedString> for ReferenceValueLeaf<'_> {
#[inline]
fn from(val: PreTokenizedString) -> Self {
ReferenceValueLeaf::PreTokStr(Box::new(val))
}
}
impl<'a, T: Value<'a> + ?Sized> From<ReferenceValueLeaf<'a>> for ReferenceValue<'a, T> {
#[inline]
fn from(value: ReferenceValueLeaf<'a>) -> Self {

View File

@@ -568,21 +568,21 @@ mod tests {
let schema = schema_builder.build();
let doc = TantivyDocument::parse_json(&schema, r#"{"id": 100}"#).unwrap();
assert_eq!(
&OwnedValue::Str("100".to_string()),
doc.get_first(text_field).unwrap()
OwnedValue::Str("100".to_string()),
doc.get_first(text_field).unwrap().into()
);
let doc = TantivyDocument::parse_json(&schema, r#"{"id": true}"#).unwrap();
assert_eq!(
&OwnedValue::Str("true".to_string()),
doc.get_first(text_field).unwrap()
OwnedValue::Str("true".to_string()),
doc.get_first(text_field).unwrap().into()
);
// Not sure if this null coercion is the best approach
let doc = TantivyDocument::parse_json(&schema, r#"{"id": null}"#).unwrap();
assert_eq!(
&OwnedValue::Str("null".to_string()),
doc.get_first(text_field).unwrap()
OwnedValue::Str("null".to_string()),
doc.get_first(text_field).unwrap().into()
);
}
@@ -595,9 +595,18 @@ mod tests {
let schema = schema_builder.build();
let doc_json = r#"{"i64": "100", "u64": "100", "f64": "100"}"#;
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
assert_eq!(&OwnedValue::I64(100), doc.get_first(i64_field).unwrap());
assert_eq!(&OwnedValue::U64(100), doc.get_first(u64_field).unwrap());
assert_eq!(&OwnedValue::F64(100.0), doc.get_first(f64_field).unwrap());
assert_eq!(
OwnedValue::I64(100),
doc.get_first(i64_field).unwrap().into()
);
assert_eq!(
OwnedValue::U64(100),
doc.get_first(u64_field).unwrap().into()
);
assert_eq!(
OwnedValue::F64(100.0),
doc.get_first(f64_field).unwrap().into()
);
}
#[test]
@@ -607,11 +616,17 @@ mod tests {
let schema = schema_builder.build();
let doc_json = r#"{"bool": "true"}"#;
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
assert_eq!(&OwnedValue::Bool(true), doc.get_first(bool_field).unwrap());
assert_eq!(
OwnedValue::Bool(true),
doc.get_first(bool_field).unwrap().into()
);
let doc_json = r#"{"bool": "false"}"#;
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
assert_eq!(&OwnedValue::Bool(false), doc.get_first(bool_field).unwrap());
assert_eq!(
OwnedValue::Bool(false),
doc.get_first(bool_field).unwrap().into()
);
}
#[test]
@@ -644,7 +659,7 @@ mod tests {
let schema = schema_builder.build();
let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#;
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
let date = doc.get_first(date_field).unwrap();
let date = OwnedValue::from(doc.get_first(date_field).unwrap());
// Time zone is converted to UTC
assert_eq!("Date(2019-10-12T05:20:50.52Z)", format!("{date:?}"));
}

View File

@@ -1,46 +0,0 @@
use crate::schema::{Field, OwnedValue};
/// `FieldValue` holds together a `Field` and its `Value`.
#[allow(missing_docs)]
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct FieldValue {
pub field: Field,
pub value: OwnedValue,
}
impl FieldValue {
/// Constructor
pub fn new(field: Field, value: OwnedValue) -> FieldValue {
FieldValue { field, value }
}
/// Field accessor
pub fn field(&self) -> Field {
self.field
}
/// Value accessor
pub fn value(&self) -> &OwnedValue {
&self.value
}
}
impl From<FieldValue> for OwnedValue {
fn from(field_value: FieldValue) -> Self {
field_value.value
}
}
/// A helper wrapper for creating standard iterators
/// out of the fields iterator trait.
pub struct FieldValueIter<'a>(pub(crate) std::slice::Iter<'a, FieldValue>);
impl<'a> Iterator for FieldValueIter<'a> {
type Item = (Field, &'a OwnedValue);
fn next(&mut self) -> Option<Self::Item> {
self.0
.next()
.map(|field_value| (field_value.field, &field_value.value))
}
}

View File

@@ -114,7 +114,6 @@ pub(crate) mod term;
mod field_entry;
mod field_type;
mod field_value;
mod bytes_options;
mod date_time_options;
@@ -138,7 +137,6 @@ pub use self::facet_options::FacetOptions;
pub use self::field::Field;
pub use self::field_entry::FieldEntry;
pub use self::field_type::{FieldType, Type};
pub use self::field_value::FieldValue;
pub use self::flags::{COERCE, FAST, INDEXED, STORED};
pub use self::index_record_option::IndexRecordOption;
pub use self::ip_options::{IntoIpv6Addr, IpAddrOptions};

View File

@@ -645,15 +645,15 @@ mod tests {
let doc =
TantivyDocument::convert_named_doc(&schema, NamedFieldDocument(named_doc_map)).unwrap();
assert_eq!(
doc.get_all(title).collect::<Vec<_>>(),
doc.get_all(title).map(OwnedValue::from).collect::<Vec<_>>(),
vec![
&OwnedValue::from("title1".to_string()),
&OwnedValue::from("title2".to_string())
OwnedValue::from("title1".to_string()),
OwnedValue::from("title2".to_string())
]
);
assert_eq!(
doc.get_all(val).collect::<Vec<_>>(),
vec![&OwnedValue::from(14u64), &OwnedValue::from(-1i64)]
doc.get_all(val).map(OwnedValue::from).collect::<Vec<_>>(),
vec![OwnedValue::from(14u64), OwnedValue::from(-1i64)]
);
}
@@ -682,7 +682,7 @@ mod tests {
let schema = schema_builder.build();
{
let doc = TantivyDocument::parse_json(&schema, "{}").unwrap();
assert!(doc.field_values().is_empty());
assert!(doc.field_values().next().is_none());
}
{
let doc = TantivyDocument::parse_json(

View File

@@ -59,9 +59,8 @@ pub mod tests {
use super::*;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::AliveBitSet;
use crate::schema::document::Value;
use crate::schema::{
self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, STORED, TEXT,
self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, Value, STORED, TEXT,
};
use crate::{Index, IndexWriter, Term};
@@ -92,8 +91,8 @@ pub mod tests {
StoreWriter::new(writer, compressor, blocksize, separate_thread).unwrap();
for i in 0..num_docs {
let mut doc = TantivyDocument::default();
doc.add_field_value(field_body, LOREM.to_string());
doc.add_field_value(field_title, format!("Doc {i}"));
doc.add_text(field_body, LOREM);
doc.add_text(field_title, format!("Doc {i}"));
store_writer.store(&doc, &schema).unwrap();
}
store_writer.close().unwrap();
@@ -119,10 +118,11 @@ pub mod tests {
let store = StoreReader::open(store_file, 10)?;
for i in 0..NUM_DOCS as u32 {
assert_eq!(
*store
store
.get::<TantivyDocument>(i)?
.get_first(field_title)
.unwrap()
.as_value()
.as_str()
.unwrap(),
format!("Doc {i}")
@@ -131,7 +131,13 @@ pub mod tests {
for doc in store.iter::<TantivyDocument>(Some(&alive_bitset)) {
let doc = doc?;
let title_content = doc.get_first(field_title).unwrap().as_str().unwrap();
let title_content = doc
.get_first(field_title)
.unwrap()
.as_value()
.as_str()
.unwrap()
.to_string();
if !title_content.starts_with("Doc ") {
panic!("unexpected title_content {title_content}");
}

View File

@@ -403,8 +403,7 @@ mod tests {
use super::*;
use crate::directory::RamDirectory;
use crate::schema::document::Value;
use crate::schema::{Field, TantivyDocument};
use crate::schema::{Field, TantivyDocument, Value};
use crate::store::tests::write_lorem_ipsum_store;
use crate::store::Compressor;
use crate::Directory;
@@ -412,7 +411,7 @@ mod tests {
const BLOCK_SIZE: usize = 16_384;
fn get_text_field<'a>(doc: &'a TantivyDocument, field: &'a Field) -> Option<&'a str> {
doc.get_first(*field).and_then(|f| f.as_str())
doc.get_first(*field).and_then(|f| f.as_value().as_str())
}
#[test]

View File

@@ -93,7 +93,7 @@ fn open_fst_index(fst_file: FileSlice) -> io::Result<tantivy_fst::Map<OwnedBytes
let fst = Fst::new(bytes).map_err(|err| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("Fst data is corrupted: {:?}", err),
format!("Fst data is corrupted: {err:?}"),
)
})?;
Ok(tantivy_fst::Map::from(fst))

View File

@@ -95,7 +95,7 @@ fn test_term_dictionary_simple() -> crate::Result<()> {
#[test]
fn test_term_dictionary_stream() -> crate::Result<()> {
let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i))
.map(|i| (format!("doc{i:0>6}"), i))
.collect();
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
@@ -156,7 +156,7 @@ fn test_stream_high_range_prefix_suffix() -> crate::Result<()> {
#[test]
fn test_stream_range() -> crate::Result<()> {
let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i))
.map(|i| (format!("doc{i:0>6}"), i))
.collect();
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();

View File

@@ -96,7 +96,7 @@ mod tests {
{
let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
tokens.push(format!("{}", facet));
tokens.push(format!("{facet}"));
};
FacetTokenizer::default()
.token_stream(facet.encoded_str())
@@ -116,7 +116,7 @@ mod tests {
{
let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
tokens.push(format!("{}", facet));
tokens.push(format!("{facet}"));
};
FacetTokenizer::default()
.token_stream(facet.encoded_str()) // ok test