Compare commits

..

14 Commits

Author SHA1 Message Date
Pascal Seitz
2ce485b8cc skip estimate phase for merge multivalue index
precompute stats for merge multivalue index + disable Line encoding for
multivalue index. That combination allows to skip the first estimation
pass. This gives up to 2x on merge performance on multivalue indices.

This change may decrease compression as Line is
very good compressible for documents, which have a fixed amount of
values in each doc. The line codec should be replaced.

```
merge_multi_and_multi          Avg: 22.7880ms (-47.15%)    Median: 22.5469ms (-47.38%)    [22.3691ms .. 25.8392ms]
merge_dense_and_dense          Avg: 14.4398ms (+2.18%)     Median: 14.2465ms (+0.74%)     [14.1620ms .. 16.1270ms]
merge_sparse_and_sparse        Avg: 10.6559ms (+1.10%)     Median: 10.6318ms (+0.91%)     [10.5527ms .. 11.2848ms]
merge_sparse_and_dense         Avg: 12.4886ms (+1.52%)     Median: 12.4044ms (+0.84%)     [12.3261ms .. 13.9439ms]
merge_multi_and_dense          Avg: 25.6686ms (-45.56%)    Median: 25.4851ms (-45.84%)    [25.1618ms .. 27.6226ms]
merge_multi_and_sparse         Avg: 24.3278ms (-47.00%)    Median: 24.1917ms (-47.34%)    [23.7159ms .. 27.0513ms]
```
2024-06-11 20:22:00 +08:00
PSeitz
c3b92a5412 fix compiler warning, cleanup (#2393)
fix compiler warning for missing feature flag
remove unused variables
cleanup unused methods
2024-06-11 16:03:50 +08:00
PSeitz
2f55511064 extend indexwriter proptests (#2342)
* index random values in proptest

* add proptest with multiple docs
2024-06-11 16:02:57 +08:00
trinity-1686a
08b9fc0b31 fix de-escaping too much in query parser (#2427)
* fix de-escaping too much in query parser
2024-06-10 11:19:01 +02:00
PSeitz
714f363d43 add bench & test for columnar merging (#2428)
* add merge columnar proptest

* add columnar merge benchmark
2024-06-10 16:26:16 +08:00
PSeitz
93ff7365b0 reduce top hits aggregation memory consumption (#2426)
move request structure out of top hits aggregation collector and use from the
passed structure instead

full
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 425.9680ms (-21.38%)    Median: 415.1097ms (-23.56%)    [395.5303ms .. 484.6325ms]
dense
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 440.0817ms (-19.68%)    Median: 432.2286ms (-21.10%)    [403.5632ms .. 497.7541ms]
sparse
terms_many_with_top_hits    Memory: 13.1 MB (-49.31%)    Avg: 33.3568ms (-32.19%)    Median: 33.0834ms (-31.86%)    [32.5126ms .. 35.7397ms]
multivalue
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 414.2340ms (-25.44%)    Median: 413.4144ms (-25.64%)    [403.9919ms .. 430.3170ms]
2024-06-06 22:32:58 +08:00
Adam Reichold
8151925068 Panicking in spawned Rayon tasks will abort the process by default. (#2409) 2024-06-04 17:04:30 +09:00
dependabot[bot]
b960e40bc8 Update sketches-ddsketch requirement from 0.2.1 to 0.3.0 (#2423)
Updates the requirements on [sketches-ddsketch](https://github.com/mheffner/rust-sketches-ddsketch) to permit the latest version.
- [Release notes](https://github.com/mheffner/rust-sketches-ddsketch/releases)
- [Commits](https://github.com/mheffner/rust-sketches-ddsketch/compare/v0.2.1...v0.3.0)

---
updated-dependencies:
- dependency-name: sketches-ddsketch
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-06-04 15:50:23 +08:00
giovannicuccu
1095c9b073 Issue 1787 extended stats (#2247)
* first version of extended stats along with its tests

* using IntermediateExtendStats instead of IntermediateStats with all tests passing

* Created struct for request and response

* first test with extended_stats

* kahan summation and tests with approximate equality

* version ready for merge

* removed approx dependency

* refactor for using ExtendedStats only when needed

* interim version

* refined version with code formatted

* refactored a struct

* cosmetic refactor

* fix after merge

* fix format

* added extended_stat bench

* merge and new benchmark for extended stats

* split stat segment collectors

* wrapped intermediate extended stat with a box to limit memory usage

* Revert "wrapped intermediate extended stat with a box to limit memory usage"

This reverts commit 5b4aa9f393.

* some code reformat, commented kahan summation

* refactor after review

* refactor after code review

* fix after incorrectly restoring kahan summation

* modifications for code review + bug fix in merge_fruit

* refactor assert_nearly_equals macro

* update after code review

---------

Co-authored-by: Giovanni Cuccu <gcuccu@imolainformatica.it>
2024-06-04 14:25:17 +08:00
PSeitz
c0686515a9 update one_shot (#2420) 2024-05-31 11:07:35 +08:00
trinity-1686a
455156f51c improve query parser (#2416)
* support escape sequence in more place

and fix bug with singlequoted strings

* add query parser test for range query on default field
2024-05-30 17:29:27 +02:00
Meng Zhang
4143d31865 chore: fix build as the rev is gone (#2417) 2024-05-29 09:49:16 +08:00
Hamir Mahal
0c634adbe1 style: simplify strings with string interpolation (#2412)
* style: simplify strings with string interpolation

* fix: formatting
2024-05-27 09:16:47 +02:00
PSeitz
2e3641c2ae return CompactDocValue instead of trait (#2410)
The CompactDocValue is easier to handle than the trait in some cases like comparison
and conversion
2024-05-27 07:33:50 +02:00
61 changed files with 2299 additions and 532 deletions

View File

@@ -15,8 +15,7 @@ rust-version = "1.63"
exclude = ["benches/*.json", "benches/*.txt"]
[dependencies]
# Switch back to the non-forked oneshot crate once https://github.com/faern/oneshot/pull/35 is merged
oneshot = { git = "https://github.com/fulmicoton/oneshot.git", rev = "c10a3ba" }
oneshot = "0.1.7"
base64 = "0.22.0"
byteorder = "1.4.3"
crc32fast = "1.3.2"
@@ -64,7 +63,7 @@ query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tanti
tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
common = { version = "0.7", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
futures-util = { version = "0.3.28", optional = true }
fnv = "1.0.7"

View File

@@ -47,6 +47,7 @@ fn bench_agg(mut group: InputGroup<Index>) {
register!(group, average_f64);
register!(group, average_f64_u64);
register!(group, stats_f64);
register!(group, extendedstats_f64);
register!(group, percentiles_f64);
register!(group, terms_few);
register!(group, terms_many);
@@ -105,7 +106,12 @@ fn stats_f64(index: &Index) {
});
exec_term_with_agg(index, agg_req)
}
fn extendedstats_f64(index: &Index) {
let agg_req = json!({
"extendedstats_f64": { "extended_stats": { "field": "score_f64", } }
});
exec_term_with_agg(index, agg_req)
}
fn percentiles_f64(index: &Index) {
let agg_req = json!({
"mypercentiles": {
@@ -349,7 +355,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let many_terms_data = (0..150_000)
.map(|num| format!("author{}", num))
.map(|num| format!("author{num}"))
.collect::<Vec<_>>();
{
let mut rng = StdRng::from_seed([1u8; 32]);

View File

@@ -141,12 +141,12 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
let parse_json = false;
// for parse_json in [false, true] {
let suffix = if parse_json {
format!("{}-with-json-parsing", suffix)
format!("{suffix}-with-json-parsing")
} else {
suffix.to_string()
};
let bench_name = format!("{}{}", prefix, suffix);
let bench_name = format!("{prefix}{suffix}");
group.bench_function(bench_name, |b| {
benchmark(b, HDFS_LOGS, schema.clone(), commit, parse_json, is_dynamic)
});

View File

@@ -23,6 +23,12 @@ downcast-rs = "1.2.0"
proptest = "1"
more-asserts = "0.3.1"
rand = "0.8"
binggan = "0.8.1"
[[bench]]
name = "bench_merge"
harness = false
[features]
unstable = []

View File

@@ -0,0 +1,97 @@
#![feature(test)]
extern crate test;
use core::fmt;
use std::fmt::{Display, Formatter};
use binggan::{black_box, BenchRunner};
use tantivy_columnar::*;
enum Card {
Multi,
Sparse,
Dense,
}
impl Display for Card {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Card::Multi => write!(f, "multi"),
Card::Sparse => write!(f, "sparse"),
Card::Dense => write!(f, "dense"),
}
}
}
const NUM_DOCS: u32 = 1_000_000;
fn generate_columnar(card: Card, num_docs: u32) -> ColumnarReader {
use tantivy_columnar::ColumnarWriter;
let mut columnar_writer = ColumnarWriter::default();
match card {
Card::Multi => {
columnar_writer.record_numerical(0, "price", 10u64);
columnar_writer.record_numerical(0, "price", 10u64);
}
_ => {}
}
for i in 0..num_docs {
match card {
Card::Multi | Card::Sparse => {
if i % 8 == 0 {
columnar_writer.record_numerical(i, "price", i as u64);
}
}
Card::Dense => {
if i % 6 == 0 {
columnar_writer.record_numerical(i, "price", i as u64);
}
}
}
}
let mut wrt: Vec<u8> = Vec::new();
columnar_writer.serialize(num_docs, None, &mut wrt).unwrap();
ColumnarReader::open(wrt).unwrap()
}
fn main() {
let mut inputs = Vec::new();
let mut add_combo = |card1: Card, card2: Card| {
inputs.push((
format!("merge_{card1}_and_{card2}"),
vec![
generate_columnar(card1, NUM_DOCS),
generate_columnar(card2, NUM_DOCS),
],
));
};
add_combo(Card::Multi, Card::Multi);
add_combo(Card::Dense, Card::Dense);
add_combo(Card::Sparse, Card::Sparse);
add_combo(Card::Sparse, Card::Dense);
add_combo(Card::Multi, Card::Dense);
add_combo(Card::Multi, Card::Sparse);
let runner: BenchRunner = BenchRunner::new();
let mut group = runner.new_group();
for (input_name, columnar_readers) in inputs.iter() {
group.register_with_input(
input_name,
columnar_readers,
move |columnar_readers: &Vec<ColumnarReader>| {
let mut out = vec![];
let columnar_readers = columnar_readers.iter().collect::<Vec<_>>();
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
black_box(out);
},
);
}
group.run();
}

View File

@@ -73,14 +73,18 @@ fn detect_cardinality(
pub fn merge_column_index<'a>(
columns: &'a [ColumnIndex],
merge_row_order: &'a MergeRowOrder,
num_values: u32,
) -> SerializableColumnIndex<'a> {
// For simplification, we do not try to detect whether the cardinality could be
// downgraded thanks to deletes.
let cardinality_after_merge = detect_cardinality(columns, merge_row_order);
match merge_row_order {
MergeRowOrder::Stack(stack_merge_order) => {
merge_column_index_stacked(columns, cardinality_after_merge, stack_merge_order)
}
MergeRowOrder::Stack(stack_merge_order) => merge_column_index_stacked(
columns,
cardinality_after_merge,
stack_merge_order,
num_values,
),
MergeRowOrder::Shuffled(complex_merge_order) => {
merge_column_index_shuffled(columns, cardinality_after_merge, complex_merge_order)
}
@@ -167,8 +171,12 @@ mod tests {
],
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order, 3);
let SerializableColumnIndex::Multivalued {
indices: start_index_iterable,
..
} = merged_column_index
else {
panic!("Excpected a multivalued index")
};
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
@@ -200,8 +208,12 @@ mod tests {
],
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order, 6);
let SerializableColumnIndex::Multivalued {
indices: start_index_iterable,
..
} = merged_column_index
else {
panic!("Excpected a multivalued index")
};
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();

View File

@@ -22,7 +22,10 @@ pub fn merge_column_index_shuffled<'a>(
Cardinality::Multivalued => {
let multivalue_start_index =
merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order);
SerializableColumnIndex::Multivalued(multivalue_start_index)
SerializableColumnIndex::Multivalued {
indices: multivalue_start_index,
stats: None,
}
}
}
}

View File

@@ -1,6 +1,8 @@
use std::iter;
use std::num::NonZeroU64;
use crate::column_index::{SerializableColumnIndex, Set};
use crate::column_values::ColumnStats;
use crate::iterable::Iterable;
use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};
@@ -12,6 +14,7 @@ pub fn merge_column_index_stacked<'a>(
columns: &'a [ColumnIndex],
cardinality_after_merge: Cardinality,
stack_merge_order: &'a StackMergeOrder,
num_values: u32,
) -> SerializableColumnIndex<'a> {
match cardinality_after_merge {
Cardinality::Full => SerializableColumnIndex::Full,
@@ -27,7 +30,17 @@ pub fn merge_column_index_stacked<'a>(
columns,
stack_merge_order,
};
SerializableColumnIndex::Multivalued(Box::new(stacked_multivalued_index))
SerializableColumnIndex::Multivalued {
indices: Box::new(stacked_multivalued_index),
stats: Some(ColumnStats {
gcd: NonZeroU64::new(1).unwrap(),
// The values in the multivalue index are the positions of the values
min_value: 0,
max_value: num_values as u64,
// This is num docs, but it starts at 0 so we need +1
num_rows: stack_merge_order.num_rows() + 1,
}),
}
}
}
}

View File

@@ -6,20 +6,29 @@ use std::sync::Arc;
use common::OwnedBytes;
use crate::column_values::{
load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
load_u64_based_column_values, serialize_u64_based_column_values,
serialize_u64_with_codec_and_stats, CodecType, ColumnStats, ColumnValues,
};
use crate::iterable::Iterable;
use crate::{DocId, RowId};
pub fn serialize_multivalued_index(
multivalued_index: &dyn Iterable<RowId>,
stats: Option<ColumnStats>,
output: &mut impl Write,
) -> io::Result<()> {
serialize_u64_based_column_values(
multivalued_index,
&[CodecType::Bitpacked, CodecType::Linear],
output,
)?;
if let Some(stats) = stats {
// TODO: Add something with higher compression that doesn't require a full scan upfront
let estimator = CodecType::Bitpacked.estimator();
assert!(!estimator.requires_full_scan());
serialize_u64_with_codec_and_stats(multivalued_index, estimator, stats, output)?;
} else {
serialize_u64_based_column_values(
multivalued_index,
&[CodecType::Bitpacked, CodecType::Linear],
output,
)?;
}
Ok(())
}
@@ -52,7 +61,7 @@ impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
impl MultiValueIndex {
pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
let mut buffer = Vec::new();
serialize_multivalued_index(&start_offsets, &mut buffer).unwrap();
serialize_multivalued_index(&start_offsets, None, &mut buffer).unwrap();
let bytes = OwnedBytes::new(buffer);
open_multivalued_index(bytes).unwrap()
}

View File

@@ -196,6 +196,7 @@ impl Set<RowId> for OptionalIndex {
} = row_addr_from_row_id(doc_id);
let block_meta = self.block_metas[block_id as usize];
let block = self.block(block_meta);
let block_offset_row_id = match block {
Block::Dense(dense_block) => dense_block.rank(in_block_row_id),
Block::Sparse(sparse_block) => sparse_block.rank(in_block_row_id),

View File

@@ -6,6 +6,7 @@ use common::{CountingWriter, OwnedBytes};
use crate::column_index::multivalued_index::serialize_multivalued_index;
use crate::column_index::optional_index::serialize_optional_index;
use crate::column_index::ColumnIndex;
use crate::column_values::ColumnStats;
use crate::iterable::Iterable;
use crate::{Cardinality, RowId};
@@ -15,9 +16,12 @@ pub enum SerializableColumnIndex<'a> {
non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
num_rows: RowId,
},
// TODO remove the Arc<dyn> apart from serialization this is not
// dynamic at all.
Multivalued(Box<dyn Iterable<RowId> + 'a>),
Multivalued {
/// Iterator emitting the indices for the index
indices: Box<dyn Iterable<RowId> + 'a>,
/// In the merge case we can precompute the column stats
stats: Option<ColumnStats>,
},
}
impl<'a> SerializableColumnIndex<'a> {
@@ -25,7 +29,7 @@ impl<'a> SerializableColumnIndex<'a> {
match self {
SerializableColumnIndex::Full => Cardinality::Full,
SerializableColumnIndex::Optional { .. } => Cardinality::Optional,
SerializableColumnIndex::Multivalued(_) => Cardinality::Multivalued,
SerializableColumnIndex::Multivalued { .. } => Cardinality::Multivalued,
}
}
}
@@ -44,9 +48,10 @@ pub fn serialize_column_index(
non_null_row_ids,
num_rows,
} => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
SerializableColumnIndex::Multivalued(multivalued_index) => {
serialize_multivalued_index(&*multivalued_index, &mut output)?
}
SerializableColumnIndex::Multivalued {
indices: multivalued_index,
stats,
} => serialize_multivalued_index(&*multivalued_index, stats, &mut output)?,
}
let column_index_num_bytes = output.written_bytes() as u32;
Ok(column_index_num_bytes)

View File

@@ -32,7 +32,8 @@ pub use u128_based::{
};
pub use u64_based::{
load_u64_based_column_values, serialize_and_load_u64_based_column_values,
serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES,
serialize_u64_based_column_values, serialize_u64_with_codec_and_stats, CodecType,
ALL_U64_CODEC_TYPES,
};
pub use vec_column::VecColumn;

View File

@@ -128,6 +128,9 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {
bit_packer.close(wrt)?;
Ok(())
}
fn codec_type(&self) -> super::CodecType {
super::CodecType::Bitpacked
}
}
pub struct BitpackedCodec;

View File

@@ -163,6 +163,10 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {
Ok(())
}
fn codec_type(&self) -> super::CodecType {
super::CodecType::BlockwiseLinear
}
}
pub struct BlockwiseLinearCodec;

View File

@@ -153,6 +153,12 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
self.collect_before_line_estimation(value);
}
}
fn requires_full_scan(&self) -> bool {
true
}
fn codec_type(&self) -> super::CodecType {
super::CodecType::Linear
}
}
impl LinearCodecEstimator {

View File

@@ -37,7 +37,11 @@ pub trait ColumnCodecEstimator<T = u64>: 'static {
/// This method will be called for each element of the column during
/// `estimation`.
fn collect(&mut self, value: u64);
/// Finalizes the first pass phase.
/// Returns true if the estimator needs a full pass over the column before serialization
fn requires_full_scan(&self) -> bool {
false
}
fn codec_type(&self) -> CodecType;
fn finalize(&mut self) {}
/// Returns an accurate estimation of the number of bytes that will
/// be used to represent this column.
@@ -150,34 +154,45 @@ pub fn serialize_u64_based_column_values<T: MonotonicallyMappableToU64>(
wrt: &mut dyn Write,
) -> io::Result<()> {
let mut stats_collector = StatsCollector::default();
let mut estimators: Vec<(CodecType, Box<dyn ColumnCodecEstimator>)> =
Vec::with_capacity(codec_types.len());
let mut estimators: Vec<Box<dyn ColumnCodecEstimator>> = Vec::with_capacity(codec_types.len());
for &codec_type in codec_types {
estimators.push((codec_type, codec_type.estimator()));
estimators.push(codec_type.estimator());
}
for val in vals.boxed_iter() {
let val_u64 = val.to_u64();
stats_collector.collect(val_u64);
for (_, estimator) in &mut estimators {
for estimator in &mut estimators {
estimator.collect(val_u64);
}
}
for (_, estimator) in &mut estimators {
for estimator in &mut estimators {
estimator.finalize();
}
let stats = stats_collector.stats();
let (_, best_codec, best_codec_estimator) = estimators
let (_, best_codec) = estimators
.into_iter()
.flat_map(|(codec_type, estimator)| {
.flat_map(|estimator| {
let num_bytes = estimator.estimate(&stats)?;
Some((num_bytes, codec_type, estimator))
Some((num_bytes, estimator))
})
.min_by_key(|(num_bytes, _, _)| *num_bytes)
.min_by_key(|(num_bytes, _)| *num_bytes)
.ok_or_else(|| {
io::Error::new(io::ErrorKind::InvalidData, "No available applicable codec.")
})?;
best_codec.to_code().serialize(wrt)?;
best_codec_estimator.serialize(
serialize_u64_with_codec_and_stats(vals, best_codec, stats, wrt)?;
Ok(())
}
/// Serializes a given column of u64-mapped values.
/// The codec estimator needs to be collected fully for the Line codec before calling this.
pub fn serialize_u64_with_codec_and_stats<T: MonotonicallyMappableToU64>(
vals: &dyn Iterable<T>,
codec: Box<dyn ColumnCodecEstimator>,
stats: ColumnStats,
wrt: &mut dyn Write,
) -> io::Result<()> {
codec.codec_type().to_code().serialize(wrt)?;
codec.serialize(
&stats,
&mut vals.boxed_iter().map(MonotonicallyMappableToU64::to_u64),
wrt,

View File

@@ -3,7 +3,7 @@ mod merge_mapping;
mod term_merger;
use std::collections::{BTreeMap, HashSet};
use std::io;
use std::io::{self};
use std::net::Ipv6Addr;
use std::sync::Arc;
@@ -156,8 +156,15 @@ fn merge_column(
column_values.push(None);
}
}
let merged_column_index =
crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
let num_values: u32 = column_values
.iter()
.map(|vals| vals.as_ref().map(|idx| idx.num_vals()).unwrap_or(0))
.sum();
let merged_column_index = crate::column_index::merge_column_index(
&column_indexes[..],
merge_row_order,
num_values,
);
let merge_column_values = MergedColumnValues {
column_indexes: &column_indexes[..],
column_values: &column_values[..],
@@ -183,8 +190,15 @@ fn merge_column(
}
}
let merged_column_index =
crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
let num_values: u32 = column_values
.iter()
.map(|vals| vals.as_ref().map(|idx| idx.num_vals()).unwrap_or(0))
.sum();
let merged_column_index = crate::column_index::merge_column_index(
&column_indexes[..],
merge_row_order,
num_values,
);
let merge_column_values = MergedColumnValues {
column_indexes: &column_indexes[..],
column_values: &column_values,
@@ -214,8 +228,19 @@ fn merge_column(
}
}
}
let merged_column_index =
crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
let num_values: u32 = bytes_columns
.iter()
.map(|vals| {
vals.as_ref()
.map(|idx| idx.term_ord_column.values.num_vals())
.unwrap_or(0)
})
.sum();
let merged_column_index = crate::column_index::merge_column_index(
&column_indexes[..],
merge_row_order,
num_values,
);
merge_bytes_or_str_column(merged_column_index, &bytes_columns, merge_row_order, wrt)?;
}
}

View File

@@ -644,7 +644,10 @@ fn send_to_serialize_column_mappable_to_u128<
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
let multivalued_index = multivalued_index_builder.finish(num_rows);
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
SerializableColumnIndex::Multivalued {
indices: Box::new(multivalued_index),
stats: Default::default(), // TODO: implement stats for u128
}
}
};
crate::column::serialize_column_mappable_to_u128(
@@ -699,7 +702,10 @@ fn send_to_serialize_column_mappable_to_u64(
if sort_values_within_row {
sort_values_within_row_in_place(multivalued_index, values);
}
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
SerializableColumnIndex::Multivalued {
indices: Box::new(multivalued_index),
stats: None,
}
}
};
crate::column::serialize_column_mappable_to_u64(

View File

@@ -738,35 +738,22 @@ proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn test_columnar_merge_proptest(columnar_docs in proptest::collection::vec(columnar_docs_strategy(), 2..=3)) {
let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().flatten().cloned().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
test_columnar_docs(columnar_docs);
}
}
#[test]
fn test_columnar_merging_empty_columnar() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
fn test_columnar_docs(columnar_docs: Vec<Vec<Vec<(&'static str, ColumnValue)>>>) {
let columnar_readers: Vec<ColumnarReader> = columnar_docs
.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
crate::merge_columnar(
&columnar_readers_arr[..],
&[],
crate::MergeRowOrder::Stack(stack_merge_order),
stack_merge_order,
&mut output,
)
.unwrap();
@@ -777,6 +764,24 @@ fn test_columnar_merging_empty_columnar() {
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
}
#[test]
fn test_columnar_merging_empty_columnar() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
test_columnar_docs(columnar_docs);
}
#[test]
fn test_columnar_merging_simple() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
vec![],
vec![vec![
("c1", ColumnValue::Numerical(0u64.into())),
("c1", ColumnValue::Numerical(0u64.into())),
]],
];
test_columnar_docs(columnar_docs);
}
#[test]
fn test_columnar_merging_number_columns() {
let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
@@ -793,25 +798,7 @@ fn test_columnar_merging_number_columns() {
vec![("c2", ColumnValue::Numerical(u64::MAX.into()))],
],
];
let columnar_readers: Vec<ColumnarReader> = columnar_docs
.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
crate::merge_columnar(
&columnar_readers_arr[..],
&[],
crate::MergeRowOrder::Stack(stack_merge_order),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().flatten().cloned().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
test_columnar_docs(columnar_docs);
}
// TODO add non trivial remap and merge

View File

@@ -22,3 +22,6 @@ serde = { version = "1.0.136", features = ["derive"] }
[dev-dependencies]
proptest = "1.0.0"
rand = "0.8.4"
[features]
unstable = [] # useful for benches.

View File

@@ -4,7 +4,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{DateOptions, Document, Schema, INDEXED, STORED, STRING};
use tantivy::schema::{DateOptions, Document, Schema, Value, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument};
fn main() -> tantivy::Result<()> {
@@ -64,6 +64,7 @@ fn main() -> tantivy::Result<()> {
assert!(retrieved_doc
.get_first(occurred_at)
.unwrap()
.as_value()
.as_datetime()
.is_some(),);
assert_eq!(

View File

@@ -61,7 +61,7 @@ fn main() -> tantivy::Result<()> {
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
))?;
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
println!("add doc {i} from thread 1 - opstamp {opstamp}");
thread::sleep(Duration::from_millis(20));
}
Result::<(), TantivyError>::Ok(())
@@ -82,7 +82,7 @@ fn main() -> tantivy::Result<()> {
body => "Some great book description..."
))?
};
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
println!("add doc {i} from thread 2 - opstamp {opstamp}");
thread::sleep(Duration::from_millis(10));
}
Result::<(), TantivyError>::Ok(())

View File

@@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::iter::once;
use nom::branch::alt;
@@ -19,7 +20,7 @@ use crate::Occur;
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
// special characters.
const SPECIAL_CHARS: &[char] = &[
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ',
'+', '^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '!', '\\', '*', ' ',
];
/// consume a field name followed by colon. Return the field name with escape sequence
@@ -41,36 +42,92 @@ fn field_name(inp: &str) -> IResult<&str, String> {
)(inp)
}
const ESCAPE_IN_WORD: &[char] = &['^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '\\'];
fn interpret_escape(source: &str) -> String {
let mut res = String::with_capacity(source.len());
let mut in_escape = false;
let require_escape = |c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c) || c == '-';
for c in source.chars() {
if in_escape {
if !require_escape(c) {
// we re-add the escape sequence
res.push('\\');
}
res.push(c);
in_escape = false;
} else if c == '\\' {
in_escape = true;
} else {
res.push(c);
}
}
res
}
/// Consume a word outside of any context.
// TODO should support escape sequences
fn word(inp: &str) -> IResult<&str, &str> {
fn word(inp: &str) -> IResult<&str, Cow<str>> {
map_res(
recognize(tuple((
satisfy(|c| {
!c.is_whitespace()
&& !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
}),
many0(satisfy(|c: char| {
!c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
})),
alt((
preceded(char('\\'), anychar),
satisfy(|c| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c) && c != '-'),
)),
many0(alt((
preceded(char('\\'), anychar),
satisfy(|c: char| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c)),
))),
))),
|s| match s {
"OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)),
_ => Ok(s),
s if s.contains('\\') => Ok(Cow::Owned(interpret_escape(s))),
s => Ok(Cow::Borrowed(s)),
},
)(inp)
}
fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ {
|inp| {
opt_i_err(
preceded(
multispace0,
recognize(many1(satisfy(|c| {
!c.is_whitespace() && !delimiter.contains(c)
}))),
fn word_infallible(
delimiter: &str,
emit_error: bool,
) -> impl Fn(&str) -> JResult<&str, Option<Cow<str>>> + '_ {
// emit error is set when receiving an unescaped `:` should emit an error
move |inp| {
map(
opt_i_err(
preceded(
multispace0,
recognize(many1(alt((
preceded(char::<&str, _>('\\'), anychar),
satisfy(|c| !c.is_whitespace() && !delimiter.contains(c)),
)))),
),
"expected word",
),
"expected word",
|(opt_s, mut errors)| match opt_s {
Some(s) => {
if emit_error
&& (s
.as_bytes()
.windows(2)
.any(|window| window[0] != b'\\' && window[1] == b':')
|| s.starts_with(':'))
{
errors.push(LenientErrorInternal {
pos: inp.len(),
message: "parsed possible invalid field as term".to_string(),
});
}
if s.contains('\\') {
(Some(Cow::Owned(interpret_escape(s))), errors)
} else {
(Some(Cow::Borrowed(s)), errors)
}
}
None => (None, errors),
},
)(inp)
}
}
@@ -159,7 +216,7 @@ fn simple_term_infallible(
(value((), char('\'')), simple_quotes),
),
// numbers are parsed with words in this case, as we allow string starting with a -
map(word_infallible(delimiter), |(text, errors)| {
map(word_infallible(delimiter, true), |(text, errors)| {
(text.map(|text| (Delimiter::None, text.to_string())), errors)
}),
)(inp)
@@ -322,15 +379,6 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
|((field_name, _, leaf), mut errors)| {
(
leaf.map(|leaf| {
if matches!(&leaf, UserInputLeaf::Literal(literal)
if literal.phrase.contains(':') && literal.delimiter == Delimiter::None)
&& field_name.is_none()
{
errors.push(LenientErrorInternal {
pos: inp.len(),
message: "parsed possible invalid field as term".to_string(),
});
}
if matches!(&leaf, UserInputLeaf::Literal(literal)
if literal.phrase == "NOT" && literal.delimiter == Delimiter::None)
&& field_name.is_none()
@@ -449,20 +497,20 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
tuple_infallible((
opt_i(anychar),
space0_infallible,
word_infallible("]}"),
word_infallible("]}", false),
space1_infallible,
opt_i_err(
terminated(tag("TO"), alt((value((), multispace1), value((), eof)))),
"missing keyword TO",
),
word_infallible("]}"),
word_infallible("]}", false),
opt_i_err(one_of("]}"), "missing range delimiter"),
)),
|(
(lower_bound_kind, _multispace0, lower, _multispace1, to, upper, upper_bound_kind),
errs,
)| {
let lower_bound = match (lower_bound_kind, lower) {
let lower_bound = match (lower_bound_kind, lower.as_deref()) {
(_, Some("*")) => UserInputBound::Unbounded,
(_, None) => UserInputBound::Unbounded,
// if it is some, TO was actually the bound (i.e. [TO TO something])
@@ -471,7 +519,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
(Some('{'), Some(bound)) => UserInputBound::Exclusive(bound.to_string()),
_ => unreachable!("precondition failed, range did not start with [ or {{"),
};
let upper_bound = match (upper_bound_kind, upper) {
let upper_bound = match (upper_bound_kind, upper.as_deref()) {
(_, Some("*")) => UserInputBound::Unbounded,
(_, None) => UserInputBound::Unbounded,
(Some(']'), Some(bound)) => UserInputBound::Inclusive(bound.to_string()),
@@ -488,7 +536,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
(
(
value((), tag(">=")),
map(word_infallible(""), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
bound
@@ -502,7 +550,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag("<=")),
map(word_infallible(""), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
UserInputBound::Unbounded,
@@ -516,7 +564,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag(">")),
map(word_infallible(""), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
bound
@@ -530,7 +578,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag("<")),
map(word_infallible(""), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
UserInputBound::Unbounded,
@@ -1157,6 +1205,12 @@ mod test {
test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]");
test_parse_query_to_ast_helper(">a", "{\"a\" TO \"*\"}");
test_parse_query_to_ast_helper(">=a", "[\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("<a", "{\"*\" TO \"a\"}");
test_parse_query_to_ast_helper("<=a", "{\"*\" TO \"a\"]");
test_parse_query_to_ast_helper("<=bsd", "{\"*\" TO \"bsd\"]");
}
#[test]
@@ -1590,5 +1644,21 @@ mod test {
r#"myfield:'hello\"happy\'tax'"#,
r#""myfield":'hello"happy'tax'"#,
);
// we don't process escape sequence for chars which don't require it
test_parse_query_to_ast_helper(r#"abc\*"#, r#"abc\*"#);
}
#[test]
fn test_queries_with_colons() {
test_parse_query_to_ast_helper(r#""abc:def""#, r#""abc:def""#);
test_parse_query_to_ast_helper(r#"'abc:def'"#, r#"'abc:def'"#);
test_parse_query_to_ast_helper(r#"abc\:def"#, r#"abc:def"#);
test_parse_query_to_ast_helper(r#""abc\:def""#, r#""abc:def""#);
test_parse_query_to_ast_helper(r#"'abc\:def'"#, r#"'abc:def'"#);
}
#[test]
fn test_invalid_field() {
test_is_parse_err(r#"!bc:def"#, "!bc:def");
}
}

View File

@@ -34,7 +34,7 @@ use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
};
use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation,
};
@@ -146,6 +146,11 @@ pub enum AggregationVariants {
/// extracted values.
#[serde(rename = "stats")]
Stats(StatsAggregation),
/// Computes a collection of estended statistics (`min`, `max`, `sum`, `count`, `avg`,
/// `sum_of_squares`, `variance`, `variance_sampling`, `std_deviation`,
/// `std_deviation_sampling`) over the extracted values.
#[serde(rename = "extended_stats")]
ExtendedStats(ExtendedStatsAggregation),
/// Computes the sum of the extracted values.
#[serde(rename = "sum")]
Sum(SumAggregation),
@@ -170,6 +175,7 @@ impl AggregationVariants {
AggregationVariants::Max(max) => vec![max.field_name()],
AggregationVariants::Min(min) => vec![min.field_name()],
AggregationVariants::Stats(stats) => vec![stats.field_name()],
AggregationVariants::ExtendedStats(extended_stats) => vec![extended_stats.field_name()],
AggregationVariants::Sum(sum) => vec![sum.field_name()],
AggregationVariants::Percentiles(per) => vec![per.field_name()],
AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
@@ -197,6 +203,12 @@ impl AggregationVariants {
_ => None,
}
}
pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregation> {
match &self {
AggregationVariants::TopHits(top_hits) => Some(top_hits),
_ => None,
}
}
pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
match &self {

View File

@@ -11,8 +11,8 @@ use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
};
use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, StatsAggregation,
SumAggregation,
AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
StatsAggregation, SumAggregation,
};
use super::segment_agg_result::AggregationLimits;
use super::VecWithNames;
@@ -276,6 +276,10 @@ impl AggregationWithAccessor {
field: ref field_name,
..
})
| ExtendedStats(ExtendedStatsAggregation {
field: ref field_name,
..
})
| Sum(SumAggregation {
field: ref field_name,
..
@@ -335,8 +339,8 @@ fn get_missing_val(
}
_ => {
return Err(crate::TantivyError::InvalidArgument(format!(
"Missing value {:?} for field {} is not supported for column type {:?}",
missing, field_name, column_type
"Missing value {missing:?} for field {field_name} is not supported for column \
type {column_type:?}"
)));
}
};
@@ -403,7 +407,7 @@ fn get_dynamic_columns(
.iter()
.map(|h| h.open())
.collect::<io::Result<_>>()?;
assert!(!ff_fields.is_empty(), "field {} not found", field_name);
assert!(!ff_fields.is_empty(), "field {field_name} not found");
Ok(cols)
}

View File

@@ -8,7 +8,9 @@ use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use super::bucket::GetDocCount;
use super::metric::{PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult};
use super::metric::{
ExtendedStats, PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult,
};
use super::{AggregationError, Key};
use crate::TantivyError;
@@ -88,6 +90,8 @@ pub enum MetricResult {
Min(SingleMetricResult),
/// Stats metric result.
Stats(Stats),
/// ExtendedStats metric result.
ExtendedStats(Box<ExtendedStats>),
/// Sum metric result.
Sum(SingleMetricResult),
/// Percentiles metric result.
@@ -104,6 +108,7 @@ impl MetricResult {
MetricResult::Max(max) => Ok(max.value),
MetricResult::Min(min) => Ok(min.value),
MetricResult::Stats(stats) => stats.get_value(agg_property),
MetricResult::ExtendedStats(extended_stats) => extended_stats.get_value(agg_property),
MetricResult::Sum(sum) => Ok(sum.value),
MetricResult::Percentiles(_) => Err(TantivyError::AggregationError(
AggregationError::InvalidRequest("percentiles can't be used to order".to_string()),

View File

@@ -357,8 +357,7 @@ impl SegmentTermCollector {
) -> crate::Result<Self> {
if field_type == ColumnType::Bytes {
return Err(TantivyError::InvalidArgument(format!(
"terms aggregation is not supported for column type {:?}",
field_type
"terms aggregation is not supported for column type {field_type:?}"
)));
}
let term_buckets = TermBuckets::default();

View File

@@ -19,8 +19,8 @@ use super::bucket::{
GetDocCount, Order, OrderTarget, RangeAggregation, TermsAggregation,
};
use super::metric::{
IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats,
IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
};
use super::segment_agg_result::AggregationLimits;
use super::{format_date, AggregationError, Key, SerializedKey};
@@ -215,6 +215,9 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
Stats(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Stats(
IntermediateStats::default(),
)),
ExtendedStats(_) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::ExtendedStats(IntermediateExtendedStats::default()),
),
Sum(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Sum(
IntermediateSum::default(),
)),
@@ -222,7 +225,7 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
IntermediateMetricResult::Percentiles(PercentilesCollector::default()),
),
TopHits(ref req) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req.clone())),
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
),
}
}
@@ -282,6 +285,8 @@ pub enum IntermediateMetricResult {
Min(IntermediateMin),
/// Intermediate stats result.
Stats(IntermediateStats),
/// Intermediate stats result.
ExtendedStats(IntermediateExtendedStats),
/// Intermediate sum result.
Sum(IntermediateSum),
/// Intermediate top_hits result
@@ -306,6 +311,9 @@ impl IntermediateMetricResult {
IntermediateMetricResult::Stats(intermediate_stats) => {
MetricResult::Stats(intermediate_stats.finalize())
}
IntermediateMetricResult::ExtendedStats(intermediate_stats) => {
MetricResult::ExtendedStats(intermediate_stats.finalize())
}
IntermediateMetricResult::Sum(intermediate_sum) => {
MetricResult::Sum(intermediate_sum.finalize().into())
}
@@ -346,6 +354,12 @@ impl IntermediateMetricResult {
) => {
stats_left.merge_fruits(stats_right);
}
(
IntermediateMetricResult::ExtendedStats(extended_stats_left),
IntermediateMetricResult::ExtendedStats(extended_stats_right),
) => {
extended_stats_left.merge_fruits(extended_stats_right);
}
(IntermediateMetricResult::Sum(sum_left), IntermediateMetricResult::Sum(sum_right)) => {
sum_left.merge_fruits(sum_right);
}

File diff suppressed because it is too large Load Diff

View File

@@ -18,6 +18,7 @@
mod average;
mod count;
mod extended_stats;
mod max;
mod min;
mod percentiles;
@@ -29,6 +30,7 @@ use std::collections::HashMap;
pub use average::*;
pub use count::*;
pub use extended_stats::*;
pub use max::*;
pub use min::*;
pub use percentiles::*;

View File

@@ -1,3 +1,5 @@
use std::fmt::Debug;
use serde::{Deserialize, Serialize};
use super::*;
@@ -85,13 +87,15 @@ impl Stats {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateStats {
/// The number of extracted values.
count: u64,
pub(crate) count: u64,
/// The sum of the extracted values.
sum: f64,
pub(crate) sum: f64,
/// delta for sum needed for [Kahan algorithm for summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
pub(crate) delta: f64,
/// The min value.
min: f64,
pub(crate) min: f64,
/// The max value.
max: f64,
pub(crate) max: f64,
}
impl Default for IntermediateStats {
@@ -99,6 +103,7 @@ impl Default for IntermediateStats {
Self {
count: 0,
sum: 0.0,
delta: 0.0,
min: f64::MAX,
max: f64::MIN,
}
@@ -109,7 +114,13 @@ impl IntermediateStats {
/// Merges the other stats intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateStats) {
self.count += other.count;
self.sum += other.sum;
// kahan algorithm for sum
let y = other.sum - (self.delta + other.delta);
let t = self.sum + y;
self.delta = (t - self.sum) - y;
self.sum = t;
self.min = self.min.min(other.min);
self.max = self.max.max(other.max);
}
@@ -141,9 +152,15 @@ impl IntermediateStats {
}
#[inline]
fn collect(&mut self, value: f64) {
pub(in crate::aggregation::metric) fn collect(&mut self, value: f64) {
self.count += 1;
self.sum += value;
// kahan algorithm for sum
let y = value - self.delta;
let t = self.sum + y;
self.delta = (t - self.sum) - y;
self.sum = t;
self.min = self.min.min(value);
self.max = self.max.max(value);
}
@@ -288,7 +305,6 @@ impl SegmentAggregationCollector for SegmentStatsCollector {
#[cfg(test)]
mod tests {
use serde_json::Value;
use crate::aggregation::agg_req::{Aggregation, Aggregations};

View File

@@ -1,7 +1,7 @@
use std::collections::HashMap;
use std::net::Ipv6Addr;
use columnar::{ColumnarReader, DynamicColumn};
use columnar::{Column, ColumnType, ColumnarReader, DynamicColumn};
use common::json_path_writer::JSON_PATH_SEGMENT_SEP_STR;
use common::DateTime;
use regex::Regex;
@@ -131,8 +131,8 @@ impl<'de> Deserialize<'de> for KeyOrder {
))?;
if key_order.next().is_some() {
return Err(serde::de::Error::custom(format!(
"Expected exactly one key-value pair in sort parameter of top_hits, found {:?}",
key_order
"Expected exactly one key-value pair in sort parameter of top_hits, found \
{key_order:?}"
)));
}
Ok(Self { field, order })
@@ -144,27 +144,22 @@ fn globbed_string_to_regex(glob: &str) -> Result<Regex, crate::TantivyError> {
// Replace `*` glob with `.*` regex
let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*"));
Regex::new(&sanitized.replace('*', ".*")).map_err(|e| {
crate::TantivyError::SchemaError(format!(
"Invalid regex '{}' in docvalue_fields: {}",
glob, e
))
crate::TantivyError::SchemaError(format!("Invalid regex '{glob}' in docvalue_fields: {e}"))
})
}
fn use_doc_value_fields_err(parameter: &str) -> crate::Result<()> {
Err(crate::TantivyError::AggregationError(
AggregationError::InvalidRequest(format!(
"The `{}` parameter is not supported, only `docvalue_fields` is supported in \
`top_hits` aggregation",
parameter
"The `{parameter}` parameter is not supported, only `docvalue_fields` is supported in \
`top_hits` aggregation"
)),
))
}
fn unsupported_err(parameter: &str) -> crate::Result<()> {
Err(crate::TantivyError::AggregationError(
AggregationError::InvalidRequest(format!(
"The `{}` parameter is not supported in the `top_hits` aggregation",
parameter
"The `{parameter}` parameter is not supported in the `top_hits` aggregation"
)),
))
}
@@ -217,8 +212,7 @@ impl TopHitsAggregation {
.collect::<Vec<_>>();
assert!(
!fields.is_empty(),
"No fields matched the glob '{}' in docvalue_fields",
field
"No fields matched the glob '{field}' in docvalue_fields"
);
Ok(fields)
})
@@ -254,7 +248,7 @@ impl TopHitsAggregation {
.map(|field| {
let accessors = accessors
.get(field)
.unwrap_or_else(|| panic!("field '{}' not found in accessors", field));
.unwrap_or_else(|| panic!("field '{field}' not found in accessors"));
let values: Vec<FastFieldValue> = accessors
.iter()
@@ -449,10 +443,10 @@ impl std::cmp::PartialEq for TopHitsTopNComputer {
impl TopHitsTopNComputer {
/// Create a new TopHitsCollector
pub fn new(req: TopHitsAggregation) -> Self {
pub fn new(req: &TopHitsAggregation) -> Self {
Self {
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
req,
req: req.clone(),
}
}
@@ -497,7 +491,6 @@ impl TopHitsTopNComputer {
pub(crate) struct TopHitsSegmentCollector {
segment_ordinal: SegmentOrdinal,
accessor_idx: usize,
req: TopHitsAggregation,
top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, false>,
}
@@ -508,7 +501,6 @@ impl TopHitsSegmentCollector {
segment_ordinal: SegmentOrdinal,
) -> Self {
Self {
req: req.clone(),
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
segment_ordinal,
accessor_idx,
@@ -517,14 +509,13 @@ impl TopHitsSegmentCollector {
fn into_top_hits_collector(
self,
value_accessors: &HashMap<String, Vec<DynamicColumn>>,
req: &TopHitsAggregation,
) -> TopHitsTopNComputer {
let mut top_hits_computer = TopHitsTopNComputer::new(self.req.clone());
let mut top_hits_computer = TopHitsTopNComputer::new(req);
let top_results = self.top_n.into_vec();
for res in top_results {
let doc_value_fields = self
.req
.get_document_field_data(value_accessors, res.doc.doc_id);
let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id);
top_hits_computer.collect(
DocSortValuesAndFields {
sorts: res.feature,
@@ -536,34 +527,15 @@ impl TopHitsSegmentCollector {
top_hits_computer
}
}
impl SegmentAggregationCollector for TopHitsSegmentCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
let intermediate_result =
IntermediateMetricResult::TopHits(self.into_top_hits_collector(value_accessors));
results.push(
name,
IntermediateAggregationResult::Metric(intermediate_result),
)
}
fn collect(
/// TODO add a specialized variant for a single sort field
fn collect_with(
&mut self,
doc_id: crate::DocId,
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
req: &TopHitsAggregation,
accessors: &[(Column<u64>, ColumnType)],
) -> crate::Result<()> {
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
let sorts: Vec<DocValueAndOrder> = self
.req
let sorts: Vec<DocValueAndOrder> = req
.sort
.iter()
.enumerate()
@@ -588,15 +560,62 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
);
Ok(())
}
}
impl SegmentAggregationCollector for TopHitsSegmentCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let intermediate_result = IntermediateMetricResult::TopHits(
self.into_top_hits_collector(value_accessors, tophits_req),
);
results.push(
name,
IntermediateAggregationResult::Metric(intermediate_result),
)
}
/// TODO: Consider a caching layer to reduce the call overhead
fn collect(
&mut self,
doc_id: crate::DocId,
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
) -> crate::Result<()> {
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
self.collect_with(doc_id, tophits_req, accessors)?;
Ok(())
}
fn collect_block(
&mut self,
docs: &[crate::DocId],
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
) -> crate::Result<()> {
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
// TODO: Consider getting fields with the column block accessor.
for doc in docs {
self.collect(*doc, agg_with_accessor)?;
self.collect_with(*doc, tophits_req, accessors)?;
}
Ok(())
}

View File

@@ -158,15 +158,14 @@ use serde::de::{self, Visitor};
use serde::{Deserialize, Deserializer, Serialize};
fn parse_str_into_f64<E: de::Error>(value: &str) -> Result<f64, E> {
let parsed = value.parse::<f64>().map_err(|_err| {
de::Error::custom(format!("Failed to parse f64 from string: {:?}", value))
})?;
let parsed = value
.parse::<f64>()
.map_err(|_err| de::Error::custom(format!("Failed to parse f64 from string: {value:?}")))?;
// Check if the parsed value is NaN or infinity
if parsed.is_nan() || parsed.is_infinite() {
Err(de::Error::custom(format!(
"Value is not a valid f64 (NaN or Infinity): {:?}",
value
"Value is not a valid f64 (NaN or Infinity): {value:?}"
)))
} else {
Ok(parsed)

View File

@@ -11,12 +11,12 @@ use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAcce
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
use super::intermediate_agg_result::IntermediateAggregationResults;
use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
SegmentPercentilesCollector, SegmentStatsCollector, SegmentStatsType, StatsAggregation,
SumAggregation,
};
use crate::aggregation::bucket::TermMissingAgg;
use crate::aggregation::metric::TopHitsSegmentCollector;
use crate::aggregation::metric::{SegmentExtendedStatsCollector, TopHitsSegmentCollector};
pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
fn add_intermediate_aggregation_result(
@@ -148,6 +148,9 @@ pub(crate) fn build_single_agg_segment_collector(
accessor_idx,
*missing,
))),
ExtendedStats(ExtendedStatsAggregation { missing, sigma, .. }) => Ok(Box::new(
SegmentExtendedStatsCollector::from_req(req.field_type, *sigma, accessor_idx, *missing),
)),
Sum(SumAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Sum,

View File

@@ -598,7 +598,7 @@ mod tests {
let mid = n % 4;
n /= 4;
let leaf = n % 5;
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
Facet::from(&format!("/top{top}/mid{mid}/leaf{leaf}"))
})
.collect();
for i in 0..num_facets * 10 {
@@ -737,7 +737,7 @@ mod tests {
vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c));
let facet = Facet::from(&format!("/facet/{c}"));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})
@@ -785,7 +785,7 @@ mod tests {
let docs: Vec<TantivyDocument> = vec![("b", 2), ("a", 2), ("c", 4)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c));
let facet = Facet::from(&format!("/facet/{c}"));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})

View File

@@ -871,7 +871,10 @@ mod tests {
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score, SegmentReader};
use crate::{
assert_nearly_equals, DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score,
SegmentReader,
};
fn make_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();

View File

@@ -195,7 +195,7 @@ mod tests {
let (tx, rx) = crossbeam_channel::bounded::<()>(0);
let rx = Arc::new(rx);
let executor = Executor::multi_thread(3, "search-test").unwrap();
for i in 0..1000 {
for _ in 0..1000 {
let counter_clone: Arc<AtomicU64> = counter.clone();
let other_counter_clone: Arc<AtomicU64> = other_counter.clone();
@@ -203,18 +203,18 @@ mod tests {
let rx_clone2 = rx.clone();
let fut = executor.spawn_blocking(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
let () = rx_clone.recv().unwrap();
let _ = rx_clone.recv();
});
futures.push(fut);
let other_fut = executor.spawn_blocking(move || {
other_counter_clone.fetch_add(1, Ordering::SeqCst);
let () = rx_clone2.recv().unwrap();
let _ = rx_clone2.recv();
});
other_futures.push(other_fut);
}
// We execute 100 futures.
for i in 0..100 {
for _ in 0..100 {
tx.send(()).unwrap();
}
@@ -226,7 +226,7 @@ mod tests {
drop(other_futures);
// We execute 100 futures.
for i in 0..100 {
for _ in 0..100 {
tx.send(()).unwrap();
}

View File

@@ -338,14 +338,14 @@ mod tests {
let mut term = Term::from_field_json_path(field, "attributes.color", false);
term.append_type_and_str("red");
assert_eq!(
format!("{:?}", term),
format!("{term:?}"),
"Term(field=1, type=Json, path=attributes.color, type=Str, \"red\")"
);
let mut term = Term::from_field_json_path(field, "attributes.dimensions.width", false);
term.append_type_and_fast_value(400i64);
assert_eq!(
format!("{:?}", term),
format!("{term:?}"),
"Term(field=1, type=Json, path=attributes.dimensions.width, type=I64, 400)"
);
}

View File

@@ -566,7 +566,7 @@ mod tests {
let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
let num_paths = 10;
let paths: Vec<PathBuf> = (0..num_paths)
.map(|i| PathBuf::from(&*format!("file_{}", i)))
.map(|i| PathBuf::from(&*format!("file_{i}")))
.collect();
{
for path in &paths {

View File

@@ -62,7 +62,7 @@ impl FacetReader {
#[cfg(test)]
mod tests {
use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED};
use crate::schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED};
use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
#[test]
@@ -88,7 +88,9 @@ mod tests {
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))
.unwrap();
let value = doc.get_first(facet_field).and_then(|v| v.as_facet());
let value = doc
.get_first(facet_field)
.and_then(|v| v.as_value().as_facet());
assert_eq!(value, None);
}

View File

@@ -252,9 +252,8 @@ impl IndexBuilder {
let field_type = entry.field_type().value_type();
if !supported_field_types.contains(&field_type) {
return Err(TantivyError::InvalidArgument(format!(
"Unsupported field type in sort_by_field: {:?}. Supported field types: \
{:?} ",
field_type, supported_field_types,
"Unsupported field type in sort_by_field: {field_type:?}. Supported field \
types: {supported_field_types:?} ",
)));
}
}

View File

@@ -318,14 +318,14 @@ impl SegmentReader {
if create_canonical {
// Without expand dots enabled dots need to be escaped.
let escaped_json_path = json_path.replace('.', "\\.");
let full_path = format!("{}.{}", field_name, escaped_json_path);
let full_path = format!("{field_name}.{escaped_json_path}");
let full_path_unescaped = format!("{}.{}", field_name, &json_path);
map_to_canonical.insert(full_path_unescaped, full_path.to_string());
full_path
} else {
// With expand dots enabled, we can use '.' instead of '\u{1}'.
json_path_sep_to_dot(&mut json_path);
format!("{}.{}", field_name, json_path)
format!("{field_name}.{json_path}")
}
};
indexed_fields.extend(

View File

@@ -808,7 +808,7 @@ mod tests {
use proptest::prop_oneof;
use super::super::operation::UserOperation;
use crate::collector::TopDocs;
use crate::collector::{Count, TopDocs};
use crate::directory::error::LockError;
use crate::error::*;
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
@@ -816,7 +816,7 @@ mod tests {
use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
use crate::schema::{
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema,
TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED, STRING, TEXT,
};
use crate::store::DOCSTORE_CACHE_CAPACITY;
use crate::{
@@ -1572,20 +1572,74 @@ mod tests {
Ok(())
}
#[derive(Debug, Clone, Copy)]
#[derive(Debug, Clone)]
enum IndexingOp {
AddDoc { id: u64 },
DeleteDoc { id: u64 },
DeleteDocQuery { id: u64 },
AddMultipleDoc {
id: u64,
num_docs: u64,
value: IndexValue,
},
AddDoc {
id: u64,
value: IndexValue,
},
DeleteDoc {
id: u64,
},
DeleteDocQuery {
id: u64,
},
Commit,
Merge,
}
impl IndexingOp {
fn add(id: u64) -> Self {
IndexingOp::AddDoc {
id,
value: IndexValue::F64(id as f64),
}
}
}
use serde::Serialize;
#[derive(Debug, Clone, Serialize)]
#[serde(untagged)]
enum IndexValue {
Str(String),
F64(f64),
U64(u64),
I64(i64),
}
impl Default for IndexValue {
fn default() -> Self {
IndexValue::F64(0.0)
}
}
fn value_strategy() -> impl Strategy<Value = IndexValue> {
prop_oneof![
any::<f64>().prop_map(IndexValue::F64),
any::<u64>().prop_map(IndexValue::U64),
any::<i64>().prop_map(IndexValue::I64),
any::<String>().prop_map(IndexValue::Str),
]
}
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
(0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
(0u64..20u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
(0u64..20u64).prop_map(|id| IndexingOp::AddDoc { id }),
(0u64..20u64, value_strategy())
.prop_map(move |(id, value)| IndexingOp::AddDoc { id, value }),
((0u64..20u64), (1u64..100), value_strategy()).prop_map(
move |(id, num_docs, value)| {
IndexingOp::AddMultipleDoc {
id,
num_docs,
value,
}
}
),
(0u64..1u64).prop_map(|_| IndexingOp::Commit),
(0u64..1u64).prop_map(|_| IndexingOp::Merge),
]
@@ -1595,7 +1649,17 @@ mod tests {
prop_oneof![
5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
5 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDocQuery { id }),
50 => (0u64..100u64).prop_map(|id| IndexingOp::AddDoc { id }),
50 => (0u64..100u64, value_strategy())
.prop_map(move |(id, value)| IndexingOp::AddDoc { id, value }),
50 => (0u64..100u64, (1u64..100), value_strategy()).prop_map(
move |(id, num_docs, value)| {
IndexingOp::AddMultipleDoc {
id,
num_docs,
value,
}
}
),
2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit),
1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge),
]
@@ -1604,19 +1668,27 @@ mod tests {
fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) {
let mut existing_ids = HashMap::new();
let mut deleted_ids = HashSet::new();
for &op in ops {
for op in ops {
match op {
IndexingOp::AddDoc { id } => {
*existing_ids.entry(id).or_insert(0) += 1;
deleted_ids.remove(&id);
IndexingOp::AddDoc { id, value: _ } => {
*existing_ids.entry(*id).or_insert(0) += 1;
deleted_ids.remove(id);
}
IndexingOp::AddMultipleDoc {
id,
num_docs,
value: _,
} => {
*existing_ids.entry(*id).or_insert(0) += num_docs;
deleted_ids.remove(id);
}
IndexingOp::DeleteDoc { id } => {
existing_ids.remove(&id);
deleted_ids.insert(id);
deleted_ids.insert(*id);
}
IndexingOp::DeleteDocQuery { id } => {
existing_ids.remove(&id);
deleted_ids.insert(id);
deleted_ids.insert(*id);
}
_ => {}
}
@@ -1626,16 +1698,19 @@ mod tests {
fn get_id_list(ops: &[IndexingOp]) -> Vec<u64> {
let mut id_list = Vec::new();
for &op in ops {
for op in ops {
match op {
IndexingOp::AddDoc { id } => {
id_list.push(id);
IndexingOp::AddDoc { id, value: _ } => {
id_list.push(*id);
}
IndexingOp::AddMultipleDoc { id, .. } => {
id_list.push(*id);
}
IndexingOp::DeleteDoc { id } => {
id_list.retain(|el| *el != id);
id_list.retain(|el| el != id);
}
IndexingOp::DeleteDocQuery { id } => {
id_list.retain(|el| *el != id);
id_list.retain(|el| el != id);
}
_ => {}
}
@@ -1716,42 +1791,59 @@ mod tests {
let ip_from_id = |id| Ipv6Addr::from_u128(id as u128);
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
let ip = ip_from_id(id);
if !id_is_full_doc(id) {
// every 3rd doc has no ip field
index_writer.add_document(doc!(
id_field=>id,
))?;
} else {
let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string()});
index_writer.add_document(doc!(id_field=>id,
json_field=>json,
bytes_field => id.to_le_bytes().as_slice(),
id_opt_field => id,
ip_field => ip,
ips_field => ip,
ips_field => ip,
multi_numbers=> id,
multi_numbers => id,
bool_field => (id % 2u64) != 0,
i64_field => id as i64,
f64_field => id as f64,
date_field => DateTime::from_timestamp_secs(id as i64),
multi_bools => (id % 2u64) != 0,
multi_bools => (id % 2u64) == 0,
text_field => id.to_string(),
facet_field => facet,
large_text_field => LOREM,
multi_text_fields => multi_text_field_text1,
multi_text_fields => multi_text_field_text2,
multi_text_fields => multi_text_field_text3,
))?;
}
let add_docs = |index_writer: &mut IndexWriter,
id: u64,
value: IndexValue,
num: u64|
-> crate::Result<()> {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
let ip = ip_from_id(id);
let doc = if !id_is_full_doc(id) {
// every 3rd doc has no ip field
doc!(
id_field=>id,
)
} else {
let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string(), "val": value});
doc!(id_field=>id,
json_field=>json,
bytes_field => id.to_le_bytes().as_slice(),
id_opt_field => id,
ip_field => ip,
ips_field => ip,
ips_field => ip,
multi_numbers=> id,
multi_numbers => id,
bool_field => (id % 2u64) != 0,
i64_field => id as i64,
f64_field => id as f64,
date_field => DateTime::from_timestamp_secs(id as i64),
multi_bools => (id % 2u64) != 0,
multi_bools => (id % 2u64) == 0,
text_field => id.to_string(),
facet_field => facet,
large_text_field => LOREM,
multi_text_fields => multi_text_field_text1,
multi_text_fields => multi_text_field_text2,
multi_text_fields => multi_text_field_text3,
)
};
for _ in 0..num {
index_writer.add_document(doc.clone())?;
}
Ok(())
};
for op in ops {
match op.clone() {
IndexingOp::AddMultipleDoc {
id,
num_docs,
value,
} => {
add_docs(&mut index_writer, id, value, num_docs)?;
}
IndexingOp::AddDoc { id, value } => {
add_docs(&mut index_writer, id, value, 1)?;
}
IndexingOp::DeleteDoc { id } => {
index_writer.delete_term(Term::from_field_u64(id_field, id));
@@ -1979,7 +2071,13 @@ mod tests {
.unwrap();
// test store iterator
for doc in store_reader.iter::<TantivyDocument>(segment_reader.alive_bitset()) {
let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap();
let id = doc
.unwrap()
.get_first(id_field)
.unwrap()
.as_value()
.as_u64()
.unwrap();
assert!(expected_ids_and_num_occurrences.contains_key(&id));
}
// test store random access
@@ -2026,18 +2124,22 @@ mod tests {
top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
};
let count_search = |term: &str, field| {
let query = QueryParser::for_index(&index, vec![field])
.parse_query(term)
.unwrap();
searcher.search(&query, &Count).unwrap()
};
let do_search2 = |term: Term| {
let count_search2 = |term: Term| {
let query = TermQuery::new(term, IndexRecordOption::Basic);
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(1000)).unwrap();
top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
searcher.search(&query, &Count).unwrap()
};
for (id, count) in &expected_ids_and_num_occurrences {
// skip expensive queries
let (existing_id, count) = (*id, *count);
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64;
let get_num_hits = |field| count_search(&existing_id.to_string(), field) as u64;
assert_eq!(get_num_hits(id_field), count);
if !id_is_full_doc(existing_id) {
continue;
@@ -2047,29 +2149,31 @@ mod tests {
assert_eq!(get_num_hits(f64_field), count);
// Test multi text
assert_eq!(
do_search("\"test1 test2\"", multi_text_fields).len(),
num_docs_with_values
);
assert_eq!(
do_search("\"test2 test3\"", multi_text_fields).len(),
num_docs_with_values
);
if num_docs_with_values < 1000 {
assert_eq!(
do_search("\"test1 test2\"", multi_text_fields).len(),
num_docs_with_values
);
assert_eq!(
do_search("\"test2 test3\"", multi_text_fields).len(),
num_docs_with_values
);
}
// Test bytes
let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice());
assert_eq!(do_search2(term).len() as u64, count);
assert_eq!(count_search2(term) as u64, count);
// Test date
let term = Term::from_field_date(
date_field,
DateTime::from_timestamp_secs(existing_id as i64),
);
assert_eq!(do_search2(term).len() as u64, count);
assert_eq!(count_search2(term) as u64, count);
}
for deleted_id in deleted_ids {
let assert_field = |field| {
assert_eq!(do_search(&deleted_id.to_string(), field).len() as u64, 0);
assert_eq!(count_search(&deleted_id.to_string(), field) as u64, 0);
};
assert_field(text_field);
assert_field(f64_field);
@@ -2078,12 +2182,12 @@ mod tests {
// Test bytes
let term = Term::from_field_bytes(bytes_field, deleted_id.to_le_bytes().as_slice());
assert_eq!(do_search2(term).len() as u64, 0);
assert_eq!(count_search2(term), 0);
// Test date
let term =
Term::from_field_date(date_field, DateTime::from_timestamp_secs(deleted_id as i64));
assert_eq!(do_search2(term).len() as u64, 0);
assert_eq!(count_search2(term), 0);
}
// search ip address
//
@@ -2092,13 +2196,13 @@ mod tests {
if !id_is_full_doc(existing_id) {
continue;
}
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
let ip_addr = Ipv6Addr::from_u128(existing_id as u128);
// Test incoming ip as ipv6
assert_eq!(do_search_ip_field(&format!("\"{ip_addr}\"")), count);
let term = Term::from_field_ip_addr(ip_field, ip_addr);
assert_eq!(do_search2(term).len() as u64, count);
assert_eq!(count_search2(term) as u64, count);
// Test incoming ip as ipv4
if let Some(ip_addr) = ip_addr.to_ipv4_mapped() {
@@ -2115,7 +2219,7 @@ mod tests {
if !sample.is_empty() {
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
let expected_count = |sample: &[(&u64, &u64)]| {
let calc_expected_count = |sample: &[(&u64, &u64)]| {
sample
.iter()
.filter(|(id, _)| id_is_full_doc(**id))
@@ -2131,18 +2235,17 @@ mod tests {
}
// Query first half
if !left_sample.is_empty() {
let expected_count = expected_count(left_sample);
let expected_count = calc_expected_count(left_sample);
if !left_sample.is_empty() && expected_count < 1000 {
let start_range = *left_sample[0].0;
let end_range = *left_sample.last().unwrap().0;
let query = gen_query_inclusive("id_opt", start_range, end_range);
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
assert_eq!(count_search(&query, id_opt_field) as u64, expected_count);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", "*", ip2);
@@ -2154,19 +2257,19 @@ mod tests {
assert_eq!(do_search_ip_field(&query), expected_count);
}
// Query second half
if !right_sample.is_empty() {
let expected_count = expected_count(right_sample);
let expected_count = calc_expected_count(right_sample);
if !right_sample.is_empty() && expected_count < 1000 {
let start_range = *right_sample[0].0;
let end_range = *right_sample.last().unwrap().0;
// Range query on id opt field
let query =
gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
assert_eq!(count_search(&query, id_opt_field) as u64, expected_count);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", ip1, "*");
@@ -2191,7 +2294,7 @@ mod tests {
};
let ip = ip_from_id(existing_id);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let do_search_ip_field = |term: &str| count_search(term, ip_field) as u64;
// Range query on single value field
let query = gen_query_inclusive("ip", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
@@ -2251,7 +2354,7 @@ mod tests {
#[test]
fn test_fast_field_range() {
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect();
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::add(id)).collect();
assert!(test_operation_strategy(&ops, false, true).is_ok());
}
@@ -2259,8 +2362,8 @@ mod tests {
fn test_sort_index_on_opt_field_regression() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 81 },
IndexingOp::AddDoc { id: 70 },
IndexingOp::add(81),
IndexingOp::add(70),
IndexingOp::DeleteDoc { id: 70 }
],
true,
@@ -2269,14 +2372,45 @@ mod tests {
.is_ok());
}
#[test]
fn test_simple_multiple_doc() {
assert!(test_operation_strategy(
&[
IndexingOp::AddMultipleDoc {
id: 7,
num_docs: 800,
value: IndexValue::U64(0),
},
IndexingOp::AddMultipleDoc {
id: 92,
num_docs: 800,
value: IndexValue::U64(0),
},
IndexingOp::AddMultipleDoc {
id: 30,
num_docs: 800,
value: IndexValue::U64(0),
},
IndexingOp::AddMultipleDoc {
id: 33,
num_docs: 800,
value: IndexValue::U64(0),
},
],
true,
false
)
.is_ok());
}
#[test]
fn test_ip_range_query_multivalue_bug() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 2 },
IndexingOp::add(2),
IndexingOp::Commit,
IndexingOp::AddDoc { id: 1 },
IndexingOp::AddDoc { id: 1 },
IndexingOp::add(1),
IndexingOp::add(1),
IndexingOp::Commit,
IndexingOp::Merge
],
@@ -2290,11 +2424,11 @@ mod tests {
fn test_ff_num_ips_regression() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 13 },
IndexingOp::AddDoc { id: 1 },
IndexingOp::add(13),
IndexingOp::add(1),
IndexingOp::Commit,
IndexingOp::DeleteDocQuery { id: 13 },
IndexingOp::AddDoc { id: 1 },
IndexingOp::add(1),
IndexingOp::Commit,
],
false,
@@ -2306,7 +2440,7 @@ mod tests {
#[test]
fn test_minimal_sort_force_end_merge() {
assert!(test_operation_strategy(
&[IndexingOp::AddDoc { id: 23 }, IndexingOp::AddDoc { id: 13 },],
&[IndexingOp::add(23), IndexingOp::add(13),],
false,
false
)
@@ -2367,8 +2501,8 @@ mod tests {
fn test_minimal_sort_force_end_merge_with_delete() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 23 },
IndexingOp::AddDoc { id: 13 },
IndexingOp::add(23),
IndexingOp::add(13),
IndexingOp::DeleteDoc { id: 13 }
],
true,
@@ -2381,8 +2515,8 @@ mod tests {
fn test_minimal_no_sort_no_force_end_merge() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 23 },
IndexingOp::AddDoc { id: 13 },
IndexingOp::add(23),
IndexingOp::add(13),
IndexingOp::DeleteDoc { id: 13 }
],
false,
@@ -2393,7 +2527,7 @@ mod tests {
#[test]
fn test_minimal_sort_merge() {
assert!(test_operation_strategy(&[IndexingOp::AddDoc { id: 3 },], true, true).is_ok());
assert!(test_operation_strategy(&[IndexingOp::add(3),], true, true).is_ok());
}
use proptest::prelude::*;
@@ -2489,14 +2623,14 @@ mod tests {
fn test_delete_bug_reproduction_ip_addr() {
use IndexingOp::*;
let ops = &[
AddDoc { id: 1 },
AddDoc { id: 2 },
IndexingOp::add(1),
IndexingOp::add(2),
Commit,
AddDoc { id: 3 },
IndexingOp::add(3),
DeleteDoc { id: 1 },
Commit,
Merge,
AddDoc { id: 4 },
IndexingOp::add(4),
Commit,
];
test_operation_strategy(&ops[..], false, true).unwrap();
@@ -2505,7 +2639,13 @@ mod tests {
#[test]
fn test_merge_regression_1() {
use IndexingOp::*;
let ops = &[AddDoc { id: 15 }, Commit, AddDoc { id: 9 }, Commit, Merge];
let ops = &[
IndexingOp::add(15),
Commit,
IndexingOp::add(9),
Commit,
Merge,
];
test_operation_strategy(&ops[..], false, true).unwrap();
}
@@ -2513,9 +2653,9 @@ mod tests {
fn test_range_query_bug_1() {
use IndexingOp::*;
let ops = &[
AddDoc { id: 9 },
AddDoc { id: 0 },
AddDoc { id: 13 },
IndexingOp::add(9),
IndexingOp::add(0),
IndexingOp::add(13),
Commit,
];
test_operation_strategy(&ops[..], false, true).unwrap();
@@ -2523,12 +2663,11 @@ mod tests {
#[test]
fn test_range_query_bug_2() {
use IndexingOp::*;
let ops = &[
AddDoc { id: 3 },
AddDoc { id: 6 },
AddDoc { id: 9 },
AddDoc { id: 10 },
IndexingOp::add(3),
IndexingOp::add(6),
IndexingOp::add(9),
IndexingOp::add(10),
];
test_operation_strategy(&ops[..], false, false).unwrap();
}
@@ -2550,7 +2689,7 @@ mod tests {
assert!(test_operation_strategy(
&[
IndexingOp::DeleteDoc { id: 0 },
IndexingOp::AddDoc { id: 6 },
IndexingOp::add(6),
IndexingOp::DeleteDocQuery { id: 11 },
IndexingOp::Commit,
IndexingOp::Merge,
@@ -2567,10 +2706,13 @@ mod tests {
fn test_bug_1617_2() {
assert!(test_operation_strategy(
&[
IndexingOp::AddDoc { id: 13 },
IndexingOp::AddDoc {
id: 13,
value: Default::default()
},
IndexingOp::DeleteDoc { id: 13 },
IndexingOp::Commit,
IndexingOp::AddDoc { id: 30 },
IndexingOp::add(30),
IndexingOp::Commit,
IndexingOp::Merge,
],

View File

@@ -696,7 +696,7 @@ impl IndexMerger {
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
let doc_bytes_it = &mut document_iterators[old_doc_addr.segment_ord as usize];
if let Some(doc_bytes_res) = doc_bytes_it.next() {
let (_, doc_bytes) = doc_bytes_res?;
let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
} else {
return Err(DataCorruption::comment_only(format!(
@@ -728,7 +728,7 @@ impl IndexMerger {
|| store_reader.decompressor() != store_writer.compressor().into()
{
for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
let (_, doc_bytes) = doc_bytes_res?;
let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
}
} else {
@@ -787,6 +787,8 @@ impl IndexMerger {
mod tests {
use columnar::Column;
use proptest::prop_oneof;
use proptest::strategy::Strategy;
use schema::FAST;
use crate::collector::tests::{
@@ -794,10 +796,11 @@ mod tests {
};
use crate::collector::{Count, FacetCollector};
use crate::index::{Index, SegmentId};
use crate::indexer::NoMergePolicy;
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::schema::{
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
TextFieldIndexing, INDEXED, TEXT,
TextFieldIndexing, Value, INDEXED, TEXT,
};
use crate::time::OffsetDateTime;
use crate::{
@@ -909,15 +912,24 @@ mod tests {
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b"));
assert_eq!(
doc.get_first(text_field).unwrap().as_value().as_str(),
Some("af b")
);
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c"));
assert_eq!(
doc.get_first(text_field).unwrap().as_value().as_str(),
Some("a b c")
);
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c d"));
assert_eq!(
doc.get_first(text_field).unwrap().as_value().as_str(),
Some("a b c d")
);
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 3))?;
@@ -1522,6 +1534,112 @@ mod tests {
Ok(())
}
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
enum IndexingOp {
ZeroVal,
OneVal { val: u64 },
TwoVal { val: u64 },
Commit,
}
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
(0u64..1u64).prop_map(|_| IndexingOp::ZeroVal),
(0u64..1u64).prop_map(|val| IndexingOp::OneVal { val }),
(0u64..1u64).prop_map(|val| IndexingOp::TwoVal { val }),
(0u64..1u64).prop_map(|_| IndexingOp::Commit),
]
}
use proptest::prelude::*;
proptest! {
#[test]
fn test_merge_columnar_int_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..20)) {
assert!(test_merge_int_fields(&ops[..]).is_ok());
}
}
fn test_merge_int_fields(ops: &[IndexingOp]) -> crate::Result<()> {
if ops.iter().all(|op| *op == IndexingOp::Commit) {
return Ok(());
}
let expected_doc_and_vals: Vec<(u32, Vec<u64>)> = ops
.iter()
.filter(|op| *op != &IndexingOp::Commit)
.map(|op| match op {
IndexingOp::ZeroVal => vec![],
IndexingOp::OneVal { val } => vec![*val],
IndexingOp::TwoVal { val } => vec![*val, *val],
IndexingOp::Commit => unreachable!(),
})
.enumerate()
.map(|(id, val)| (id as u32, val))
.collect();
let mut schema_builder = schema::Schema::builder();
let int_options = NumericOptions::default().set_fast().set_indexed();
let int_field = schema_builder.add_u64_field("intvals", int_options);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = TantivyDocument::default();
for &val in int_vals {
doc.add_u64(int_field, val);
}
index_writer.add_document(doc).unwrap();
};
for op in ops {
match op {
IndexingOp::ZeroVal => index_doc(&mut index_writer, &[]),
IndexingOp::OneVal { val } => index_doc(&mut index_writer, &[*val]),
IndexingOp::TwoVal { val } => index_doc(&mut index_writer, &[*val, *val]),
IndexingOp::Commit => {
index_writer.commit().expect("commit failed");
}
}
}
index_writer.commit().expect("commit failed");
}
{
let mut segment_ids = index.searchable_segment_ids()?;
segment_ids.sort();
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
let reader = index.reader()?;
reader.reload()?;
let mut vals: Vec<u64> = Vec::new();
let mut test_vals = move |col: &Column<u64>, doc: DocId, expected: &[u64]| {
vals.clear();
vals.extend(col.values_for_doc(doc));
assert_eq!(&vals[..], expected);
};
let mut test_col = move |col: &Column<u64>, column_expected: &[(u32, Vec<u64>)]| {
for (doc_id, vals) in column_expected.iter() {
test_vals(col, *doc_id, vals);
}
};
{
let searcher = reader.searcher();
let segment = searcher.segment_reader(0u32);
let col = segment
.fast_fields()
.column_opt::<u64>("intvals")
.unwrap()
.unwrap();
test_col(&col, &expected_doc_and_vals);
}
Ok(())
}
#[test]
fn test_merge_multivalued_int_fields_simple() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();

View File

@@ -7,7 +7,7 @@ mod tests {
use crate::query::QueryParser;
use crate::schema::{
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions,
TextFieldIndexing, TextOptions, Value,
};
use crate::{
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument,
@@ -280,13 +280,16 @@ mod tests {
.doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
.unwrap();
assert_eq!(
doc.get_first(my_text_field).unwrap().as_str(),
doc.get_first(my_text_field).unwrap().as_value().as_str(),
Some("blubber")
);
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))
.unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000));
assert_eq!(
doc.get_first(int_field).unwrap().as_value().as_u64(),
Some(1000)
);
}
}

View File

@@ -216,7 +216,7 @@ mod tests_mmap {
let test_query = |query_str: &str| {
let query = parse_query.parse_query(query_str).unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1, "{}", query_str);
assert_eq!(num_docs, 1, "{query_str}");
};
test_query(format!("json.{field_name_out}:test1").as_str());
test_query(format!("json.a{field_name_out}:test2").as_str());
@@ -590,10 +590,10 @@ mod tests_mmap {
let query_parser = QueryParser::for_index(&index, vec![]);
// Test if field name can be queried
for (indexed_field, val) in fields_and_vals.iter() {
let query_str = &format!("{}:{}", indexed_field, val);
let query_str = &format!("{indexed_field}:{val}");
let query = query_parser.parse_query(query_str).unwrap();
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
assert!(!count_docs.is_empty(), "{}:{}", indexed_field, val);
assert!(!count_docs.is_empty(), "{indexed_field}:{val}");
}
// Test if field name can be used for aggregation
for (field_name, val) in fields_and_vals.iter() {

View File

@@ -202,9 +202,8 @@ impl SegmentWriter {
match field_entry.field_type() {
FieldType::Facet(_) => {
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
let facet_str = value.as_facet().ok_or_else(make_schema_error)?;
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
@@ -220,15 +219,14 @@ impl SegmentWriter {
}
FieldType::Str(_) => {
let mut indexing_position = IndexingPosition::default();
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
let mut token_stream = if let Some(text) = value.as_str() {
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text)
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
} else if let Some(tok_str) = value.into_pre_tokenized_text() {
BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone()))
} else {
continue;
@@ -250,9 +248,8 @@ impl SegmentWriter {
}
FieldType::U64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
@@ -265,10 +262,8 @@ impl SegmentWriter {
}
FieldType::Date(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
for value in values {
let value = value.as_value();
num_vals += 1;
let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
@@ -282,9 +277,8 @@ impl SegmentWriter {
}
FieldType::I64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
@@ -297,10 +291,8 @@ impl SegmentWriter {
}
FieldType::F64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
term_buffer.set_f64(f64_val);
@@ -312,10 +304,8 @@ impl SegmentWriter {
}
FieldType::Bool(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
term_buffer.set_bool(bool_val);
@@ -327,10 +317,8 @@ impl SegmentWriter {
}
FieldType::Bytes(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes);
@@ -364,9 +352,8 @@ impl SegmentWriter {
}
FieldType::IpAddr(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
@@ -500,8 +487,8 @@ mod tests {
use crate::postings::{Postings, TermInfo};
use crate::query::{PhraseQuery, QueryParser};
use crate::schema::{
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, STORED,
STRING, TEXT,
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
STORED, STRING, TEXT,
};
use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339;
@@ -555,9 +542,12 @@ mod tests {
let doc = reader.get::<TantivyDocument>(0).unwrap();
assert_eq!(doc.field_values().count(), 2);
assert_eq!(doc.get_all(text_field).next().unwrap().as_str(), Some("A"));
assert_eq!(
doc.get_all(text_field).nth(1).unwrap().as_str(),
doc.get_all(text_field).next().unwrap().as_value().as_str(),
Some("A")
);
assert_eq!(
doc.get_all(text_field).nth(1).unwrap().as_value().as_str(),
Some("title")
);
}

View File

@@ -397,16 +397,20 @@ pub mod tests {
#[macro_export]
macro_rules! assert_nearly_equals {
($left:expr, $right:expr) => {{
match (&$left, &$right) {
(left_val, right_val) => {
assert_nearly_equals!($left, $right, 0.0005);
}};
($left:expr, $right:expr, $epsilon:expr) => {{
match (&$left, &$right, &$epsilon) {
(left_val, right_val, epsilon_val) => {
let diff = (left_val - right_val).abs();
let add = left_val.abs() + right_val.abs();
if diff > 0.0005 * add {
if diff > *epsilon_val {
panic!(
r#"assertion failed: `(left ~= right)`
left: `{:?}`,
right: `{:?}`"#,
&*left_val, &*right_val
r#"assertion failed: `abs(left-right)>epsilon`
left: `{:?}`,
right: `{:?}`,
epsilon: `{:?}`"#,
&*left_val, &*right_val, &*epsilon_val
)
}
}

View File

@@ -138,8 +138,7 @@ impl FuzzyTermQuery {
if json_path_type != Type::Str {
return Err(InvalidArgument(format!(
"The fuzzy term query requires a string path type for a json term. Found \
{:?}",
json_path_type
{json_path_type:?}"
)));
}
}

View File

@@ -2,7 +2,7 @@ use std::fmt;
use std::ops::Bound;
use crate::query::Occur;
use crate::schema::{Field, Term, Type};
use crate::schema::{Term, Type};
use crate::Score;
#[derive(Clone)]
@@ -20,8 +20,6 @@ pub enum LogicalLiteral {
upper: Bound<Term>,
},
Set {
field: Field,
value_type: Type,
elements: Vec<Term>,
},
All,

View File

@@ -832,17 +832,11 @@ impl QueryParser {
let (field, json_path) = try_tuple!(self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let (elements, errors) = elements
.into_iter()
.map(|element| self.compute_boundary_term(field, json_path, &element))
.partition_result();
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set {
elements,
field,
value_type,
}));
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set { elements }));
(Some(logical_ast), errors)
}
UserInputLeaf::Exists { .. } => (

View File

@@ -185,7 +185,7 @@ mod test {
Err(crate::TantivyError::InvalidArgument(msg)) => {
assert!(msg.contains("error: unclosed group"))
}
res => panic!("unexpected result: {:?}", res),
res => panic!("unexpected result: {res:?}"),
}
}
}

View File

@@ -157,29 +157,24 @@ impl CompactDoc {
}
/// field_values accessor
pub fn field_values(
&self,
) -> impl Iterator<Item = (Field, ReferenceValue<'_, CompactDocValue<'_>>)> {
pub fn field_values(&self) -> impl Iterator<Item = (Field, CompactDocValue<'_>)> {
self.field_values.iter().map(|field_val| {
let field = Field::from_field_id(field_val.field as u32);
let val = self.extract_value(field_val.value_addr).unwrap();
let val = self.get_compact_doc_value(field_val.value_addr);
(field, val)
})
}
/// Returns all of the `ReferenceValue`s associated the given field
pub fn get_all(
&self,
field: Field,
) -> impl Iterator<Item = ReferenceValue<'_, CompactDocValue<'_>>> + '_ {
pub fn get_all(&self, field: Field) -> impl Iterator<Item = CompactDocValue<'_>> + '_ {
self.field_values
.iter()
.filter(move |field_value| Field::from_field_id(field_value.field as u32) == field)
.map(|val| self.extract_value(val.value_addr).unwrap())
.map(|val| self.get_compact_doc_value(val.value_addr))
}
/// Returns the first `ReferenceValue` associated the given field
pub fn get_first(&self, field: Field) -> Option<ReferenceValue<'_, CompactDocValue<'_>>> {
pub fn get_first(&self, field: Field) -> Option<CompactDocValue<'_>> {
self.get_all(field).next()
}
@@ -299,58 +294,11 @@ impl CompactDoc {
}
}
fn extract_value(
&self,
ref_value: ValueAddr,
) -> io::Result<ReferenceValue<'_, CompactDocValue<'_>>> {
match ref_value.type_id {
ValueType::Null => Ok(ReferenceValueLeaf::Null.into()),
ValueType::Str => {
let str_ref = self.extract_str(ref_value.val_addr);
Ok(ReferenceValueLeaf::Str(str_ref).into())
}
ValueType::Facet => {
let str_ref = self.extract_str(ref_value.val_addr);
Ok(ReferenceValueLeaf::Facet(str_ref).into())
}
ValueType::Bytes => {
let data = self.extract_bytes(ref_value.val_addr);
Ok(ReferenceValueLeaf::Bytes(data).into())
}
ValueType::U64 => self
.read_from::<u64>(ref_value.val_addr)
.map(ReferenceValueLeaf::U64)
.map(Into::into),
ValueType::I64 => self
.read_from::<i64>(ref_value.val_addr)
.map(ReferenceValueLeaf::I64)
.map(Into::into),
ValueType::F64 => self
.read_from::<f64>(ref_value.val_addr)
.map(ReferenceValueLeaf::F64)
.map(Into::into),
ValueType::Bool => Ok(ReferenceValueLeaf::Bool(ref_value.val_addr != 0).into()),
ValueType::Date => self
.read_from::<i64>(ref_value.val_addr)
.map(|ts| ReferenceValueLeaf::Date(DateTime::from_timestamp_nanos(ts)))
.map(Into::into),
ValueType::IpAddr => self
.read_from::<u128>(ref_value.val_addr)
.map(|num| ReferenceValueLeaf::IpAddr(Ipv6Addr::from_u128(num)))
.map(Into::into),
ValueType::PreTokStr => self
.read_from::<PreTokenizedString>(ref_value.val_addr)
.map(Into::into)
.map(ReferenceValueLeaf::PreTokStr)
.map(Into::into),
ValueType::Object => Ok(ReferenceValue::Object(CompactDocObjectIter::new(
self,
ref_value.val_addr,
)?)),
ValueType::Array => Ok(ReferenceValue::Array(CompactDocArrayIter::new(
self,
ref_value.val_addr,
)?)),
/// Get CompactDocValue for address
fn get_compact_doc_value(&self, value_addr: ValueAddr) -> CompactDocValue<'_> {
CompactDocValue {
container: self,
value_addr,
}
}
@@ -410,7 +358,7 @@ impl PartialEq for CompactDoc {
let convert_to_comparable_map = |doc: &CompactDoc| {
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
for field_value in doc.field_values.iter() {
let value: OwnedValue = doc.extract_value(field_value.value_addr).unwrap().into();
let value: OwnedValue = doc.get_compact_doc_value(field_value.value_addr).into();
let value = serde_json::to_string(&value).unwrap();
field_value_set
.entry(Field::from_field_id(field_value.field as u32))
@@ -444,7 +392,19 @@ impl DocumentDeserialize for CompactDoc {
#[derive(Debug, Clone, Copy)]
pub struct CompactDocValue<'a> {
container: &'a CompactDoc,
value: ValueAddr,
value_addr: ValueAddr,
}
impl PartialEq for CompactDocValue<'_> {
fn eq(&self, other: &Self) -> bool {
let value1: OwnedValue = (*self).into();
let value2: OwnedValue = (*other).into();
value1 == value2
}
}
impl<'a> From<CompactDocValue<'a>> for OwnedValue {
fn from(value: CompactDocValue) -> Self {
value.as_value().into()
}
}
impl<'a> Value<'a> for CompactDocValue<'a> {
type ArrayIter = CompactDocArrayIter<'a>;
@@ -452,7 +412,67 @@ impl<'a> Value<'a> for CompactDocValue<'a> {
type ObjectIter = CompactDocObjectIter<'a>;
fn as_value(&self) -> ReferenceValue<'a, Self> {
self.container.extract_value(self.value).unwrap()
self.get_ref_value().unwrap()
}
}
impl<'a> CompactDocValue<'a> {
fn get_ref_value(&self) -> io::Result<ReferenceValue<'a, CompactDocValue<'a>>> {
let addr = self.value_addr.val_addr;
match self.value_addr.type_id {
ValueType::Null => Ok(ReferenceValueLeaf::Null.into()),
ValueType::Str => {
let str_ref = self.container.extract_str(addr);
Ok(ReferenceValueLeaf::Str(str_ref).into())
}
ValueType::Facet => {
let str_ref = self.container.extract_str(addr);
Ok(ReferenceValueLeaf::Facet(str_ref).into())
}
ValueType::Bytes => {
let data = self.container.extract_bytes(addr);
Ok(ReferenceValueLeaf::Bytes(data).into())
}
ValueType::U64 => self
.container
.read_from::<u64>(addr)
.map(ReferenceValueLeaf::U64)
.map(Into::into),
ValueType::I64 => self
.container
.read_from::<i64>(addr)
.map(ReferenceValueLeaf::I64)
.map(Into::into),
ValueType::F64 => self
.container
.read_from::<f64>(addr)
.map(ReferenceValueLeaf::F64)
.map(Into::into),
ValueType::Bool => Ok(ReferenceValueLeaf::Bool(addr != 0).into()),
ValueType::Date => self
.container
.read_from::<i64>(addr)
.map(|ts| ReferenceValueLeaf::Date(DateTime::from_timestamp_nanos(ts)))
.map(Into::into),
ValueType::IpAddr => self
.container
.read_from::<u128>(addr)
.map(|num| ReferenceValueLeaf::IpAddr(Ipv6Addr::from_u128(num)))
.map(Into::into),
ValueType::PreTokStr => self
.container
.read_from::<PreTokenizedString>(addr)
.map(Into::into)
.map(ReferenceValueLeaf::PreTokStr)
.map(Into::into),
ValueType::Object => Ok(ReferenceValue::Object(CompactDocObjectIter::new(
self.container,
addr,
)?)),
ValueType::Array => Ok(ReferenceValue::Array(CompactDocArrayIter::new(
self.container,
addr,
)?)),
}
}
}
@@ -537,7 +557,7 @@ impl BinarySerializable for ValueType {
} else {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("Invalid value type id: {}", num),
format!("Invalid value type id: {num}"),
));
};
Ok(type_id)
@@ -601,9 +621,9 @@ impl<'a> Iterator for CompactDocObjectIter<'a> {
let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
let value = CompactDocValue {
container: self.container,
value,
value_addr: value,
};
return Some((key, value));
Some((key, value))
}
}
@@ -635,9 +655,9 @@ impl<'a> Iterator for CompactDocArrayIter<'a> {
let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?;
let value = CompactDocValue {
container: self.container,
value,
value_addr: value,
};
return Some(value);
Some(value)
}
}
@@ -668,7 +688,7 @@ impl<'a> Iterator for FieldValueIterRef<'a> {
Field::from_field_id(field_value.field as u32),
CompactDocValue::<'a> {
container: self.container,
value: field_value.value_addr,
value_addr: field_value.value_addr,
},
)
})

View File

@@ -58,9 +58,8 @@ where W: Write
return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"Unexpected number of entries written to serializer, expected {} entries, got \
{} entries",
num_field_values, actual_length,
"Unexpected number of entries written to serializer, expected \
{num_field_values} entries, got {actual_length} entries",
),
));
}

View File

@@ -17,15 +17,6 @@ pub trait Value<'a>: Send + Sync + Debug {
/// Returns the field value represented by an enum which borrows it's data.
fn as_value(&self) -> ReferenceValue<'a, Self>;
#[inline]
/// Returns if the value is `null` or not.
fn is_null(&self) -> bool {
matches!(
self.as_value(),
ReferenceValue::Leaf(ReferenceValueLeaf::Null)
)
}
#[inline]
/// If the Value is a leaf, returns the associated leaf. Returns None otherwise.
fn as_leaf(&self) -> Option<ReferenceValueLeaf<'a>> {
@@ -117,18 +108,6 @@ pub trait Value<'a>: Send + Sync + Debug {
None
}
}
#[inline]
/// Returns true if the Value is an array.
fn is_array(&self) -> bool {
matches!(self.as_value(), ReferenceValue::Object(_))
}
#[inline]
/// Returns true if the Value is an object.
fn is_object(&self) -> bool {
matches!(self.as_value(), ReferenceValue::Object(_))
}
}
/// A enum representing a leaf value for tantivy to index.

View File

@@ -659,9 +659,9 @@ mod tests {
let schema = schema_builder.build();
let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#;
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
let date = doc.get_first(date_field).unwrap();
let date = OwnedValue::from(doc.get_first(date_field).unwrap());
// Time zone is converted to UTC
assert_eq!("Leaf(Date(2019-10-12T05:20:50.52Z))", format!("{date:?}"));
assert_eq!("Date(2019-10-12T05:20:50.52Z)", format!("{date:?}"));
}
#[test]

View File

@@ -60,7 +60,7 @@ pub mod tests {
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::AliveBitSet;
use crate::schema::{
self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, STORED, TEXT,
self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, Value, STORED, TEXT,
};
use crate::{Index, IndexWriter, Term};
@@ -122,6 +122,7 @@ pub mod tests {
.get::<TantivyDocument>(i)?
.get_first(field_title)
.unwrap()
.as_value()
.as_str()
.unwrap(),
format!("Doc {i}")
@@ -133,6 +134,7 @@ pub mod tests {
let title_content = doc
.get_first(field_title)
.unwrap()
.as_value()
.as_str()
.unwrap()
.to_string();

View File

@@ -241,23 +241,12 @@ impl StoreReader {
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<D>> + 'b {
self.enumerate(alive_bitset)
.map(|res| res.map(|(_, doc)| doc))
}
/// A variant of [`iter`][Self::iter] which also yields document ID.
pub fn enumerate<'a: 'b, 'b, D: DocumentDeserialize>(
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<(DocId, D)>> + 'b {
self.iter_raw(alive_bitset).map(|doc_bytes_res| {
let (doc_id, mut doc_bytes) = doc_bytes_res?;
let mut doc_bytes = doc_bytes_res?;
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?;
let doc = D::deserialize(deserializer).map_err(crate::TantivyError::from)?;
Ok((doc_id, doc))
D::deserialize(deserializer).map_err(crate::TantivyError::from)
})
}
@@ -267,7 +256,7 @@ impl StoreReader {
pub(crate) fn iter_raw<'a: 'b, 'b>(
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<(DocId, OwnedBytes)>> + 'b {
) -> impl Iterator<Item = crate::Result<OwnedBytes>> + 'b {
let last_doc_id = self
.block_checkpoints()
.last()
@@ -295,14 +284,14 @@ impl StoreReader {
let alive = alive_bitset.map_or(true, |bitset| bitset.is_alive(doc_id));
let res = if alive {
Some((doc_id, curr_block.clone(), doc_pos))
Some((curr_block.clone(), doc_pos))
} else {
None
};
doc_pos += 1;
res
})
.map(move |(doc_id, block, doc_pos)| {
.map(move |(block, doc_pos)| {
let block = block
.ok_or_else(|| {
DataCorruption::comment_only(
@@ -315,7 +304,7 @@ impl StoreReader {
})?;
let range = block_read_index(&block, doc_pos)?;
Ok((doc_id, block.slice(range)))
Ok(block.slice(range))
})
}
@@ -414,7 +403,7 @@ mod tests {
use super::*;
use crate::directory::RamDirectory;
use crate::schema::{Field, TantivyDocument};
use crate::schema::{Field, TantivyDocument, Value};
use crate::store::tests::write_lorem_ipsum_store;
use crate::store::Compressor;
use crate::Directory;
@@ -422,7 +411,7 @@ mod tests {
const BLOCK_SIZE: usize = 16_384;
fn get_text_field<'a>(doc: &'a TantivyDocument, field: &'a Field) -> Option<&'a str> {
doc.get_first(*field).and_then(|f| f.as_str())
doc.get_first(*field).and_then(|f| f.as_value().as_str())
}
#[test]

View File

@@ -93,7 +93,7 @@ fn open_fst_index(fst_file: FileSlice) -> io::Result<tantivy_fst::Map<OwnedBytes
let fst = Fst::new(bytes).map_err(|err| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("Fst data is corrupted: {:?}", err),
format!("Fst data is corrupted: {err:?}"),
)
})?;
Ok(tantivy_fst::Map::from(fst))

View File

@@ -95,7 +95,7 @@ fn test_term_dictionary_simple() -> crate::Result<()> {
#[test]
fn test_term_dictionary_stream() -> crate::Result<()> {
let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i))
.map(|i| (format!("doc{i:0>6}"), i))
.collect();
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
@@ -156,7 +156,7 @@ fn test_stream_high_range_prefix_suffix() -> crate::Result<()> {
#[test]
fn test_stream_range() -> crate::Result<()> {
let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i))
.map(|i| (format!("doc{i:0>6}"), i))
.collect();
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();

View File

@@ -96,7 +96,7 @@ mod tests {
{
let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
tokens.push(format!("{}", facet));
tokens.push(format!("{facet}"));
};
FacetTokenizer::default()
.token_stream(facet.encoded_str())
@@ -116,7 +116,7 @@ mod tests {
{
let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
tokens.push(format!("{}", facet));
tokens.push(format!("{facet}"));
};
FacetTokenizer::default()
.token_stream(facet.encoded_str()) // ok test