mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 17:22:54 +00:00
Compare commits
8 Commits
clippy-and
...
0.25.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b4139bc353 | ||
|
|
8676a1f57b | ||
|
|
021ff2ad63 | ||
|
|
39e027667b | ||
|
|
a1d65c3df3 | ||
|
|
2e4615c2d3 | ||
|
|
610091e2c4 | ||
|
|
d4b090124c |
@@ -2,13 +2,17 @@ Tantivy 0.25
|
|||||||
================================
|
================================
|
||||||
|
|
||||||
## Bugfixes
|
## Bugfixes
|
||||||
- fix union performance regression in tantivy 0.24 [#2663](https://github.com/quickwit-oss/tantivy/pull/2663)(@PSeitz-dd)
|
- fix union performance regression in tantivy 0.24 [#2663](https://github.com/quickwit-oss/tantivy/pull/2663)(@PSeitz)
|
||||||
- make zstd optional in sstable [#2633](https://github.com/quickwit-oss/tantivy/pull/2633)(@Parth)
|
- make zstd optional in sstable [#2633](https://github.com/quickwit-oss/tantivy/pull/2633)(@Parth)
|
||||||
|
- Fix TopDocs::order_by_string_fast_field for asc order [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
|
||||||
|
|
||||||
## Features/Improvements
|
## Features/Improvements
|
||||||
- add docs/example and Vec<u32> values to sstable [#2660](https://github.com/quickwit-oss/tantivy/pull/2660)(@PSeitz)
|
- add docs/example and Vec<u32> values to sstable [#2660](https://github.com/quickwit-oss/tantivy/pull/2660)(@PSeitz)
|
||||||
- Add string fast field support to `TopDocs`. [#2642](https://github.com/quickwit-oss/tantivy/pull/2642)(@stuhood)
|
- Add string fast field support to `TopDocs`. [#2642](https://github.com/quickwit-oss/tantivy/pull/2642)(@stuhood)
|
||||||
- update edition to 2024 [#2620](https://github.com/quickwit-oss/tantivy/pull/2620)(@PSeitz)
|
- update edition to 2024 [#2620](https://github.com/quickwit-oss/tantivy/pull/2620)(@PSeitz)
|
||||||
|
- Allow optional spaces between the field name and the value in the query parser [#2678](https://github.com/quickwit-oss/tantivy/pull/2678)(@Darkheir)
|
||||||
|
- Support mixed field types in query parser [#2676](https://github.com/quickwit-oss/tantivy/pull/2676)(@trinity-1686a)
|
||||||
|
- Add per-field size details [#2679](https://github.com/quickwit-oss/tantivy/pull/2679)(@fulmicoton)
|
||||||
|
|
||||||
Tantivy 0.24
|
Tantivy 0.24
|
||||||
================================
|
================================
|
||||||
|
|||||||
16
Cargo.toml
16
Cargo.toml
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.24.0"
|
version = "0.25.0"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = ["database-implementations", "data-structures"]
|
categories = ["database-implementations", "data-structures"]
|
||||||
@@ -57,13 +57,13 @@ measure_time = "0.9.0"
|
|||||||
arc-swap = "1.5.0"
|
arc-swap = "1.5.0"
|
||||||
bon = "3.3.1"
|
bon = "3.3.1"
|
||||||
|
|
||||||
columnar = { version = "0.5", path = "./columnar", package = "tantivy-columnar" }
|
columnar = { version = "0.6", path = "./columnar", package = "tantivy-columnar" }
|
||||||
sstable = { version = "0.5", path = "./sstable", package = "tantivy-sstable", optional = true }
|
sstable = { version = "0.6", path = "./sstable", package = "tantivy-sstable", optional = true }
|
||||||
stacker = { version = "0.5", path = "./stacker", package = "tantivy-stacker" }
|
stacker = { version = "0.6", path = "./stacker", package = "tantivy-stacker" }
|
||||||
query-grammar = { version = "0.24.0", path = "./query-grammar", package = "tantivy-query-grammar" }
|
query-grammar = { version = "0.25.0", path = "./query-grammar", package = "tantivy-query-grammar" }
|
||||||
tantivy-bitpacker = { version = "0.8", path = "./bitpacker" }
|
tantivy-bitpacker = { version = "0.9", path = "./bitpacker" }
|
||||||
common = { version = "0.9", path = "./common/", package = "tantivy-common" }
|
common = { version = "0.10", path = "./common/", package = "tantivy-common" }
|
||||||
tokenizer-api = { version = "0.5", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
tokenizer-api = { version = "0.6", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
||||||
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
|
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
|
||||||
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
|
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
|
||||||
futures-util = { version = "0.3.28", optional = true }
|
futures-util = { version = "0.3.28", optional = true }
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy-bitpacker"
|
name = "tantivy-bitpacker"
|
||||||
version = "0.8.0"
|
version = "0.9.0"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
|||||||
@@ -1,7 +1,3 @@
|
|||||||
// manual divceil actually generates code that is not optimal (to accept the full range of u32) and
|
|
||||||
// perf matters here.
|
|
||||||
#![allow(clippy::manual_div_ceil)]
|
|
||||||
|
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::ops::{Range, RangeInclusive};
|
use std::ops::{Range, RangeInclusive};
|
||||||
|
|
||||||
|
|||||||
@@ -140,9 +140,10 @@ impl BlockedBitpacker {
|
|||||||
pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
|
pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
|
||||||
// todo performance: we could decompress a whole block and cache it instead
|
// todo performance: we could decompress a whole block and cache it instead
|
||||||
let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE;
|
let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE;
|
||||||
(0..bitpacked_elems)
|
let iter = (0..bitpacked_elems)
|
||||||
.map(move |idx| self.get(idx))
|
.map(move |idx| self.get(idx))
|
||||||
.chain(self.buffer.iter().cloned())
|
.chain(self.buffer.iter().cloned());
|
||||||
|
iter
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
// #[allow(clippy::manual_div_ceil)]
|
|
||||||
|
|
||||||
mod bitpacker;
|
mod bitpacker;
|
||||||
mod blocked_bitpacker;
|
mod blocked_bitpacker;
|
||||||
mod filter_vec;
|
mod filter_vec;
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy-columnar"
|
name = "tantivy-columnar"
|
||||||
version = "0.5.0"
|
version = "0.6.0"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||||
@@ -12,10 +12,10 @@ categories = ["database-implementations", "data-structures", "compression"]
|
|||||||
itertools = "0.14.0"
|
itertools = "0.14.0"
|
||||||
fastdivide = "0.4.0"
|
fastdivide = "0.4.0"
|
||||||
|
|
||||||
stacker = { version= "0.5", path = "../stacker", package="tantivy-stacker"}
|
stacker = { version= "0.6", path = "../stacker", package="tantivy-stacker"}
|
||||||
sstable = { version= "0.5", path = "../sstable", package = "tantivy-sstable" }
|
sstable = { version= "0.6", path = "../sstable", package = "tantivy-sstable" }
|
||||||
common = { version= "0.9", path = "../common", package = "tantivy-common" }
|
common = { version= "0.10", path = "../common", package = "tantivy-common" }
|
||||||
tantivy-bitpacker = { version= "0.8", path = "../bitpacker/" }
|
tantivy-bitpacker = { version= "0.9", path = "../bitpacker/" }
|
||||||
serde = "1.0.152"
|
serde = "1.0.152"
|
||||||
downcast-rs = "2.0.1"
|
downcast-rs = "2.0.1"
|
||||||
|
|
||||||
@@ -33,6 +33,29 @@ harness = false
|
|||||||
name = "bench_access"
|
name = "bench_access"
|
||||||
harness = false
|
harness = false
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "bench_first_vals"
|
||||||
|
harness = false
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "bench_values_u64"
|
||||||
|
harness = false
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "bench_values_u128"
|
||||||
|
harness = false
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "bench_create_column_values"
|
||||||
|
harness = false
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "bench_column_values_get"
|
||||||
|
harness = false
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "bench_optional_index"
|
||||||
|
harness = false
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
unstable = []
|
|
||||||
zstd-compression = ["sstable/zstd-compression"]
|
zstd-compression = ["sstable/zstd-compression"]
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ fn main() {
|
|||||||
|
|
||||||
let mut add_card = |card1: Card| {
|
let mut add_card = |card1: Card| {
|
||||||
inputs.push((
|
inputs.push((
|
||||||
format!("{card1}"),
|
card1.to_string(),
|
||||||
generate_columnar_and_open(card1, NUM_DOCS),
|
generate_columnar_and_open(card1, NUM_DOCS),
|
||||||
));
|
));
|
||||||
};
|
};
|
||||||
@@ -50,6 +50,7 @@ fn bench_group(mut runner: InputGroup<Column>) {
|
|||||||
let mut buffer = vec![None; BLOCK_SIZE];
|
let mut buffer = vec![None; BLOCK_SIZE];
|
||||||
for i in (0..NUM_DOCS).step_by(BLOCK_SIZE) {
|
for i in (0..NUM_DOCS).step_by(BLOCK_SIZE) {
|
||||||
// fill docs
|
// fill docs
|
||||||
|
#[allow(clippy::needless_range_loop)]
|
||||||
for idx in 0..BLOCK_SIZE {
|
for idx in 0..BLOCK_SIZE {
|
||||||
docs[idx] = idx as u32 + i;
|
docs[idx] = idx as u32 + i;
|
||||||
}
|
}
|
||||||
|
|||||||
61
columnar/benches/bench_column_values_get.rs
Normal file
61
columnar/benches/bench_column_values_get.rs
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use binggan::{InputGroup, black_box};
|
||||||
|
use rand::rngs::StdRng;
|
||||||
|
use rand::{Rng, SeedableRng};
|
||||||
|
use tantivy_columnar::ColumnValues;
|
||||||
|
use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values};
|
||||||
|
|
||||||
|
fn get_data() -> Vec<u64> {
|
||||||
|
let mut rng = StdRng::seed_from_u64(2u64);
|
||||||
|
let mut data: Vec<_> = (100..55_000_u64)
|
||||||
|
.map(|num| num + rng.r#gen::<u8>() as u64)
|
||||||
|
.collect();
|
||||||
|
data.push(99_000);
|
||||||
|
data.insert(1000, 2000);
|
||||||
|
data.insert(2000, 100);
|
||||||
|
data.insert(3000, 4100);
|
||||||
|
data.insert(4000, 100);
|
||||||
|
data.insert(5000, 800);
|
||||||
|
data
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(never)]
|
||||||
|
fn value_iter() -> impl Iterator<Item = u64> {
|
||||||
|
0..20_000
|
||||||
|
}
|
||||||
|
|
||||||
|
type Col = Arc<dyn ColumnValues<u64>>;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let data = get_data();
|
||||||
|
let inputs: Vec<(String, Col)> = vec![
|
||||||
|
(
|
||||||
|
"bitpacked".to_string(),
|
||||||
|
serialize_and_load_u64_based_column_values(&data.as_slice(), &[CodecType::Bitpacked]),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"linear".to_string(),
|
||||||
|
serialize_and_load_u64_based_column_values(&data.as_slice(), &[CodecType::Linear]),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"blockwise_linear".to_string(),
|
||||||
|
serialize_and_load_u64_based_column_values(
|
||||||
|
&data.as_slice(),
|
||||||
|
&[CodecType::BlockwiseLinear],
|
||||||
|
),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut group: InputGroup<Col> = InputGroup::new_with_inputs(inputs);
|
||||||
|
|
||||||
|
group.register("fastfield_get", |col: &Col| {
|
||||||
|
let mut sum = 0u64;
|
||||||
|
for pos in value_iter() {
|
||||||
|
sum = sum.wrapping_add(col.get_val(pos as u32));
|
||||||
|
}
|
||||||
|
black_box(sum);
|
||||||
|
});
|
||||||
|
|
||||||
|
group.run();
|
||||||
|
}
|
||||||
44
columnar/benches/bench_create_column_values.rs
Normal file
44
columnar/benches/bench_create_column_values.rs
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
use binggan::{InputGroup, black_box};
|
||||||
|
use rand::rngs::StdRng;
|
||||||
|
use rand::{Rng, SeedableRng};
|
||||||
|
use tantivy_columnar::column_values::{CodecType, serialize_u64_based_column_values};
|
||||||
|
|
||||||
|
fn get_data() -> Vec<u64> {
|
||||||
|
let mut rng = StdRng::seed_from_u64(2u64);
|
||||||
|
let mut data: Vec<_> = (100..55_000_u64)
|
||||||
|
.map(|num| num + rng.r#gen::<u8>() as u64)
|
||||||
|
.collect();
|
||||||
|
data.push(99_000);
|
||||||
|
data.insert(1000, 2000);
|
||||||
|
data.insert(2000, 100);
|
||||||
|
data.insert(3000, 4100);
|
||||||
|
data.insert(4000, 100);
|
||||||
|
data.insert(5000, 800);
|
||||||
|
data
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let data = get_data();
|
||||||
|
let mut group: InputGroup<(CodecType, Vec<u64>)> = InputGroup::new_with_inputs(vec![
|
||||||
|
(
|
||||||
|
"bitpacked codec".to_string(),
|
||||||
|
(CodecType::Bitpacked, data.clone()),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"linear codec".to_string(),
|
||||||
|
(CodecType::Linear, data.clone()),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"blockwise linear codec".to_string(),
|
||||||
|
(CodecType::BlockwiseLinear, data.clone()),
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
|
||||||
|
group.register("serialize column_values", |data| {
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
serialize_u64_based_column_values(&data.1.as_slice(), &[data.0], &mut buffer).unwrap();
|
||||||
|
black_box(buffer.len());
|
||||||
|
});
|
||||||
|
|
||||||
|
group.run();
|
||||||
|
}
|
||||||
@@ -1,12 +1,9 @@
|
|||||||
#![feature(test)]
|
|
||||||
extern crate test;
|
|
||||||
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use binggan::{InputGroup, black_box};
|
||||||
use rand::prelude::*;
|
use rand::prelude::*;
|
||||||
use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values};
|
use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values};
|
||||||
use tantivy_columnar::*;
|
use tantivy_columnar::*;
|
||||||
use test::{Bencher, black_box};
|
|
||||||
|
|
||||||
struct Columns {
|
struct Columns {
|
||||||
pub optional: Column,
|
pub optional: Column,
|
||||||
@@ -68,88 +65,45 @@ pub fn serialize_and_load(column: &[u64], codec_type: CodecType) -> Arc<dyn Colu
|
|||||||
serialize_and_load_u64_based_column_values(&column, &[codec_type])
|
serialize_and_load_u64_based_column_values(&column, &[codec_type])
|
||||||
}
|
}
|
||||||
|
|
||||||
fn run_bench_on_column_full_scan(b: &mut Bencher, column: Column) {
|
fn main() {
|
||||||
let num_iter = black_box(NUM_VALUES);
|
let Columns {
|
||||||
b.iter(|| {
|
optional,
|
||||||
|
full,
|
||||||
|
multi,
|
||||||
|
} = get_test_columns();
|
||||||
|
|
||||||
|
let inputs = vec![
|
||||||
|
("full".to_string(), full),
|
||||||
|
("optional".to_string(), optional),
|
||||||
|
("multi".to_string(), multi),
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut group = InputGroup::new_with_inputs(inputs);
|
||||||
|
|
||||||
|
group.register("first_full_scan", |column| {
|
||||||
let mut sum = 0u64;
|
let mut sum = 0u64;
|
||||||
for i in 0..num_iter as u32 {
|
for i in 0..NUM_VALUES as u32 {
|
||||||
let val = column.first(i);
|
let val = column.first(i);
|
||||||
sum += val.unwrap_or(0);
|
sum += val.unwrap_or(0);
|
||||||
}
|
}
|
||||||
sum
|
black_box(sum);
|
||||||
});
|
});
|
||||||
}
|
|
||||||
fn run_bench_on_column_block_fetch(b: &mut Bencher, column: Column) {
|
group.register("first_block_fetch", |column| {
|
||||||
let mut block: Vec<Option<u64>> = vec![None; 64];
|
let mut block: Vec<Option<u64>> = vec![None; 64];
|
||||||
let fetch_docids = (0..64).collect::<Vec<_>>();
|
let fetch_docids = (0..64).collect::<Vec<_>>();
|
||||||
b.iter(move || {
|
|
||||||
column.first_vals(&fetch_docids, &mut block);
|
column.first_vals(&fetch_docids, &mut block);
|
||||||
block[0]
|
black_box(block[0]);
|
||||||
});
|
});
|
||||||
}
|
|
||||||
fn run_bench_on_column_block_single_calls(b: &mut Bencher, column: Column) {
|
group.register("first_block_single_calls", |column| {
|
||||||
let mut block: Vec<Option<u64>> = vec![None; 64];
|
let mut block: Vec<Option<u64>> = vec![None; 64];
|
||||||
let fetch_docids = (0..64).collect::<Vec<_>>();
|
let fetch_docids = (0..64).collect::<Vec<_>>();
|
||||||
b.iter(move || {
|
|
||||||
for i in 0..fetch_docids.len() {
|
for i in 0..fetch_docids.len() {
|
||||||
block[i] = column.first(fetch_docids[i]);
|
block[i] = column.first(fetch_docids[i]);
|
||||||
}
|
}
|
||||||
block[0]
|
black_box(block[0]);
|
||||||
});
|
});
|
||||||
}
|
|
||||||
|
|
||||||
/// Column first method
|
group.run();
|
||||||
#[bench]
|
|
||||||
fn bench_get_first_on_full_column_full_scan(b: &mut Bencher) {
|
|
||||||
let column = get_test_columns().full;
|
|
||||||
run_bench_on_column_full_scan(b, column);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_get_first_on_optional_column_full_scan(b: &mut Bencher) {
|
|
||||||
let column = get_test_columns().optional;
|
|
||||||
run_bench_on_column_full_scan(b, column);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_get_first_on_multi_column_full_scan(b: &mut Bencher) {
|
|
||||||
let column = get_test_columns().multi;
|
|
||||||
run_bench_on_column_full_scan(b, column);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Block fetch column accessor
|
|
||||||
#[bench]
|
|
||||||
fn bench_get_block_first_on_optional_column(b: &mut Bencher) {
|
|
||||||
let column = get_test_columns().optional;
|
|
||||||
run_bench_on_column_block_fetch(b, column);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_get_block_first_on_multi_column(b: &mut Bencher) {
|
|
||||||
let column = get_test_columns().multi;
|
|
||||||
run_bench_on_column_block_fetch(b, column);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_get_block_first_on_full_column(b: &mut Bencher) {
|
|
||||||
let column = get_test_columns().full;
|
|
||||||
run_bench_on_column_block_fetch(b, column);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_get_block_first_on_optional_column_single_calls(b: &mut Bencher) {
|
|
||||||
let column = get_test_columns().optional;
|
|
||||||
run_bench_on_column_block_single_calls(b, column);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_get_block_first_on_multi_column_single_calls(b: &mut Bencher) {
|
|
||||||
let column = get_test_columns().multi;
|
|
||||||
run_bench_on_column_block_single_calls(b, column);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_get_block_first_on_full_column_single_calls(b: &mut Bencher) {
|
|
||||||
let column = get_test_columns().full;
|
|
||||||
run_bench_on_column_block_single_calls(b, column);
|
|
||||||
}
|
}
|
||||||
|
|||||||
106
columnar/benches/bench_optional_index.rs
Normal file
106
columnar/benches/bench_optional_index.rs
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
use binggan::{InputGroup, black_box};
|
||||||
|
use rand::rngs::StdRng;
|
||||||
|
use rand::{Rng, SeedableRng};
|
||||||
|
use tantivy_columnar::column_index::{OptionalIndex, Set};
|
||||||
|
|
||||||
|
const TOTAL_NUM_VALUES: u32 = 1_000_000;
|
||||||
|
|
||||||
|
fn gen_optional_index(fill_ratio: f64) -> OptionalIndex {
|
||||||
|
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
|
||||||
|
let vals: Vec<u32> = (0..TOTAL_NUM_VALUES)
|
||||||
|
.map(|_| rng.gen_bool(fill_ratio))
|
||||||
|
.enumerate()
|
||||||
|
.filter(|(_pos, val)| *val)
|
||||||
|
.map(|(pos, _)| pos as u32)
|
||||||
|
.collect();
|
||||||
|
OptionalIndex::for_test(TOTAL_NUM_VALUES, &vals)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn random_range_iterator(
|
||||||
|
start: u32,
|
||||||
|
end: u32,
|
||||||
|
avg_step_size: u32,
|
||||||
|
avg_deviation: u32,
|
||||||
|
) -> impl Iterator<Item = u32> {
|
||||||
|
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
|
||||||
|
let mut current = start;
|
||||||
|
std::iter::from_fn(move || {
|
||||||
|
current += rng.gen_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation);
|
||||||
|
if current >= end { None } else { Some(current) }
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> {
|
||||||
|
let ratio = percent / 100.0;
|
||||||
|
let step_size = (1f32 / ratio) as u32;
|
||||||
|
let deviation = step_size - 1;
|
||||||
|
random_range_iterator(0, num_values, step_size, deviation)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn walk_over_data(codec: &OptionalIndex, avg_step_size: u32) -> Option<u32> {
|
||||||
|
walk_over_data_from_positions(
|
||||||
|
codec,
|
||||||
|
random_range_iterator(0, TOTAL_NUM_VALUES, avg_step_size, 0),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn walk_over_data_from_positions(
|
||||||
|
codec: &OptionalIndex,
|
||||||
|
positions: impl Iterator<Item = u32>,
|
||||||
|
) -> Option<u32> {
|
||||||
|
let mut dense_idx: Option<u32> = None;
|
||||||
|
for idx in positions {
|
||||||
|
dense_idx = dense_idx.or(codec.rank_if_exists(idx));
|
||||||
|
}
|
||||||
|
dense_idx
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
// Build separate inputs for each fill ratio.
|
||||||
|
let inputs: Vec<(String, OptionalIndex)> = vec![
|
||||||
|
("fill=1%".to_string(), gen_optional_index(0.01)),
|
||||||
|
("fill=5%".to_string(), gen_optional_index(0.05)),
|
||||||
|
("fill=10%".to_string(), gen_optional_index(0.10)),
|
||||||
|
("fill=50%".to_string(), gen_optional_index(0.50)),
|
||||||
|
("fill=90%".to_string(), gen_optional_index(0.90)),
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut group: InputGroup<OptionalIndex> = InputGroup::new_with_inputs(inputs);
|
||||||
|
|
||||||
|
// Translate orig->codec (rank_if_exists) with sampling
|
||||||
|
group.register("orig_to_codec_10pct_hit", |codec: &OptionalIndex| {
|
||||||
|
black_box(walk_over_data(codec, 100));
|
||||||
|
});
|
||||||
|
group.register("orig_to_codec_1pct_hit", |codec: &OptionalIndex| {
|
||||||
|
black_box(walk_over_data(codec, 1000));
|
||||||
|
});
|
||||||
|
group.register("orig_to_codec_full_scan", |codec: &OptionalIndex| {
|
||||||
|
black_box(walk_over_data_from_positions(codec, 0..TOTAL_NUM_VALUES));
|
||||||
|
});
|
||||||
|
|
||||||
|
// Translate codec->orig (select/select_batch) on sampled ranks
|
||||||
|
fn bench_translate_codec_to_orig_util(codec: &OptionalIndex, percent_hit: f32) {
|
||||||
|
let num_non_nulls = codec.num_non_nulls();
|
||||||
|
let idxs: Vec<u32> = if percent_hit == 100.0f32 {
|
||||||
|
(0..num_non_nulls).collect()
|
||||||
|
} else {
|
||||||
|
n_percent_step_iterator(percent_hit, num_non_nulls).collect()
|
||||||
|
};
|
||||||
|
let mut output = vec![0u32; idxs.len()];
|
||||||
|
output.copy_from_slice(&idxs[..]);
|
||||||
|
codec.select_batch(&mut output);
|
||||||
|
black_box(output);
|
||||||
|
}
|
||||||
|
|
||||||
|
group.register("codec_to_orig_0.005pct_hit", |codec: &OptionalIndex| {
|
||||||
|
bench_translate_codec_to_orig_util(codec, 0.005);
|
||||||
|
});
|
||||||
|
group.register("codec_to_orig_10pct_hit", |codec: &OptionalIndex| {
|
||||||
|
bench_translate_codec_to_orig_util(codec, 10.0);
|
||||||
|
});
|
||||||
|
group.register("codec_to_orig_full_scan", |codec: &OptionalIndex| {
|
||||||
|
bench_translate_codec_to_orig_util(codec, 100.0);
|
||||||
|
});
|
||||||
|
|
||||||
|
group.run();
|
||||||
|
}
|
||||||
@@ -1,15 +1,12 @@
|
|||||||
#![feature(test)]
|
|
||||||
|
|
||||||
use std::ops::RangeInclusive;
|
use std::ops::RangeInclusive;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use binggan::{InputGroup, black_box};
|
||||||
use common::OwnedBytes;
|
use common::OwnedBytes;
|
||||||
use rand::rngs::StdRng;
|
use rand::rngs::StdRng;
|
||||||
use rand::seq::SliceRandom;
|
use rand::seq::SliceRandom;
|
||||||
use rand::{Rng, SeedableRng, random};
|
use rand::{Rng, SeedableRng, random};
|
||||||
use tantivy_columnar::ColumnValues;
|
use tantivy_columnar::ColumnValues;
|
||||||
use test::Bencher;
|
|
||||||
extern crate test;
|
|
||||||
|
|
||||||
// TODO does this make sense for IPv6 ?
|
// TODO does this make sense for IPv6 ?
|
||||||
fn generate_random() -> Vec<u64> {
|
fn generate_random() -> Vec<u64> {
|
||||||
@@ -47,78 +44,77 @@ fn get_data_50percent_item() -> Vec<u128> {
|
|||||||
}
|
}
|
||||||
data.push(SINGLE_ITEM);
|
data.push(SINGLE_ITEM);
|
||||||
data.shuffle(&mut rng);
|
data.shuffle(&mut rng);
|
||||||
let data = data.iter().map(|el| *el as u128).collect::<Vec<_>>();
|
data.iter().map(|el| *el as u128).collect::<Vec<_>>()
|
||||||
data
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[bench]
|
fn main() {
|
||||||
fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) {
|
|
||||||
let data = get_data_50percent_item();
|
let data = get_data_50percent_item();
|
||||||
let column = get_u128_column_from_data(&data);
|
let column_range = get_u128_column_from_data(&data);
|
||||||
|
let column_random = get_u128_column_random();
|
||||||
|
|
||||||
b.iter(|| {
|
struct Inputs {
|
||||||
|
data: Vec<u128>,
|
||||||
|
column_range: Arc<dyn ColumnValues<u128>>,
|
||||||
|
column_random: Arc<dyn ColumnValues<u128>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
let inputs = Inputs {
|
||||||
|
data,
|
||||||
|
column_range,
|
||||||
|
column_random,
|
||||||
|
};
|
||||||
|
let mut group: InputGroup<Inputs> =
|
||||||
|
InputGroup::new_with_inputs(vec![("u128 benches".to_string(), inputs)]);
|
||||||
|
|
||||||
|
group.register(
|
||||||
|
"intfastfield_getrange_u128_50percent_hit",
|
||||||
|
|inp: &Inputs| {
|
||||||
|
let mut positions = Vec::new();
|
||||||
|
inp.column_range.get_row_ids_for_value_range(
|
||||||
|
*FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128,
|
||||||
|
0..inp.data.len() as u32,
|
||||||
|
&mut positions,
|
||||||
|
);
|
||||||
|
black_box(positions.len());
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
group.register("intfastfield_getrange_u128_single_hit", |inp: &Inputs| {
|
||||||
let mut positions = Vec::new();
|
let mut positions = Vec::new();
|
||||||
column.get_row_ids_for_value_range(
|
inp.column_range.get_row_ids_for_value_range(
|
||||||
*FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128,
|
|
||||||
0..data.len() as u32,
|
|
||||||
&mut positions,
|
|
||||||
);
|
|
||||||
positions
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_intfastfield_getrange_u128_single_hit(b: &mut Bencher) {
|
|
||||||
let data = get_data_50percent_item();
|
|
||||||
let column = get_u128_column_from_data(&data);
|
|
||||||
|
|
||||||
b.iter(|| {
|
|
||||||
let mut positions = Vec::new();
|
|
||||||
column.get_row_ids_for_value_range(
|
|
||||||
*SINGLE_ITEM_RANGE.start() as u128..=*SINGLE_ITEM_RANGE.end() as u128,
|
*SINGLE_ITEM_RANGE.start() as u128..=*SINGLE_ITEM_RANGE.end() as u128,
|
||||||
0..data.len() as u32,
|
0..inp.data.len() as u32,
|
||||||
&mut positions,
|
&mut positions,
|
||||||
);
|
);
|
||||||
positions
|
black_box(positions.len());
|
||||||
});
|
});
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
group.register("intfastfield_getrange_u128_hit_all", |inp: &Inputs| {
|
||||||
fn bench_intfastfield_getrange_u128_hit_all(b: &mut Bencher) {
|
|
||||||
let data = get_data_50percent_item();
|
|
||||||
let column = get_u128_column_from_data(&data);
|
|
||||||
|
|
||||||
b.iter(|| {
|
|
||||||
let mut positions = Vec::new();
|
let mut positions = Vec::new();
|
||||||
column.get_row_ids_for_value_range(0..=u128::MAX, 0..data.len() as u32, &mut positions);
|
inp.column_range.get_row_ids_for_value_range(
|
||||||
positions
|
0..=u128::MAX,
|
||||||
|
0..inp.data.len() as u32,
|
||||||
|
&mut positions,
|
||||||
|
);
|
||||||
|
black_box(positions.len());
|
||||||
});
|
});
|
||||||
}
|
|
||||||
// U128 RANGE END
|
|
||||||
|
|
||||||
#[bench]
|
group.register("intfastfield_scan_all_fflookup_u128", |inp: &Inputs| {
|
||||||
fn bench_intfastfield_scan_all_fflookup_u128(b: &mut Bencher) {
|
|
||||||
let column = get_u128_column_random();
|
|
||||||
|
|
||||||
b.iter(|| {
|
|
||||||
let mut a = 0u128;
|
let mut a = 0u128;
|
||||||
for i in 0u64..column.num_vals() as u64 {
|
for i in 0u64..inp.column_random.num_vals() as u64 {
|
||||||
a += column.get_val(i as u32);
|
a += inp.column_random.get_val(i as u32);
|
||||||
}
|
}
|
||||||
a
|
black_box(a);
|
||||||
});
|
});
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
group.register("intfastfield_jumpy_stride5_u128", |inp: &Inputs| {
|
||||||
fn bench_intfastfield_jumpy_stride5_u128(b: &mut Bencher) {
|
let n = inp.column_random.num_vals();
|
||||||
let column = get_u128_column_random();
|
|
||||||
|
|
||||||
b.iter(|| {
|
|
||||||
let n = column.num_vals();
|
|
||||||
let mut a = 0u128;
|
let mut a = 0u128;
|
||||||
for i in (0..n / 5).map(|val| val * 5) {
|
for i in (0..n / 5).map(|val| val * 5) {
|
||||||
a += column.get_val(i);
|
a += inp.column_random.get_val(i);
|
||||||
}
|
}
|
||||||
a
|
black_box(a);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
group.run();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,13 +1,10 @@
|
|||||||
#![feature(test)]
|
|
||||||
extern crate test;
|
|
||||||
|
|
||||||
use std::ops::RangeInclusive;
|
use std::ops::RangeInclusive;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use binggan::{InputGroup, black_box};
|
||||||
use rand::prelude::*;
|
use rand::prelude::*;
|
||||||
use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values};
|
use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values};
|
||||||
use tantivy_columnar::*;
|
use tantivy_columnar::*;
|
||||||
use test::Bencher;
|
|
||||||
|
|
||||||
// Warning: this generates the same permutation at each call
|
// Warning: this generates the same permutation at each call
|
||||||
fn generate_permutation() -> Vec<u64> {
|
fn generate_permutation() -> Vec<u64> {
|
||||||
@@ -27,37 +24,11 @@ pub fn serialize_and_load(column: &[u64], codec_type: CodecType) -> Arc<dyn Colu
|
|||||||
serialize_and_load_u64_based_column_values(&column, &[codec_type])
|
serialize_and_load_u64_based_column_values(&column, &[codec_type])
|
||||||
}
|
}
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
|
|
||||||
let permutation = generate_permutation();
|
|
||||||
let n = permutation.len();
|
|
||||||
b.iter(|| {
|
|
||||||
let mut a = 0u64;
|
|
||||||
for _ in 0..n {
|
|
||||||
a = permutation[a as usize];
|
|
||||||
}
|
|
||||||
a
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_intfastfield_jumpy_fflookup_bitpacked(b: &mut Bencher) {
|
|
||||||
let permutation = generate_permutation();
|
|
||||||
let n = permutation.len();
|
|
||||||
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
|
|
||||||
b.iter(|| {
|
|
||||||
let mut a = 0u64;
|
|
||||||
for _ in 0..n {
|
|
||||||
a = column.get_val(a as u32);
|
|
||||||
}
|
|
||||||
a
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
const FIFTY_PERCENT_RANGE: RangeInclusive<u64> = 1..=50;
|
const FIFTY_PERCENT_RANGE: RangeInclusive<u64> = 1..=50;
|
||||||
const SINGLE_ITEM: u64 = 90;
|
const SINGLE_ITEM: u64 = 90;
|
||||||
const SINGLE_ITEM_RANGE: RangeInclusive<u64> = 90..=90;
|
const SINGLE_ITEM_RANGE: RangeInclusive<u64> = 90..=90;
|
||||||
const ONE_PERCENT_ITEM_RANGE: RangeInclusive<u64> = 49..=49;
|
const ONE_PERCENT_ITEM_RANGE: RangeInclusive<u64> = 49..=49;
|
||||||
|
|
||||||
fn get_data_50percent_item() -> Vec<u128> {
|
fn get_data_50percent_item() -> Vec<u128> {
|
||||||
let mut rng = StdRng::from_seed([1u8; 32]);
|
let mut rng = StdRng::from_seed([1u8; 32]);
|
||||||
|
|
||||||
@@ -69,135 +40,122 @@ fn get_data_50percent_item() -> Vec<u128> {
|
|||||||
data.push(SINGLE_ITEM);
|
data.push(SINGLE_ITEM);
|
||||||
|
|
||||||
data.shuffle(&mut rng);
|
data.shuffle(&mut rng);
|
||||||
let data = data.iter().map(|el| *el as u128).collect::<Vec<_>>();
|
data.iter().map(|el| *el as u128).collect::<Vec<_>>()
|
||||||
data
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// U64 RANGE START
|
type VecCol = (Vec<u64>, Arc<dyn ColumnValues<u64>>);
|
||||||
#[bench]
|
|
||||||
fn bench_intfastfield_getrange_u64_50percent_hit(b: &mut Bencher) {
|
|
||||||
let data = get_data_50percent_item();
|
|
||||||
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
|
|
||||||
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
|
|
||||||
b.iter(|| {
|
|
||||||
let mut positions = Vec::new();
|
|
||||||
column.get_row_ids_for_value_range(
|
|
||||||
FIFTY_PERCENT_RANGE,
|
|
||||||
0..data.len() as u32,
|
|
||||||
&mut positions,
|
|
||||||
);
|
|
||||||
positions
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
fn bench_access() {
|
||||||
fn bench_intfastfield_getrange_u64_1percent_hit(b: &mut Bencher) {
|
|
||||||
let data = get_data_50percent_item();
|
|
||||||
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
|
|
||||||
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
|
|
||||||
|
|
||||||
b.iter(|| {
|
|
||||||
let mut positions = Vec::new();
|
|
||||||
column.get_row_ids_for_value_range(
|
|
||||||
ONE_PERCENT_ITEM_RANGE,
|
|
||||||
0..data.len() as u32,
|
|
||||||
&mut positions,
|
|
||||||
);
|
|
||||||
positions
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_intfastfield_getrange_u64_single_hit(b: &mut Bencher) {
|
|
||||||
let data = get_data_50percent_item();
|
|
||||||
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
|
|
||||||
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
|
|
||||||
|
|
||||||
b.iter(|| {
|
|
||||||
let mut positions = Vec::new();
|
|
||||||
column.get_row_ids_for_value_range(SINGLE_ITEM_RANGE, 0..data.len() as u32, &mut positions);
|
|
||||||
positions
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_intfastfield_getrange_u64_hit_all(b: &mut Bencher) {
|
|
||||||
let data = get_data_50percent_item();
|
|
||||||
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
|
|
||||||
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
|
|
||||||
|
|
||||||
b.iter(|| {
|
|
||||||
let mut positions = Vec::new();
|
|
||||||
column.get_row_ids_for_value_range(0..=u64::MAX, 0..data.len() as u32, &mut positions);
|
|
||||||
positions
|
|
||||||
});
|
|
||||||
}
|
|
||||||
// U64 RANGE END
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_intfastfield_stride7_vec(b: &mut Bencher) {
|
|
||||||
let permutation = generate_permutation();
|
let permutation = generate_permutation();
|
||||||
let n = permutation.len();
|
let column_perm: Arc<dyn ColumnValues<u64>> =
|
||||||
b.iter(|| {
|
serialize_and_load(&permutation, CodecType::Bitpacked);
|
||||||
|
|
||||||
|
let permutation_gcd = generate_permutation_gcd();
|
||||||
|
let column_perm_gcd: Arc<dyn ColumnValues<u64>> =
|
||||||
|
serialize_and_load(&permutation_gcd, CodecType::Bitpacked);
|
||||||
|
|
||||||
|
let mut group: InputGroup<VecCol> = InputGroup::new_with_inputs(vec![
|
||||||
|
(
|
||||||
|
"access".to_string(),
|
||||||
|
(permutation.clone(), column_perm.clone()),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"access_gcd".to_string(),
|
||||||
|
(permutation_gcd.clone(), column_perm_gcd.clone()),
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
|
||||||
|
group.register("stride7_vec", |inp: &VecCol| {
|
||||||
|
let n = inp.0.len();
|
||||||
let mut a = 0u64;
|
let mut a = 0u64;
|
||||||
for i in (0..n / 7).map(|val| val * 7) {
|
for i in (0..n / 7).map(|val| val * 7) {
|
||||||
a += permutation[i as usize];
|
a += inp.0[i];
|
||||||
}
|
}
|
||||||
a
|
black_box(a);
|
||||||
});
|
});
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
group.register("fullscan_vec", |inp: &VecCol| {
|
||||||
fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) {
|
let mut a = 0u64;
|
||||||
let permutation = generate_permutation();
|
for i in 0..inp.0.len() {
|
||||||
let n = permutation.len();
|
a += inp.0[i];
|
||||||
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
|
}
|
||||||
b.iter(|| {
|
black_box(a);
|
||||||
let mut a = 0;
|
});
|
||||||
|
|
||||||
|
group.register("stride7_column_values", |inp: &VecCol| {
|
||||||
|
let n = inp.1.num_vals() as usize;
|
||||||
|
let mut a = 0u64;
|
||||||
for i in (0..n / 7).map(|val| val * 7) {
|
for i in (0..n / 7).map(|val| val * 7) {
|
||||||
a += column.get_val(i as u32);
|
a += inp.1.get_val(i as u32);
|
||||||
}
|
}
|
||||||
a
|
black_box(a);
|
||||||
});
|
});
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
group.register("fullscan_column_values", |inp: &VecCol| {
|
||||||
fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
|
|
||||||
let permutation = generate_permutation();
|
|
||||||
let n = permutation.len();
|
|
||||||
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
|
|
||||||
let column_ref = column.as_ref();
|
|
||||||
b.iter(|| {
|
|
||||||
let mut a = 0u64;
|
|
||||||
for i in 0u32..n as u32 {
|
|
||||||
a += column_ref.get_val(i);
|
|
||||||
}
|
|
||||||
a
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) {
|
|
||||||
let permutation = generate_permutation_gcd();
|
|
||||||
let n = permutation.len();
|
|
||||||
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
|
|
||||||
b.iter(|| {
|
|
||||||
let mut a = 0u64;
|
let mut a = 0u64;
|
||||||
|
let n = inp.1.num_vals() as usize;
|
||||||
for i in 0..n {
|
for i in 0..n {
|
||||||
a += column.get_val(i as u32);
|
a += inp.1.get_val(i as u32);
|
||||||
}
|
}
|
||||||
a
|
black_box(a);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
group.run();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[bench]
|
fn bench_range() {
|
||||||
fn bench_intfastfield_scan_all_vec(b: &mut Bencher) {
|
let data_50 = get_data_50percent_item();
|
||||||
let permutation = generate_permutation();
|
let data_u64 = data_50.iter().map(|el| *el as u64).collect::<Vec<_>>();
|
||||||
b.iter(|| {
|
let column_data: Arc<dyn ColumnValues<u64>> =
|
||||||
let mut a = 0u64;
|
serialize_and_load(&data_u64, CodecType::Bitpacked);
|
||||||
for i in 0..permutation.len() {
|
|
||||||
a += permutation[i as usize] as u64;
|
let mut group: InputGroup<Arc<dyn ColumnValues<u64>>> =
|
||||||
}
|
InputGroup::new_with_inputs(vec![("dist_50pct_item".to_string(), column_data.clone())]);
|
||||||
a
|
|
||||||
});
|
group.register(
|
||||||
|
"fastfield_getrange_u64_50percent_hit",
|
||||||
|
|col: &Arc<dyn ColumnValues<u64>>| {
|
||||||
|
let mut positions = Vec::new();
|
||||||
|
col.get_row_ids_for_value_range(FIFTY_PERCENT_RANGE, 0..col.num_vals(), &mut positions);
|
||||||
|
black_box(positions.len());
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
group.register(
|
||||||
|
"fastfield_getrange_u64_1percent_hit",
|
||||||
|
|col: &Arc<dyn ColumnValues<u64>>| {
|
||||||
|
let mut positions = Vec::new();
|
||||||
|
col.get_row_ids_for_value_range(
|
||||||
|
ONE_PERCENT_ITEM_RANGE,
|
||||||
|
0..col.num_vals(),
|
||||||
|
&mut positions,
|
||||||
|
);
|
||||||
|
black_box(positions.len());
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
group.register(
|
||||||
|
"fastfield_getrange_u64_single_hit",
|
||||||
|
|col: &Arc<dyn ColumnValues<u64>>| {
|
||||||
|
let mut positions = Vec::new();
|
||||||
|
col.get_row_ids_for_value_range(SINGLE_ITEM_RANGE, 0..col.num_vals(), &mut positions);
|
||||||
|
black_box(positions.len());
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
group.register(
|
||||||
|
"fastfield_getrange_u64_hit_all",
|
||||||
|
|col: &Arc<dyn ColumnValues<u64>>| {
|
||||||
|
let mut positions = Vec::new();
|
||||||
|
col.get_row_ids_for_value_range(0..=u64::MAX, 0..col.num_vals(), &mut positions);
|
||||||
|
black_box(positions.len());
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
group.run();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
bench_access();
|
||||||
|
bench_range();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -219,170 +219,3 @@ fn test_optional_index_for_tests() {
|
|||||||
assert!(!optional_index.contains(3));
|
assert!(!optional_index.contains(3));
|
||||||
assert_eq!(optional_index.num_docs(), 4);
|
assert_eq!(optional_index.num_docs(), 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
|
||||||
mod bench {
|
|
||||||
|
|
||||||
use rand::rngs::StdRng;
|
|
||||||
use rand::{Rng, SeedableRng};
|
|
||||||
use test::Bencher;
|
|
||||||
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
const TOTAL_NUM_VALUES: u32 = 1_000_000;
|
|
||||||
fn gen_bools(fill_ratio: f64) -> OptionalIndex {
|
|
||||||
let mut out = Vec::new();
|
|
||||||
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
|
|
||||||
let vals: Vec<RowId> = (0..TOTAL_NUM_VALUES)
|
|
||||||
.map(|_| rng.gen_bool(fill_ratio))
|
|
||||||
.enumerate()
|
|
||||||
.filter(|(_pos, val)| *val)
|
|
||||||
.map(|(pos, _)| pos as RowId)
|
|
||||||
.collect();
|
|
||||||
serialize_optional_index(&&vals[..], TOTAL_NUM_VALUES, &mut out).unwrap();
|
|
||||||
|
|
||||||
open_optional_index(OwnedBytes::new(out)).unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn random_range_iterator(
|
|
||||||
start: u32,
|
|
||||||
end: u32,
|
|
||||||
avg_step_size: u32,
|
|
||||||
avg_deviation: u32,
|
|
||||||
) -> impl Iterator<Item = u32> {
|
|
||||||
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
|
|
||||||
let mut current = start;
|
|
||||||
std::iter::from_fn(move || {
|
|
||||||
current += rng.gen_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation);
|
|
||||||
if current >= end { None } else { Some(current) }
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> {
|
|
||||||
let ratio = percent / 100.0;
|
|
||||||
let step_size = (1f32 / ratio) as u32;
|
|
||||||
let deviation = step_size - 1;
|
|
||||||
random_range_iterator(0, num_values, step_size, deviation)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn walk_over_data(codec: &OptionalIndex, avg_step_size: u32) -> Option<u32> {
|
|
||||||
walk_over_data_from_positions(
|
|
||||||
codec,
|
|
||||||
random_range_iterator(0, TOTAL_NUM_VALUES, avg_step_size, 0),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn walk_over_data_from_positions(
|
|
||||||
codec: &OptionalIndex,
|
|
||||||
positions: impl Iterator<Item = u32>,
|
|
||||||
) -> Option<u32> {
|
|
||||||
let mut dense_idx: Option<u32> = None;
|
|
||||||
for idx in positions {
|
|
||||||
dense_idx = dense_idx.or(codec.rank_if_exists(idx));
|
|
||||||
}
|
|
||||||
dense_idx
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_orig_to_codec_1percent_filled_10percent_hit(bench: &mut Bencher) {
|
|
||||||
let codec = gen_bools(0.01f64);
|
|
||||||
bench.iter(|| walk_over_data(&codec, 100));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_orig_to_codec_5percent_filled_10percent_hit(bench: &mut Bencher) {
|
|
||||||
let codec = gen_bools(0.05f64);
|
|
||||||
bench.iter(|| walk_over_data(&codec, 100));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_orig_to_codec_5percent_filled_1percent_hit(bench: &mut Bencher) {
|
|
||||||
let codec = gen_bools(0.05f64);
|
|
||||||
bench.iter(|| walk_over_data(&codec, 1000));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_orig_to_codec_full_scan_1percent_filled(bench: &mut Bencher) {
|
|
||||||
let codec = gen_bools(0.01f64);
|
|
||||||
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_orig_to_codec_full_scan_10percent_filled(bench: &mut Bencher) {
|
|
||||||
let codec = gen_bools(0.1f64);
|
|
||||||
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_orig_to_codec_full_scan_90percent_filled(bench: &mut Bencher) {
|
|
||||||
let codec = gen_bools(0.9f64);
|
|
||||||
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_orig_to_codec_10percent_filled_1percent_hit(bench: &mut Bencher) {
|
|
||||||
let codec = gen_bools(0.1f64);
|
|
||||||
bench.iter(|| walk_over_data(&codec, 100));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_orig_to_codec_50percent_filled_1percent_hit(bench: &mut Bencher) {
|
|
||||||
let codec = gen_bools(0.5f64);
|
|
||||||
bench.iter(|| walk_over_data(&codec, 100));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_orig_to_codec_90percent_filled_1percent_hit(bench: &mut Bencher) {
|
|
||||||
let codec = gen_bools(0.9f64);
|
|
||||||
bench.iter(|| walk_over_data(&codec, 100));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_codec_to_orig_1percent_filled_0comma005percent_hit(bench: &mut Bencher) {
|
|
||||||
bench_translate_codec_to_orig_util(0.01f64, 0.005f32, bench);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_codec_to_orig_10percent_filled_0comma005percent_hit(bench: &mut Bencher) {
|
|
||||||
bench_translate_codec_to_orig_util(0.1f64, 0.005f32, bench);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_codec_to_orig_1percent_filled_10percent_hit(bench: &mut Bencher) {
|
|
||||||
bench_translate_codec_to_orig_util(0.01f64, 10f32, bench);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_codec_to_orig_1percent_filled_full_scan(bench: &mut Bencher) {
|
|
||||||
bench_translate_codec_to_orig_util(0.01f64, 100f32, bench);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn bench_translate_codec_to_orig_util(
|
|
||||||
percent_filled: f64,
|
|
||||||
percent_hit: f32,
|
|
||||||
bench: &mut Bencher,
|
|
||||||
) {
|
|
||||||
let codec = gen_bools(percent_filled);
|
|
||||||
let num_non_nulls = codec.num_non_nulls();
|
|
||||||
let idxs: Vec<u32> = if percent_hit == 100.0f32 {
|
|
||||||
(0..num_non_nulls).collect()
|
|
||||||
} else {
|
|
||||||
n_percent_step_iterator(percent_hit, num_non_nulls).collect()
|
|
||||||
};
|
|
||||||
let mut output = vec![0u32; idxs.len()];
|
|
||||||
bench.iter(|| {
|
|
||||||
output.copy_from_slice(&idxs[..]);
|
|
||||||
codec.select_batch(&mut output);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_codec_to_orig_90percent_filled_0comma005percent_hit(bench: &mut Bencher) {
|
|
||||||
bench_translate_codec_to_orig_util(0.9f64, 0.005, bench);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_translate_codec_to_orig_90percent_filled_full_scan(bench: &mut Bencher) {
|
|
||||||
bench_translate_codec_to_orig_util(0.9f64, 100.0f32, bench);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,139 +0,0 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use common::OwnedBytes;
|
|
||||||
use rand::rngs::StdRng;
|
|
||||||
use rand::{Rng, SeedableRng};
|
|
||||||
use test::{self, Bencher};
|
|
||||||
|
|
||||||
use super::*;
|
|
||||||
use crate::column_values::u64_based::*;
|
|
||||||
|
|
||||||
fn get_data() -> Vec<u64> {
|
|
||||||
let mut rng = StdRng::seed_from_u64(2u64);
|
|
||||||
let mut data: Vec<_> = (100..55000_u64)
|
|
||||||
.map(|num| num + rng.r#gen::<u8>() as u64)
|
|
||||||
.collect();
|
|
||||||
data.push(99_000);
|
|
||||||
data.insert(1000, 2000);
|
|
||||||
data.insert(2000, 100);
|
|
||||||
data.insert(3000, 4100);
|
|
||||||
data.insert(4000, 100);
|
|
||||||
data.insert(5000, 800);
|
|
||||||
data
|
|
||||||
}
|
|
||||||
|
|
||||||
fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
|
|
||||||
let mut stats_collector = StatsCollector::default();
|
|
||||||
for val in vals {
|
|
||||||
stats_collector.collect(val);
|
|
||||||
}
|
|
||||||
stats_collector.stats()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(never)]
|
|
||||||
fn value_iter() -> impl Iterator<Item = u64> {
|
|
||||||
0..20_000
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
|
|
||||||
let mut bytes = Vec::new();
|
|
||||||
let stats = compute_stats(data.iter().cloned());
|
|
||||||
let mut codec_serializer = Codec::estimator();
|
|
||||||
for val in data {
|
|
||||||
codec_serializer.collect(*val);
|
|
||||||
}
|
|
||||||
codec_serializer
|
|
||||||
.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
Codec::load(OwnedBytes::new(bytes)).unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
|
|
||||||
let col = get_reader_for_bench::<Codec>(data);
|
|
||||||
b.iter(|| {
|
|
||||||
let mut sum = 0u64;
|
|
||||||
for pos in value_iter() {
|
|
||||||
let val = col.get_val(pos as u32);
|
|
||||||
sum = sum.wrapping_add(val);
|
|
||||||
}
|
|
||||||
sum
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(never)]
|
|
||||||
fn bench_get_dynamic_helper(b: &mut Bencher, col: Arc<dyn ColumnValues>) {
|
|
||||||
b.iter(|| {
|
|
||||||
let mut sum = 0u64;
|
|
||||||
for pos in value_iter() {
|
|
||||||
let val = col.get_val(pos as u32);
|
|
||||||
sum = sum.wrapping_add(val);
|
|
||||||
}
|
|
||||||
sum
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
fn bench_get_dynamic<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
|
|
||||||
let col = Arc::new(get_reader_for_bench::<Codec>(data));
|
|
||||||
bench_get_dynamic_helper(b, col);
|
|
||||||
}
|
|
||||||
fn bench_create<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
|
|
||||||
let stats = compute_stats(data.iter().cloned());
|
|
||||||
|
|
||||||
let mut bytes = Vec::new();
|
|
||||||
b.iter(|| {
|
|
||||||
bytes.clear();
|
|
||||||
let mut codec_serializer = Codec::estimator();
|
|
||||||
for val in data.iter().take(1024) {
|
|
||||||
codec_serializer.collect(*val);
|
|
||||||
}
|
|
||||||
|
|
||||||
codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
|
|
||||||
let data: Vec<_> = get_data();
|
|
||||||
bench_create::<BitpackedCodec>(b, &data);
|
|
||||||
}
|
|
||||||
#[bench]
|
|
||||||
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
|
|
||||||
let data: Vec<_> = get_data();
|
|
||||||
bench_create::<LinearCodec>(b, &data);
|
|
||||||
}
|
|
||||||
#[bench]
|
|
||||||
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
|
|
||||||
let data: Vec<_> = get_data();
|
|
||||||
bench_create::<BlockwiseLinearCodec>(b, &data);
|
|
||||||
}
|
|
||||||
#[bench]
|
|
||||||
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
|
|
||||||
let data: Vec<_> = get_data();
|
|
||||||
bench_get::<BitpackedCodec>(b, &data);
|
|
||||||
}
|
|
||||||
#[bench]
|
|
||||||
fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) {
|
|
||||||
let data: Vec<_> = get_data();
|
|
||||||
bench_get_dynamic::<BitpackedCodec>(b, &data);
|
|
||||||
}
|
|
||||||
#[bench]
|
|
||||||
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
|
|
||||||
let data: Vec<_> = get_data();
|
|
||||||
bench_get::<LinearCodec>(b, &data);
|
|
||||||
}
|
|
||||||
#[bench]
|
|
||||||
fn bench_fastfield_linearinterpol_get_dynamic(b: &mut Bencher) {
|
|
||||||
let data: Vec<_> = get_data();
|
|
||||||
bench_get_dynamic::<LinearCodec>(b, &data);
|
|
||||||
}
|
|
||||||
#[bench]
|
|
||||||
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
|
|
||||||
let data: Vec<_> = get_data();
|
|
||||||
bench_get::<BlockwiseLinearCodec>(b, &data);
|
|
||||||
}
|
|
||||||
#[bench]
|
|
||||||
fn bench_fastfield_multilinearinterpol_get_dynamic(b: &mut Bencher) {
|
|
||||||
let data: Vec<_> = get_data();
|
|
||||||
bench_get_dynamic::<BlockwiseLinearCodec>(b, &data);
|
|
||||||
}
|
|
||||||
@@ -242,6 +242,3 @@ impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnV
|
|||||||
.get_row_ids_for_value_range(range, doc_id_range, positions)
|
.get_row_ids_for_value_range(range, doc_id_range, positions)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
|
||||||
mod bench;
|
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
#![allow(clippy::manual_div_ceil)]
|
|
||||||
|
|
||||||
mod column_type;
|
mod column_type;
|
||||||
mod format_version;
|
mod format_version;
|
||||||
mod merge;
|
mod merge;
|
||||||
|
|||||||
@@ -17,15 +17,10 @@
|
|||||||
//! column.
|
//! column.
|
||||||
//! - [column_values]: Stores the values of a column in a dense format.
|
//! - [column_values]: Stores the values of a column in a dense format.
|
||||||
|
|
||||||
// #![cfg_attr(all(feature = "unstable", test), feature(test))]
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate more_asserts;
|
extern crate more_asserts;
|
||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
|
||||||
extern crate test;
|
|
||||||
|
|
||||||
use std::fmt::Display;
|
use std::fmt::Display;
|
||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy-common"
|
name = "tantivy-common"
|
||||||
version = "0.9.0"
|
version = "0.10.0"
|
||||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ use crate::ByteCount;
|
|||||||
pub struct TinySet(u64);
|
pub struct TinySet(u64);
|
||||||
|
|
||||||
impl fmt::Debug for TinySet {
|
impl fmt::Debug for TinySet {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
self.into_iter().collect::<Vec<u32>>().fmt(f)
|
self.into_iter().collect::<Vec<u32>>().fmt(f)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -182,7 +182,6 @@ pub struct BitSet {
|
|||||||
max_value: u32,
|
max_value: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn num_buckets(max_val: u32) -> u32 {
|
fn num_buckets(max_val: u32) -> u32 {
|
||||||
(max_val + 63u32) / 64u32
|
(max_val + 63u32) / 64u32
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,4 @@
|
|||||||
// manual divceil actually generates code that is not optimal (to accept the full range of u32) and
|
#![allow(clippy::len_without_is_empty)]
|
||||||
// perf matters here.
|
|
||||||
#![allow(clippy::len_without_is_empty, clippy::manual_div_ceil)]
|
|
||||||
|
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy-query-grammar"
|
name = "tantivy-query-grammar"
|
||||||
version = "0.24.0"
|
version = "0.25.0"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = ["database-implementations", "data-structures"]
|
categories = ["database-implementations", "data-structures"]
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ fn field_name(inp: &str) -> IResult<&str, String> {
|
|||||||
alt((first_char, escape_sequence())),
|
alt((first_char, escape_sequence())),
|
||||||
many0(alt((simple_char, escape_sequence(), char('\\')))),
|
many0(alt((simple_char, escape_sequence(), char('\\')))),
|
||||||
)),
|
)),
|
||||||
char(':'),
|
tuple((multispace0, char(':'), multispace0)),
|
||||||
),
|
),
|
||||||
|(first_char, next)| once(first_char).chain(next).collect(),
|
|(first_char, next)| once(first_char).chain(next).collect(),
|
||||||
)(inp)
|
)(inp)
|
||||||
@@ -305,14 +305,15 @@ fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> {
|
|||||||
let (inp, (field_name, _, _, _)) =
|
let (inp, (field_name, _, _, _)) =
|
||||||
tuple((field_name, multispace0, char('('), multispace0))(inp).expect("precondition failed");
|
tuple((field_name, multispace0, char('('), multispace0))(inp).expect("precondition failed");
|
||||||
|
|
||||||
delimited_infallible(
|
let res = delimited_infallible(
|
||||||
nothing,
|
nothing,
|
||||||
map(ast_infallible, |(mut ast, errors)| {
|
map(ast_infallible, |(mut ast, errors)| {
|
||||||
ast.set_default_field(field_name.to_string());
|
ast.set_default_field(field_name.to_string());
|
||||||
(ast, errors)
|
(ast, errors)
|
||||||
}),
|
}),
|
||||||
opt_i_err(char(')'), "expected ')'"),
|
opt_i_err(char(')'), "expected ')'"),
|
||||||
)(inp)
|
)(inp);
|
||||||
|
res
|
||||||
}
|
}
|
||||||
|
|
||||||
fn exists(inp: &str) -> IResult<&str, UserInputLeaf> {
|
fn exists(inp: &str) -> IResult<&str, UserInputLeaf> {
|
||||||
@@ -1282,6 +1283,10 @@ mod test {
|
|||||||
super::field_name("~my~field:a"),
|
super::field_name("~my~field:a"),
|
||||||
Ok(("a", "~my~field".to_string()))
|
Ok(("a", "~my~field".to_string()))
|
||||||
);
|
);
|
||||||
|
assert_eq!(
|
||||||
|
super::field_name(".my.field.name : a"),
|
||||||
|
Ok(("a", ".my.field.name".to_string()))
|
||||||
|
);
|
||||||
for special_char in SPECIAL_CHARS.iter() {
|
for special_char in SPECIAL_CHARS.iter() {
|
||||||
let query = &format!("\\{special_char}my\\{special_char}field:a");
|
let query = &format!("\\{special_char}my\\{special_char}field:a");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -1688,4 +1693,15 @@ mod test {
|
|||||||
fn test_invalid_field() {
|
fn test_invalid_field() {
|
||||||
test_is_parse_err(r#"!bc:def"#, "!bc:def");
|
test_is_parse_err(r#"!bc:def"#, "!bc:def");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_space_before_value() {
|
||||||
|
test_parse_query_to_ast_helper("field : a", r#""field":a"#);
|
||||||
|
test_parse_query_to_ast_helper("field: a", r#""field":a"#);
|
||||||
|
test_parse_query_to_ast_helper("field :a", r#""field":a"#);
|
||||||
|
test_parse_query_to_ast_helper(
|
||||||
|
"field : 'happy tax payer' AND other_field : 1",
|
||||||
|
r#"(+"field":'happy tax payer' +"other_field":1)"#,
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -484,6 +484,7 @@ impl FacetCounts {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
|
use std::iter;
|
||||||
|
|
||||||
use columnar::Dictionary;
|
use columnar::Dictionary;
|
||||||
use rand::distributions::Uniform;
|
use rand::distributions::Uniform;
|
||||||
|
|||||||
@@ -1293,6 +1293,220 @@ mod tests {
|
|||||||
assert_eq!(page_0, &page_2[..page_0.len()]);
|
assert_eq!(page_0, &page_2[..page_0.len()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
proptest! {
|
||||||
|
#![proptest_config(ProptestConfig::with_cases(20))]
|
||||||
|
/// Build multiple segments with equal-scoring docs and verify stable ordering
|
||||||
|
/// across pages when increasing limit or offset.
|
||||||
|
#[test]
|
||||||
|
fn proptest_stable_ordering_across_segments_with_pagination(
|
||||||
|
docs_per_segment in proptest::collection::vec(1usize..50, 2..5)
|
||||||
|
) {
|
||||||
|
use crate::indexer::NoMergePolicy;
|
||||||
|
|
||||||
|
// Build an index with multiple segments; all docs will have the same score using AllQuery.
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let text = schema_builder.add_text_field("text", TEXT);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let mut writer = index.writer_for_tests().unwrap();
|
||||||
|
writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||||
|
|
||||||
|
for num_docs in &docs_per_segment {
|
||||||
|
for _ in 0..*num_docs {
|
||||||
|
writer.add_document(doc!(text => "x")).unwrap();
|
||||||
|
}
|
||||||
|
writer.commit().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
let reader = index.reader().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
|
let total_docs: usize = docs_per_segment.iter().sum();
|
||||||
|
// Full result set, first assert all scores are identical.
|
||||||
|
let full_with_scores: Vec<(Score, DocAddress)> = searcher
|
||||||
|
.search(&AllQuery, &TopDocs::with_limit(total_docs))
|
||||||
|
.unwrap();
|
||||||
|
// Sanity: at least one document was returned.
|
||||||
|
prop_assert!(!full_with_scores.is_empty());
|
||||||
|
let first_score = full_with_scores[0].0;
|
||||||
|
prop_assert!(full_with_scores.iter().all(|(score, _)| *score == first_score));
|
||||||
|
|
||||||
|
// Keep only the addresses for the remaining checks.
|
||||||
|
let full: Vec<DocAddress> = full_with_scores
|
||||||
|
.into_iter()
|
||||||
|
.map(|(_score, addr)| addr)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Sanity: we actually created multiple segments and have documents.
|
||||||
|
prop_assert!(docs_per_segment.len() >= 2);
|
||||||
|
prop_assert!(total_docs >= 2);
|
||||||
|
|
||||||
|
// 1) Increasing limit should preserve prefix ordering.
|
||||||
|
for k in 1..=total_docs {
|
||||||
|
let page: Vec<DocAddress> = searcher
|
||||||
|
.search(&AllQuery, &TopDocs::with_limit(k))
|
||||||
|
.unwrap()
|
||||||
|
.into_iter()
|
||||||
|
.map(|(_score, addr)| addr)
|
||||||
|
.collect();
|
||||||
|
prop_assert_eq!(page, full[..k].to_vec());
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2) Offset + limit pages should always match the corresponding slice.
|
||||||
|
// For each offset, check three representative page sizes:
|
||||||
|
// - first page (size 1)
|
||||||
|
// - a middle page (roughly half of remaining)
|
||||||
|
// - the last page (size = remaining)
|
||||||
|
for offset in 0..total_docs {
|
||||||
|
let remaining = total_docs - offset;
|
||||||
|
|
||||||
|
let assert_page_eq = |limit: usize| -> proptest::test_runner::TestCaseResult {
|
||||||
|
let page: Vec<DocAddress> = searcher
|
||||||
|
.search(&AllQuery, &TopDocs::with_limit(limit).and_offset(offset))
|
||||||
|
.unwrap()
|
||||||
|
.into_iter()
|
||||||
|
.map(|(_score, addr)| addr)
|
||||||
|
.collect();
|
||||||
|
prop_assert_eq!(page, full[offset..offset + limit].to_vec());
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
|
||||||
|
// Smallest page.
|
||||||
|
assert_page_eq(1)?;
|
||||||
|
// A middle-sized page (dedupes to 1 if remaining == 1).
|
||||||
|
assert_page_eq((remaining / 2).max(1))?;
|
||||||
|
// Largest page for this offset.
|
||||||
|
assert_page_eq(remaining)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3) Concatenating fixed-size pages by offset reproduces the full order.
|
||||||
|
for page_size in 1..=total_docs.min(5) {
|
||||||
|
let mut concat: Vec<DocAddress> = Vec::new();
|
||||||
|
let mut offset = 0;
|
||||||
|
while offset < total_docs {
|
||||||
|
let size = page_size.min(total_docs - offset);
|
||||||
|
let page: Vec<DocAddress> = searcher
|
||||||
|
.search(&AllQuery, &TopDocs::with_limit(size).and_offset(offset))
|
||||||
|
.unwrap()
|
||||||
|
.into_iter()
|
||||||
|
.map(|(_score, addr)| addr)
|
||||||
|
.collect();
|
||||||
|
concat.extend(page);
|
||||||
|
offset += size;
|
||||||
|
}
|
||||||
|
// Avoid moving `full` across loop iterations.
|
||||||
|
prop_assert_eq!(concat, full.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
proptest! {
|
||||||
|
#![proptest_config(ProptestConfig::with_cases(20))]
|
||||||
|
/// Build multiple segments with same-scoring term matches and verify stable ordering
|
||||||
|
/// across pages for a real scoring query (TermQuery with identical TF and fieldnorm).
|
||||||
|
#[test]
|
||||||
|
fn proptest_stable_ordering_across_segments_with_term_query_and_pagination(
|
||||||
|
docs_per_segment in proptest::collection::vec(1usize..50, 2..5)
|
||||||
|
) {
|
||||||
|
use crate::indexer::NoMergePolicy;
|
||||||
|
use crate::schema::IndexRecordOption;
|
||||||
|
use crate::query::TermQuery;
|
||||||
|
use crate::Term;
|
||||||
|
|
||||||
|
// Build an index with multiple segments; each doc has exactly one token "x",
|
||||||
|
// ensuring equal BM25 scores across all matching docs (same TF=1 and fieldnorm=1).
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let text = schema_builder.add_text_field("text", TEXT);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let mut writer = index.writer_for_tests().unwrap();
|
||||||
|
writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||||
|
|
||||||
|
for num_docs in &docs_per_segment {
|
||||||
|
for _ in 0..*num_docs {
|
||||||
|
writer.add_document(doc!(text => "x")).unwrap();
|
||||||
|
}
|
||||||
|
writer.commit().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
let reader = index.reader().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
|
let total_docs: usize = docs_per_segment.iter().sum();
|
||||||
|
let term = Term::from_field_text(text, "x");
|
||||||
|
let tq = TermQuery::new(term, IndexRecordOption::WithFreqs);
|
||||||
|
|
||||||
|
// Full result set, first assert all scores are identical across docs.
|
||||||
|
let full_with_scores: Vec<(Score, DocAddress)> = searcher
|
||||||
|
.search(&tq, &TopDocs::with_limit(total_docs))
|
||||||
|
.unwrap();
|
||||||
|
// Sanity: at least one document was returned.
|
||||||
|
prop_assert!(!full_with_scores.is_empty());
|
||||||
|
let first_score = full_with_scores[0].0;
|
||||||
|
prop_assert!(full_with_scores.iter().all(|(score, _)| *score == first_score));
|
||||||
|
|
||||||
|
// Keep only the addresses for the remaining checks.
|
||||||
|
let full: Vec<DocAddress> = full_with_scores
|
||||||
|
.into_iter()
|
||||||
|
.map(|(_score, addr)| addr)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Sanity: we actually created multiple segments and have documents.
|
||||||
|
prop_assert!(docs_per_segment.len() >= 2);
|
||||||
|
prop_assert!(total_docs >= 2);
|
||||||
|
|
||||||
|
// 1) Increasing limit should preserve prefix ordering.
|
||||||
|
for k in 1..=total_docs {
|
||||||
|
let page: Vec<DocAddress> = searcher
|
||||||
|
.search(&tq, &TopDocs::with_limit(k))
|
||||||
|
.unwrap()
|
||||||
|
.into_iter()
|
||||||
|
.map(|(_score, addr)| addr)
|
||||||
|
.collect();
|
||||||
|
prop_assert_eq!(page, full[..k].to_vec());
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2) Offset + limit pages should always match the corresponding slice.
|
||||||
|
// Check three representative page sizes for each offset: 1, ~half, and remaining.
|
||||||
|
for offset in 0..total_docs {
|
||||||
|
let remaining = total_docs - offset;
|
||||||
|
|
||||||
|
let assert_page_eq = |limit: usize| -> proptest::test_runner::TestCaseResult {
|
||||||
|
let page: Vec<DocAddress> = searcher
|
||||||
|
.search(&tq, &TopDocs::with_limit(limit).and_offset(offset))
|
||||||
|
.unwrap()
|
||||||
|
.into_iter()
|
||||||
|
.map(|(_score, addr)| addr)
|
||||||
|
.collect();
|
||||||
|
prop_assert_eq!(page, full[offset..offset + limit].to_vec());
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
|
||||||
|
assert_page_eq(1)?;
|
||||||
|
assert_page_eq((remaining / 2).max(1))?;
|
||||||
|
assert_page_eq(remaining)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3) Concatenating fixed-size pages by offset reproduces the full order.
|
||||||
|
for page_size in 1..=total_docs.min(5) {
|
||||||
|
let mut concat: Vec<DocAddress> = Vec::new();
|
||||||
|
let mut offset = 0;
|
||||||
|
while offset < total_docs {
|
||||||
|
let size = page_size.min(total_docs - offset);
|
||||||
|
let page: Vec<DocAddress> = searcher
|
||||||
|
.search(&tq, &TopDocs::with_limit(size).and_offset(offset))
|
||||||
|
.unwrap()
|
||||||
|
.into_iter()
|
||||||
|
.map(|(_score, addr)| addr)
|
||||||
|
.collect();
|
||||||
|
concat.extend(page);
|
||||||
|
offset += size;
|
||||||
|
}
|
||||||
|
prop_assert_eq!(concat, full.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[should_panic]
|
#[should_panic]
|
||||||
fn test_top_0() {
|
fn test_top_0() {
|
||||||
|
|||||||
@@ -216,7 +216,7 @@ impl IndexBuilder {
|
|||||||
|
|
||||||
/// Opens or creates a new index in the provided directory
|
/// Opens or creates a new index in the provided directory
|
||||||
pub fn open_or_create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
|
pub fn open_or_create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
|
||||||
let dir = dir.into();
|
let dir: Box<dyn Directory> = dir.into();
|
||||||
if !Index::exists(&*dir)? {
|
if !Index::exists(&*dir)? {
|
||||||
return self.create(dir);
|
return self.create(dir);
|
||||||
}
|
}
|
||||||
@@ -494,7 +494,7 @@ impl Index {
|
|||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|segment| SegmentReader::open(&segment)?.fields_metadata())
|
.map(|segment| SegmentReader::open(&segment)?.fields_metadata())
|
||||||
.collect::<Result<_, _>>()?;
|
.collect::<Result<_, _>>()?;
|
||||||
Ok(merge_field_meta_data(fields_metadata, &self.schema()))
|
Ok(merge_field_meta_data(fields_metadata))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a new segment_meta (Advanced user only).
|
/// Creates a new segment_meta (Advanced user only).
|
||||||
|
|||||||
@@ -1,8 +1,7 @@
|
|||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
use common::json_path_writer::JSON_END_OF_PATH;
|
use common::json_path_writer::JSON_END_OF_PATH;
|
||||||
use common::BinarySerializable;
|
use common::{BinarySerializable, ByteCount};
|
||||||
use fnv::FnvHashSet;
|
|
||||||
#[cfg(feature = "quickwit")]
|
#[cfg(feature = "quickwit")]
|
||||||
use futures_util::{FutureExt, StreamExt, TryStreamExt};
|
use futures_util::{FutureExt, StreamExt, TryStreamExt};
|
||||||
#[cfg(feature = "quickwit")]
|
#[cfg(feature = "quickwit")]
|
||||||
@@ -36,6 +35,33 @@ pub struct InvertedIndexReader {
|
|||||||
total_num_tokens: u64,
|
total_num_tokens: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Object that records the amount of space used by a field in an inverted index.
|
||||||
|
pub(crate) struct InvertedIndexFieldSpace {
|
||||||
|
pub field_name: String,
|
||||||
|
pub field_type: Type,
|
||||||
|
pub postings_size: ByteCount,
|
||||||
|
pub positions_size: ByteCount,
|
||||||
|
pub num_terms: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns None if the term is not a valid JSON path.
|
||||||
|
fn extract_field_name_and_field_type_from_json_path(term: &[u8]) -> Option<(String, Type)> {
|
||||||
|
let index = term.iter().position(|&byte| byte == JSON_END_OF_PATH)?;
|
||||||
|
let field_type_code = term.get(index + 1).copied()?;
|
||||||
|
let field_type = Type::from_code(field_type_code)?;
|
||||||
|
// Let's flush the current field.
|
||||||
|
let field_name = String::from_utf8_lossy(&term[..index]).to_string();
|
||||||
|
Some((field_name, field_type))
|
||||||
|
}
|
||||||
|
|
||||||
|
impl InvertedIndexFieldSpace {
|
||||||
|
fn record(&mut self, term_info: &TermInfo) {
|
||||||
|
self.postings_size += ByteCount::from(term_info.posting_num_bytes() as u64);
|
||||||
|
self.positions_size += ByteCount::from(term_info.positions_num_bytes() as u64);
|
||||||
|
self.num_terms += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl InvertedIndexReader {
|
impl InvertedIndexReader {
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(
|
||||||
termdict: TermDictionary,
|
termdict: TermDictionary,
|
||||||
@@ -81,20 +107,56 @@ impl InvertedIndexReader {
|
|||||||
///
|
///
|
||||||
/// Notice: This requires a full scan and therefore **very expensive**.
|
/// Notice: This requires a full scan and therefore **very expensive**.
|
||||||
/// TODO: Move to sstable to use the index.
|
/// TODO: Move to sstable to use the index.
|
||||||
pub fn list_encoded_fields(&self) -> io::Result<Vec<(String, Type)>> {
|
pub(crate) fn list_encoded_json_fields(&self) -> io::Result<Vec<InvertedIndexFieldSpace>> {
|
||||||
let mut stream = self.termdict.stream()?;
|
let mut stream = self.termdict.stream()?;
|
||||||
let mut fields = Vec::new();
|
let mut fields: Vec<InvertedIndexFieldSpace> = Vec::new();
|
||||||
let mut fields_set = FnvHashSet::default();
|
|
||||||
while let Some((term, _term_info)) = stream.next() {
|
let mut current_field_opt: Option<InvertedIndexFieldSpace> = None;
|
||||||
if let Some(index) = term.iter().position(|&byte| byte == JSON_END_OF_PATH) {
|
// Current field bytes, including the JSON_END_OF_PATH.
|
||||||
if !fields_set.contains(&term[..index + 2]) {
|
let mut current_field_bytes: Vec<u8> = Vec::new();
|
||||||
fields_set.insert(term[..index + 2].to_vec());
|
|
||||||
let typ = Type::from_code(term[index + 1]).unwrap();
|
while let Some((term, term_info)) = stream.next() {
|
||||||
fields.push((String::from_utf8_lossy(&term[..index]).to_string(), typ));
|
if let Some(current_field) = &mut current_field_opt {
|
||||||
|
if term.starts_with(¤t_field_bytes) {
|
||||||
|
// We are still in the same field.
|
||||||
|
current_field.record(term_info);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This is a new field!
|
||||||
|
// Let's flush the current field.
|
||||||
|
fields.extend(current_field_opt.take());
|
||||||
|
current_field_bytes.clear();
|
||||||
|
|
||||||
|
// And create a new one.
|
||||||
|
let Some((field_name, field_type)) =
|
||||||
|
extract_field_name_and_field_type_from_json_path(term)
|
||||||
|
else {
|
||||||
|
error!(
|
||||||
|
"invalid term bytes encountered {term:?}. this only happens if the term \
|
||||||
|
dictionary is corrupted. please report"
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let mut field_space = InvertedIndexFieldSpace {
|
||||||
|
field_name,
|
||||||
|
field_type,
|
||||||
|
postings_size: ByteCount::default(),
|
||||||
|
positions_size: ByteCount::default(),
|
||||||
|
num_terms: 0u64,
|
||||||
|
};
|
||||||
|
field_space.record(&term_info);
|
||||||
|
|
||||||
|
// We include the json type and the json end of path to make sure the prefix check
|
||||||
|
// is meaningful.
|
||||||
|
current_field_bytes.extend_from_slice(&term[..field_space.field_name.len() + 2]);
|
||||||
|
current_field_opt = Some(field_space);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// We need to flush the last field as well.
|
||||||
|
fields.extend(current_field_opt.take());
|
||||||
|
|
||||||
Ok(fields)
|
Ok(fields)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::ops::BitOrAssign;
|
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
use std::{fmt, io};
|
use std::{fmt, io};
|
||||||
|
|
||||||
|
use common::{ByteCount, HasLen};
|
||||||
use fnv::FnvHashMap;
|
use fnv::FnvHashMap;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
|
|
||||||
@@ -304,12 +304,16 @@ impl SegmentReader {
|
|||||||
for (field, field_entry) in self.schema().fields() {
|
for (field, field_entry) in self.schema().fields() {
|
||||||
let field_name = field_entry.name().to_string();
|
let field_name = field_entry.name().to_string();
|
||||||
let is_indexed = field_entry.is_indexed();
|
let is_indexed = field_entry.is_indexed();
|
||||||
|
|
||||||
if is_indexed {
|
if is_indexed {
|
||||||
let is_json = field_entry.field_type().value_type() == Type::Json;
|
let is_json = field_entry.field_type().value_type() == Type::Json;
|
||||||
if is_json {
|
if is_json {
|
||||||
|
let term_dictionary_json_field_num_bytes: u64 = self
|
||||||
|
.termdict_composite
|
||||||
|
.open_read(field)
|
||||||
|
.map(|file_slice| file_slice.len() as u64)
|
||||||
|
.unwrap_or(0u64);
|
||||||
let inv_index = self.inverted_index(field)?;
|
let inv_index = self.inverted_index(field)?;
|
||||||
let encoded_fields_in_index = inv_index.list_encoded_fields()?;
|
let encoded_fields_in_index = inv_index.list_encoded_json_fields()?;
|
||||||
let mut build_path = |field_name: &str, mut json_path: String| {
|
let mut build_path = |field_name: &str, mut json_path: String| {
|
||||||
// In this case we need to map the potential fast field to the field name
|
// In this case we need to map the potential fast field to the field name
|
||||||
// accepted by the query parser.
|
// accepted by the query parser.
|
||||||
@@ -328,30 +332,65 @@ impl SegmentReader {
|
|||||||
format!("{field_name}.{json_path}")
|
format!("{field_name}.{json_path}")
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
indexed_fields.extend(
|
let total_num_terms = encoded_fields_in_index
|
||||||
encoded_fields_in_index
|
.iter()
|
||||||
.into_iter()
|
.map(|field_space| field_space.num_terms)
|
||||||
.map(|(name, typ)| (build_path(&field_name, name), typ))
|
.sum();
|
||||||
.map(|(field_name, typ)| FieldMetadata {
|
indexed_fields.extend(encoded_fields_in_index.into_iter().map(|field_space| {
|
||||||
indexed: true,
|
let field_name = build_path(&field_name, field_space.field_name);
|
||||||
stored: false,
|
// It is complex to attribute the exact amount of bytes required by specific
|
||||||
field_name,
|
// field in the json field. Instead, as a proxy, we
|
||||||
fast: false,
|
// attribute the total amount of bytes for the entire json field,
|
||||||
typ,
|
// proportionally to the number of terms in each
|
||||||
}),
|
// fields.
|
||||||
);
|
let term_dictionary_size = (term_dictionary_json_field_num_bytes
|
||||||
|
* field_space.num_terms)
|
||||||
|
.checked_div(total_num_terms)
|
||||||
|
.unwrap_or(0);
|
||||||
|
FieldMetadata {
|
||||||
|
postings_size: Some(field_space.postings_size),
|
||||||
|
positions_size: Some(field_space.positions_size),
|
||||||
|
term_dictionary_size: Some(ByteCount::from(term_dictionary_size)),
|
||||||
|
fast_size: None,
|
||||||
|
// The stored flag will be set at the end of this function!
|
||||||
|
stored: field_entry.is_stored(),
|
||||||
|
field_name,
|
||||||
|
typ: field_space.field_type,
|
||||||
|
}
|
||||||
|
}));
|
||||||
} else {
|
} else {
|
||||||
|
let postings_size: ByteCount = self
|
||||||
|
.postings_composite
|
||||||
|
.open_read(field)
|
||||||
|
.map(|posting_fileslice| posting_fileslice.len())
|
||||||
|
.unwrap_or(0)
|
||||||
|
.into();
|
||||||
|
let positions_size: ByteCount = self
|
||||||
|
.positions_composite
|
||||||
|
.open_read(field)
|
||||||
|
.map(|positions_fileslice| positions_fileslice.len())
|
||||||
|
.unwrap_or(0)
|
||||||
|
.into();
|
||||||
|
let term_dictionary_size: ByteCount = self
|
||||||
|
.termdict_composite
|
||||||
|
.open_read(field)
|
||||||
|
.map(|term_dictionary_fileslice| term_dictionary_fileslice.len())
|
||||||
|
.unwrap_or(0)
|
||||||
|
.into();
|
||||||
indexed_fields.push(FieldMetadata {
|
indexed_fields.push(FieldMetadata {
|
||||||
indexed: true,
|
|
||||||
stored: false,
|
|
||||||
field_name: field_name.to_string(),
|
field_name: field_name.to_string(),
|
||||||
fast: false,
|
|
||||||
typ: field_entry.field_type().value_type(),
|
typ: field_entry.field_type().value_type(),
|
||||||
|
// The stored flag will be set at the end of this function!
|
||||||
|
stored: field_entry.is_stored(),
|
||||||
|
fast_size: None,
|
||||||
|
term_dictionary_size: Some(term_dictionary_size),
|
||||||
|
postings_size: Some(postings_size),
|
||||||
|
positions_size: Some(positions_size),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let mut fast_fields: Vec<FieldMetadata> = self
|
let fast_fields: Vec<FieldMetadata> = self
|
||||||
.fast_fields()
|
.fast_fields()
|
||||||
.columnar()
|
.columnar()
|
||||||
.iter_columns()?
|
.iter_columns()?
|
||||||
@@ -363,23 +402,21 @@ impl SegmentReader {
|
|||||||
.get(&field_name)
|
.get(&field_name)
|
||||||
.unwrap_or(&field_name)
|
.unwrap_or(&field_name)
|
||||||
.to_string();
|
.to_string();
|
||||||
|
let stored = is_field_stored(&field_name, &self.schema);
|
||||||
FieldMetadata {
|
FieldMetadata {
|
||||||
indexed: false,
|
|
||||||
stored: false,
|
|
||||||
field_name,
|
field_name,
|
||||||
fast: true,
|
|
||||||
typ: Type::from(handle.column_type()),
|
typ: Type::from(handle.column_type()),
|
||||||
|
stored,
|
||||||
|
fast_size: Some(handle.num_bytes()),
|
||||||
|
term_dictionary_size: None,
|
||||||
|
postings_size: None,
|
||||||
|
positions_size: None,
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
// Since the type is encoded differently in the fast field and in the inverted index,
|
let merged_field_metadatas: Vec<FieldMetadata> =
|
||||||
// the order of the fields is not guaranteed to be the same. Therefore, we sort the fields.
|
merge_field_meta_data(vec![indexed_fields, fast_fields]);
|
||||||
// If we are sure that the order is the same, we can remove this sort.
|
Ok(merged_field_metadatas)
|
||||||
indexed_fields.sort_unstable();
|
|
||||||
fast_fields.sort_unstable();
|
|
||||||
let merged = merge_field_meta_data(vec![indexed_fields, fast_fields], &self.schema);
|
|
||||||
|
|
||||||
Ok(merged)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the segment id
|
/// Returns the segment id
|
||||||
@@ -443,20 +480,47 @@ pub struct FieldMetadata {
|
|||||||
// Notice: Don't reorder the declaration of 1.field_name 2.typ, as it is used for ordering by
|
// Notice: Don't reorder the declaration of 1.field_name 2.typ, as it is used for ordering by
|
||||||
// field_name then typ.
|
// field_name then typ.
|
||||||
pub typ: Type,
|
pub typ: Type,
|
||||||
/// Is the field indexed for search
|
|
||||||
pub indexed: bool,
|
|
||||||
/// Is the field stored in the doc store
|
/// Is the field stored in the doc store
|
||||||
pub stored: bool,
|
pub stored: bool,
|
||||||
/// Is the field stored in the columnar storage
|
/// Size occupied in the columnar storage (None if not fast)
|
||||||
pub fast: bool,
|
pub fast_size: Option<ByteCount>,
|
||||||
|
/// term_dictionary
|
||||||
|
pub term_dictionary_size: Option<ByteCount>,
|
||||||
|
/// Size occupied in the index postings storage (None if not indexed)
|
||||||
|
pub postings_size: Option<ByteCount>,
|
||||||
|
/// Size occupied in the index postings storage (None if positions are not recorded)
|
||||||
|
pub positions_size: Option<ByteCount>,
|
||||||
}
|
}
|
||||||
impl BitOrAssign for FieldMetadata {
|
|
||||||
fn bitor_assign(&mut self, rhs: Self) {
|
fn merge_options(left: Option<ByteCount>, right: Option<ByteCount>) -> Option<ByteCount> {
|
||||||
assert!(self.field_name == rhs.field_name);
|
match (left, right) {
|
||||||
assert!(self.typ == rhs.typ);
|
(Some(l), Some(r)) => Some(l + r),
|
||||||
self.indexed |= rhs.indexed;
|
(None, right) => right,
|
||||||
|
(left, None) => left,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FieldMetadata {
|
||||||
|
/// Returns true if and only if the field is indexed.
|
||||||
|
pub fn is_indexed(&self) -> bool {
|
||||||
|
self.postings_size.is_some()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if the field is a fast field (i.e.: recorded in columnar format).
|
||||||
|
pub fn is_fast(&self) -> bool {
|
||||||
|
self.fast_size.is_some()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Merges two field metadata.
|
||||||
|
pub fn merge(&mut self, rhs: Self) {
|
||||||
|
assert_eq!(self.field_name, rhs.field_name);
|
||||||
|
assert_eq!(self.typ, rhs.typ);
|
||||||
self.stored |= rhs.stored;
|
self.stored |= rhs.stored;
|
||||||
self.fast |= rhs.fast;
|
self.fast_size = merge_options(self.fast_size, rhs.fast_size);
|
||||||
|
self.term_dictionary_size =
|
||||||
|
merge_options(self.term_dictionary_size, rhs.term_dictionary_size);
|
||||||
|
self.postings_size = merge_options(self.postings_size, rhs.postings_size);
|
||||||
|
self.positions_size = merge_options(self.positions_size, rhs.positions_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -469,23 +533,29 @@ fn is_field_stored(field_name: &str, schema: &Schema) -> bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Helper to merge the field metadata from multiple segments.
|
/// Helper to merge the field metadata from multiple segments.
|
||||||
pub fn merge_field_meta_data(
|
pub fn merge_field_meta_data(mut field_metadatas: Vec<Vec<FieldMetadata>>) -> Vec<FieldMetadata> {
|
||||||
field_metadatas: Vec<Vec<FieldMetadata>>,
|
// READ BEFORE REMOVING THIS!
|
||||||
schema: &Schema,
|
//
|
||||||
) -> Vec<FieldMetadata> {
|
// Because we replace field sep by `.`, fields are not always sorted.
|
||||||
|
// Also, to enforce such an implicit contract, we would have to add
|
||||||
|
// assert here.
|
||||||
|
//
|
||||||
|
// Sorting is linear time on pre-sorted data, so we are simply better off sorting data here.
|
||||||
|
for field_metadatas in &mut field_metadatas {
|
||||||
|
field_metadatas.sort_unstable();
|
||||||
|
}
|
||||||
let mut merged_field_metadata = Vec::new();
|
let mut merged_field_metadata = Vec::new();
|
||||||
for (_key, mut group) in &field_metadatas
|
for (_key, mut group) in &field_metadatas
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.kmerge_by(|left, right| left < right)
|
.kmerge()
|
||||||
// TODO: Remove allocation
|
// TODO: Remove allocation
|
||||||
.chunk_by(|el| (el.field_name.to_string(), el.typ))
|
.chunk_by(|el| (el.field_name.to_string(), el.typ))
|
||||||
{
|
{
|
||||||
let mut merged: FieldMetadata = group.next().unwrap();
|
let mut merged: FieldMetadata = group.next().unwrap();
|
||||||
for el in group {
|
for el in group {
|
||||||
merged |= el;
|
merged.merge(el);
|
||||||
}
|
}
|
||||||
// Currently is_field_stored is maybe too slow for the high cardinality case
|
// Currently is_field_stored is maybe too slow for the high cardinality case
|
||||||
merged.stored = is_field_stored(&merged.field_name, schema);
|
|
||||||
merged_field_metadata.push(merged);
|
merged_field_metadata.push(merged);
|
||||||
}
|
}
|
||||||
merged_field_metadata
|
merged_field_metadata
|
||||||
@@ -507,7 +577,7 @@ fn intersect_alive_bitset(
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for SegmentReader {
|
impl fmt::Debug for SegmentReader {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
write!(f, "SegmentReader({:?})", self.segment_id)
|
write!(f, "SegmentReader({:?})", self.segment_id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -516,122 +586,168 @@ impl fmt::Debug for SegmentReader {
|
|||||||
mod test {
|
mod test {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::index::Index;
|
use crate::index::Index;
|
||||||
use crate::schema::{SchemaBuilder, Term, STORED, TEXT};
|
use crate::schema::{Term, STORED, TEXT};
|
||||||
use crate::IndexWriter;
|
use crate::IndexWriter;
|
||||||
|
|
||||||
|
#[track_caller]
|
||||||
|
fn assert_merge(fields_metadatas: &[Vec<FieldMetadata>], expected: &[FieldMetadata]) {
|
||||||
|
use itertools::Itertools;
|
||||||
|
let num_els = fields_metadatas.len();
|
||||||
|
for permutation in fields_metadatas.iter().cloned().permutations(num_els) {
|
||||||
|
let res = merge_field_meta_data(permutation);
|
||||||
|
assert_eq!(&res, &expected);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_field_meta_data_same() {
|
fn test_merge_field_meta_data_same_field() {
|
||||||
let schema = SchemaBuilder::new().build();
|
|
||||||
let field_metadata1 = FieldMetadata {
|
let field_metadata1 = FieldMetadata {
|
||||||
field_name: "a".to_string(),
|
field_name: "a".to_string(),
|
||||||
typ: crate::schema::Type::Str,
|
typ: crate::schema::Type::Str,
|
||||||
indexed: true,
|
|
||||||
stored: false,
|
stored: false,
|
||||||
fast: true,
|
term_dictionary_size: Some(ByteCount::from(100u64)),
|
||||||
|
postings_size: Some(ByteCount::from(1_000u64)),
|
||||||
|
positions_size: Some(ByteCount::from(2_000u64)),
|
||||||
|
fast_size: Some(ByteCount::from(1_000u64).into()),
|
||||||
};
|
};
|
||||||
let field_metadata2 = FieldMetadata {
|
let field_metadata2 = FieldMetadata {
|
||||||
field_name: "a".to_string(),
|
field_name: "a".to_string(),
|
||||||
typ: crate::schema::Type::Str,
|
typ: crate::schema::Type::Str,
|
||||||
indexed: true,
|
|
||||||
stored: false,
|
stored: false,
|
||||||
fast: true,
|
term_dictionary_size: Some(ByteCount::from(80u64)),
|
||||||
|
postings_size: Some(ByteCount::from(1_500u64)),
|
||||||
|
positions_size: Some(ByteCount::from(2_500u64)),
|
||||||
|
fast_size: Some(ByteCount::from(3_000u64).into()),
|
||||||
};
|
};
|
||||||
let res = merge_field_meta_data(
|
let expected = FieldMetadata {
|
||||||
vec![vec![field_metadata1.clone()], vec![field_metadata2]],
|
field_name: "a".to_string(),
|
||||||
&schema,
|
typ: crate::schema::Type::Str,
|
||||||
|
stored: false,
|
||||||
|
term_dictionary_size: Some(ByteCount::from(180u64)),
|
||||||
|
postings_size: Some(ByteCount::from(2_500u64)),
|
||||||
|
positions_size: Some(ByteCount::from(4_500u64)),
|
||||||
|
fast_size: Some(ByteCount::from(4_000u64).into()),
|
||||||
|
};
|
||||||
|
assert_merge(
|
||||||
|
&[vec![field_metadata1.clone()], vec![field_metadata2]],
|
||||||
|
&[expected],
|
||||||
);
|
);
|
||||||
assert_eq!(res, vec![field_metadata1]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[track_caller]
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_field_meta_data_different() {
|
fn test_merge_field_meta_data_different() {
|
||||||
let schema = SchemaBuilder::new().build();
|
|
||||||
let field_metadata1 = FieldMetadata {
|
let field_metadata1 = FieldMetadata {
|
||||||
field_name: "a".to_string(),
|
field_name: "a".to_string(),
|
||||||
typ: crate::schema::Type::Str,
|
typ: crate::schema::Type::Str,
|
||||||
indexed: false,
|
|
||||||
stored: false,
|
stored: false,
|
||||||
fast: true,
|
fast_size: Some(1_000u64.into()),
|
||||||
|
term_dictionary_size: Some(100u64.into()),
|
||||||
|
postings_size: Some(2_000u64.into()),
|
||||||
|
positions_size: Some(4_000u64.into()),
|
||||||
};
|
};
|
||||||
let field_metadata2 = FieldMetadata {
|
let field_metadata2 = FieldMetadata {
|
||||||
field_name: "b".to_string(),
|
field_name: "b".to_string(),
|
||||||
typ: crate::schema::Type::Str,
|
typ: crate::schema::Type::Str,
|
||||||
indexed: false,
|
|
||||||
stored: false,
|
stored: false,
|
||||||
fast: true,
|
fast_size: Some(1_002u64.into()),
|
||||||
|
term_dictionary_size: None,
|
||||||
|
postings_size: None,
|
||||||
|
positions_size: None,
|
||||||
};
|
};
|
||||||
let field_metadata3 = FieldMetadata {
|
let field_metadata3 = FieldMetadata {
|
||||||
field_name: "a".to_string(),
|
field_name: "a".to_string(),
|
||||||
typ: crate::schema::Type::Str,
|
typ: crate::schema::Type::Str,
|
||||||
indexed: true,
|
term_dictionary_size: Some(101u64.into()),
|
||||||
|
postings_size: Some(2_001u64.into()),
|
||||||
|
positions_size: Some(4_001u64.into()),
|
||||||
stored: false,
|
stored: false,
|
||||||
fast: false,
|
fast_size: None,
|
||||||
};
|
};
|
||||||
let res = merge_field_meta_data(
|
let expected = vec![
|
||||||
vec![
|
FieldMetadata {
|
||||||
|
field_name: "a".to_string(),
|
||||||
|
typ: crate::schema::Type::Str,
|
||||||
|
stored: false,
|
||||||
|
term_dictionary_size: Some(201u64.into()),
|
||||||
|
postings_size: Some(4_001u64.into()),
|
||||||
|
positions_size: Some(8_001u64.into()),
|
||||||
|
fast_size: Some(1_000u64.into()),
|
||||||
|
},
|
||||||
|
FieldMetadata {
|
||||||
|
field_name: "b".to_string(),
|
||||||
|
typ: crate::schema::Type::Str,
|
||||||
|
stored: false,
|
||||||
|
term_dictionary_size: None,
|
||||||
|
postings_size: None,
|
||||||
|
positions_size: None,
|
||||||
|
fast_size: Some(1_002u64.into()),
|
||||||
|
},
|
||||||
|
];
|
||||||
|
assert_merge(
|
||||||
|
&[
|
||||||
vec![field_metadata1.clone(), field_metadata2.clone()],
|
vec![field_metadata1.clone(), field_metadata2.clone()],
|
||||||
vec![field_metadata3],
|
vec![field_metadata3],
|
||||||
],
|
],
|
||||||
&schema,
|
&expected,
|
||||||
);
|
);
|
||||||
let field_metadata_expected1 = FieldMetadata {
|
|
||||||
field_name: "a".to_string(),
|
|
||||||
typ: crate::schema::Type::Str,
|
|
||||||
indexed: true,
|
|
||||||
stored: false,
|
|
||||||
fast: true,
|
|
||||||
};
|
|
||||||
assert_eq!(res, vec![field_metadata_expected1, field_metadata2.clone()]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_field_meta_data_merge() {
|
fn test_merge_field_meta_data_merge() {
|
||||||
use pretty_assertions::assert_eq;
|
|
||||||
let get_meta_data = |name: &str, typ: Type| FieldMetadata {
|
let get_meta_data = |name: &str, typ: Type| FieldMetadata {
|
||||||
field_name: name.to_string(),
|
field_name: name.to_string(),
|
||||||
typ,
|
typ,
|
||||||
indexed: false,
|
term_dictionary_size: None,
|
||||||
|
postings_size: None,
|
||||||
|
positions_size: None,
|
||||||
stored: false,
|
stored: false,
|
||||||
fast: true,
|
fast_size: Some(1u64.into()),
|
||||||
};
|
};
|
||||||
let schema = SchemaBuilder::new().build();
|
let metas = vec![get_meta_data("d", Type::Str), get_meta_data("e", Type::U64)];
|
||||||
let mut metas = vec![get_meta_data("d", Type::Str), get_meta_data("e", Type::U64)];
|
assert_merge(
|
||||||
metas.sort();
|
&[vec![get_meta_data("e", Type::Str)], metas],
|
||||||
let res = merge_field_meta_data(vec![vec![get_meta_data("e", Type::Str)], metas], &schema);
|
&[
|
||||||
assert_eq!(
|
|
||||||
res,
|
|
||||||
vec![
|
|
||||||
get_meta_data("d", Type::Str),
|
get_meta_data("d", Type::Str),
|
||||||
get_meta_data("e", Type::Str),
|
get_meta_data("e", Type::Str),
|
||||||
get_meta_data("e", Type::U64),
|
get_meta_data("e", Type::U64),
|
||||||
]
|
],
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_field_meta_data_bitxor() {
|
fn test_merge_field_meta_data_bitxor() {
|
||||||
let field_metadata1 = FieldMetadata {
|
let field_metadata1 = FieldMetadata {
|
||||||
field_name: "a".to_string(),
|
field_name: "a".to_string(),
|
||||||
typ: crate::schema::Type::Str,
|
typ: crate::schema::Type::Str,
|
||||||
indexed: false,
|
term_dictionary_size: None,
|
||||||
|
postings_size: None,
|
||||||
|
positions_size: None,
|
||||||
stored: false,
|
stored: false,
|
||||||
fast: true,
|
fast_size: Some(10u64.into()),
|
||||||
};
|
};
|
||||||
let field_metadata2 = FieldMetadata {
|
let field_metadata2 = FieldMetadata {
|
||||||
field_name: "a".to_string(),
|
field_name: "a".to_string(),
|
||||||
typ: crate::schema::Type::Str,
|
typ: crate::schema::Type::Str,
|
||||||
indexed: true,
|
term_dictionary_size: Some(10u64.into()),
|
||||||
|
postings_size: Some(11u64.into()),
|
||||||
|
positions_size: Some(12u64.into()),
|
||||||
stored: false,
|
stored: false,
|
||||||
fast: false,
|
fast_size: None,
|
||||||
};
|
};
|
||||||
let field_metadata_expected = FieldMetadata {
|
let field_metadata_expected = FieldMetadata {
|
||||||
field_name: "a".to_string(),
|
field_name: "a".to_string(),
|
||||||
typ: crate::schema::Type::Str,
|
typ: crate::schema::Type::Str,
|
||||||
indexed: true,
|
term_dictionary_size: Some(10u64.into()),
|
||||||
|
postings_size: Some(11u64.into()),
|
||||||
|
positions_size: Some(12u64.into()),
|
||||||
stored: false,
|
stored: false,
|
||||||
fast: true,
|
fast_size: Some(10u64.into()),
|
||||||
};
|
};
|
||||||
let mut res1 = field_metadata1.clone();
|
let mut res1 = field_metadata1.clone();
|
||||||
res1 |= field_metadata2.clone();
|
res1.merge(field_metadata2.clone());
|
||||||
let mut res2 = field_metadata2.clone();
|
let mut res2 = field_metadata2.clone();
|
||||||
res2 |= field_metadata1;
|
res2.merge(field_metadata1);
|
||||||
assert_eq!(res1, field_metadata_expected);
|
assert_eq!(res1, field_metadata_expected);
|
||||||
assert_eq!(res2, field_metadata_expected);
|
assert_eq!(res2, field_metadata_expected);
|
||||||
}
|
}
|
||||||
@@ -662,6 +778,7 @@ mod test {
|
|||||||
assert_eq!(4, searcher.segment_reader(0).max_doc());
|
assert_eq!(4, searcher.segment_reader(0).max_doc());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_alive_docs_iterator() -> crate::Result<()> {
|
fn test_alive_docs_iterator() -> crate::Result<()> {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|||||||
@@ -61,6 +61,8 @@ type AddBatchReceiver<D> = channel::Receiver<AddBatch<D>>;
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests_mmap {
|
mod tests_mmap {
|
||||||
|
|
||||||
|
use common::ByteCount;
|
||||||
|
|
||||||
use crate::aggregation::agg_req::Aggregations;
|
use crate::aggregation::agg_req::Aggregations;
|
||||||
use crate::aggregation::agg_result::AggregationResults;
|
use crate::aggregation::agg_result::AggregationResults;
|
||||||
use crate::aggregation::AggregationCollector;
|
use crate::aggregation::AggregationCollector;
|
||||||
@@ -280,11 +282,14 @@ mod tests_mmap {
|
|||||||
field_name_out
|
field_name_out
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut fields = reader.searcher().segment_readers()[0]
|
let mut fields: Vec<(String, Type)> = reader.searcher().segment_readers()[0]
|
||||||
.inverted_index(field)
|
.inverted_index(field)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.list_encoded_fields()
|
.list_encoded_json_fields()
|
||||||
.unwrap();
|
.unwrap()
|
||||||
|
.into_iter()
|
||||||
|
.map(|field_space| (field_space.field_name, field_space.field_type))
|
||||||
|
.collect();
|
||||||
assert_eq!(fields.len(), 8);
|
assert_eq!(fields.len(), 8);
|
||||||
fields.sort();
|
fields.sort();
|
||||||
let mut expected_fields = vec![
|
let mut expected_fields = vec![
|
||||||
@@ -385,7 +390,12 @@ mod tests_mmap {
|
|||||||
let reader = &searcher.segment_readers()[0];
|
let reader = &searcher.segment_readers()[0];
|
||||||
let inverted_index = reader.inverted_index(json_field).unwrap();
|
let inverted_index = reader.inverted_index(json_field).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
inverted_index.list_encoded_fields().unwrap(),
|
inverted_index
|
||||||
|
.list_encoded_json_fields()
|
||||||
|
.unwrap()
|
||||||
|
.into_iter()
|
||||||
|
.map(|field_space| (field_space.field_name, field_space.field_type))
|
||||||
|
.collect::<Vec<_>>(),
|
||||||
[
|
[
|
||||||
("k8s.container.name".to_string(), Type::Str),
|
("k8s.container.name".to_string(), Type::Str),
|
||||||
("sub\u{1}a".to_string(), Type::I64),
|
("sub\u{1}a".to_string(), Type::I64),
|
||||||
@@ -402,19 +412,41 @@ mod tests_mmap {
|
|||||||
fn test_json_fields_metadata_expanded_dots_one_segment() {
|
fn test_json_fields_metadata_expanded_dots_one_segment() {
|
||||||
test_json_fields_metadata(true, true);
|
test_json_fields_metadata(true, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_json_fields_metadata_expanded_dots_multi_segment() {
|
fn test_json_fields_metadata_expanded_dots_multi_segment() {
|
||||||
test_json_fields_metadata(true, false);
|
test_json_fields_metadata(true, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_json_fields_metadata_no_expanded_dots_one_segment() {
|
fn test_json_fields_metadata_no_expanded_dots_one_segment() {
|
||||||
test_json_fields_metadata(false, true);
|
test_json_fields_metadata(false, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_json_fields_metadata_no_expanded_dots_multi_segment() {
|
fn test_json_fields_metadata_no_expanded_dots_multi_segment() {
|
||||||
test_json_fields_metadata(false, false);
|
test_json_fields_metadata(false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[track_caller]
|
||||||
|
fn assert_size_eq(lhs: Option<ByteCount>, rhs: Option<ByteCount>) {
|
||||||
|
let ignore_actual_values = |size_opt: Option<ByteCount>| size_opt.map(|val| val > 0);
|
||||||
|
assert_eq!(ignore_actual_values(lhs), ignore_actual_values(rhs));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[track_caller]
|
||||||
|
fn assert_field_metadata_eq_but_ignore_field_size(
|
||||||
|
expected: &FieldMetadata,
|
||||||
|
actual: &FieldMetadata,
|
||||||
|
) {
|
||||||
|
assert_eq!(&expected.field_name, &actual.field_name);
|
||||||
|
assert_eq!(&expected.typ, &actual.typ);
|
||||||
|
assert_eq!(&expected.stored, &actual.stored);
|
||||||
|
assert_size_eq(expected.postings_size, actual.postings_size);
|
||||||
|
assert_size_eq(expected.positions_size, actual.positions_size);
|
||||||
|
assert_size_eq(expected.fast_size, actual.fast_size);
|
||||||
|
}
|
||||||
|
|
||||||
fn test_json_fields_metadata(expanded_dots: bool, one_segment: bool) {
|
fn test_json_fields_metadata(expanded_dots: bool, one_segment: bool) {
|
||||||
use pretty_assertions::assert_eq;
|
use pretty_assertions::assert_eq;
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
@@ -453,81 +485,101 @@ mod tests_mmap {
|
|||||||
assert_eq!(searcher.num_docs(), 3);
|
assert_eq!(searcher.num_docs(), 3);
|
||||||
|
|
||||||
let fields_metadata = index.fields_metadata().unwrap();
|
let fields_metadata = index.fields_metadata().unwrap();
|
||||||
assert_eq!(
|
|
||||||
fields_metadata,
|
let expected_fields = &[
|
||||||
[
|
FieldMetadata {
|
||||||
FieldMetadata {
|
field_name: "empty".to_string(),
|
||||||
field_name: "empty".to_string(),
|
stored: true,
|
||||||
indexed: true,
|
typ: Type::U64,
|
||||||
stored: true,
|
term_dictionary_size: Some(0u64.into()),
|
||||||
fast: true,
|
fast_size: Some(1u64.into()),
|
||||||
typ: Type::U64
|
postings_size: Some(0u64.into()),
|
||||||
|
positions_size: Some(0u64.into()),
|
||||||
|
},
|
||||||
|
FieldMetadata {
|
||||||
|
field_name: if expanded_dots {
|
||||||
|
"json.shadow.k8s.container.name".to_string()
|
||||||
|
} else {
|
||||||
|
"json.shadow.k8s\\.container\\.name".to_string()
|
||||||
},
|
},
|
||||||
FieldMetadata {
|
stored: true,
|
||||||
field_name: if expanded_dots {
|
typ: Type::Str,
|
||||||
"json.shadow.k8s.container.name".to_string()
|
term_dictionary_size: Some(1u64.into()),
|
||||||
} else {
|
fast_size: Some(1u64.into()),
|
||||||
"json.shadow.k8s\\.container\\.name".to_string()
|
postings_size: Some(1u64.into()),
|
||||||
},
|
positions_size: Some(1u64.into()),
|
||||||
indexed: true,
|
},
|
||||||
stored: true,
|
FieldMetadata {
|
||||||
fast: true,
|
field_name: "json.shadow.sub.a".to_string(),
|
||||||
typ: Type::Str
|
typ: Type::I64,
|
||||||
},
|
stored: true,
|
||||||
FieldMetadata {
|
fast_size: Some(1u64.into()),
|
||||||
field_name: "json.shadow.sub.a".to_string(),
|
term_dictionary_size: Some(1u64.into()),
|
||||||
indexed: true,
|
postings_size: Some(1u64.into()),
|
||||||
stored: true,
|
positions_size: Some(1u64.into()),
|
||||||
fast: true,
|
},
|
||||||
typ: Type::I64
|
FieldMetadata {
|
||||||
},
|
field_name: "json.shadow.sub.b".to_string(),
|
||||||
FieldMetadata {
|
typ: Type::I64,
|
||||||
field_name: "json.shadow.sub.b".to_string(),
|
stored: true,
|
||||||
indexed: true,
|
fast_size: Some(1u64.into()),
|
||||||
stored: true,
|
term_dictionary_size: Some(1u64.into()),
|
||||||
fast: true,
|
postings_size: Some(1u64.into()),
|
||||||
typ: Type::I64
|
positions_size: Some(1u64.into()),
|
||||||
},
|
},
|
||||||
FieldMetadata {
|
FieldMetadata {
|
||||||
field_name: "json.shadow.suber.a".to_string(),
|
field_name: "json.shadow.suber.a".to_string(),
|
||||||
indexed: true,
|
stored: true,
|
||||||
stored: true,
|
typ: Type::I64,
|
||||||
fast: true,
|
fast_size: Some(1u64.into()),
|
||||||
typ: Type::I64
|
term_dictionary_size: Some(1u64.into()),
|
||||||
},
|
postings_size: Some(1u64.into()),
|
||||||
FieldMetadata {
|
positions_size: Some(1u64.into()),
|
||||||
field_name: "json.shadow.suber.a".to_string(),
|
},
|
||||||
indexed: true,
|
FieldMetadata {
|
||||||
stored: true,
|
field_name: "json.shadow.suber.a".to_string(),
|
||||||
fast: true,
|
typ: Type::Str,
|
||||||
typ: Type::Str
|
stored: true,
|
||||||
},
|
fast_size: Some(1u64.into()),
|
||||||
FieldMetadata {
|
term_dictionary_size: Some(1u64.into()),
|
||||||
field_name: "json.shadow.suber.b".to_string(),
|
postings_size: Some(1u64.into()),
|
||||||
indexed: true,
|
positions_size: Some(1u64.into()),
|
||||||
stored: true,
|
},
|
||||||
fast: true,
|
FieldMetadata {
|
||||||
typ: Type::I64
|
field_name: "json.shadow.suber.b".to_string(),
|
||||||
},
|
typ: Type::I64,
|
||||||
FieldMetadata {
|
stored: true,
|
||||||
field_name: "json.shadow.val".to_string(),
|
fast_size: Some(1u64.into()),
|
||||||
indexed: true,
|
term_dictionary_size: Some(1u64.into()),
|
||||||
stored: true,
|
postings_size: Some(1u64.into()),
|
||||||
fast: true,
|
positions_size: Some(1u64.into()),
|
||||||
typ: Type::Str
|
},
|
||||||
},
|
FieldMetadata {
|
||||||
FieldMetadata {
|
field_name: "json.shadow.val".to_string(),
|
||||||
field_name: "numbers".to_string(),
|
typ: Type::Str,
|
||||||
indexed: false,
|
stored: true,
|
||||||
stored: false,
|
fast_size: Some(1u64.into()),
|
||||||
fast: true,
|
term_dictionary_size: Some(1u64.into()),
|
||||||
typ: Type::U64
|
postings_size: Some(1u64.into()),
|
||||||
}
|
positions_size: Some(1u64.into()),
|
||||||
]
|
},
|
||||||
);
|
FieldMetadata {
|
||||||
|
field_name: "numbers".to_string(),
|
||||||
|
stored: false,
|
||||||
|
typ: Type::U64,
|
||||||
|
fast_size: Some(1u64.into()),
|
||||||
|
term_dictionary_size: None,
|
||||||
|
postings_size: None,
|
||||||
|
positions_size: None,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
assert_eq!(fields_metadata.len(), expected_fields.len());
|
||||||
|
for (expected, value) in expected_fields.iter().zip(fields_metadata.iter()) {
|
||||||
|
assert_field_metadata_eq_but_ignore_field_size(expected, value);
|
||||||
|
}
|
||||||
let query_parser = QueryParser::for_index(&index, vec![]);
|
let query_parser = QueryParser::for_index(&index, vec![]);
|
||||||
// Test if returned field name can be queried
|
// Test if returned field name can be queried
|
||||||
for indexed_field in fields_metadata.iter().filter(|meta| meta.indexed) {
|
for indexed_field in fields_metadata.iter().filter(|meta| meta.is_indexed()) {
|
||||||
let val = if indexed_field.typ == Type::Str {
|
let val = if indexed_field.typ == Type::Str {
|
||||||
"a"
|
"a"
|
||||||
} else {
|
} else {
|
||||||
@@ -543,7 +595,10 @@ mod tests_mmap {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Test if returned field name can be used for aggregation
|
// Test if returned field name can be used for aggregation
|
||||||
for fast_field in fields_metadata.iter().filter(|meta| meta.fast) {
|
for fast_field in fields_metadata
|
||||||
|
.iter()
|
||||||
|
.filter(|field_metadata| field_metadata.is_fast())
|
||||||
|
{
|
||||||
let agg_req_str = json!(
|
let agg_req_str = json!(
|
||||||
{
|
{
|
||||||
"termagg": {
|
"termagg": {
|
||||||
|
|||||||
@@ -165,7 +165,7 @@ mod macros;
|
|||||||
mod future_result;
|
mod future_result;
|
||||||
|
|
||||||
// Re-exports
|
// Re-exports
|
||||||
pub use common::DateTime;
|
pub use common::{ByteCount, DateTime};
|
||||||
pub use {columnar, query_grammar, time};
|
pub use {columnar, query_grammar, time};
|
||||||
|
|
||||||
pub use crate::error::TantivyError;
|
pub use crate::error::TantivyError;
|
||||||
|
|||||||
@@ -40,6 +40,9 @@ const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub(crate) mod tests {
|
pub(crate) mod tests {
|
||||||
|
|
||||||
|
use std::iter;
|
||||||
|
|
||||||
use proptest::prelude::*;
|
use proptest::prelude::*;
|
||||||
use proptest::sample::select;
|
use proptest::sample::select;
|
||||||
|
|
||||||
|
|||||||
@@ -302,6 +302,7 @@ fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::BinaryHeap;
|
use std::collections::BinaryHeap;
|
||||||
|
use std::iter;
|
||||||
|
|
||||||
use proptest::prelude::*;
|
use proptest::prelude::*;
|
||||||
|
|
||||||
|
|||||||
@@ -1790,6 +1790,15 @@ mod test {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_space_before_value() {
|
||||||
|
test_parse_query_to_logical_ast_helper(
|
||||||
|
"title: a",
|
||||||
|
r#"Term(field=0, type=Str, "a")"#,
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_escaped_field() {
|
fn test_escaped_field() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|||||||
@@ -41,6 +41,7 @@
|
|||||||
//! use tantivy::schema::document::{DeserializeError, DocumentDeserialize, DocumentDeserializer};
|
//! use tantivy::schema::document::{DeserializeError, DocumentDeserialize, DocumentDeserializer};
|
||||||
//!
|
//!
|
||||||
//! /// Our custom document to let us use a map of `serde_json::Values`.
|
//! /// Our custom document to let us use a map of `serde_json::Values`.
|
||||||
|
//! #[allow(dead_code)]
|
||||||
//! pub struct MyCustomDocument {
|
//! pub struct MyCustomDocument {
|
||||||
//! // Tantivy provides trait implementations for common `serde_json` types.
|
//! // Tantivy provides trait implementations for common `serde_json` types.
|
||||||
//! fields: BTreeMap<Field, serde_json::Value>
|
//! fields: BTreeMap<Field, serde_json::Value>
|
||||||
|
|||||||
@@ -1561,6 +1561,8 @@ fn to_ascii(text: &str, output: &mut String) {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use std::iter;
|
||||||
|
|
||||||
use super::to_ascii;
|
use super::to_ascii;
|
||||||
use crate::tokenizer::{AsciiFoldingFilter, RawTokenizer, SimpleTokenizer, TextAnalyzer};
|
use crate::tokenizer::{AsciiFoldingFilter, RawTokenizer, SimpleTokenizer, TextAnalyzer};
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy-sstable"
|
name = "tantivy-sstable"
|
||||||
version = "0.5.0"
|
version = "0.6.0"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||||
@@ -10,10 +10,10 @@ categories = ["database-implementations", "data-structures", "compression"]
|
|||||||
description = "sstables for tantivy"
|
description = "sstables for tantivy"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
common = {version= "0.9", path="../common", package="tantivy-common"}
|
common = {version= "0.10", path="../common", package="tantivy-common"}
|
||||||
futures-util = "0.3.30"
|
futures-util = "0.3.30"
|
||||||
itertools = "0.14.0"
|
itertools = "0.14.0"
|
||||||
tantivy-bitpacker = { version= "0.8", path="../bitpacker" }
|
tantivy-bitpacker = { version= "0.9", path="../bitpacker" }
|
||||||
tantivy-fst = "0.5"
|
tantivy-fst = "0.5"
|
||||||
# experimental gives us access to Decompressor::upper_bound
|
# experimental gives us access to Decompressor::upper_bound
|
||||||
zstd = { version = "0.13", optional = true, features = ["experimental"] }
|
zstd = { version = "0.13", optional = true, features = ["experimental"] }
|
||||||
|
|||||||
@@ -308,9 +308,10 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
return Err(io::Error::other(format!(
|
return Err(io::Error::new(
|
||||||
"Unsupported sstable version, expected one of [2, 3], found {version}"
|
io::ErrorKind::Other,
|
||||||
)));
|
format!("Unsupported sstable version, expected one of [2, 3], found {version}"),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -696,9 +697,10 @@ mod tests {
|
|||||||
fn read_bytes(&self, range: Range<usize>) -> std::io::Result<OwnedBytes> {
|
fn read_bytes(&self, range: Range<usize>) -> std::io::Result<OwnedBytes> {
|
||||||
let allowed_range = self.allowed_range.lock().unwrap();
|
let allowed_range = self.allowed_range.lock().unwrap();
|
||||||
if !allowed_range.contains(&range.start) || !allowed_range.contains(&(range.end - 1)) {
|
if !allowed_range.contains(&range.start) || !allowed_range.contains(&(range.end - 1)) {
|
||||||
return Err(std::io::Error::other(format!(
|
return Err(std::io::Error::new(
|
||||||
"invalid range, allowed {allowed_range:?}, requested {range:?}"
|
std::io::ErrorKind::Other,
|
||||||
)));
|
format!("invalid range, allowed {allowed_range:?}, requested {range:?}"),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(self.bytes.slice(range))
|
Ok(self.bytes.slice(range))
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
#![allow(clippy::manual_div_ceil)]
|
|
||||||
|
|
||||||
//! `tantivy_sstable` is a crate that provides a sorted string table data structure.
|
//! `tantivy_sstable` is a crate that provides a sorted string table data structure.
|
||||||
//!
|
//!
|
||||||
//! It is used in `tantivy` to store the term dictionary.
|
//! It is used in `tantivy` to store the term dictionary.
|
||||||
|
|||||||
@@ -394,7 +394,7 @@ impl SSTableIndexBuilder {
|
|||||||
|
|
||||||
fn fst_error_to_io_error(error: tantivy_fst::Error) -> io::Error {
|
fn fst_error_to_io_error(error: tantivy_fst::Error) -> io::Error {
|
||||||
match error {
|
match error {
|
||||||
tantivy_fst::Error::Fst(fst_error) => io::Error::other(fst_error),
|
tantivy_fst::Error::Fst(fst_error) => io::Error::new(io::ErrorKind::Other, fst_error),
|
||||||
tantivy_fst::Error::Io(ioerror) => ioerror,
|
tantivy_fst::Error::Io(ioerror) => ioerror,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy-stacker"
|
name = "tantivy-stacker"
|
||||||
version = "0.5.0"
|
version = "0.6.0"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||||
@@ -9,7 +9,7 @@ description = "term hashmap used for indexing"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
murmurhash32 = "0.3"
|
murmurhash32 = "0.3"
|
||||||
common = { version = "0.9", path = "../common/", package = "tantivy-common" }
|
common = { version = "0.10", path = "../common/", package = "tantivy-common" }
|
||||||
ahash = { version = "0.8.11", default-features = false, optional = true }
|
ahash = { version = "0.8.11", default-features = false, optional = true }
|
||||||
rand_distr = "0.4.3"
|
rand_distr = "0.4.3"
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,8 @@ pub fn fast_short_slice_copy(src: &[u8], dst: &mut [u8]) {
|
|||||||
#[track_caller]
|
#[track_caller]
|
||||||
fn len_mismatch_fail(dst_len: usize, src_len: usize) -> ! {
|
fn len_mismatch_fail(dst_len: usize, src_len: usize) -> ! {
|
||||||
panic!(
|
panic!(
|
||||||
"source slice length ({src_len}) does not match destination slice length ({dst_len})",
|
"source slice length ({}) does not match destination slice length ({})",
|
||||||
|
src_len, dst_len,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#![cfg_attr(all(feature = "unstable", test), feature(test))]
|
||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
extern crate test;
|
extern crate test;
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy-tokenizer-api"
|
name = "tantivy-tokenizer-api"
|
||||||
version = "0.5.0"
|
version = "0.6.0"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
description = "Tokenizer API of tantivy"
|
description = "Tokenizer API of tantivy"
|
||||||
|
|||||||
Reference in New Issue
Block a user