Compare commits

..

1 Commits

Author SHA1 Message Date
Paul Masurel
dc0aa3d734 Clippy and cleanups 2025-08-01 11:54:29 +09:00
79 changed files with 1084 additions and 2514 deletions

View File

@@ -2,30 +2,14 @@ Tantivy 0.25
================================
## Bugfixes
- fix union performance regression in tantivy 0.24 [#2663](https://github.com/quickwit-oss/tantivy/pull/2663)(@PSeitz)
- fix union performance regression in tantivy 0.24 [#2663](https://github.com/quickwit-oss/tantivy/pull/2663)(@PSeitz-dd)
- make zstd optional in sstable [#2633](https://github.com/quickwit-oss/tantivy/pull/2633)(@Parth)
- Fix TopDocs::order_by_string_fast_field for asc order [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
## Features/Improvements
- add docs/example and Vec<u32> values to sstable [#2660](https://github.com/quickwit-oss/tantivy/pull/2660)(@PSeitz)
- Add string fast field support to `TopDocs`. [#2642](https://github.com/quickwit-oss/tantivy/pull/2642)(@stuhood)
- update edition to 2024 [#2620](https://github.com/quickwit-oss/tantivy/pull/2620)(@PSeitz)
- Allow optional spaces between the field name and the value in the query parser [#2678](https://github.com/quickwit-oss/tantivy/pull/2678)(@Darkheir)
- Support mixed field types in query parser [#2676](https://github.com/quickwit-oss/tantivy/pull/2676)(@trinity-1686a)
- Add per-field size details [#2679](https://github.com/quickwit-oss/tantivy/pull/2679)(@fulmicoton)
Tantivy 0.24.2
================================
- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
for `Order::Asc`
Tantivy 0.24.1
================================
- Fix: bump required rust version to 1.81
Tantivy 0.24
================================
Tantivy 0.24 will be backwards compatible with indices created with v0.22 and v0.21. The new minimum rust version will be 1.75. Tantivy 0.23 will be skipped.
@@ -108,14 +92,6 @@ This will slightly increase space and access time. [#2439](https://github.com/qu
- Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold)
- remove read_postings_no_deletes [#2526](https://github.com/quickwit-oss/tantivy/pull/2526)(@PSeitz)
Tantivy 0.22.1
================================
- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
for `Order::Asc`
Tantivy 0.22
================================

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.25.0"
version = "0.24.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -57,13 +57,13 @@ measure_time = "0.9.0"
arc-swap = "1.5.0"
bon = "3.3.1"
columnar = { version = "0.6", path = "./columnar", package = "tantivy-columnar" }
sstable = { version = "0.6", path = "./sstable", package = "tantivy-sstable", optional = true }
stacker = { version = "0.6", path = "./stacker", package = "tantivy-stacker" }
query-grammar = { version = "0.25.0", path = "./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version = "0.9", path = "./bitpacker" }
common = { version = "0.10", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version = "0.6", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
columnar = { version = "0.5", path = "./columnar", package = "tantivy-columnar" }
sstable = { version = "0.5", path = "./sstable", package = "tantivy-sstable", optional = true }
stacker = { version = "0.5", path = "./stacker", package = "tantivy-stacker" }
query-grammar = { version = "0.24.0", path = "./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version = "0.8", path = "./bitpacker" }
common = { version = "0.9", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version = "0.5", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
futures-util = { version = "0.3.28", optional = true }
@@ -167,12 +167,3 @@ harness = false
[[bench]]
name = "agg_bench"
harness = false
[[bench]]
name = "exists_json"
harness = false
[[bench]]
name = "and_or_queries"
harness = false

View File

@@ -1,4 +1,4 @@
# Releasing a new Tantivy Version
# Release a new Tantivy Version
## Steps
@@ -10,29 +10,12 @@
6. Set git tag with new version
[`cargo-release`](https://github.com/crate-ci/cargo-release) will help us with steps 1-5:
In conjucation with `cargo-release` Steps 1-4 (I'm not sure if the change detection works):
Set new packages to version 0.0.0
Replace prev-tag-name
```bash
cargo release --workspace --no-publish -v --prev-tag-name 0.24 --push-remote origin minor --no-tag
cargo release --workspace --no-publish -v --prev-tag-name 0.19 --push-remote origin minor --no-tag --execute
```
`no-tag` or it will create tags for all the subpackages
cargo release will _not_ ignore unchanged packages, but it will print warnings for them.
e.g. "warning: updating ownedbytes to 0.10.0 despite no changes made since tag 0.24"
We need to manually ignore these unchanged packages
```bash
cargo release --workspace --no-publish -v --prev-tag-name 0.24 --push-remote origin minor --no-tag --exclude tokenizer-api
```
Add `--execute` to actually publish the packages, otherwise it will only print the commands that would be run.
### Tag Version
```bash
git tag 0.25.0
git push upstream tag 0.25.0
```
no-tag or it will create tags for all the subpackages

View File

@@ -1,224 +0,0 @@
// Benchmarks boolean conjunction queries using binggan.
//
// Whats measured:
// - Or and And queries with varying selectivity (only `Term` queries for now on leafs)
// - Nested AND/OR combinations (on multiple fields)
// - No-scoring path using the Count collector (focus on iterator/skip performance)
// - Top-K retrieval (k=10) using the TopDocs collector
//
// Corpus model:
// - Synthetic docs; each token a/b/c is independently included per doc
// - If none of a/b/c are included, emit a neutral filler token to keep doc length similar
//
// Notes:
// - After optimization, when scoring is disabled Tantivy reads doc-only postings
// (IndexRecordOption::Basic), avoiding frequency decoding overhead.
// - This bench isolates boolean iteration speed and intersection/union cost.
// - Use `cargo bench --bench boolean_conjunction` to run.
use binggan::{black_box, BenchRunner};
use rand::prelude::*;
use rand::rngs::StdRng;
use rand::SeedableRng;
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, TEXT};
use tantivy::{doc, Index, ReloadPolicy, Searcher};
#[derive(Clone)]
struct BenchIndex {
#[allow(dead_code)]
index: Index,
searcher: Searcher,
query_parser: QueryParser,
}
impl BenchIndex {
#[inline(always)]
fn count_query(&self, query_str: &str) -> usize {
let query = self.query_parser.parse_query(query_str).unwrap();
self.searcher.search(&query, &Count).unwrap()
}
#[inline(always)]
fn topk_len(&self, query_str: &str, k: usize) -> usize {
let query = self.query_parser.parse_query(query_str).unwrap();
self.searcher
.search(&query, &TopDocs::with_limit(k))
.unwrap()
.len()
}
}
/// Build a single index containing both fields (title, body) and
/// return two BenchIndex views:
/// - single_field: QueryParser defaults to only "body"
/// - multi_field: QueryParser defaults to ["title", "body"]
fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (BenchIndex, BenchIndex) {
// Unified schema (two text fields)
let mut schema_builder = Schema::builder();
let f_title = schema_builder.add_text_field("title", TEXT);
let f_body = schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
// Populate index with stable RNG for reproducibility.
let mut rng = StdRng::from_seed([7u8; 32]);
// Populate: spread each present token 90/10 to body/title
{
let mut writer = index.writer(500_000_000).unwrap();
for _ in 0..num_docs {
let has_a = rng.gen_bool(p_a as f64);
let has_b = rng.gen_bool(p_b as f64);
let has_c = rng.gen_bool(p_c as f64);
let mut title_tokens: Vec<&str> = Vec::new();
let mut body_tokens: Vec<&str> = Vec::new();
if has_a {
if rng.gen_bool(0.1) {
title_tokens.push("a");
} else {
body_tokens.push("a");
}
}
if has_b {
if rng.gen_bool(0.1) {
title_tokens.push("b");
} else {
body_tokens.push("b");
}
}
if has_c {
if rng.gen_bool(0.1) {
title_tokens.push("c");
} else {
body_tokens.push("c");
}
}
if title_tokens.is_empty() && body_tokens.is_empty() {
body_tokens.push("z");
}
writer
.add_document(doc!(
f_title=>title_tokens.join(" "),
f_body=>body_tokens.join(" ")
))
.unwrap();
}
writer.commit().unwrap();
}
// Prepare reader/searcher once.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let searcher = reader.searcher();
// Build two query parsers with different default fields.
let qp_single = QueryParser::for_index(&index, vec![f_body]);
let qp_multi = QueryParser::for_index(&index, vec![f_title, f_body]);
let single_view = BenchIndex {
index: index.clone(),
searcher: searcher.clone(),
query_parser: qp_single,
};
let multi_view = BenchIndex {
index,
searcher,
query_parser: qp_multi,
};
(single_view, multi_view)
}
fn main() {
// Prepare corpora with varying selectivity. Build one index per corpus
// and derive two views (single-field vs multi-field) from it.
let scenarios = vec![
(
"N=1M, p(a)=5%, p(b)=1%, p(c)=15%".to_string(),
1_000_000,
0.05,
0.01,
0.15,
),
(
"N=1M, p(a)=1%, p(b)=1%, p(c)=15%".to_string(),
1_000_000,
0.01,
0.01,
0.15,
),
];
let mut runner = BenchRunner::new();
for (label, n, pa, pb, pc) in scenarios {
let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc);
// Single-field group: default field is body only
{
let mut group = runner.new_group();
group.set_name(format!("single_field — {}", label));
group.register_with_input("+a_+b_count", &single_view, |benv: &BenchIndex| {
black_box(benv.count_query("+a +b"))
});
group.register_with_input("+a_+b_+c_count", &single_view, |benv: &BenchIndex| {
black_box(benv.count_query("+a +b +c"))
});
group.register_with_input("+a_+b_top10", &single_view, |benv: &BenchIndex| {
black_box(benv.topk_len("+a +b", 10))
});
group.register_with_input("+a_+b_+c_top10", &single_view, |benv: &BenchIndex| {
black_box(benv.topk_len("+a +b +c", 10))
});
// OR queries
group.register_with_input("a_OR_b_count", &single_view, |benv: &BenchIndex| {
black_box(benv.count_query("a OR b"))
});
group.register_with_input("a_OR_b_OR_c_count", &single_view, |benv: &BenchIndex| {
black_box(benv.count_query("a OR b OR c"))
});
group.register_with_input("a_OR_b_top10", &single_view, |benv: &BenchIndex| {
black_box(benv.topk_len("a OR b", 10))
});
group.register_with_input("a_OR_b_OR_c_top10", &single_view, |benv: &BenchIndex| {
black_box(benv.topk_len("a OR b OR c", 10))
});
group.run();
}
// Multi-field group: default fields are [title, body]
{
let mut group = runner.new_group();
group.set_name(format!("multi_field — {}", label));
group.register_with_input("+a_+b_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("+a +b"))
});
group.register_with_input("+a_+b_+c_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("+a +b +c"))
});
group.register_with_input("+a_+b_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("+a +b", 10))
});
group.register_with_input("+a_+b_+c_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("+a +b +c", 10))
});
// OR queries
group.register_with_input("a_OR_b_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("a OR b"))
});
group.register_with_input("a_OR_b_OR_c_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("a OR b OR c"))
});
group.register_with_input("a_OR_b_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("a OR b", 10))
});
group.register_with_input("a_OR_b_OR_c_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("a OR b OR c", 10))
});
group.run();
}
}
}

View File

@@ -1,69 +0,0 @@
use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use serde_json::json;
use tantivy::collector::Count;
use tantivy::query::ExistsQuery;
use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{doc, Index};
#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
fn main() {
let doc_count: usize = 500_000;
let subfield_counts: &[usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 16, 256, 4096, 65536, 262144];
let indices: Vec<(String, Index)> = subfield_counts
.iter()
.map(|&sub_fields| {
(
format!("subfields={sub_fields}"),
build_index_with_json_subfields(doc_count, sub_fields),
)
})
.collect();
let mut group = InputGroup::new_with_inputs(indices);
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
group.config().num_iter_group = Some(1);
group.config().num_iter_bench = Some(1);
group.register("exists_json", exists_json_union);
group.run();
}
fn exists_json_union(index: &Index) {
let reader = index.reader().expect("reader");
let searcher = reader.searcher();
let query = ExistsQuery::new("json".to_string(), true);
let count = searcher.search(&query, &Count).expect("exists search");
// Prevents optimizer from eliding the search
black_box(count);
}
fn build_index_with_json_subfields(num_docs: usize, num_subfields: usize) -> Index {
// Schema: single JSON field stored as FAST to support ExistsQuery.
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).expect("create index");
{
let mut index_writer = index
.writer_with_num_threads(1, 200_000_000)
.expect("writer");
for i in 0..num_docs {
let sub = i % num_subfields;
// Only one subpath set per document; rotate subpaths so that
// no single subpath is full, but the union covers all docs.
let v = json!({ format!("field_{sub}"): i as u64 });
index_writer
.add_document(doc!(json_field => v))
.expect("add_document");
}
index_writer.commit().expect("commit");
}
index
}

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-bitpacker"
version = "0.9.0"
version = "0.8.0"
edition = "2024"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"

View File

@@ -1,3 +1,7 @@
// manual divceil actually generates code that is not optimal (to accept the full range of u32) and
// perf matters here.
#![allow(clippy::manual_div_ceil)]
use std::io;
use std::ops::{Range, RangeInclusive};
@@ -48,7 +52,7 @@ impl BitPacker {
pub fn flush<TWrite: io::Write + ?Sized>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = self.mini_buffer_written.div_ceil(8);
let num_bytes = (self.mini_buffer_written + 7) / 8;
let bytes = self.mini_buffer.to_le_bytes();
output.write_all(&bytes[..num_bytes])?;
self.mini_buffer_written = 0;
@@ -138,7 +142,7 @@ impl BitUnpacker {
// We use `usize` here to avoid overflow issues.
let end_bit_read = (end_idx as usize) * self.num_bits;
let end_byte_read = end_bit_read.div_ceil(8);
let end_byte_read = (end_bit_read + 7) / 8;
assert!(
end_byte_read <= data.len(),
"Requested index is out of bounds."

View File

@@ -140,7 +140,6 @@ impl BlockedBitpacker {
pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
// todo performance: we could decompress a whole block and cache it instead
let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE;
(0..bitpacked_elems)
.map(move |idx| self.get(idx))
.chain(self.buffer.iter().cloned())

View File

@@ -1,3 +1,5 @@
// #[allow(clippy::manual_div_ceil)]
mod bitpacker;
mod blocked_bitpacker;
mod filter_vec;

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-columnar"
version = "0.6.0"
version = "0.5.0"
edition = "2024"
license = "MIT"
homepage = "https://github.com/quickwit-oss/tantivy"
@@ -12,10 +12,10 @@ categories = ["database-implementations", "data-structures", "compression"]
itertools = "0.14.0"
fastdivide = "0.4.0"
stacker = { version= "0.6", path = "../stacker", package="tantivy-stacker"}
sstable = { version= "0.6", path = "../sstable", package = "tantivy-sstable" }
common = { version= "0.10", path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.9", path = "../bitpacker/" }
stacker = { version= "0.5", path = "../stacker", package="tantivy-stacker"}
sstable = { version= "0.5", path = "../sstable", package = "tantivy-sstable" }
common = { version= "0.9", path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.8", path = "../bitpacker/" }
serde = "1.0.152"
downcast-rs = "2.0.1"
@@ -33,29 +33,6 @@ harness = false
name = "bench_access"
harness = false
[[bench]]
name = "bench_first_vals"
harness = false
[[bench]]
name = "bench_values_u64"
harness = false
[[bench]]
name = "bench_values_u128"
harness = false
[[bench]]
name = "bench_create_column_values"
harness = false
[[bench]]
name = "bench_column_values_get"
harness = false
[[bench]]
name = "bench_optional_index"
harness = false
[features]
unstable = []
zstd-compression = ["sstable/zstd-compression"]

View File

@@ -19,7 +19,7 @@ fn main() {
let mut add_card = |card1: Card| {
inputs.push((
card1.to_string(),
format!("{card1}"),
generate_columnar_and_open(card1, NUM_DOCS),
));
};
@@ -50,7 +50,6 @@ fn bench_group(mut runner: InputGroup<Column>) {
let mut buffer = vec![None; BLOCK_SIZE];
for i in (0..NUM_DOCS).step_by(BLOCK_SIZE) {
// fill docs
#[allow(clippy::needless_range_loop)]
for idx in 0..BLOCK_SIZE {
docs[idx] = idx as u32 + i;
}

View File

@@ -1,61 +0,0 @@
use std::sync::Arc;
use binggan::{InputGroup, black_box};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tantivy_columnar::ColumnValues;
use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values};
fn get_data() -> Vec<u64> {
let mut rng = StdRng::seed_from_u64(2u64);
let mut data: Vec<_> = (100..55_000_u64)
.map(|num| num + rng.r#gen::<u8>() as u64)
.collect();
data.push(99_000);
data.insert(1000, 2000);
data.insert(2000, 100);
data.insert(3000, 4100);
data.insert(4000, 100);
data.insert(5000, 800);
data
}
#[inline(never)]
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
type Col = Arc<dyn ColumnValues<u64>>;
fn main() {
let data = get_data();
let inputs: Vec<(String, Col)> = vec![
(
"bitpacked".to_string(),
serialize_and_load_u64_based_column_values(&data.as_slice(), &[CodecType::Bitpacked]),
),
(
"linear".to_string(),
serialize_and_load_u64_based_column_values(&data.as_slice(), &[CodecType::Linear]),
),
(
"blockwise_linear".to_string(),
serialize_and_load_u64_based_column_values(
&data.as_slice(),
&[CodecType::BlockwiseLinear],
),
),
];
let mut group: InputGroup<Col> = InputGroup::new_with_inputs(inputs);
group.register("fastfield_get", |col: &Col| {
let mut sum = 0u64;
for pos in value_iter() {
sum = sum.wrapping_add(col.get_val(pos as u32));
}
black_box(sum);
});
group.run();
}

View File

@@ -1,44 +0,0 @@
use binggan::{InputGroup, black_box};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tantivy_columnar::column_values::{CodecType, serialize_u64_based_column_values};
fn get_data() -> Vec<u64> {
let mut rng = StdRng::seed_from_u64(2u64);
let mut data: Vec<_> = (100..55_000_u64)
.map(|num| num + rng.r#gen::<u8>() as u64)
.collect();
data.push(99_000);
data.insert(1000, 2000);
data.insert(2000, 100);
data.insert(3000, 4100);
data.insert(4000, 100);
data.insert(5000, 800);
data
}
fn main() {
let data = get_data();
let mut group: InputGroup<(CodecType, Vec<u64>)> = InputGroup::new_with_inputs(vec![
(
"bitpacked codec".to_string(),
(CodecType::Bitpacked, data.clone()),
),
(
"linear codec".to_string(),
(CodecType::Linear, data.clone()),
),
(
"blockwise linear codec".to_string(),
(CodecType::BlockwiseLinear, data.clone()),
),
]);
group.register("serialize column_values", |data| {
let mut buffer = Vec::new();
serialize_u64_based_column_values(&data.1.as_slice(), &[data.0], &mut buffer).unwrap();
black_box(buffer.len());
});
group.run();
}

View File

@@ -1,9 +1,12 @@
#![feature(test)]
extern crate test;
use std::sync::Arc;
use binggan::{InputGroup, black_box};
use rand::prelude::*;
use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values};
use tantivy_columnar::*;
use test::{Bencher, black_box};
struct Columns {
pub optional: Column,
@@ -65,45 +68,88 @@ pub fn serialize_and_load(column: &[u64], codec_type: CodecType) -> Arc<dyn Colu
serialize_and_load_u64_based_column_values(&column, &[codec_type])
}
fn main() {
let Columns {
optional,
full,
multi,
} = get_test_columns();
let inputs = vec![
("full".to_string(), full),
("optional".to_string(), optional),
("multi".to_string(), multi),
];
let mut group = InputGroup::new_with_inputs(inputs);
group.register("first_full_scan", |column| {
fn run_bench_on_column_full_scan(b: &mut Bencher, column: Column) {
let num_iter = black_box(NUM_VALUES);
b.iter(|| {
let mut sum = 0u64;
for i in 0..NUM_VALUES as u32 {
for i in 0..num_iter as u32 {
let val = column.first(i);
sum += val.unwrap_or(0);
}
black_box(sum);
sum
});
group.register("first_block_fetch", |column| {
let mut block: Vec<Option<u64>> = vec![None; 64];
let fetch_docids = (0..64).collect::<Vec<_>>();
}
fn run_bench_on_column_block_fetch(b: &mut Bencher, column: Column) {
let mut block: Vec<Option<u64>> = vec![None; 64];
let fetch_docids = (0..64).collect::<Vec<_>>();
b.iter(move || {
column.first_vals(&fetch_docids, &mut block);
black_box(block[0]);
block[0]
});
group.register("first_block_single_calls", |column| {
let mut block: Vec<Option<u64>> = vec![None; 64];
let fetch_docids = (0..64).collect::<Vec<_>>();
}
fn run_bench_on_column_block_single_calls(b: &mut Bencher, column: Column) {
let mut block: Vec<Option<u64>> = vec![None; 64];
let fetch_docids = (0..64).collect::<Vec<_>>();
b.iter(move || {
for i in 0..fetch_docids.len() {
block[i] = column.first(fetch_docids[i]);
}
black_box(block[0]);
block[0]
});
group.run();
}
/// Column first method
#[bench]
fn bench_get_first_on_full_column_full_scan(b: &mut Bencher) {
let column = get_test_columns().full;
run_bench_on_column_full_scan(b, column);
}
#[bench]
fn bench_get_first_on_optional_column_full_scan(b: &mut Bencher) {
let column = get_test_columns().optional;
run_bench_on_column_full_scan(b, column);
}
#[bench]
fn bench_get_first_on_multi_column_full_scan(b: &mut Bencher) {
let column = get_test_columns().multi;
run_bench_on_column_full_scan(b, column);
}
/// Block fetch column accessor
#[bench]
fn bench_get_block_first_on_optional_column(b: &mut Bencher) {
let column = get_test_columns().optional;
run_bench_on_column_block_fetch(b, column);
}
#[bench]
fn bench_get_block_first_on_multi_column(b: &mut Bencher) {
let column = get_test_columns().multi;
run_bench_on_column_block_fetch(b, column);
}
#[bench]
fn bench_get_block_first_on_full_column(b: &mut Bencher) {
let column = get_test_columns().full;
run_bench_on_column_block_fetch(b, column);
}
#[bench]
fn bench_get_block_first_on_optional_column_single_calls(b: &mut Bencher) {
let column = get_test_columns().optional;
run_bench_on_column_block_single_calls(b, column);
}
#[bench]
fn bench_get_block_first_on_multi_column_single_calls(b: &mut Bencher) {
let column = get_test_columns().multi;
run_bench_on_column_block_single_calls(b, column);
}
#[bench]
fn bench_get_block_first_on_full_column_single_calls(b: &mut Bencher) {
let column = get_test_columns().full;
run_bench_on_column_block_single_calls(b, column);
}

View File

@@ -1,106 +0,0 @@
use binggan::{InputGroup, black_box};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tantivy_columnar::column_index::{OptionalIndex, Set};
const TOTAL_NUM_VALUES: u32 = 1_000_000;
fn gen_optional_index(fill_ratio: f64) -> OptionalIndex {
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let vals: Vec<u32> = (0..TOTAL_NUM_VALUES)
.map(|_| rng.gen_bool(fill_ratio))
.enumerate()
.filter(|(_pos, val)| *val)
.map(|(pos, _)| pos as u32)
.collect();
OptionalIndex::for_test(TOTAL_NUM_VALUES, &vals)
}
fn random_range_iterator(
start: u32,
end: u32,
avg_step_size: u32,
avg_deviation: u32,
) -> impl Iterator<Item = u32> {
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let mut current = start;
std::iter::from_fn(move || {
current += rng.gen_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation);
if current >= end { None } else { Some(current) }
})
}
fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> {
let ratio = percent / 100.0;
let step_size = (1f32 / ratio) as u32;
let deviation = step_size - 1;
random_range_iterator(0, num_values, step_size, deviation)
}
fn walk_over_data(codec: &OptionalIndex, avg_step_size: u32) -> Option<u32> {
walk_over_data_from_positions(
codec,
random_range_iterator(0, TOTAL_NUM_VALUES, avg_step_size, 0),
)
}
fn walk_over_data_from_positions(
codec: &OptionalIndex,
positions: impl Iterator<Item = u32>,
) -> Option<u32> {
let mut dense_idx: Option<u32> = None;
for idx in positions {
dense_idx = dense_idx.or(codec.rank_if_exists(idx));
}
dense_idx
}
fn main() {
// Build separate inputs for each fill ratio.
let inputs: Vec<(String, OptionalIndex)> = vec![
("fill=1%".to_string(), gen_optional_index(0.01)),
("fill=5%".to_string(), gen_optional_index(0.05)),
("fill=10%".to_string(), gen_optional_index(0.10)),
("fill=50%".to_string(), gen_optional_index(0.50)),
("fill=90%".to_string(), gen_optional_index(0.90)),
];
let mut group: InputGroup<OptionalIndex> = InputGroup::new_with_inputs(inputs);
// Translate orig->codec (rank_if_exists) with sampling
group.register("orig_to_codec_10pct_hit", |codec: &OptionalIndex| {
black_box(walk_over_data(codec, 100));
});
group.register("orig_to_codec_1pct_hit", |codec: &OptionalIndex| {
black_box(walk_over_data(codec, 1000));
});
group.register("orig_to_codec_full_scan", |codec: &OptionalIndex| {
black_box(walk_over_data_from_positions(codec, 0..TOTAL_NUM_VALUES));
});
// Translate codec->orig (select/select_batch) on sampled ranks
fn bench_translate_codec_to_orig_util(codec: &OptionalIndex, percent_hit: f32) {
let num_non_nulls = codec.num_non_nulls();
let idxs: Vec<u32> = if percent_hit == 100.0f32 {
(0..num_non_nulls).collect()
} else {
n_percent_step_iterator(percent_hit, num_non_nulls).collect()
};
let mut output = vec![0u32; idxs.len()];
output.copy_from_slice(&idxs[..]);
codec.select_batch(&mut output);
black_box(output);
}
group.register("codec_to_orig_0.005pct_hit", |codec: &OptionalIndex| {
bench_translate_codec_to_orig_util(codec, 0.005);
});
group.register("codec_to_orig_10pct_hit", |codec: &OptionalIndex| {
bench_translate_codec_to_orig_util(codec, 10.0);
});
group.register("codec_to_orig_full_scan", |codec: &OptionalIndex| {
bench_translate_codec_to_orig_util(codec, 100.0);
});
group.run();
}

View File

@@ -1,12 +1,15 @@
#![feature(test)]
use std::ops::RangeInclusive;
use std::sync::Arc;
use binggan::{InputGroup, black_box};
use common::OwnedBytes;
use rand::rngs::StdRng;
use rand::seq::SliceRandom;
use rand::{Rng, SeedableRng, random};
use tantivy_columnar::ColumnValues;
use test::Bencher;
extern crate test;
// TODO does this make sense for IPv6 ?
fn generate_random() -> Vec<u64> {
@@ -44,77 +47,78 @@ fn get_data_50percent_item() -> Vec<u128> {
}
data.push(SINGLE_ITEM);
data.shuffle(&mut rng);
data.iter().map(|el| *el as u128).collect::<Vec<_>>()
let data = data.iter().map(|el| *el as u128).collect::<Vec<_>>();
data
}
fn main() {
#[bench]
fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) {
let data = get_data_50percent_item();
let column_range = get_u128_column_from_data(&data);
let column_random = get_u128_column_random();
let column = get_u128_column_from_data(&data);
struct Inputs {
data: Vec<u128>,
column_range: Arc<dyn ColumnValues<u128>>,
column_random: Arc<dyn ColumnValues<u128>>,
}
let inputs = Inputs {
data,
column_range,
column_random,
};
let mut group: InputGroup<Inputs> =
InputGroup::new_with_inputs(vec![("u128 benches".to_string(), inputs)]);
group.register(
"intfastfield_getrange_u128_50percent_hit",
|inp: &Inputs| {
let mut positions = Vec::new();
inp.column_range.get_row_ids_for_value_range(
*FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128,
0..inp.data.len() as u32,
&mut positions,
);
black_box(positions.len());
},
);
group.register("intfastfield_getrange_u128_single_hit", |inp: &Inputs| {
b.iter(|| {
let mut positions = Vec::new();
inp.column_range.get_row_ids_for_value_range(
column.get_row_ids_for_value_range(
*FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128,
0..data.len() as u32,
&mut positions,
);
positions
});
}
#[bench]
fn bench_intfastfield_getrange_u128_single_hit(b: &mut Bencher) {
let data = get_data_50percent_item();
let column = get_u128_column_from_data(&data);
b.iter(|| {
let mut positions = Vec::new();
column.get_row_ids_for_value_range(
*SINGLE_ITEM_RANGE.start() as u128..=*SINGLE_ITEM_RANGE.end() as u128,
0..inp.data.len() as u32,
0..data.len() as u32,
&mut positions,
);
black_box(positions.len());
positions
});
}
group.register("intfastfield_getrange_u128_hit_all", |inp: &Inputs| {
#[bench]
fn bench_intfastfield_getrange_u128_hit_all(b: &mut Bencher) {
let data = get_data_50percent_item();
let column = get_u128_column_from_data(&data);
b.iter(|| {
let mut positions = Vec::new();
inp.column_range.get_row_ids_for_value_range(
0..=u128::MAX,
0..inp.data.len() as u32,
&mut positions,
);
black_box(positions.len());
column.get_row_ids_for_value_range(0..=u128::MAX, 0..data.len() as u32, &mut positions);
positions
});
}
// U128 RANGE END
group.register("intfastfield_scan_all_fflookup_u128", |inp: &Inputs| {
#[bench]
fn bench_intfastfield_scan_all_fflookup_u128(b: &mut Bencher) {
let column = get_u128_column_random();
b.iter(|| {
let mut a = 0u128;
for i in 0u64..inp.column_random.num_vals() as u64 {
a += inp.column_random.get_val(i as u32);
for i in 0u64..column.num_vals() as u64 {
a += column.get_val(i as u32);
}
black_box(a);
a
});
}
group.register("intfastfield_jumpy_stride5_u128", |inp: &Inputs| {
let n = inp.column_random.num_vals();
#[bench]
fn bench_intfastfield_jumpy_stride5_u128(b: &mut Bencher) {
let column = get_u128_column_random();
b.iter(|| {
let n = column.num_vals();
let mut a = 0u128;
for i in (0..n / 5).map(|val| val * 5) {
a += inp.column_random.get_val(i);
a += column.get_val(i);
}
black_box(a);
a
});
group.run();
}

View File

@@ -1,10 +1,13 @@
#![feature(test)]
extern crate test;
use std::ops::RangeInclusive;
use std::sync::Arc;
use binggan::{InputGroup, black_box};
use rand::prelude::*;
use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values};
use tantivy_columnar::*;
use test::Bencher;
// Warning: this generates the same permutation at each call
fn generate_permutation() -> Vec<u64> {
@@ -24,11 +27,37 @@ pub fn serialize_and_load(column: &[u64], codec_type: CodecType) -> Arc<dyn Colu
serialize_and_load_u64_based_column_values(&column, &[codec_type])
}
#[bench]
fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
b.iter(|| {
let mut a = 0u64;
for _ in 0..n {
a = permutation[a as usize];
}
a
});
}
#[bench]
fn bench_intfastfield_jumpy_fflookup_bitpacked(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
b.iter(|| {
let mut a = 0u64;
for _ in 0..n {
a = column.get_val(a as u32);
}
a
});
}
const FIFTY_PERCENT_RANGE: RangeInclusive<u64> = 1..=50;
const SINGLE_ITEM: u64 = 90;
const SINGLE_ITEM_RANGE: RangeInclusive<u64> = 90..=90;
const ONE_PERCENT_ITEM_RANGE: RangeInclusive<u64> = 49..=49;
fn get_data_50percent_item() -> Vec<u128> {
let mut rng = StdRng::from_seed([1u8; 32]);
@@ -40,122 +69,135 @@ fn get_data_50percent_item() -> Vec<u128> {
data.push(SINGLE_ITEM);
data.shuffle(&mut rng);
data.iter().map(|el| *el as u128).collect::<Vec<_>>()
let data = data.iter().map(|el| *el as u128).collect::<Vec<_>>();
data
}
type VecCol = (Vec<u64>, Arc<dyn ColumnValues<u64>>);
// U64 RANGE START
#[bench]
fn bench_intfastfield_getrange_u64_50percent_hit(b: &mut Bencher) {
let data = get_data_50percent_item();
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
b.iter(|| {
let mut positions = Vec::new();
column.get_row_ids_for_value_range(
FIFTY_PERCENT_RANGE,
0..data.len() as u32,
&mut positions,
);
positions
});
}
fn bench_access() {
#[bench]
fn bench_intfastfield_getrange_u64_1percent_hit(b: &mut Bencher) {
let data = get_data_50percent_item();
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
b.iter(|| {
let mut positions = Vec::new();
column.get_row_ids_for_value_range(
ONE_PERCENT_ITEM_RANGE,
0..data.len() as u32,
&mut positions,
);
positions
});
}
#[bench]
fn bench_intfastfield_getrange_u64_single_hit(b: &mut Bencher) {
let data = get_data_50percent_item();
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
b.iter(|| {
let mut positions = Vec::new();
column.get_row_ids_for_value_range(SINGLE_ITEM_RANGE, 0..data.len() as u32, &mut positions);
positions
});
}
#[bench]
fn bench_intfastfield_getrange_u64_hit_all(b: &mut Bencher) {
let data = get_data_50percent_item();
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
b.iter(|| {
let mut positions = Vec::new();
column.get_row_ids_for_value_range(0..=u64::MAX, 0..data.len() as u32, &mut positions);
positions
});
}
// U64 RANGE END
#[bench]
fn bench_intfastfield_stride7_vec(b: &mut Bencher) {
let permutation = generate_permutation();
let column_perm: Arc<dyn ColumnValues<u64>> =
serialize_and_load(&permutation, CodecType::Bitpacked);
let permutation_gcd = generate_permutation_gcd();
let column_perm_gcd: Arc<dyn ColumnValues<u64>> =
serialize_and_load(&permutation_gcd, CodecType::Bitpacked);
let mut group: InputGroup<VecCol> = InputGroup::new_with_inputs(vec![
(
"access".to_string(),
(permutation.clone(), column_perm.clone()),
),
(
"access_gcd".to_string(),
(permutation_gcd.clone(), column_perm_gcd.clone()),
),
]);
group.register("stride7_vec", |inp: &VecCol| {
let n = inp.0.len();
let n = permutation.len();
b.iter(|| {
let mut a = 0u64;
for i in (0..n / 7).map(|val| val * 7) {
a += inp.0[i];
a += permutation[i as usize];
}
black_box(a);
a
});
}
group.register("fullscan_vec", |inp: &VecCol| {
let mut a = 0u64;
for i in 0..inp.0.len() {
a += inp.0[i];
}
black_box(a);
});
group.register("stride7_column_values", |inp: &VecCol| {
let n = inp.1.num_vals() as usize;
let mut a = 0u64;
#[bench]
fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
b.iter(|| {
let mut a = 0;
for i in (0..n / 7).map(|val| val * 7) {
a += inp.1.get_val(i as u32);
a += column.get_val(i as u32);
}
black_box(a);
a
});
}
group.register("fullscan_column_values", |inp: &VecCol| {
#[bench]
fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
let column_ref = column.as_ref();
b.iter(|| {
let mut a = 0u64;
for i in 0u32..n as u32 {
a += column_ref.get_val(i);
}
a
});
}
#[bench]
fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) {
let permutation = generate_permutation_gcd();
let n = permutation.len();
let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
b.iter(|| {
let mut a = 0u64;
let n = inp.1.num_vals() as usize;
for i in 0..n {
a += inp.1.get_val(i as u32);
a += column.get_val(i as u32);
}
black_box(a);
a
});
group.run();
}
fn bench_range() {
let data_50 = get_data_50percent_item();
let data_u64 = data_50.iter().map(|el| *el as u64).collect::<Vec<_>>();
let column_data: Arc<dyn ColumnValues<u64>> =
serialize_and_load(&data_u64, CodecType::Bitpacked);
let mut group: InputGroup<Arc<dyn ColumnValues<u64>>> =
InputGroup::new_with_inputs(vec![("dist_50pct_item".to_string(), column_data.clone())]);
group.register(
"fastfield_getrange_u64_50percent_hit",
|col: &Arc<dyn ColumnValues<u64>>| {
let mut positions = Vec::new();
col.get_row_ids_for_value_range(FIFTY_PERCENT_RANGE, 0..col.num_vals(), &mut positions);
black_box(positions.len());
},
);
group.register(
"fastfield_getrange_u64_1percent_hit",
|col: &Arc<dyn ColumnValues<u64>>| {
let mut positions = Vec::new();
col.get_row_ids_for_value_range(
ONE_PERCENT_ITEM_RANGE,
0..col.num_vals(),
&mut positions,
);
black_box(positions.len());
},
);
group.register(
"fastfield_getrange_u64_single_hit",
|col: &Arc<dyn ColumnValues<u64>>| {
let mut positions = Vec::new();
col.get_row_ids_for_value_range(SINGLE_ITEM_RANGE, 0..col.num_vals(), &mut positions);
black_box(positions.len());
},
);
group.register(
"fastfield_getrange_u64_hit_all",
|col: &Arc<dyn ColumnValues<u64>>| {
let mut positions = Vec::new();
col.get_row_ids_for_value_range(0..=u64::MAX, 0..col.num_vals(), &mut positions);
black_box(positions.len());
},
);
group.run();
}
fn main() {
bench_access();
bench_range();
#[bench]
fn bench_intfastfield_scan_all_vec(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let mut a = 0u64;
for i in 0..permutation.len() {
a += permutation[i as usize] as u64;
}
a
});
}

View File

@@ -56,7 +56,7 @@ fn get_doc_ids_with_values<'a>(
ColumnIndex::Full => Box::new(doc_range),
ColumnIndex::Optional(optional_index) => Box::new(
optional_index
.iter_non_null_docs()
.iter_docs()
.map(move |row| row + doc_range.start),
),
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
@@ -73,7 +73,7 @@ fn get_doc_ids_with_values<'a>(
MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
multivalued_index
.optional_index
.iter_non_null_docs()
.iter_docs()
.map(move |row| row + doc_range.start),
),
},
@@ -105,11 +105,10 @@ fn get_num_values_iterator<'a>(
) -> Box<dyn Iterator<Item = u32> + 'a> {
match column_index {
ColumnIndex::Empty { .. } => Box::new(std::iter::empty()),
ColumnIndex::Full => Box::new(std::iter::repeat_n(1u32, num_docs as usize)),
ColumnIndex::Optional(optional_index) => Box::new(std::iter::repeat_n(
1u32,
optional_index.num_non_nulls() as usize,
)),
ColumnIndex::Full => Box::new(std::iter::repeat(1u32).take(num_docs as usize)),
ColumnIndex::Optional(optional_index) => {
Box::new(std::iter::repeat(1u32).take(optional_index.num_non_nulls() as usize))
}
ColumnIndex::Multivalued(multivalued_index) => Box::new(
multivalued_index
.get_start_index_column()
@@ -178,7 +177,7 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
ColumnIndex::Full => Box::new(columnar_row_range),
ColumnIndex::Optional(optional_index) => Box::new(
optional_index
.iter_non_null_docs()
.iter_docs()
.map(move |row_id: RowId| columnar_row_range.start + row_id),
),
ColumnIndex::Multivalued(_) => {

View File

@@ -215,32 +215,6 @@ impl MultiValueIndex {
}
}
/// Returns an iterator over document ids that have at least one value.
pub fn iter_non_null_docs(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
match self {
MultiValueIndex::MultiValueIndexV1(idx) => {
let mut doc: DocId = 0u32;
let num_docs = idx.num_docs();
Box::new(std::iter::from_fn(move || {
// This is not the most efficient way to do this, but it's legacy code.
while doc < num_docs {
let cur = doc;
doc += 1;
let start = idx.start_index_column.get_val(cur);
let end = idx.start_index_column.get_val(cur + 1);
if end > start {
return Some(cur);
}
}
None
}))
}
MultiValueIndex::MultiValueIndexV2(idx) => {
Box::new(idx.optional_index.iter_non_null_docs())
}
}
}
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
/// docids. Positions are converted inplace to docids.
///

View File

@@ -1,4 +1,4 @@
use std::io;
use std::io::{self, Write};
use std::sync::Arc;
mod set;
@@ -11,7 +11,7 @@ use set_block::{
};
use crate::iterable::Iterable;
use crate::{DocId, RowId};
use crate::{DocId, InvalidData, RowId};
/// The threshold for for number of elements after which we switch to dense block encoding.
///
@@ -88,7 +88,7 @@ pub struct OptionalIndex {
impl Iterable<u32> for &OptionalIndex {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
Box::new(self.iter_non_null_docs())
Box::new(self.iter_docs())
}
}
@@ -280,9 +280,8 @@ impl OptionalIndex {
self.num_non_null_docs
}
pub fn iter_non_null_docs(&self) -> impl Iterator<Item = RowId> + '_ {
// TODO optimize. We could iterate over the blocks directly.
// We use the dense value ids and retrieve the doc ids via select.
pub fn iter_docs(&self) -> impl Iterator<Item = RowId> + '_ {
// TODO optimize
let mut select_batch = self.select_cursor();
(0..self.num_non_null_docs).map(move |rank| select_batch.select(rank))
}
@@ -335,6 +334,38 @@ enum Block<'a> {
Sparse(SparseBlock<'a>),
}
#[derive(Debug, Copy, Clone)]
enum OptionalIndexCodec {
Dense = 0,
Sparse = 1,
}
impl OptionalIndexCodec {
fn to_code(self) -> u8 {
self as u8
}
fn try_from_code(code: u8) -> Result<Self, InvalidData> {
match code {
0 => Ok(Self::Dense),
1 => Ok(Self::Sparse),
_ => Err(InvalidData),
}
}
}
impl BinarySerializable for OptionalIndexCodec {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
writer.write_all(&[self.to_code()])
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let optional_codec_code = u8::deserialize(reader)?;
let optional_codec = Self::try_from_code(optional_codec_code)?;
Ok(optional_codec)
}
}
fn serialize_optional_index_block(block_els: &[u16], out: &mut impl io::Write) -> io::Result<()> {
let is_sparse = is_sparse(block_els.len() as u32);
if is_sparse {

View File

@@ -164,11 +164,7 @@ fn test_optional_index_large() {
fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {
let optional_index = OptionalIndex::for_test(num_rows, row_ids);
assert_eq!(optional_index.num_docs(), num_rows);
assert!(
optional_index
.iter_non_null_docs()
.eq(row_ids.iter().copied())
);
assert!(optional_index.iter_docs().eq(row_ids.iter().copied()));
}
#[test]
@@ -223,3 +219,170 @@ fn test_optional_index_for_tests() {
assert!(!optional_index.contains(3));
assert_eq!(optional_index.num_docs(), 4);
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::Bencher;
use super::*;
const TOTAL_NUM_VALUES: u32 = 1_000_000;
fn gen_bools(fill_ratio: f64) -> OptionalIndex {
let mut out = Vec::new();
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let vals: Vec<RowId> = (0..TOTAL_NUM_VALUES)
.map(|_| rng.gen_bool(fill_ratio))
.enumerate()
.filter(|(_pos, val)| *val)
.map(|(pos, _)| pos as RowId)
.collect();
serialize_optional_index(&&vals[..], TOTAL_NUM_VALUES, &mut out).unwrap();
open_optional_index(OwnedBytes::new(out)).unwrap()
}
fn random_range_iterator(
start: u32,
end: u32,
avg_step_size: u32,
avg_deviation: u32,
) -> impl Iterator<Item = u32> {
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let mut current = start;
std::iter::from_fn(move || {
current += rng.gen_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation);
if current >= end { None } else { Some(current) }
})
}
fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> {
let ratio = percent / 100.0;
let step_size = (1f32 / ratio) as u32;
let deviation = step_size - 1;
random_range_iterator(0, num_values, step_size, deviation)
}
fn walk_over_data(codec: &OptionalIndex, avg_step_size: u32) -> Option<u32> {
walk_over_data_from_positions(
codec,
random_range_iterator(0, TOTAL_NUM_VALUES, avg_step_size, 0),
)
}
fn walk_over_data_from_positions(
codec: &OptionalIndex,
positions: impl Iterator<Item = u32>,
) -> Option<u32> {
let mut dense_idx: Option<u32> = None;
for idx in positions {
dense_idx = dense_idx.or(codec.rank_if_exists(idx));
}
dense_idx
}
#[bench]
fn bench_translate_orig_to_codec_1percent_filled_10percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.01f64);
bench.iter(|| walk_over_data(&codec, 100));
}
#[bench]
fn bench_translate_orig_to_codec_5percent_filled_10percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.05f64);
bench.iter(|| walk_over_data(&codec, 100));
}
#[bench]
fn bench_translate_orig_to_codec_5percent_filled_1percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.05f64);
bench.iter(|| walk_over_data(&codec, 1000));
}
#[bench]
fn bench_translate_orig_to_codec_full_scan_1percent_filled(bench: &mut Bencher) {
let codec = gen_bools(0.01f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_translate_orig_to_codec_full_scan_10percent_filled(bench: &mut Bencher) {
let codec = gen_bools(0.1f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_translate_orig_to_codec_full_scan_90percent_filled(bench: &mut Bencher) {
let codec = gen_bools(0.9f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_translate_orig_to_codec_10percent_filled_1percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.1f64);
bench.iter(|| walk_over_data(&codec, 100));
}
#[bench]
fn bench_translate_orig_to_codec_50percent_filled_1percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.5f64);
bench.iter(|| walk_over_data(&codec, 100));
}
#[bench]
fn bench_translate_orig_to_codec_90percent_filled_1percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.9f64);
bench.iter(|| walk_over_data(&codec, 100));
}
#[bench]
fn bench_translate_codec_to_orig_1percent_filled_0comma005percent_hit(bench: &mut Bencher) {
bench_translate_codec_to_orig_util(0.01f64, 0.005f32, bench);
}
#[bench]
fn bench_translate_codec_to_orig_10percent_filled_0comma005percent_hit(bench: &mut Bencher) {
bench_translate_codec_to_orig_util(0.1f64, 0.005f32, bench);
}
#[bench]
fn bench_translate_codec_to_orig_1percent_filled_10percent_hit(bench: &mut Bencher) {
bench_translate_codec_to_orig_util(0.01f64, 10f32, bench);
}
#[bench]
fn bench_translate_codec_to_orig_1percent_filled_full_scan(bench: &mut Bencher) {
bench_translate_codec_to_orig_util(0.01f64, 100f32, bench);
}
fn bench_translate_codec_to_orig_util(
percent_filled: f64,
percent_hit: f32,
bench: &mut Bencher,
) {
let codec = gen_bools(percent_filled);
let num_non_nulls = codec.num_non_nulls();
let idxs: Vec<u32> = if percent_hit == 100.0f32 {
(0..num_non_nulls).collect()
} else {
n_percent_step_iterator(percent_hit, num_non_nulls).collect()
};
let mut output = vec![0u32; idxs.len()];
bench.iter(|| {
output.copy_from_slice(&idxs[..]);
codec.select_batch(&mut output);
});
}
#[bench]
fn bench_translate_codec_to_orig_90percent_filled_0comma005percent_hit(bench: &mut Bencher) {
bench_translate_codec_to_orig_util(0.9f64, 0.005, bench);
}
#[bench]
fn bench_translate_codec_to_orig_90percent_filled_full_scan(bench: &mut Bencher) {
bench_translate_codec_to_orig_util(0.9f64, 100.0f32, bench);
}
}

View File

@@ -0,0 +1,139 @@
use std::sync::Arc;
use common::OwnedBytes;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::{self, Bencher};
use super::*;
use crate::column_values::u64_based::*;
fn get_data() -> Vec<u64> {
let mut rng = StdRng::seed_from_u64(2u64);
let mut data: Vec<_> = (100..55000_u64)
.map(|num| num + rng.r#gen::<u8>() as u64)
.collect();
data.push(99_000);
data.insert(1000, 2000);
data.insert(2000, 100);
data.insert(3000, 4100);
data.insert(4000, 100);
data.insert(5000, 800);
data
}
fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
let mut stats_collector = StatsCollector::default();
for val in vals {
stats_collector.collect(val);
}
stats_collector.stats()
}
#[inline(never)]
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
let mut bytes = Vec::new();
let stats = compute_stats(data.iter().cloned());
let mut codec_serializer = Codec::estimator();
for val in data {
codec_serializer.collect(*val);
}
codec_serializer
.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
.unwrap();
Codec::load(OwnedBytes::new(bytes)).unwrap()
}
fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
let col = get_reader_for_bench::<Codec>(data);
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
let val = col.get_val(pos as u32);
sum = sum.wrapping_add(val);
}
sum
});
}
#[inline(never)]
fn bench_get_dynamic_helper(b: &mut Bencher, col: Arc<dyn ColumnValues>) {
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
let val = col.get_val(pos as u32);
sum = sum.wrapping_add(val);
}
sum
});
}
fn bench_get_dynamic<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
let col = Arc::new(get_reader_for_bench::<Codec>(data));
bench_get_dynamic_helper(b, col);
}
fn bench_create<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
let stats = compute_stats(data.iter().cloned());
let mut bytes = Vec::new();
b.iter(|| {
bytes.clear();
let mut codec_serializer = Codec::estimator();
for val in data.iter().take(1024) {
codec_serializer.collect(*val);
}
codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
});
}
#[bench]
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BlockwiseLinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get_dynamic::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get_dynamic(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get_dynamic::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BlockwiseLinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get_dynamic(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get_dynamic::<BlockwiseLinearCodec>(b, &data);
}

View File

@@ -242,3 +242,6 @@ impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnV
.get_row_ids_for_value_range(range, doc_id_range, positions)
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench;

View File

@@ -185,10 +185,10 @@ impl CompactSpaceBuilder {
let mut covered_space = Vec::with_capacity(self.blanks.len());
// beginning of the blanks
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start)
&& *first_blank_start != 0
{
covered_space.push(0..=first_blank_start - 1);
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
if *first_blank_start != 0 {
covered_space.push(0..=first_blank_start - 1);
}
}
// Between the blanks
@@ -202,10 +202,10 @@ impl CompactSpaceBuilder {
covered_space.extend(between_blanks);
// end of the blanks
if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end)
&& *last_blank_end != u128::MAX
{
covered_space.push(last_blank_end + 1..=u128::MAX);
if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end) {
if *last_blank_end != u128::MAX {
covered_space.push(last_blank_end + 1..=u128::MAX);
}
}
if covered_space.is_empty() {

View File

@@ -105,7 +105,7 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {
fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
let num_bits_per_value = num_bits(stats);
Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64)).div_ceil(8))
Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64) + 7) / 8)
}
fn serialize(

View File

@@ -117,7 +117,7 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
Some(
stats.num_bytes()
+ linear_params.num_bytes()
+ (num_bits as u64 * stats.num_rows as u64).div_ceil(8),
+ (num_bits as u64 * stats.num_rows as u64 + 7) / 8,
)
}

View File

@@ -367,7 +367,7 @@ fn is_empty_after_merge(
ColumnIndex::Empty { .. } => true,
ColumnIndex::Full => alive_bitset.len() == 0,
ColumnIndex::Optional(optional_index) => {
for doc in optional_index.iter_non_null_docs() {
for doc in optional_index.iter_docs() {
if alive_bitset.contains(doc) {
return false;
}

View File

@@ -1,3 +1,5 @@
#![allow(clippy::manual_div_ceil)]
mod column_type;
mod format_version;
mod merge;

View File

@@ -244,7 +244,7 @@ impl SymbolValue for UnorderedId {
fn compute_num_bytes_for_u64(val: u64) -> usize {
let msb = (64u32 - val.leading_zeros()) as usize;
msb.div_ceil(8)
(msb + 7) / 8
}
fn encode_zig_zag(n: i64) -> u64 {

View File

@@ -17,10 +17,15 @@
//! column.
//! - [column_values]: Stores the values of a column in a dense format.
// #![cfg_attr(all(feature = "unstable", test), feature(test))]
#[cfg(test)]
#[macro_use]
extern crate more_asserts;
#[cfg(all(test, feature = "unstable"))]
extern crate test;
use std::fmt::Display;
use std::io;

View File

@@ -1,5 +1,3 @@
use std::str::FromStr;
use common::DateTime;
use crate::InvalidData;
@@ -11,23 +9,6 @@ pub enum NumericalValue {
F64(f64),
}
impl FromStr for NumericalValue {
type Err = ();
fn from_str(s: &str) -> Result<Self, ()> {
if let Ok(val_i64) = s.parse::<i64>() {
return Ok(val_i64.into());
}
if let Ok(val_u64) = s.parse::<u64>() {
return Ok(val_u64.into());
}
if let Ok(val_f64) = s.parse::<f64>() {
return Ok(NumericalValue::from(val_f64).normalize());
}
Err(())
}
}
impl NumericalValue {
pub fn numerical_type(&self) -> NumericalType {
match self {
@@ -45,7 +26,7 @@ impl NumericalValue {
if val <= i64::MAX as u64 {
NumericalValue::I64(val as i64)
} else {
NumericalValue::U64(val)
NumericalValue::F64(val as f64)
}
}
NumericalValue::I64(val) => NumericalValue::I64(val),
@@ -160,7 +141,6 @@ impl Coerce for DateTime {
#[cfg(test)]
mod tests {
use super::NumericalType;
use crate::NumericalValue;
#[test]
fn test_numerical_type_code() {
@@ -173,58 +153,4 @@ mod tests {
}
assert_eq!(num_numerical_type, 3);
}
#[test]
fn test_parse_numerical() {
assert_eq!(
"123".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(123)
);
assert_eq!(
"18446744073709551615".parse::<NumericalValue>().unwrap(),
NumericalValue::U64(18446744073709551615u64)
);
assert_eq!(
"1.0".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(1i64)
);
assert_eq!(
"1.1".parse::<NumericalValue>().unwrap(),
NumericalValue::F64(1.1f64)
);
assert_eq!(
"-1.0".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(-1i64)
);
}
#[test]
fn test_normalize_numerical() {
assert_eq!(
NumericalValue::from(1u64).normalize(),
NumericalValue::I64(1i64),
);
let limit_val = i64::MAX as u64 + 1u64;
assert_eq!(
NumericalValue::from(limit_val).normalize(),
NumericalValue::U64(limit_val),
);
assert_eq!(
NumericalValue::from(-1i64).normalize(),
NumericalValue::I64(-1i64),
);
assert_eq!(
NumericalValue::from(-2.0f64).normalize(),
NumericalValue::I64(-2i64),
);
assert_eq!(
NumericalValue::from(-2.1f64).normalize(),
NumericalValue::F64(-2.1f64),
);
let large_float = 2.0f64.powf(70.0f64);
assert_eq!(
NumericalValue::from(large_float).normalize(),
NumericalValue::F64(large_float),
);
}
}

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-common"
version = "0.10.0"
version = "0.9.0"
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
edition = "2024"

View File

@@ -9,7 +9,7 @@ use crate::ByteCount;
pub struct TinySet(u64);
impl fmt::Debug for TinySet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.into_iter().collect::<Vec<u32>>().fmt(f)
}
}
@@ -182,8 +182,9 @@ pub struct BitSet {
max_value: u32,
}
#[inline(always)]
fn num_buckets(max_val: u32) -> u32 {
max_val.div_ceil(64u32)
(max_val + 63u32) / 64u32
}
impl BitSet {

View File

@@ -1,4 +1,6 @@
#![allow(clippy::len_without_is_empty)]
// manual divceil actually generates code that is not optimal (to accept the full range of u32) and
// perf matters here.
#![allow(clippy::len_without_is_empty, clippy::manual_div_ceil)]
use std::ops::Deref;

View File

@@ -29,7 +29,6 @@ impl BinarySerializable for VIntU128 {
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
#[allow(clippy::unbuffered_bytes)]
let mut bytes = reader.bytes();
let mut result = 0u128;
let mut shift = 0u64;
@@ -53,7 +52,7 @@ impl BinarySerializable for VIntU128 {
}
}
/// Wrapper over a `u64` that serializes as a variable int.
/// Wrapper over a `u64` that serializes as a variable int.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct VInt(pub u64);
@@ -197,7 +196,6 @@ impl BinarySerializable for VInt {
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
#[allow(clippy::unbuffered_bytes)]
let mut bytes = reader.bytes();
let mut result = 0u64;
let mut shift = 0u64;

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-query-grammar"
version = "0.25.0"
version = "0.24.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -15,5 +15,3 @@ edition = "2024"
nom = "7"
serde = { version = "1.0.219", features = ["derive"] }
serde_json = "1.0.140"
ordered-float = "5.0.0"
fnv = "1.0.7"

View File

@@ -117,22 +117,6 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
}
}
pub(crate) fn terminated_infallible<I, O1, O2, F, G>(
mut first: F,
mut second: G,
) -> impl FnMut(I) -> JResult<I, O1>
where
F: nom::Parser<I, (O1, ErrorList), Infallible>,
G: nom::Parser<I, (O2, ErrorList), Infallible>,
{
move |input: I| {
let (input, (o1, mut err)) = first.parse(input)?;
let (input, (_, mut err2)) = second.parse(input)?;
err.append(&mut err2);
Ok((input, (o1, err)))
}
}
pub(crate) fn delimited_infallible<I, O1, O2, O3, F, G, H>(
mut first: F,
mut second: G,

View File

@@ -31,17 +31,7 @@ pub fn parse_query_lenient(query: &str) -> (UserInputAst, Vec<LenientError>) {
#[cfg(test)]
mod tests {
use crate::{UserInputAst, parse_query, parse_query_lenient};
#[test]
fn test_deduplication() {
let ast: UserInputAst = parse_query("a a").unwrap();
let json = serde_json::to_string(&ast).unwrap();
assert_eq!(
json,
r#"{"type":"bool","clauses":[[null,{"type":"literal","field_name":null,"phrase":"a","delimiter":"none","slop":0,"prefix":false}]]}"#
);
}
use crate::{parse_query, parse_query_lenient};
#[test]
fn test_parse_query_serialization() {

View File

@@ -1,7 +1,6 @@
use std::borrow::Cow;
use std::iter::once;
use fnv::FnvHashSet;
use nom::IResult;
use nom::branch::alt;
use nom::bytes::complete::tag;
@@ -37,7 +36,7 @@ fn field_name(inp: &str) -> IResult<&str, String> {
alt((first_char, escape_sequence())),
many0(alt((simple_char, escape_sequence(), char('\\')))),
)),
tuple((multispace0, char(':'), multispace0)),
char(':'),
),
|(first_char, next)| once(first_char).chain(next).collect(),
)(inp)
@@ -69,7 +68,7 @@ fn interpret_escape(source: &str) -> String {
/// Consume a word outside of any context.
// TODO should support escape sequences
fn word(inp: &str) -> IResult<&str, Cow<'_, str>> {
fn word(inp: &str) -> IResult<&str, Cow<str>> {
map_res(
recognize(tuple((
alt((
@@ -367,10 +366,7 @@ fn literal(inp: &str) -> IResult<&str, UserInputAst> {
// something (a field name) got parsed before
alt((
map(
tuple((
opt(field_name),
alt((range, set, exists, regex, term_or_phrase)),
)),
tuple((opt(field_name), alt((range, set, exists, term_or_phrase)))),
|(field_name, leaf): (Option<String>, UserInputLeaf)| leaf.set_field(field_name).into(),
),
term_group,
@@ -392,10 +388,6 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
value((), peek(one_of("{[><"))),
map(range_infallible, |(range, errs)| (Some(range), errs)),
),
(
value((), peek(one_of("/"))),
map(regex_infallible, |(regex, errs)| (Some(regex), errs)),
),
),
delimited_infallible(space0_infallible, term_or_phrase_infallible, nothing),
),
@@ -696,61 +688,6 @@ fn set_infallible(mut inp: &str) -> JResult<&str, UserInputLeaf> {
}
}
fn regex(inp: &str) -> IResult<&str, UserInputLeaf> {
map(
terminated(
delimited(
char('/'),
many1(alt((preceded(char('\\'), char('/')), none_of("/")))),
char('/'),
),
peek(alt((multispace1, eof))),
),
|elements| UserInputLeaf::Regex {
field: None,
pattern: elements.into_iter().collect::<String>(),
},
)(inp)
}
fn regex_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
match terminated_infallible(
delimited_infallible(
opt_i_err(char('/'), "missing delimiter /"),
opt_i(many1(alt((preceded(char('\\'), char('/')), none_of("/"))))),
opt_i_err(char('/'), "missing delimiter /"),
),
opt_i_err(
peek(alt((multispace1, eof))),
"expected whitespace or end of input",
),
)(inp)
{
Ok((rest, (elements_part, errors))) => {
let pattern = match elements_part {
Some(elements_part) => elements_part.into_iter().collect(),
None => String::new(),
};
let res = UserInputLeaf::Regex {
field: None,
pattern,
};
Ok((rest, (res, errors)))
}
Err(e) => {
let errs = vec![LenientErrorInternal {
pos: inp.len(),
message: e.to_string(),
}];
let res = UserInputLeaf::Regex {
field: None,
pattern: String::new(),
};
Ok((inp, (res, errs)))
}
}
}
fn negate(expr: UserInputAst) -> UserInputAst {
expr.unary(Occur::MustNot)
}
@@ -815,7 +752,7 @@ fn boosted_leaf(inp: &str) -> IResult<&str, UserInputAst> {
tuple((leaf, fallible(boost))),
|(leaf, boost_opt)| match boost_opt {
Some(boost) if (boost - 1.0).abs() > f64::EPSILON => {
UserInputAst::Boost(Box::new(leaf), boost.into())
UserInputAst::Boost(Box::new(leaf), boost)
}
_ => leaf,
},
@@ -827,7 +764,7 @@ fn boosted_leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
tuple_infallible((leaf_infallible, boost)),
|((leaf, boost_opt), error)| match boost_opt {
Some(boost) if (boost - 1.0).abs() > f64::EPSILON => (
leaf.map(|leaf| UserInputAst::Boost(Box::new(leaf), boost.into())),
leaf.map(|leaf| UserInputAst::Boost(Box::new(leaf), boost)),
error,
),
_ => (leaf, error),
@@ -1078,25 +1015,12 @@ pub fn parse_to_ast_lenient(query_str: &str) -> (UserInputAst, Vec<LenientError>
(rewrite_ast(res), errors)
}
/// Removes unnecessary children clauses in AST
///
/// Motivated by [issue #1433](https://github.com/quickwit-oss/tantivy/issues/1433)
fn rewrite_ast(mut input: UserInputAst) -> UserInputAst {
if let UserInputAst::Clause(sub_clauses) = &mut input {
// call rewrite_ast recursively on children clauses if applicable
let mut new_clauses = Vec::with_capacity(sub_clauses.len());
for (occur, clause) in sub_clauses.drain(..) {
let rewritten_clause = rewrite_ast(clause);
new_clauses.push((occur, rewritten_clause));
}
*sub_clauses = new_clauses;
// remove duplicate child clauses
// e.g. (+a +b) OR (+c +d) OR (+a +b) => (+a +b) OR (+c +d)
let mut seen = FnvHashSet::default();
sub_clauses.retain(|term| seen.insert(term.clone()));
// Removes unnecessary children clauses in AST
//
// Motivated by [issue #1433](https://github.com/quickwit-oss/tantivy/issues/1433)
for term in sub_clauses {
if let UserInputAst::Clause(terms) = &mut input {
for term in terms {
rewrite_ast_clause(term);
}
}
@@ -1358,10 +1282,6 @@ mod test {
super::field_name("~my~field:a"),
Ok(("a", "~my~field".to_string()))
);
assert_eq!(
super::field_name(".my.field.name : a"),
Ok(("a", ".my.field.name".to_string()))
);
for special_char in SPECIAL_CHARS.iter() {
let query = &format!("\\{special_char}my\\{special_char}field:a");
assert_eq!(
@@ -1768,72 +1688,4 @@ mod test {
fn test_invalid_field() {
test_is_parse_err(r#"!bc:def"#, "!bc:def");
}
#[test]
fn test_regex_parser() {
let r = parse_to_ast(r#"a:/joh?n(ath[oa]n)/"#);
assert!(r.is_ok(), "Failed to parse custom query: {r:?}");
let (_, input) = r.unwrap();
match input {
UserInputAst::Leaf(leaf) => match leaf.as_ref() {
UserInputLeaf::Regex { field, pattern } => {
assert_eq!(field, &Some("a".to_string()));
assert_eq!(pattern, "joh?n(ath[oa]n)");
}
_ => panic!("Expected a regex leaf, got {leaf:?}"),
},
_ => panic!("Expected a leaf"),
}
let r = parse_to_ast(r#"a:/\\/cgi-bin\\/luci.*/"#);
assert!(r.is_ok(), "Failed to parse custom query: {r:?}");
let (_, input) = r.unwrap();
match input {
UserInputAst::Leaf(leaf) => match leaf.as_ref() {
UserInputLeaf::Regex { field, pattern } => {
assert_eq!(field, &Some("a".to_string()));
assert_eq!(pattern, "\\/cgi-bin\\/luci.*");
}
_ => panic!("Expected a regex leaf, got {leaf:?}"),
},
_ => panic!("Expected a leaf"),
}
}
#[test]
fn test_regex_parser_lenient() {
let literal = |query| literal_infallible(query).unwrap().1;
let (res, errs) = literal(r#"a:/joh?n(ath[oa]n)/"#);
let expected = UserInputLeaf::Regex {
field: Some("a".to_string()),
pattern: "joh?n(ath[oa]n)".to_string(),
}
.into();
assert_eq!(res.unwrap(), expected);
assert!(errs.is_empty(), "Expected no errors, got: {errs:?}");
let (res, errs) = literal("title:/joh?n(ath[oa]n)");
let expected = UserInputLeaf::Regex {
field: Some("title".to_string()),
pattern: "joh?n(ath[oa]n)".to_string(),
}
.into();
assert_eq!(res.unwrap(), expected);
assert_eq!(errs.len(), 1, "Expected 1 error, got: {errs:?}");
assert_eq!(
errs[0].message, "missing delimiter /",
"Unexpected error message",
);
}
#[test]
fn test_space_before_value() {
test_parse_query_to_ast_helper("field : a", r#""field":a"#);
test_parse_query_to_ast_helper("field: a", r#""field":a"#);
test_parse_query_to_ast_helper("field :a", r#""field":a"#);
test_parse_query_to_ast_helper(
"field : 'happy tax payer' AND other_field : 1",
r#"(+"field":'happy tax payer' +"other_field":1)"#,
);
}
}

View File

@@ -5,7 +5,7 @@ use serde::Serialize;
use crate::Occur;
#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
#[derive(PartialEq, Clone, Serialize)]
#[serde(tag = "type")]
#[serde(rename_all = "snake_case")]
pub enum UserInputLeaf {
@@ -23,10 +23,6 @@ pub enum UserInputLeaf {
Exists {
field: String,
},
Regex {
field: Option<String>,
pattern: String,
},
}
impl UserInputLeaf {
@@ -50,7 +46,6 @@ impl UserInputLeaf {
UserInputLeaf::Exists { field: _ } => UserInputLeaf::Exists {
field: field.expect("Exist query without a field isn't allowed"),
},
UserInputLeaf::Regex { field: _, pattern } => UserInputLeaf::Regex { field, pattern },
}
}
@@ -108,19 +103,11 @@ impl Debug for UserInputLeaf {
UserInputLeaf::Exists { field } => {
write!(formatter, "$exists(\"{field}\")")
}
UserInputLeaf::Regex { field, pattern } => {
if let Some(field) = field {
// TODO properly escape field (in case of \")
write!(formatter, "\"{field}\":")?;
}
// TODO properly escape pattern (in case of \")
write!(formatter, "/{pattern}/")
}
}
}
}
#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, Serialize)]
#[derive(Copy, Clone, Eq, PartialEq, Debug, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum Delimiter {
SingleQuotes,
@@ -128,7 +115,7 @@ pub enum Delimiter {
None,
}
#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
#[derive(PartialEq, Clone, Serialize)]
#[serde(rename_all = "snake_case")]
pub struct UserInputLiteral {
pub field_name: Option<String>,
@@ -167,7 +154,7 @@ impl fmt::Debug for UserInputLiteral {
}
}
#[derive(PartialEq, Eq, Hash, Debug, Clone, Serialize)]
#[derive(PartialEq, Debug, Clone, Serialize)]
#[serde(tag = "type", content = "value")]
#[serde(rename_all = "snake_case")]
pub enum UserInputBound {
@@ -204,11 +191,11 @@ impl UserInputBound {
}
}
#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
#[derive(PartialEq, Clone, Serialize)]
#[serde(into = "UserInputAstSerde")]
pub enum UserInputAst {
Clause(Vec<(Option<Occur>, UserInputAst)>),
Boost(Box<UserInputAst>, ordered_float::OrderedFloat<f64>),
Boost(Box<UserInputAst>, f64),
Leaf(Box<UserInputLeaf>),
}
@@ -230,10 +217,9 @@ impl From<UserInputAst> for UserInputAstSerde {
fn from(ast: UserInputAst) -> Self {
match ast {
UserInputAst::Clause(clause) => UserInputAstSerde::Bool { clauses: clause },
UserInputAst::Boost(underlying, boost) => UserInputAstSerde::Boost {
underlying,
boost: boost.into_inner(),
},
UserInputAst::Boost(underlying, boost) => {
UserInputAstSerde::Boost { underlying, boost }
}
UserInputAst::Leaf(leaf) => UserInputAstSerde::Leaf(leaf),
}
}
@@ -392,7 +378,7 @@ mod tests {
#[test]
fn test_boost_serialization() {
let inner_ast = UserInputAst::Leaf(Box::new(UserInputLeaf::All));
let boost_ast = UserInputAst::Boost(Box::new(inner_ast), 2.5.into());
let boost_ast = UserInputAst::Boost(Box::new(inner_ast), 2.5);
let json = serde_json::to_string(&boost_ast).unwrap();
assert_eq!(
json,
@@ -419,7 +405,7 @@ mod tests {
}))),
),
])),
2.5.into(),
2.5,
);
let json = serde_json::to_string(&boost_ast).unwrap();
assert_eq!(

View File

@@ -301,7 +301,7 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
let bounds = self.bounds;
let interval = self.interval;
let offset = self.offset;
let get_bucket_pos = |val| get_bucket_pos_f64(val, interval, offset) as i64;
let get_bucket_pos = |val| (get_bucket_pos_f64(val, interval, offset) as i64);
bucket_agg_accessor
.column_block_accessor

View File

@@ -1293,220 +1293,6 @@ mod tests {
assert_eq!(page_0, &page_2[..page_0.len()]);
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(20))]
/// Build multiple segments with equal-scoring docs and verify stable ordering
/// across pages when increasing limit or offset.
#[test]
fn proptest_stable_ordering_across_segments_with_pagination(
docs_per_segment in proptest::collection::vec(1usize..50, 2..5)
) {
use crate::indexer::NoMergePolicy;
// Build an index with multiple segments; all docs will have the same score using AllQuery.
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.set_merge_policy(Box::new(NoMergePolicy));
for num_docs in &docs_per_segment {
for _ in 0..*num_docs {
writer.add_document(doc!(text => "x")).unwrap();
}
writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let total_docs: usize = docs_per_segment.iter().sum();
// Full result set, first assert all scores are identical.
let full_with_scores: Vec<(Score, DocAddress)> = searcher
.search(&AllQuery, &TopDocs::with_limit(total_docs))
.unwrap();
// Sanity: at least one document was returned.
prop_assert!(!full_with_scores.is_empty());
let first_score = full_with_scores[0].0;
prop_assert!(full_with_scores.iter().all(|(score, _)| *score == first_score));
// Keep only the addresses for the remaining checks.
let full: Vec<DocAddress> = full_with_scores
.into_iter()
.map(|(_score, addr)| addr)
.collect();
// Sanity: we actually created multiple segments and have documents.
prop_assert!(docs_per_segment.len() >= 2);
prop_assert!(total_docs >= 2);
// 1) Increasing limit should preserve prefix ordering.
for k in 1..=total_docs {
let page: Vec<DocAddress> = searcher
.search(&AllQuery, &TopDocs::with_limit(k))
.unwrap()
.into_iter()
.map(|(_score, addr)| addr)
.collect();
prop_assert_eq!(page, full[..k].to_vec());
}
// 2) Offset + limit pages should always match the corresponding slice.
// For each offset, check three representative page sizes:
// - first page (size 1)
// - a middle page (roughly half of remaining)
// - the last page (size = remaining)
for offset in 0..total_docs {
let remaining = total_docs - offset;
let assert_page_eq = |limit: usize| -> proptest::test_runner::TestCaseResult {
let page: Vec<DocAddress> = searcher
.search(&AllQuery, &TopDocs::with_limit(limit).and_offset(offset))
.unwrap()
.into_iter()
.map(|(_score, addr)| addr)
.collect();
prop_assert_eq!(page, full[offset..offset + limit].to_vec());
Ok(())
};
// Smallest page.
assert_page_eq(1)?;
// A middle-sized page (dedupes to 1 if remaining == 1).
assert_page_eq((remaining / 2).max(1))?;
// Largest page for this offset.
assert_page_eq(remaining)?;
}
// 3) Concatenating fixed-size pages by offset reproduces the full order.
for page_size in 1..=total_docs.min(5) {
let mut concat: Vec<DocAddress> = Vec::new();
let mut offset = 0;
while offset < total_docs {
let size = page_size.min(total_docs - offset);
let page: Vec<DocAddress> = searcher
.search(&AllQuery, &TopDocs::with_limit(size).and_offset(offset))
.unwrap()
.into_iter()
.map(|(_score, addr)| addr)
.collect();
concat.extend(page);
offset += size;
}
// Avoid moving `full` across loop iterations.
prop_assert_eq!(concat, full.clone());
}
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(20))]
/// Build multiple segments with same-scoring term matches and verify stable ordering
/// across pages for a real scoring query (TermQuery with identical TF and fieldnorm).
#[test]
fn proptest_stable_ordering_across_segments_with_term_query_and_pagination(
docs_per_segment in proptest::collection::vec(1usize..50, 2..5)
) {
use crate::indexer::NoMergePolicy;
use crate::schema::IndexRecordOption;
use crate::query::TermQuery;
use crate::Term;
// Build an index with multiple segments; each doc has exactly one token "x",
// ensuring equal BM25 scores across all matching docs (same TF=1 and fieldnorm=1).
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.set_merge_policy(Box::new(NoMergePolicy));
for num_docs in &docs_per_segment {
for _ in 0..*num_docs {
writer.add_document(doc!(text => "x")).unwrap();
}
writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let total_docs: usize = docs_per_segment.iter().sum();
let term = Term::from_field_text(text, "x");
let tq = TermQuery::new(term, IndexRecordOption::WithFreqs);
// Full result set, first assert all scores are identical across docs.
let full_with_scores: Vec<(Score, DocAddress)> = searcher
.search(&tq, &TopDocs::with_limit(total_docs))
.unwrap();
// Sanity: at least one document was returned.
prop_assert!(!full_with_scores.is_empty());
let first_score = full_with_scores[0].0;
prop_assert!(full_with_scores.iter().all(|(score, _)| *score == first_score));
// Keep only the addresses for the remaining checks.
let full: Vec<DocAddress> = full_with_scores
.into_iter()
.map(|(_score, addr)| addr)
.collect();
// Sanity: we actually created multiple segments and have documents.
prop_assert!(docs_per_segment.len() >= 2);
prop_assert!(total_docs >= 2);
// 1) Increasing limit should preserve prefix ordering.
for k in 1..=total_docs {
let page: Vec<DocAddress> = searcher
.search(&tq, &TopDocs::with_limit(k))
.unwrap()
.into_iter()
.map(|(_score, addr)| addr)
.collect();
prop_assert_eq!(page, full[..k].to_vec());
}
// 2) Offset + limit pages should always match the corresponding slice.
// Check three representative page sizes for each offset: 1, ~half, and remaining.
for offset in 0..total_docs {
let remaining = total_docs - offset;
let assert_page_eq = |limit: usize| -> proptest::test_runner::TestCaseResult {
let page: Vec<DocAddress> = searcher
.search(&tq, &TopDocs::with_limit(limit).and_offset(offset))
.unwrap()
.into_iter()
.map(|(_score, addr)| addr)
.collect();
prop_assert_eq!(page, full[offset..offset + limit].to_vec());
Ok(())
};
assert_page_eq(1)?;
assert_page_eq((remaining / 2).max(1))?;
assert_page_eq(remaining)?;
}
// 3) Concatenating fixed-size pages by offset reproduces the full order.
for page_size in 1..=total_docs.min(5) {
let mut concat: Vec<DocAddress> = Vec::new();
let mut offset = 0;
while offset < total_docs {
let size = page_size.min(total_docs - offset);
let page: Vec<DocAddress> = searcher
.search(&tq, &TopDocs::with_limit(size).and_offset(offset))
.unwrap()
.into_iter()
.map(|(_score, addr)| addr)
.collect();
concat.extend(page);
offset += size;
}
prop_assert_eq!(concat, full.clone());
}
}
}
#[test]
#[should_panic]
fn test_top_0() {

View File

@@ -1,4 +1,3 @@
use columnar::NumericalValue;
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
use common::{replace_in_place, JsonPathWriter};
use rustc_hash::FxHashMap;
@@ -153,7 +152,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
if let Ok(i64_val) = val.try_into() {
term_buffer.append_type_and_fast_value::<i64>(i64_val);
} else {
term_buffer.append_type_and_fast_value::<u64>(val);
term_buffer.append_type_and_fast_value(val);
}
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
@@ -167,30 +166,12 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::F64(val) => {
if !val.is_finite() {
return;
};
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
// Normalize here is important.
// In the inverted index, we coerce all numerical values to their canonical
// representation.
//
// (We do the same thing on the query side)
match NumericalValue::F64(val).normalize() {
NumericalValue::I64(val_i64) => {
term_buffer.append_type_and_fast_value::<i64>(val_i64);
}
NumericalValue::U64(val_u64) => {
term_buffer.append_type_and_fast_value::<u64>(val_u64);
}
NumericalValue::F64(val_f64) => {
term_buffer.append_type_and_fast_value::<f64>(val_f64);
}
}
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::Bool(val) => {
@@ -260,8 +241,8 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
///
/// The term must be json + JSON path.
pub fn convert_to_fast_value_and_append_to_json_term(
term: &Term,
text: &str,
mut term: Term,
phrase: &str,
truncate_date_for_search: bool,
) -> Option<Term> {
assert_eq!(
@@ -273,50 +254,31 @@ pub fn convert_to_fast_value_and_append_to_json_term(
0,
"JSON value bytes should be empty"
);
try_convert_to_datetime_and_append_to_json_term(term, text, truncate_date_for_search)
.or_else(|| try_convert_to_number_and_append_to_json_term(term, text))
.or_else(|| try_convert_to_bool_and_append_to_json_term_typed(term, text))
}
fn try_convert_to_datetime_and_append_to_json_term(
term: &Term,
text: &str,
truncate_date_for_search: bool,
) -> Option<Term> {
let dt = OffsetDateTime::parse(text, &Rfc3339).ok()?;
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
if truncate_date_for_search {
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
if truncate_date_for_search {
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
}
term.append_type_and_fast_value(dt);
return Some(term);
}
let mut term_clone = term.clone();
term_clone.append_type_and_fast_value(dt);
Some(term_clone)
}
fn try_convert_to_number_and_append_to_json_term(term: &Term, text: &str) -> Option<Term> {
let numerical_value: NumericalValue = str::parse::<NumericalValue>(text).ok()?;
let mut term_clone = term.clone();
// Parse is actually returning normalized values already today, but let's not
// not rely on that hidden contract.
match numerical_value.normalize() {
NumericalValue::I64(i64_value) => {
term_clone.append_type_and_fast_value::<i64>(i64_value);
}
NumericalValue::U64(u64_value) => {
term_clone.append_type_and_fast_value::<u64>(u64_value);
}
NumericalValue::F64(f64_value) => {
term_clone.append_type_and_fast_value::<f64>(f64_value);
}
if let Ok(i64_val) = str::parse::<i64>(phrase) {
term.append_type_and_fast_value(i64_val);
return Some(term);
}
Some(term_clone)
}
fn try_convert_to_bool_and_append_to_json_term_typed(term: &Term, text: &str) -> Option<Term> {
let val = str::parse::<bool>(text).ok()?;
let mut term_clone = term.clone();
term_clone.append_type_and_fast_value(val);
Some(term_clone)
if let Ok(u64_val) = str::parse::<u64>(phrase) {
term.append_type_and_fast_value(u64_val);
return Some(term);
}
if let Ok(f64_val) = str::parse::<f64>(phrase) {
term.append_type_and_fast_value(f64_val);
return Some(term);
}
if let Ok(bool_val) = str::parse::<bool>(phrase) {
term.append_type_and_fast_value(bool_val);
return Some(term);
}
None
}
/// Splits a json path supplied to the query parser in such a way that

View File

@@ -484,8 +484,10 @@ impl Directory for MmapDirectory {
.map_err(LockError::wrap_io_error)?;
if lock.is_blocking {
file.lock_exclusive().map_err(LockError::wrap_io_error)?;
} else if !file.try_lock_exclusive().map_err(|_| LockError::LockBusy)? {
return Err(LockError::LockBusy);
} else {
if !file.try_lock_exclusive().map_err(|_| LockError::LockBusy)? {
return Err(LockError::LockBusy);
}
}
// dropping the file handle will release the lock.
Ok(DirectoryLock::from(Box::new(ReleaseLockFile {

View File

@@ -216,7 +216,7 @@ impl IndexBuilder {
/// Opens or creates a new index in the provided directory
pub fn open_or_create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
let dir: Box<dyn Directory> = dir.into();
let dir = dir.into();
if !Index::exists(&*dir)? {
return self.create(dir);
}
@@ -494,7 +494,7 @@ impl Index {
.into_iter()
.map(|segment| SegmentReader::open(&segment)?.fields_metadata())
.collect::<Result<_, _>>()?;
Ok(merge_field_meta_data(fields_metadata))
Ok(merge_field_meta_data(fields_metadata, &self.schema()))
}
/// Creates a new segment_meta (Advanced user only).

View File

@@ -1,7 +1,8 @@
use std::io;
use common::json_path_writer::JSON_END_OF_PATH;
use common::{BinarySerializable, ByteCount};
use common::BinarySerializable;
use fnv::FnvHashSet;
#[cfg(feature = "quickwit")]
use futures_util::{FutureExt, StreamExt, TryStreamExt};
#[cfg(feature = "quickwit")]
@@ -35,33 +36,6 @@ pub struct InvertedIndexReader {
total_num_tokens: u64,
}
/// Object that records the amount of space used by a field in an inverted index.
pub(crate) struct InvertedIndexFieldSpace {
pub field_name: String,
pub field_type: Type,
pub postings_size: ByteCount,
pub positions_size: ByteCount,
pub num_terms: u64,
}
/// Returns None if the term is not a valid JSON path.
fn extract_field_name_and_field_type_from_json_path(term: &[u8]) -> Option<(String, Type)> {
let index = term.iter().position(|&byte| byte == JSON_END_OF_PATH)?;
let field_type_code = term.get(index + 1).copied()?;
let field_type = Type::from_code(field_type_code)?;
// Let's flush the current field.
let field_name = String::from_utf8_lossy(&term[..index]).to_string();
Some((field_name, field_type))
}
impl InvertedIndexFieldSpace {
fn record(&mut self, term_info: &TermInfo) {
self.postings_size += ByteCount::from(term_info.posting_num_bytes() as u64);
self.positions_size += ByteCount::from(term_info.positions_num_bytes() as u64);
self.num_terms += 1;
}
}
impl InvertedIndexReader {
pub(crate) fn new(
termdict: TermDictionary,
@@ -107,56 +81,20 @@ impl InvertedIndexReader {
///
/// Notice: This requires a full scan and therefore **very expensive**.
/// TODO: Move to sstable to use the index.
pub(crate) fn list_encoded_json_fields(&self) -> io::Result<Vec<InvertedIndexFieldSpace>> {
pub fn list_encoded_fields(&self) -> io::Result<Vec<(String, Type)>> {
let mut stream = self.termdict.stream()?;
let mut fields: Vec<InvertedIndexFieldSpace> = Vec::new();
let mut current_field_opt: Option<InvertedIndexFieldSpace> = None;
// Current field bytes, including the JSON_END_OF_PATH.
let mut current_field_bytes: Vec<u8> = Vec::new();
while let Some((term, term_info)) = stream.next() {
if let Some(current_field) = &mut current_field_opt {
if term.starts_with(&current_field_bytes) {
// We are still in the same field.
current_field.record(term_info);
continue;
let mut fields = Vec::new();
let mut fields_set = FnvHashSet::default();
while let Some((term, _term_info)) = stream.next() {
if let Some(index) = term.iter().position(|&byte| byte == JSON_END_OF_PATH) {
if !fields_set.contains(&term[..index + 2]) {
fields_set.insert(term[..index + 2].to_vec());
let typ = Type::from_code(term[index + 1]).unwrap();
fields.push((String::from_utf8_lossy(&term[..index]).to_string(), typ));
}
}
// This is a new field!
// Let's flush the current field.
fields.extend(current_field_opt.take());
current_field_bytes.clear();
// And create a new one.
let Some((field_name, field_type)) =
extract_field_name_and_field_type_from_json_path(term)
else {
error!(
"invalid term bytes encountered {term:?}. this only happens if the term \
dictionary is corrupted. please report"
);
continue;
};
let mut field_space = InvertedIndexFieldSpace {
field_name,
field_type,
postings_size: ByteCount::default(),
positions_size: ByteCount::default(),
num_terms: 0u64,
};
field_space.record(term_info);
// We include the json type and the json end of path to make sure the prefix check
// is meaningful.
current_field_bytes.extend_from_slice(&term[..field_space.field_name.len() + 2]);
current_field_opt = Some(field_space);
}
// We need to flush the last field as well.
fields.extend(current_field_opt.take());
Ok(fields)
}

View File

@@ -1,8 +1,8 @@
use std::collections::HashMap;
use std::ops::BitOrAssign;
use std::sync::{Arc, RwLock};
use std::{fmt, io};
use common::{ByteCount, HasLen};
use fnv::FnvHashMap;
use itertools::Itertools;
@@ -304,16 +304,12 @@ impl SegmentReader {
for (field, field_entry) in self.schema().fields() {
let field_name = field_entry.name().to_string();
let is_indexed = field_entry.is_indexed();
if is_indexed {
let is_json = field_entry.field_type().value_type() == Type::Json;
if is_json {
let term_dictionary_json_field_num_bytes: u64 = self
.termdict_composite
.open_read(field)
.map(|file_slice| file_slice.len() as u64)
.unwrap_or(0u64);
let inv_index = self.inverted_index(field)?;
let encoded_fields_in_index = inv_index.list_encoded_json_fields()?;
let encoded_fields_in_index = inv_index.list_encoded_fields()?;
let mut build_path = |field_name: &str, mut json_path: String| {
// In this case we need to map the potential fast field to the field name
// accepted by the query parser.
@@ -332,65 +328,30 @@ impl SegmentReader {
format!("{field_name}.{json_path}")
}
};
let total_num_terms = encoded_fields_in_index
.iter()
.map(|field_space| field_space.num_terms)
.sum();
indexed_fields.extend(encoded_fields_in_index.into_iter().map(|field_space| {
let field_name = build_path(&field_name, field_space.field_name);
// It is complex to attribute the exact amount of bytes required by specific
// field in the json field. Instead, as a proxy, we
// attribute the total amount of bytes for the entire json field,
// proportionally to the number of terms in each
// fields.
let term_dictionary_size = (term_dictionary_json_field_num_bytes
* field_space.num_terms)
.checked_div(total_num_terms)
.unwrap_or(0);
FieldMetadata {
postings_size: Some(field_space.postings_size),
positions_size: Some(field_space.positions_size),
term_dictionary_size: Some(ByteCount::from(term_dictionary_size)),
fast_size: None,
// The stored flag will be set at the end of this function!
stored: field_entry.is_stored(),
field_name,
typ: field_space.field_type,
}
}));
indexed_fields.extend(
encoded_fields_in_index
.into_iter()
.map(|(name, typ)| (build_path(&field_name, name), typ))
.map(|(field_name, typ)| FieldMetadata {
indexed: true,
stored: false,
field_name,
fast: false,
typ,
}),
);
} else {
let postings_size: ByteCount = self
.postings_composite
.open_read(field)
.map(|posting_fileslice| posting_fileslice.len())
.unwrap_or(0)
.into();
let positions_size: ByteCount = self
.positions_composite
.open_read(field)
.map(|positions_fileslice| positions_fileslice.len())
.unwrap_or(0)
.into();
let term_dictionary_size: ByteCount = self
.termdict_composite
.open_read(field)
.map(|term_dictionary_fileslice| term_dictionary_fileslice.len())
.unwrap_or(0)
.into();
indexed_fields.push(FieldMetadata {
indexed: true,
stored: false,
field_name: field_name.to_string(),
fast: false,
typ: field_entry.field_type().value_type(),
// The stored flag will be set at the end of this function!
stored: field_entry.is_stored(),
fast_size: None,
term_dictionary_size: Some(term_dictionary_size),
postings_size: Some(postings_size),
positions_size: Some(positions_size),
});
}
}
}
let fast_fields: Vec<FieldMetadata> = self
let mut fast_fields: Vec<FieldMetadata> = self
.fast_fields()
.columnar()
.iter_columns()?
@@ -402,21 +363,23 @@ impl SegmentReader {
.get(&field_name)
.unwrap_or(&field_name)
.to_string();
let stored = is_field_stored(&field_name, &self.schema);
FieldMetadata {
indexed: false,
stored: false,
field_name,
fast: true,
typ: Type::from(handle.column_type()),
stored,
fast_size: Some(handle.num_bytes()),
term_dictionary_size: None,
postings_size: None,
positions_size: None,
}
})
.collect();
let merged_field_metadatas: Vec<FieldMetadata> =
merge_field_meta_data(vec![indexed_fields, fast_fields]);
Ok(merged_field_metadatas)
// Since the type is encoded differently in the fast field and in the inverted index,
// the order of the fields is not guaranteed to be the same. Therefore, we sort the fields.
// If we are sure that the order is the same, we can remove this sort.
indexed_fields.sort_unstable();
fast_fields.sort_unstable();
let merged = merge_field_meta_data(vec![indexed_fields, fast_fields], &self.schema);
Ok(merged)
}
/// Returns the segment id
@@ -480,47 +443,20 @@ pub struct FieldMetadata {
// Notice: Don't reorder the declaration of 1.field_name 2.typ, as it is used for ordering by
// field_name then typ.
pub typ: Type,
/// Is the field indexed for search
pub indexed: bool,
/// Is the field stored in the doc store
pub stored: bool,
/// Size occupied in the columnar storage (None if not fast)
pub fast_size: Option<ByteCount>,
/// term_dictionary
pub term_dictionary_size: Option<ByteCount>,
/// Size occupied in the index postings storage (None if not indexed)
pub postings_size: Option<ByteCount>,
/// Size occupied in the index postings storage (None if positions are not recorded)
pub positions_size: Option<ByteCount>,
/// Is the field stored in the columnar storage
pub fast: bool,
}
fn merge_options(left: Option<ByteCount>, right: Option<ByteCount>) -> Option<ByteCount> {
match (left, right) {
(Some(l), Some(r)) => Some(l + r),
(None, right) => right,
(left, None) => left,
}
}
impl FieldMetadata {
/// Returns true if and only if the field is indexed.
pub fn is_indexed(&self) -> bool {
self.postings_size.is_some()
}
/// Returns true if and only if the field is a fast field (i.e.: recorded in columnar format).
pub fn is_fast(&self) -> bool {
self.fast_size.is_some()
}
/// Merges two field metadata.
pub fn merge(&mut self, rhs: Self) {
assert_eq!(self.field_name, rhs.field_name);
assert_eq!(self.typ, rhs.typ);
impl BitOrAssign for FieldMetadata {
fn bitor_assign(&mut self, rhs: Self) {
assert!(self.field_name == rhs.field_name);
assert!(self.typ == rhs.typ);
self.indexed |= rhs.indexed;
self.stored |= rhs.stored;
self.fast_size = merge_options(self.fast_size, rhs.fast_size);
self.term_dictionary_size =
merge_options(self.term_dictionary_size, rhs.term_dictionary_size);
self.postings_size = merge_options(self.postings_size, rhs.postings_size);
self.positions_size = merge_options(self.positions_size, rhs.positions_size);
self.fast |= rhs.fast;
}
}
@@ -533,29 +469,23 @@ fn is_field_stored(field_name: &str, schema: &Schema) -> bool {
}
/// Helper to merge the field metadata from multiple segments.
pub fn merge_field_meta_data(mut field_metadatas: Vec<Vec<FieldMetadata>>) -> Vec<FieldMetadata> {
// READ BEFORE REMOVING THIS!
//
// Because we replace field sep by `.`, fields are not always sorted.
// Also, to enforce such an implicit contract, we would have to add
// assert here.
//
// Sorting is linear time on pre-sorted data, so we are simply better off sorting data here.
for field_metadatas in &mut field_metadatas {
field_metadatas.sort_unstable();
}
pub fn merge_field_meta_data(
field_metadatas: Vec<Vec<FieldMetadata>>,
schema: &Schema,
) -> Vec<FieldMetadata> {
let mut merged_field_metadata = Vec::new();
for (_key, mut group) in &field_metadatas
.into_iter()
.kmerge()
.kmerge_by(|left, right| left < right)
// TODO: Remove allocation
.chunk_by(|el| (el.field_name.to_string(), el.typ))
{
let mut merged: FieldMetadata = group.next().unwrap();
for el in group {
merged.merge(el);
merged |= el;
}
// Currently is_field_stored is maybe too slow for the high cardinality case
merged.stored = is_field_stored(&merged.field_name, schema);
merged_field_metadata.push(merged);
}
merged_field_metadata
@@ -577,7 +507,7 @@ fn intersect_alive_bitset(
}
impl fmt::Debug for SegmentReader {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "SegmentReader({:?})", self.segment_id)
}
}
@@ -586,168 +516,122 @@ impl fmt::Debug for SegmentReader {
mod test {
use super::*;
use crate::index::Index;
use crate::schema::{Term, STORED, TEXT};
use crate::schema::{SchemaBuilder, Term, STORED, TEXT};
use crate::IndexWriter;
#[track_caller]
fn assert_merge(fields_metadatas: &[Vec<FieldMetadata>], expected: &[FieldMetadata]) {
use itertools::Itertools;
let num_els = fields_metadatas.len();
for permutation in fields_metadatas.iter().cloned().permutations(num_els) {
let res = merge_field_meta_data(permutation);
assert_eq!(&res, &expected);
}
}
#[test]
fn test_merge_field_meta_data_same_field() {
fn test_merge_field_meta_data_same() {
let schema = SchemaBuilder::new().build();
let field_metadata1 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
term_dictionary_size: Some(ByteCount::from(100u64)),
postings_size: Some(ByteCount::from(1_000u64)),
positions_size: Some(ByteCount::from(2_000u64)),
fast_size: Some(ByteCount::from(1_000u64).into()),
fast: true,
};
let field_metadata2 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
term_dictionary_size: Some(ByteCount::from(80u64)),
postings_size: Some(ByteCount::from(1_500u64)),
positions_size: Some(ByteCount::from(2_500u64)),
fast_size: Some(ByteCount::from(3_000u64).into()),
fast: true,
};
let expected = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
stored: false,
term_dictionary_size: Some(ByteCount::from(180u64)),
postings_size: Some(ByteCount::from(2_500u64)),
positions_size: Some(ByteCount::from(4_500u64)),
fast_size: Some(ByteCount::from(4_000u64).into()),
};
assert_merge(
&[vec![field_metadata1.clone()], vec![field_metadata2]],
&[expected],
let res = merge_field_meta_data(
vec![vec![field_metadata1.clone()], vec![field_metadata2]],
&schema,
);
assert_eq!(res, vec![field_metadata1]);
}
#[track_caller]
#[test]
fn test_merge_field_meta_data_different() {
let schema = SchemaBuilder::new().build();
let field_metadata1 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: false,
stored: false,
fast_size: Some(1_000u64.into()),
term_dictionary_size: Some(100u64.into()),
postings_size: Some(2_000u64.into()),
positions_size: Some(4_000u64.into()),
fast: true,
};
let field_metadata2 = FieldMetadata {
field_name: "b".to_string(),
typ: crate::schema::Type::Str,
indexed: false,
stored: false,
fast_size: Some(1_002u64.into()),
term_dictionary_size: None,
postings_size: None,
positions_size: None,
fast: true,
};
let field_metadata3 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
term_dictionary_size: Some(101u64.into()),
postings_size: Some(2_001u64.into()),
positions_size: Some(4_001u64.into()),
indexed: true,
stored: false,
fast_size: None,
fast: false,
};
let expected = vec![
FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
stored: false,
term_dictionary_size: Some(201u64.into()),
postings_size: Some(4_001u64.into()),
positions_size: Some(8_001u64.into()),
fast_size: Some(1_000u64.into()),
},
FieldMetadata {
field_name: "b".to_string(),
typ: crate::schema::Type::Str,
stored: false,
term_dictionary_size: None,
postings_size: None,
positions_size: None,
fast_size: Some(1_002u64.into()),
},
];
assert_merge(
&[
let res = merge_field_meta_data(
vec![
vec![field_metadata1.clone(), field_metadata2.clone()],
vec![field_metadata3],
],
&expected,
&schema,
);
let field_metadata_expected1 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
fast: true,
};
assert_eq!(res, vec![field_metadata_expected1, field_metadata2.clone()]);
}
#[test]
fn test_merge_field_meta_data_merge() {
use pretty_assertions::assert_eq;
let get_meta_data = |name: &str, typ: Type| FieldMetadata {
field_name: name.to_string(),
typ,
term_dictionary_size: None,
postings_size: None,
positions_size: None,
indexed: false,
stored: false,
fast_size: Some(1u64.into()),
fast: true,
};
let metas = vec![get_meta_data("d", Type::Str), get_meta_data("e", Type::U64)];
assert_merge(
&[vec![get_meta_data("e", Type::Str)], metas],
&[
let schema = SchemaBuilder::new().build();
let mut metas = vec![get_meta_data("d", Type::Str), get_meta_data("e", Type::U64)];
metas.sort();
let res = merge_field_meta_data(vec![vec![get_meta_data("e", Type::Str)], metas], &schema);
assert_eq!(
res,
vec![
get_meta_data("d", Type::Str),
get_meta_data("e", Type::Str),
get_meta_data("e", Type::U64),
],
]
);
}
#[test]
fn test_merge_field_meta_data_bitxor() {
let field_metadata1 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
term_dictionary_size: None,
postings_size: None,
positions_size: None,
indexed: false,
stored: false,
fast_size: Some(10u64.into()),
fast: true,
};
let field_metadata2 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
term_dictionary_size: Some(10u64.into()),
postings_size: Some(11u64.into()),
positions_size: Some(12u64.into()),
indexed: true,
stored: false,
fast_size: None,
fast: false,
};
let field_metadata_expected = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
term_dictionary_size: Some(10u64.into()),
postings_size: Some(11u64.into()),
positions_size: Some(12u64.into()),
indexed: true,
stored: false,
fast_size: Some(10u64.into()),
fast: true,
};
let mut res1 = field_metadata1.clone();
res1.merge(field_metadata2.clone());
res1 |= field_metadata2.clone();
let mut res2 = field_metadata2.clone();
res2.merge(field_metadata1);
res2 |= field_metadata1;
assert_eq!(res1, field_metadata_expected);
assert_eq!(res2, field_metadata_expected);
}
@@ -778,7 +662,6 @@ mod test {
assert_eq!(4, searcher.segment_reader(0).max_doc());
Ok(())
}
#[test]
fn test_alive_docs_iterator() -> crate::Result<()> {
let mut schema_builder = Schema::builder();

View File

@@ -615,7 +615,7 @@ impl<D: Document> IndexWriter<D> {
/// It is also possible to add a payload to the `commit`
/// using this API.
/// See [`PreparedCommit::set_payload()`].
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<'_, D>> {
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<D>> {
// Here, because we join all of the worker threads,
// all of the segment update for this commit have been
// sent.

View File

@@ -61,8 +61,6 @@ type AddBatchReceiver<D> = channel::Receiver<AddBatch<D>>;
#[cfg(test)]
mod tests_mmap {
use common::ByteCount;
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::AggregationCollector;
@@ -282,14 +280,11 @@ mod tests_mmap {
field_name_out
};
let mut fields: Vec<(String, Type)> = reader.searcher().segment_readers()[0]
let mut fields = reader.searcher().segment_readers()[0]
.inverted_index(field)
.unwrap()
.list_encoded_json_fields()
.unwrap()
.into_iter()
.map(|field_space| (field_space.field_name, field_space.field_type))
.collect();
.list_encoded_fields()
.unwrap();
assert_eq!(fields.len(), 8);
fields.sort();
let mut expected_fields = vec![
@@ -390,12 +385,7 @@ mod tests_mmap {
let reader = &searcher.segment_readers()[0];
let inverted_index = reader.inverted_index(json_field).unwrap();
assert_eq!(
inverted_index
.list_encoded_json_fields()
.unwrap()
.into_iter()
.map(|field_space| (field_space.field_name, field_space.field_type))
.collect::<Vec<_>>(),
inverted_index.list_encoded_fields().unwrap(),
[
("k8s.container.name".to_string(), Type::Str),
("sub\u{1}a".to_string(), Type::I64),
@@ -412,41 +402,19 @@ mod tests_mmap {
fn test_json_fields_metadata_expanded_dots_one_segment() {
test_json_fields_metadata(true, true);
}
#[test]
fn test_json_fields_metadata_expanded_dots_multi_segment() {
test_json_fields_metadata(true, false);
}
#[test]
fn test_json_fields_metadata_no_expanded_dots_one_segment() {
test_json_fields_metadata(false, true);
}
#[test]
fn test_json_fields_metadata_no_expanded_dots_multi_segment() {
test_json_fields_metadata(false, false);
}
#[track_caller]
fn assert_size_eq(lhs: Option<ByteCount>, rhs: Option<ByteCount>) {
let ignore_actual_values = |size_opt: Option<ByteCount>| size_opt.map(|val| val > 0);
assert_eq!(ignore_actual_values(lhs), ignore_actual_values(rhs));
}
#[track_caller]
fn assert_field_metadata_eq_but_ignore_field_size(
expected: &FieldMetadata,
actual: &FieldMetadata,
) {
assert_eq!(&expected.field_name, &actual.field_name);
assert_eq!(&expected.typ, &actual.typ);
assert_eq!(&expected.stored, &actual.stored);
assert_size_eq(expected.postings_size, actual.postings_size);
assert_size_eq(expected.positions_size, actual.positions_size);
assert_size_eq(expected.fast_size, actual.fast_size);
}
fn test_json_fields_metadata(expanded_dots: bool, one_segment: bool) {
use pretty_assertions::assert_eq;
let mut schema_builder = Schema::builder();
@@ -485,101 +453,81 @@ mod tests_mmap {
assert_eq!(searcher.num_docs(), 3);
let fields_metadata = index.fields_metadata().unwrap();
let expected_fields = &[
FieldMetadata {
field_name: "empty".to_string(),
stored: true,
typ: Type::U64,
term_dictionary_size: Some(0u64.into()),
fast_size: Some(1u64.into()),
postings_size: Some(0u64.into()),
positions_size: Some(0u64.into()),
},
FieldMetadata {
field_name: if expanded_dots {
"json.shadow.k8s.container.name".to_string()
} else {
"json.shadow.k8s\\.container\\.name".to_string()
assert_eq!(
fields_metadata,
[
FieldMetadata {
field_name: "empty".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::U64
},
stored: true,
typ: Type::Str,
term_dictionary_size: Some(1u64.into()),
fast_size: Some(1u64.into()),
postings_size: Some(1u64.into()),
positions_size: Some(1u64.into()),
},
FieldMetadata {
field_name: "json.shadow.sub.a".to_string(),
typ: Type::I64,
stored: true,
fast_size: Some(1u64.into()),
term_dictionary_size: Some(1u64.into()),
postings_size: Some(1u64.into()),
positions_size: Some(1u64.into()),
},
FieldMetadata {
field_name: "json.shadow.sub.b".to_string(),
typ: Type::I64,
stored: true,
fast_size: Some(1u64.into()),
term_dictionary_size: Some(1u64.into()),
postings_size: Some(1u64.into()),
positions_size: Some(1u64.into()),
},
FieldMetadata {
field_name: "json.shadow.suber.a".to_string(),
stored: true,
typ: Type::I64,
fast_size: Some(1u64.into()),
term_dictionary_size: Some(1u64.into()),
postings_size: Some(1u64.into()),
positions_size: Some(1u64.into()),
},
FieldMetadata {
field_name: "json.shadow.suber.a".to_string(),
typ: Type::Str,
stored: true,
fast_size: Some(1u64.into()),
term_dictionary_size: Some(1u64.into()),
postings_size: Some(1u64.into()),
positions_size: Some(1u64.into()),
},
FieldMetadata {
field_name: "json.shadow.suber.b".to_string(),
typ: Type::I64,
stored: true,
fast_size: Some(1u64.into()),
term_dictionary_size: Some(1u64.into()),
postings_size: Some(1u64.into()),
positions_size: Some(1u64.into()),
},
FieldMetadata {
field_name: "json.shadow.val".to_string(),
typ: Type::Str,
stored: true,
fast_size: Some(1u64.into()),
term_dictionary_size: Some(1u64.into()),
postings_size: Some(1u64.into()),
positions_size: Some(1u64.into()),
},
FieldMetadata {
field_name: "numbers".to_string(),
stored: false,
typ: Type::U64,
fast_size: Some(1u64.into()),
term_dictionary_size: None,
postings_size: None,
positions_size: None,
},
];
assert_eq!(fields_metadata.len(), expected_fields.len());
for (expected, value) in expected_fields.iter().zip(fields_metadata.iter()) {
assert_field_metadata_eq_but_ignore_field_size(expected, value);
}
FieldMetadata {
field_name: if expanded_dots {
"json.shadow.k8s.container.name".to_string()
} else {
"json.shadow.k8s\\.container\\.name".to_string()
},
indexed: true,
stored: true,
fast: true,
typ: Type::Str
},
FieldMetadata {
field_name: "json.shadow.sub.a".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::I64
},
FieldMetadata {
field_name: "json.shadow.sub.b".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::I64
},
FieldMetadata {
field_name: "json.shadow.suber.a".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::I64
},
FieldMetadata {
field_name: "json.shadow.suber.a".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::Str
},
FieldMetadata {
field_name: "json.shadow.suber.b".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::I64
},
FieldMetadata {
field_name: "json.shadow.val".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::Str
},
FieldMetadata {
field_name: "numbers".to_string(),
indexed: false,
stored: false,
fast: true,
typ: Type::U64
}
]
);
let query_parser = QueryParser::for_index(&index, vec![]);
// Test if returned field name can be queried
for indexed_field in fields_metadata.iter().filter(|meta| meta.is_indexed()) {
for indexed_field in fields_metadata.iter().filter(|meta| meta.indexed) {
let val = if indexed_field.typ == Type::Str {
"a"
} else {
@@ -595,10 +543,7 @@ mod tests_mmap {
}
}
// Test if returned field name can be used for aggregation
for fast_field in fields_metadata
.iter()
.filter(|field_metadata| field_metadata.is_fast())
{
for fast_field in fields_metadata.iter().filter(|meta| meta.fast) {
let agg_req_str = json!(
{
"termagg": {

View File

@@ -55,7 +55,7 @@
//! // between indexing threads.
//! let mut index_writer: IndexWriter = index.writer(100_000_000)?;
//!
//! // Let's index a document!
//! // Let's index one documents!
//! index_writer.add_document(doc!(
//! title => "The Old Man and the Sea",
//! body => "He was an old man who fished alone in a skiff in \
@@ -165,7 +165,7 @@ mod macros;
mod future_result;
// Re-exports
pub use common::{ByteCount, DateTime};
pub use common::DateTime;
pub use {columnar, query_grammar, time};
pub use crate::error::TantivyError;
@@ -370,8 +370,6 @@ macro_rules! fail_point {
/// Common test utilities.
#[cfg(test)]
pub mod tests {
use std::collections::BTreeMap;
use common::{BinarySerializable, FixedSize};
use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
use rand::distributions::{Bernoulli, Uniform};
@@ -384,7 +382,7 @@ pub mod tests {
use crate::index::SegmentReader;
use crate::merge_policy::NoMergePolicy;
use crate::postings::Postings;
use crate::query::{BooleanQuery, QueryParser};
use crate::query::BooleanQuery;
use crate::schema::*;
use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};
@@ -1225,49 +1223,4 @@ pub mod tests {
);
assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro());
}
#[test]
fn test_json_number_ambiguity() {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("number", crate::schema::TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
{
let mut doc = TantivyDocument::new();
let mut obj = BTreeMap::default();
obj.insert("key".to_string(), OwnedValue::I64(1i64));
doc.add_object(json_field, obj);
index_writer.add_document(doc).unwrap();
}
{
let mut doc = TantivyDocument::new();
let mut obj = BTreeMap::default();
obj.insert("key".to_string(), OwnedValue::U64(1u64));
doc.add_object(json_field, obj);
index_writer.add_document(doc).unwrap();
}
{
let mut doc = TantivyDocument::new();
let mut obj = BTreeMap::default();
obj.insert("key".to_string(), OwnedValue::F64(1.0f64));
doc.add_object(json_field, obj);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
assert_eq!(searcher.num_docs(), 3);
{
let parser = QueryParser::for_index(&index, vec![]);
let query = parser.parse_query("number.key:1").unwrap();
let count = searcher.search(&query, &crate::collector::Count).unwrap();
assert_eq!(count, 3);
}
{
let parser = QueryParser::for_index(&index, vec![]);
let query = parser.parse_query("number.key:1.0").unwrap();
let count = searcher.search(&query, &crate::collector::Count).unwrap();
assert_eq!(count, 3);
}
}
}

View File

@@ -40,9 +40,6 @@ const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
#[cfg(test)]
pub(crate) mod tests {
use std::iter;
use proptest::prelude::*;
use proptest::sample::select;

View File

@@ -227,6 +227,19 @@ impl BlockSegmentPostings {
self.doc_decoder.output_array()
}
/// Returns a full block, regardless of whether the block is complete or incomplete (
/// as it happens for the last block of the posting list).
///
/// In the latter case, the block is guaranteed to be padded with the sentinel value:
/// `TERMINATED`. The array is also guaranteed to be aligned on 16 bytes = 128 bits.
///
/// This method is useful to run SSE2 linear search.
#[inline]
pub(crate) fn full_block(&self) -> &[DocId; COMPRESSION_BLOCK_SIZE] {
debug_assert!(self.block_is_loaded());
self.doc_decoder.full_output()
}
/// Return the document at index `idx` of the block.
#[inline]
pub fn doc(&self, idx: usize) -> u32 {
@@ -262,36 +275,22 @@ impl BlockSegmentPostings {
///
/// If all docs are smaller than target, the block loaded may be empty,
/// or be the last an incomplete VInt block.
pub fn seek(&mut self, target_doc: DocId) -> usize {
// Move to the block that might contain our document.
self.seek_block(target_doc);
pub fn seek(&mut self, target_doc: DocId) {
self.shallow_seek(target_doc);
self.load_block();
// At this point we are on the block that might contain our document.
let doc = self.doc_decoder.seek_within_block(target_doc);
// The last block is not full and padded with TERMINATED,
// so we are guaranteed to have at least one value (real or padding)
// that is >= target_doc.
debug_assert!(doc < COMPRESSION_BLOCK_SIZE);
// `doc` is now the first element >= `target_doc`.
// If all docs are smaller than target, the current block is incomplete and padded
// with TERMINATED. After the search, the cursor points to the first TERMINATED.
doc
}
pub(crate) fn position_offset(&self) -> u64 {
self.skip_reader.position_offset()
}
/// Dangerous API! This calls seeks the next block on the skip list,
/// Dangerous API! This calls seek on the skip list,
/// but does not `.load_block()` afterwards.
///
/// `.load_block()` needs to be called manually afterwards.
/// If all docs are smaller than target, the block loaded may be empty,
/// or be the last an incomplete VInt block.
pub(crate) fn seek_block(&mut self, target_doc: DocId) {
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
if self.skip_reader.seek(target_doc) {
self.block_max_score_cache = None;
self.block_loaded = false;

View File

@@ -151,11 +151,9 @@ impl BlockDecoder {
&self.output[..self.output_len]
}
/// Return in-block index of first value >= `target`.
/// Uses the padded buffer to enable branchless search.
#[inline]
pub(crate) fn seek_within_block(&self, target: u32) -> usize {
crate::postings::branchless_binary_search(&self.output, target)
pub(crate) fn full_output(&self) -> &[u32; COMPRESSION_BLOCK_SIZE] {
&self.output
}
#[inline]

View File

@@ -4,7 +4,7 @@ use crate::docset::DocSet;
use crate::fastfield::AliveBitSet;
use crate::positions::PositionReader;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::{BlockSegmentPostings, Postings};
use crate::postings::{branchless_binary_search, BlockSegmentPostings, Postings};
use crate::{DocId, TERMINATED};
/// `SegmentPostings` represents the inverted list or postings associated with
@@ -175,11 +175,26 @@ impl DocSet for SegmentPostings {
return self.doc();
}
// Delegate block-local search to BlockSegmentPostings::seek, which returns
// the in-block index of the first doc >= target.
self.cur = self.block_cursor.seek(target);
let doc = self.doc();
self.block_cursor.seek(target);
// At this point we are on the block, that might contain our document.
let output = self.block_cursor.full_block();
self.cur = branchless_binary_search(output, target);
// The last block is not full and padded with the value TERMINATED,
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
// that is greater or equal to the target.
debug_assert!(self.cur < COMPRESSION_BLOCK_SIZE);
// `doc` is now the first element >= `target`
// If all docs are smaller than target the current block should be incomplemented and padded
// with the value `TERMINATED`.
//
// After the search, the cursor should point to the first value of TERMINATED.
let doc = output[self.cur];
debug_assert!(doc >= target);
debug_assert_eq!(doc, self.doc());
doc
}

View File

@@ -75,7 +75,7 @@ impl InvertedIndexSerializer {
field: Field,
total_num_tokens: u64,
fieldnorm_reader: Option<FieldNormReader>,
) -> io::Result<FieldSerializer<'_>> {
) -> io::Result<FieldSerializer> {
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
let term_dictionary_write = self.terms_write.for_field(field);
let postings_write = self.postings_write.for_field(field);
@@ -126,7 +126,7 @@ impl<'a> FieldSerializer<'a> {
let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?;
let average_fieldnorm = fieldnorm_reader
.as_ref()
.map(|ff_reader| total_num_tokens as Score / ff_reader.num_docs() as Score)
.map(|ff_reader| (total_num_tokens as Score / ff_reader.num_docs() as Score))
.unwrap_or(0.0);
let postings_serializer = PostingsSerializer::new(
postings_write,

View File

@@ -1,3 +1,5 @@
use serde::{Deserialize, Serialize};
use crate::fieldnorm::FieldNormReader;
use crate::query::Explanation;
use crate::schema::Field;
@@ -66,6 +68,12 @@ fn compute_tf_cache(average_fieldnorm: Score) -> [Score; 256] {
cache
}
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
pub struct Bm25Params {
pub idf: Score,
pub avg_fieldnorm: Score,
}
/// A struct used for computing BM25 scores.
#[derive(Clone)]
pub struct Bm25Weight {

View File

@@ -167,7 +167,7 @@ pub fn block_wand(
let block_max_score_upperbound: Score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| {
scorer.seek_block(pivot_doc);
scorer.shallow_seek(pivot_doc);
scorer.block_max_score()
})
.sum();
@@ -234,7 +234,7 @@ pub fn block_wand_single_scorer(
return;
}
doc = last_doc_in_block + 1;
scorer.seek_block(doc);
scorer.shallow_seek(doc);
}
// Seek will effectively load that block.
doc = scorer.seek(doc);
@@ -256,7 +256,7 @@ pub fn block_wand_single_scorer(
}
}
doc += 1;
scorer.seek_block(doc);
scorer.shallow_seek(doc);
}
}

View File

@@ -1,15 +1,12 @@
use core::fmt::Debug;
use columnar::{ColumnIndex, DynamicColumn};
use common::BitSet;
use super::{ConstScorer, EmptyScorer};
use crate::docset::{DocSet, TERMINATED};
use crate::index::SegmentReader;
use crate::query::all_query::AllScorer;
use crate::query::boost_query::BoostScorer;
use crate::query::explanation::does_not_match;
use crate::query::{BitSetDocSet, EnableScoring, Explanation, Query, Scorer, Weight};
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::schema::Type;
use crate::{DocId, Score, TantivyError};
@@ -116,49 +113,13 @@ impl Weight for ExistsWeight {
non_empty_columns.push(column)
}
}
if non_empty_columns.is_empty() {
return Ok(Box::new(EmptyScorer));
}
// If any column is full, all docs match.
let max_doc = reader.max_doc();
if non_empty_columns
.iter()
.any(|col| matches!(col.column_index(), ColumnIndex::Full))
{
let all_scorer = AllScorer::new(max_doc);
return Ok(Box::new(BoostScorer::new(all_scorer, boost)));
}
// If we have a single dynamic column, use ExistsDocSet
// NOTE: A lower number may be better for very sparse columns
if non_empty_columns.len() < 4 {
// TODO: we can optimizer more here since in most cases we will have only one index
if !non_empty_columns.is_empty() {
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
return Ok(Box::new(ConstScorer::new(docset, boost)));
Ok(Box::new(ConstScorer::new(docset, boost)))
} else {
Ok(Box::new(EmptyScorer))
}
// If we have many dynamic columns, precompute a bitset of matching docs
let mut doc_bitset = BitSet::with_max_value(max_doc);
for column in &non_empty_columns {
match column.column_index() {
ColumnIndex::Empty { .. } => {}
ColumnIndex::Full => {
// Handled by AllScorer return above.
}
ColumnIndex::Optional(optional_index) => {
for doc in optional_index.iter_non_null_docs() {
doc_bitset.insert(doc);
}
}
ColumnIndex::Multivalued(multi_idx) => {
for doc in multi_idx.iter_non_null_docs() {
doc_bitset.insert(doc);
}
}
}
}
let docset = BitSetDocSet::from(doc_bitset);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
@@ -333,43 +294,6 @@ mod tests {
Ok(())
}
#[test]
fn test_exists_query_json_union_no_single_full_subpath() -> crate::Result<()> {
// Build docs where no single subpath exists for all docs, but the union does.
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
for i in 0u64..100u64 {
if i % 2 == 0 {
// only subpath `a`
index_writer.add_document(doc!(json => json!({"a": i})))?;
} else {
// only subpath `b`
index_writer.add_document(doc!(json => json!({"b": i})))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
// No single subpath is full
assert_eq!(count_existing_fields(&searcher, "json.a", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "json.b", false)?, 50);
// Root exists with subpaths disabled is zero
assert_eq!(count_existing_fields(&searcher, "json", false)?, 0);
// Root exists with subpaths enabled should match all docs via union
assert_eq!(count_existing_fields(&searcher, "json", true)?, 100);
Ok(())
}
#[test]
fn test_exists_query_misc_supported_types() -> crate::Result<()> {
let mut schema_builder = Schema::builder();

View File

@@ -104,7 +104,7 @@ mod tests {
let query = query_parser.parse_query("a a a a a").unwrap();
let mut terms = Vec::new();
query.query_terms(&mut |term, pos| terms.push((term, pos)));
assert_eq!(vec![(&term_a, false); 1], terms);
assert_eq!(vec![(&term_a, false); 5], terms);
}
{
let query = query_parser.parse_query("a -b").unwrap();

View File

@@ -1,11 +1,8 @@
use std::fmt;
use std::ops::Bound;
use std::sync::Arc;
use tantivy_fst::Regex;
use crate::query::Occur;
use crate::schema::{Field, Term};
use crate::schema::Term;
use crate::Score;
#[derive(Clone)]
@@ -24,10 +21,6 @@ pub enum LogicalLiteral {
elements: Vec<Term>,
},
All,
Regex {
pattern: Arc<Regex>,
field: Field,
},
}
pub enum LogicalAst {
@@ -45,7 +38,6 @@ impl LogicalAst {
}
}
// TODO: Move to rewrite_ast in query_grammar
pub fn simplify(self) -> LogicalAst {
match self {
LogicalAst::Clause(clauses) => {
@@ -155,10 +147,6 @@ impl fmt::Debug for LogicalLiteral {
write!(formatter, "]")
}
LogicalLiteral::All => write!(formatter, "*"),
LogicalLiteral::Regex {
ref pattern,
ref field,
} => write!(formatter, "Regex({field:?}, {pattern:?})"),
}
}
}

View File

@@ -2,14 +2,12 @@ use std::net::{AddrParseError, IpAddr};
use std::num::{ParseFloatError, ParseIntError};
use std::ops::Bound;
use std::str::{FromStr, ParseBoolError};
use std::sync::Arc;
use base64::engine::general_purpose::STANDARD as BASE64;
use base64::Engine;
use itertools::Itertools;
use query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
use rustc_hash::FxHashMap;
use tantivy_fst::Regex;
use super::logical_ast::*;
use crate::index::Index;
@@ -17,7 +15,7 @@ use crate::json_utils::convert_to_fast_value_and_append_to_json_term;
use crate::query::range_query::{is_type_valid_for_fastfield_range_query, RangeQuery};
use crate::query::{
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery,
PhraseQuery, Query, RegexQuery, TermQuery, TermSetQuery,
PhraseQuery, Query, TermQuery, TermSetQuery,
};
use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
@@ -208,7 +206,6 @@ pub struct QueryParser {
tokenizer_manager: TokenizerManager,
boost: FxHashMap<Field, Score>,
fuzzy: FxHashMap<Field, Fuzzy>,
regexes_allowed: bool,
}
#[derive(Clone)]
@@ -263,7 +260,6 @@ impl QueryParser {
conjunction_by_default: false,
boost: Default::default(),
fuzzy: Default::default(),
regexes_allowed: false,
}
}
@@ -324,11 +320,6 @@ impl QueryParser {
);
}
/// Allow regexes in queries
pub fn allow_regexes(&mut self) {
self.regexes_allowed = true;
}
/// Parse a query
///
/// Note that `parse_query` returns an error if the input
@@ -495,17 +486,24 @@ impl QueryParser {
Ok(terms.into_iter().next().unwrap())
}
FieldType::JsonObject(ref json_options) => {
let mut term = Term::from_field_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
);
let get_term_with_path = || {
Term::from_field_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
)
};
if let Some(term) =
// Try to convert the phrase to a fast value
convert_to_fast_value_and_append_to_json_term(&term, phrase, false)
convert_to_fast_value_and_append_to_json_term(
get_term_with_path(),
phrase,
false,
)
{
Ok(term)
} else {
let mut term = get_term_with_path();
term.append_type_and_str(phrase);
Ok(term)
}
@@ -672,7 +670,7 @@ impl QueryParser {
}
UserInputAst::Boost(ast, boost) => {
let (ast, errors) = self.compute_logical_ast_with_occur_lenient(*ast);
(ast.boost(boost.into_inner() as Score), errors)
(ast.boost(boost as Score), errors)
}
UserInputAst::Leaf(leaf) => {
let (ast, errors) = self.compute_logical_ast_from_leaf_lenient(*leaf);
@@ -862,51 +860,6 @@ impl QueryParser {
"Range query need to target a specific field.".to_string(),
)],
),
UserInputLeaf::Regex { field, pattern } => {
if !self.regexes_allowed {
return (
None,
vec![QueryParserError::UnsupportedQuery(
"Regex queries are not allowed.".to_string(),
)],
);
}
let full_path = try_tuple!(field.ok_or_else(|| {
QueryParserError::UnsupportedQuery(
"Regex query need to target a specific field.".to_string(),
)
}));
let (field, json_path) = try_tuple!(self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
if !json_path.is_empty() {
return (
None,
vec![QueryParserError::UnsupportedQuery(
"Regex query does not support json paths.".to_string(),
)],
);
}
if !matches!(
self.schema.get_field_entry(field).field_type(),
FieldType::Str(_)
) {
return (
None,
vec![QueryParserError::UnsupportedQuery(
"Regex query only supported on text fields".to_string(),
)],
);
}
let pattern = try_tuple!(Regex::new(&pattern).map_err(|e| {
QueryParserError::UnsupportedQuery(format!("Invalid regex: {e}"))
}));
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Regex {
pattern: Arc::new(pattern),
field,
}));
(Some(logical_ast), Vec::new())
}
}
}
}
@@ -949,9 +902,6 @@ fn convert_literal_to_query(
LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)),
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
LogicalLiteral::All => Box::new(AllQuery),
LogicalLiteral::Regex { pattern, field } => {
Box::new(RegexQuery::from_regex(pattern, field))
}
}
}
@@ -1021,7 +971,7 @@ fn generate_literals_for_json_object(
// Try to convert the phrase to a fast value
if let Some(term) =
convert_to_fast_value_and_append_to_json_term(&get_term_with_path(), phrase, true)
convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true)
{
logical_literals.push(LogicalLiteral::Term(term));
}
@@ -1150,15 +1100,11 @@ mod test {
query: &str,
default_conjunction: bool,
default_fields: &[&'static str],
allow_regexes: bool,
) -> Result<LogicalAst, QueryParserError> {
let mut query_parser = make_query_parser_with_default_fields(default_fields);
if default_conjunction {
query_parser.set_conjunction_by_default();
}
if allow_regexes {
query_parser.allow_regexes();
}
query_parser.parse_query_to_logical_ast(query)
}
@@ -1170,7 +1116,6 @@ mod test {
query,
default_conjunction,
&["title", "text"],
true,
)
}
@@ -1185,7 +1130,6 @@ mod test {
query,
default_conjunction,
default_fields,
true,
)
.unwrap();
let query_str = format!("{query:?}");
@@ -1846,15 +1790,6 @@ mod test {
}
}
#[test]
fn test_space_before_value() {
test_parse_query_to_logical_ast_helper(
"title: a",
r#"Term(field=0, type=Str, "a")"#,
false,
);
}
#[test]
fn test_escaped_field() {
let mut schema_builder = Schema::builder();
@@ -2049,66 +1984,4 @@ mod test {
Err(QueryParserError::ExpectedInt(_))
);
}
#[test]
pub fn test_deduplication() {
let query = "be be";
test_parse_query_to_logical_ast_helper(
query,
"(Term(field=0, type=Str, \"be\") Term(field=1, type=Str, \"be\"))",
false,
);
}
#[test]
pub fn test_regex() {
let expected_regex = tantivy_fst::Regex::new(r".*b").unwrap();
test_parse_query_to_logical_ast_helper(
"title:/.*b/",
format!("Regex(Field(0), {:#?})", expected_regex).as_str(),
false,
);
// Invalid field
let err = parse_query_to_logical_ast("float:/.*b/", false).unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Regex query only supported on text fields"
);
// No field specified
let err = parse_query_to_logical_ast("/.*b/", false).unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Regex query need to target a specific field."
);
// Regex on a json path
let err = parse_query_to_logical_ast("title.subpath:/.*b/", false).unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Regex query does not support json paths."
);
// Invalid regex
let err = parse_query_to_logical_ast("title:/[A-Z*b/", false).unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Invalid regex: regex parse error:\n [A-Z*b\n ^\nerror: \
unclosed character class"
);
// Regexes not allowed
let err = parse_query_to_logical_ast_with_default_fields(
"title:/.*b/",
false,
&["title", "text"],
false,
)
.unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Regex queries are not allowed."
);
}
}

View File

@@ -12,14 +12,10 @@ pub use self::range_query_fastfield::*;
// TODO is this correct?
pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
match typ {
Type::Str
| Type::U64
| Type::I64
| Type::F64
| Type::Bool
| Type::Date
| Type::Json
| Type::IpAddr => true,
Type::Str | Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date | Type::Json => {
true
}
Type::IpAddr => true,
Type::Facet | Type::Bytes => false,
}
}

View File

@@ -258,7 +258,7 @@ fn search_on_json_numerical_field(
let bounds = match typ.numerical_type().unwrap() {
NumericalType::I64 => {
let bounds = bounds.map_bound(|term| term.as_i64().unwrap());
let bounds = bounds.map_bound(|term| (term.as_i64().unwrap()));
match actual_column_type {
NumericalType::I64 => bounds.map_bound(|&term| term.to_u64()),
NumericalType::U64 => {
@@ -282,7 +282,7 @@ fn search_on_json_numerical_field(
}
}
NumericalType::U64 => {
let bounds = bounds.map_bound(|term| term.as_u64().unwrap());
let bounds = bounds.map_bound(|term| (term.as_u64().unwrap()));
match actual_column_type {
NumericalType::U64 => bounds.map_bound(|&term| term.to_u64()),
NumericalType::I64 => {
@@ -306,7 +306,7 @@ fn search_on_json_numerical_field(
}
}
NumericalType::F64 => {
let bounds = bounds.map_bound(|term| term.as_f64().unwrap());
let bounds = bounds.map_bound(|term| (term.as_f64().unwrap()));
match actual_column_type {
NumericalType::U64 => transform_from_f64_bounds::<u64>(&bounds),
NumericalType::I64 => transform_from_f64_bounds::<i64>(&bounds),

View File

@@ -11,7 +11,7 @@ mod tests {
use crate::docset::DocSet;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::query::{EnableScoring, Query, QueryParser, Scorer, TermQuery};
use crate::schema::{Field, IndexRecordOption, Schema, FAST, STRING, TEXT};
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
use crate::{assert_nearly_equals, DocAddress, Index, IndexWriter, Term, TERMINATED};
#[test]
@@ -212,232 +212,4 @@ mod tests {
}
Ok(())
}
#[test]
fn test_term_query_fallback_to_fastfield() -> crate::Result<()> {
use crate::collector::Count;
use crate::schema::FAST;
// Create a FAST-only numeric field (not indexed)
let mut schema_builder = Schema::builder();
let num_field = schema_builder.add_u64_field("num", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(num_field => 10u64))?;
index_writer.add_document(doc!(num_field => 20u64))?;
index_writer.add_document(doc!(num_field => 10u64))?;
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
// TermQuery should fall back to a fastfield range query and match correctly.
let tq_10 = TermQuery::new(
Term::from_field_u64(num_field, 10u64),
IndexRecordOption::Basic,
);
let tq_20 = TermQuery::new(
Term::from_field_u64(num_field, 20u64),
IndexRecordOption::Basic,
);
let tq_30 = TermQuery::new(
Term::from_field_u64(num_field, 30u64),
IndexRecordOption::Basic,
);
let count_10 = searcher.search(&tq_10, &Count)?;
let count_20 = searcher.search(&tq_20, &Count)?;
let count_30 = searcher.search(&tq_30, &Count)?;
assert_eq!(count_10, 2);
assert_eq!(count_20, 1);
assert_eq!(count_30, 0);
Ok(())
}
#[test]
fn test_term_query_fallback_text_fast_only() -> crate::Result<()> {
use crate::collector::Count;
// FAST-only text field (not indexed)
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field => "hello"))?;
index_writer.add_document(doc!(text_field => "world"))?;
index_writer.add_document(doc!(text_field => "hello"))?;
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let tq_hello = TermQuery::new(
Term::from_field_text(text_field, "hello"),
IndexRecordOption::Basic,
);
let tq_world = TermQuery::new(
Term::from_field_text(text_field, "world"),
IndexRecordOption::Basic,
);
let tq_missing = TermQuery::new(
Term::from_field_text(text_field, "nope"),
IndexRecordOption::Basic,
);
assert_eq!(searcher.search(&tq_hello, &Count)?, 2);
assert_eq!(searcher.search(&tq_world, &Count)?, 1);
assert_eq!(searcher.search(&tq_missing, &Count)?, 0);
Ok(())
}
#[test]
fn test_term_query_fallback_json_fast_only() -> crate::Result<()> {
use crate::collector::Count;
use crate::fastfield::FastValue;
use crate::schema::FAST;
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(json_field => json!({"a": 10, "b": "x"})))?;
index_writer.add_document(doc!(json_field => json!({"a": 20, "b": "y"})))?;
index_writer.add_document(doc!(json_field => json!({"a": 10, "b": "z"})))?;
index_writer.commit()?;
}
fn json_term_fast<T: FastValue>(field: Field, path: &str, v: T) -> Term {
let mut term = Term::from_field_json_path(field, path, true);
term.append_type_and_fast_value(v);
term
}
fn json_term_str(field: Field, path: &str, v: &str) -> Term {
let mut term = Term::from_field_json_path(field, path, true);
term.append_type_and_str(v);
term
}
let searcher = index.reader()?.searcher();
// numeric path match
let tq_a10 = TermQuery::new(
json_term_fast(json_field, "a", 10u64),
IndexRecordOption::Basic,
);
let tq_a20 = TermQuery::new(
json_term_fast(json_field, "a", 20u64),
IndexRecordOption::Basic,
);
let tq_a30 = TermQuery::new(
json_term_fast(json_field, "a", 30u64),
IndexRecordOption::Basic,
);
assert_eq!(searcher.search(&tq_a10, &Count)?, 2);
assert_eq!(searcher.search(&tq_a20, &Count)?, 1);
assert_eq!(searcher.search(&tq_a30, &Count)?, 0);
// string path match
let tq_bx = TermQuery::new(
json_term_str(json_field, "b", "x"),
IndexRecordOption::Basic,
);
let tq_by = TermQuery::new(
json_term_str(json_field, "b", "y"),
IndexRecordOption::Basic,
);
let tq_bm = TermQuery::new(
json_term_str(json_field, "b", "missing"),
IndexRecordOption::Basic,
);
assert_eq!(searcher.search(&tq_bx, &Count)?, 1);
assert_eq!(searcher.search(&tq_by, &Count)?, 1);
assert_eq!(searcher.search(&tq_bm, &Count)?, 0);
Ok(())
}
#[test]
fn test_term_query_fallback_ip_fast_only() -> crate::Result<()> {
use std::net::IpAddr;
use std::str::FromStr;
use crate::collector::Count;
use crate::schema::{IntoIpv6Addr, FAST};
let mut schema_builder = Schema::builder();
let ip_field = schema_builder.add_ip_addr_field("ip", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let ip1 = IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr();
let ip2 = IpAddr::from_str("127.0.0.2").unwrap().into_ipv6_addr();
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(ip_field => ip1))?;
index_writer.add_document(doc!(ip_field => ip2))?;
index_writer.add_document(doc!(ip_field => ip1))?;
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let tq_ip1 = TermQuery::new(
Term::from_field_ip_addr(ip_field, ip1),
IndexRecordOption::Basic,
);
let tq_ip2 = TermQuery::new(
Term::from_field_ip_addr(ip_field, ip2),
IndexRecordOption::Basic,
);
let ip3 = IpAddr::from_str("127.0.0.3").unwrap().into_ipv6_addr();
let tq_ip3 = TermQuery::new(
Term::from_field_ip_addr(ip_field, ip3),
IndexRecordOption::Basic,
);
assert_eq!(searcher.search(&tq_ip1, &Count)?, 2);
assert_eq!(searcher.search(&tq_ip2, &Count)?, 1);
assert_eq!(searcher.search(&tq_ip3, &Count)?, 0);
Ok(())
}
#[test]
fn test_term_query_fallback_fastfield_with_scores_errors() -> crate::Result<()> {
use crate::collector::TopDocs;
// FAST-only numeric field (not indexed) should error when scoring is required
let mut schema_builder = Schema::builder();
let num_field = schema_builder.add_u64_field("num", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(num_field => 10u64))?;
index_writer.add_document(doc!(num_field => 20u64))?;
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let tq = TermQuery::new(
Term::from_field_u64(num_field, 10u64),
IndexRecordOption::Basic,
);
// Using TopDocs requires scoring; since the field is not indexed,
// TermQuery cannot score and should return a SchemaError.
let res = searcher.search(&tq, &TopDocs::with_limit(1));
assert!(matches!(res, Err(crate::TantivyError::SchemaError(_))));
Ok(())
}
}

View File

@@ -1,10 +1,8 @@
use std::fmt;
use std::ops::Bound;
use super::term_weight::TermWeight;
use crate::query::bm25::Bm25Weight;
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
use crate::query::{EnableScoring, Explanation, Query, RangeQuery, Weight};
use crate::query::{EnableScoring, Explanation, Query, Weight};
use crate::schema::IndexRecordOption;
use crate::Term;
@@ -101,7 +99,7 @@ impl TermQuery {
EnableScoring::Enabled {
statistics_provider,
..
} => Bm25Weight::for_terms(statistics_provider, std::slice::from_ref(&self.term))?,
} => Bm25Weight::for_terms(statistics_provider, &[self.term.clone()])?,
EnableScoring::Disabled { .. } => {
Bm25Weight::new(Explanation::new("<no score>", 1.0f32), 1.0f32)
}
@@ -124,24 +122,6 @@ impl TermQuery {
impl Query for TermQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
// If the field is not indexed but is a suitable fast field, fall back to a range query
// on the fast field matching exactly this term.
//
// Note: This is considerable slower since it requires to scan the entire fast field.
// TODO: The range query would gain from having a single-value optimization
let schema = enable_scoring.schema();
let field_entry = schema.get_field_entry(self.term.field());
if !field_entry.is_indexed()
&& field_entry.is_fast()
&& is_type_valid_for_fastfield_range_query(self.term.typ())
&& !enable_scoring.is_scoring_enabled()
{
let range_query = RangeQuery::new(
Bound::Included(self.term.clone()),
Bound::Included(self.term.clone()),
);
return range_query.weight(enable_scoring);
}
Ok(Box::new(self.specialized_weight(enable_scoring)?))
}
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {

View File

@@ -25,8 +25,8 @@ impl TermScorer {
}
}
pub(crate) fn seek_block(&mut self, target_doc: DocId) {
self.postings.block_cursor.seek_block(target_doc);
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
self.postings.block_cursor.shallow_seek(target_doc);
}
#[cfg(test)]
@@ -175,7 +175,7 @@ mod tests {
let fieldnorms: Vec<u32> = std::iter::repeat_n(10u32, 3_000).collect();
let mut term_scorer = TermScorer::create_for_test(&doc_and_tfs, &fieldnorms, bm25_weight);
assert_eq!(term_scorer.doc(), 0u32);
term_scorer.seek_block(1289);
term_scorer.shallow_seek(1289);
assert_eq!(term_scorer.doc(), 0u32);
term_scorer.seek(1289);
assert_eq!(term_scorer.doc(), 1290);
@@ -242,9 +242,9 @@ mod tests {
let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0);
let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight);
assert_nearly_equals!(docs.block_max_score(), 2.5161593);
docs.seek_block(135);
docs.shallow_seek(135);
assert_nearly_equals!(docs.block_max_score(), 3.4597192);
docs.seek_block(256);
docs.shallow_seek(256);
// the block is not loaded yet.
assert_nearly_equals!(docs.block_max_score(), 5.2971773);
assert_eq!(256, docs.seek(256));
@@ -275,7 +275,7 @@ mod tests {
{
let mut term_scorer = term_weight.specialized_scorer(reader, 1.0)?;
for d in docs {
term_scorer.seek_block(d);
term_scorer.shallow_seek(d);
block_max_scores_b.push(term_scorer.block_max_score());
}
}

View File

@@ -5,10 +5,8 @@ use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::Scorer;
use crate::{DocId, Score};
// The buffered union looks ahead within a fixed-size sliding window
// of upcoming document IDs (the "horizon").
const HORIZON_NUM_TINYBITSETS: usize = HORIZON as usize / 64;
const HORIZON: u32 = 64u32 * 64u32;
const HORIZON_NUM_TINYBITSETS: usize = 64;
const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32;
// `drain_filter` is not stable yet.
// This function is similar except that it does is not unstable, and
@@ -29,26 +27,12 @@ where P: FnMut(&mut T) -> bool {
/// Creates a `DocSet` that iterate through the union of two or more `DocSet`s.
pub struct BufferedUnionScorer<TScorer, TScoreCombiner = DoNothingCombiner> {
/// Active scorers (already filtered of `TERMINATED`).
docsets: Vec<TScorer>,
/// Sliding window presence map for upcoming docs.
///
/// There are `HORIZON_NUM_TINYBITSETS` buckets, each covering
/// a span of 64 doc IDs. Bucket `i` represents the range
/// `[window_start_doc + i*64, window_start_doc + (i+1)*64)`.
bitsets: Box<[TinySet; HORIZON_NUM_TINYBITSETS]>,
// Index of the current TinySet bucket within the sliding window.
bucket_idx: usize,
/// Per-doc score combiners for the current window.
///
/// these accumulators merge contributions from all scorers that
/// hit the same doc within the buffered window.
scores: Box<[TScoreCombiner; HORIZON as usize]>,
/// Start doc ID (inclusive) of the current sliding window.
window_start_doc: DocId,
/// Current doc ID of the union.
cursor: usize,
offset: DocId,
doc: DocId,
/// Combined score for current `doc` as produced by `TScoreCombiner`.
score: Score,
}
@@ -90,8 +74,8 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
docsets: non_empty_docsets,
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
scores: Box::new([score_combiner_fn(); HORIZON as usize]),
bucket_idx: HORIZON_NUM_TINYBITSETS,
window_start_doc: 0,
cursor: HORIZON_NUM_TINYBITSETS,
offset: 0,
doc: 0,
score: 0.0,
};
@@ -105,10 +89,8 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
fn refill(&mut self) -> bool {
if let Some(min_doc) = self.docsets.iter().map(DocSet::doc).min() {
// Reset the sliding window to start at the smallest doc
// across all scorers and prebuffer within the horizon.
self.window_start_doc = min_doc;
self.bucket_idx = 0;
self.offset = min_doc;
self.cursor = 0;
self.doc = min_doc;
refill(
&mut self.docsets,
@@ -123,16 +105,16 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
}
fn advance_buffered(&mut self) -> bool {
while self.bucket_idx < HORIZON_NUM_TINYBITSETS {
if let Some(val) = self.bitsets[self.bucket_idx].pop_lowest() {
let delta = val + (self.bucket_idx as u32) * 64;
self.doc = self.window_start_doc + delta;
while self.cursor < HORIZON_NUM_TINYBITSETS {
if let Some(val) = self.bitsets[self.cursor].pop_lowest() {
let delta = val + (self.cursor as u32) * 64;
self.doc = self.offset + delta;
let score_combiner = &mut self.scores[delta as usize];
self.score = score_combiner.score();
score_combiner.clear();
return true;
} else {
self.bucket_idx += 1;
self.cursor += 1;
}
}
false
@@ -162,19 +144,19 @@ where
if self.doc >= target {
return self.doc;
}
let gap = target - self.window_start_doc;
let gap = target - self.offset;
if gap < HORIZON {
// Our value is within the buffered horizon.
// Skipping to corresponding bucket.
let new_bucket_idx = gap as usize / 64;
for obsolete_tinyset in &mut self.bitsets[self.bucket_idx..new_bucket_idx] {
// Skipping to corresponding bucket.
let new_cursor = gap as usize / 64;
for obsolete_tinyset in &mut self.bitsets[self.cursor..new_cursor] {
obsolete_tinyset.clear();
}
for score_combiner in &mut self.scores[self.bucket_idx * 64..new_bucket_idx * 64] {
for score_combiner in &mut self.scores[self.cursor * 64..new_cursor * 64] {
score_combiner.clear();
}
self.bucket_idx = new_bucket_idx;
self.cursor = new_cursor;
// Advancing until we reach the end of the bucket
// or we reach a doc greater or equal to the target.
@@ -229,7 +211,7 @@ where
if self.doc == TERMINATED {
return 0;
}
let mut count = self.bitsets[self.bucket_idx..HORIZON_NUM_TINYBITSETS]
let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
.iter()
.map(|bitset| bitset.len())
.sum::<u32>()
@@ -243,7 +225,7 @@ where
bitset.clear();
}
}
self.bucket_idx = HORIZON_NUM_TINYBITSETS;
self.cursor = HORIZON_NUM_TINYBITSETS;
count
}
}

View File

@@ -41,7 +41,6 @@
//! use tantivy::schema::document::{DeserializeError, DocumentDeserialize, DocumentDeserializer};
//!
//! /// Our custom document to let us use a map of `serde_json::Values`.
//! #[allow(dead_code)]
//! pub struct MyCustomDocument {
//! // Tantivy provides trait implementations for common `serde_json` types.
//! fields: BTreeMap<Field, serde_json::Value>
@@ -80,7 +79,6 @@
//! }
//!
//! /// Our custom iterator just helps us to avoid some messy generics.
//! #[allow(dead_code)]
//! pub struct MyCustomIter<'a>(btree_map::Iter<'a, Field, serde_json::Value>);
//! impl<'a> Iterator for MyCustomIter<'a> {
//! // Here we can see our field-value pairs being produced by the iterator.

View File

@@ -1561,7 +1561,6 @@ fn to_ascii(text: &str, output: &mut String) {
#[cfg(test)]
mod tests {
use super::to_ascii;
use crate::tokenizer::{AsciiFoldingFilter, RawTokenizer, SimpleTokenizer, TextAnalyzer};

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-sstable"
version = "0.6.0"
version = "0.5.0"
edition = "2024"
license = "MIT"
homepage = "https://github.com/quickwit-oss/tantivy"
@@ -10,10 +10,10 @@ categories = ["database-implementations", "data-structures", "compression"]
description = "sstables for tantivy"
[dependencies]
common = {version= "0.10", path="../common", package="tantivy-common"}
common = {version= "0.9", path="../common", package="tantivy-common"}
futures-util = "0.3.30"
itertools = "0.14.0"
tantivy-bitpacker = { version= "0.9", path="../bitpacker" }
tantivy-bitpacker = { version= "0.8", path="../bitpacker" }
tantivy-fst = "0.5"
# experimental gives us access to Decompressor::upper_bound
zstd = { version = "0.13", optional = true, features = ["experimental"] }

View File

@@ -608,12 +608,12 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
/// Returns a range builder, to stream all of the terms
/// within an interval.
pub fn range(&self) -> StreamerBuilder<'_, TSSTable> {
pub fn range(&self) -> StreamerBuilder<TSSTable> {
StreamerBuilder::new(self, AlwaysMatch)
}
/// Returns a range builder filtered with a prefix.
pub fn prefix_range<K: AsRef<[u8]>>(&self, prefix: K) -> StreamerBuilder<'_, TSSTable> {
pub fn prefix_range<K: AsRef<[u8]>>(&self, prefix: K) -> StreamerBuilder<TSSTable> {
let lower_bound = prefix.as_ref();
let mut upper_bound = lower_bound.to_vec();
for idx in (0..upper_bound.len()).rev() {
@@ -632,7 +632,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
}
/// A stream of all the sorted terms.
pub fn stream(&self) -> io::Result<Streamer<'_, TSSTable>> {
pub fn stream(&self) -> io::Result<Streamer<TSSTable>> {
self.range().into_stream()
}
@@ -696,10 +696,9 @@ mod tests {
fn read_bytes(&self, range: Range<usize>) -> std::io::Result<OwnedBytes> {
let allowed_range = self.allowed_range.lock().unwrap();
if !allowed_range.contains(&range.start) || !allowed_range.contains(&(range.end - 1)) {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
format!("invalid range, allowed {allowed_range:?}, requested {range:?}"),
));
return Err(std::io::Error::other(format!(
"invalid range, allowed {allowed_range:?}, requested {range:?}"
)));
}
Ok(self.bytes.slice(range))

View File

@@ -1,3 +1,5 @@
#![allow(clippy::manual_div_ceil)]
//! `tantivy_sstable` is a crate that provides a sorted string table data structure.
//!
//! It is used in `tantivy` to store the term dictionary.

View File

@@ -54,14 +54,14 @@ pub fn merge_sstable<SST: SSTable, W: io::Write, M: ValueMerger<SST::Value>>(
}
}
for _ in 0..len - 1 {
if let Some(mut head) = heap.peek_mut()
&& head.0.key() == writer.last_inserted_key()
{
value_merger.add(head.0.value());
if !head.0.advance()? {
PeekMut::pop(head);
if let Some(mut head) = heap.peek_mut() {
if head.0.key() == writer.last_inserted_key() {
value_merger.add(head.0.value());
if !head.0.advance()? {
PeekMut::pop(head);
}
continue;
}
continue;
}
break;
}

View File

@@ -438,7 +438,7 @@ impl BlockAddrBlockMetadata {
let ordinal_addr = range_start_addr + self.range_start_nbits as usize;
let range_end_addr = range_start_addr + num_bits;
if (range_end_addr + self.range_start_nbits as usize).div_ceil(8) > data.len() {
if (range_end_addr + self.range_start_nbits as usize + 7) / 8 > data.len() {
return None;
}

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-stacker"
version = "0.6.0"
version = "0.5.0"
edition = "2024"
license = "MIT"
homepage = "https://github.com/quickwit-oss/tantivy"
@@ -9,7 +9,7 @@ description = "term hashmap used for indexing"
[dependencies]
murmurhash32 = "0.3"
common = { version = "0.10", path = "../common/", package = "tantivy-common" }
common = { version = "0.9", path = "../common/", package = "tantivy-common" }
ahash = { version = "0.8.11", default-features = false, optional = true }
rand_distr = "0.4.3"

View File

@@ -10,8 +10,7 @@ pub fn fast_short_slice_copy(src: &[u8], dst: &mut [u8]) {
#[track_caller]
fn len_mismatch_fail(dst_len: usize, src_len: usize) -> ! {
panic!(
"source slice length ({}) does not match destination slice length ({})",
src_len, dst_len,
"source slice length ({src_len}) does not match destination slice length ({dst_len})",
);
}

View File

@@ -1,5 +1,3 @@
#![cfg_attr(all(feature = "unstable", test), feature(test))]
#[cfg(all(test, feature = "unstable"))]
extern crate test;

View File

@@ -274,12 +274,13 @@ impl SharedArenaHashMap {
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
return None;
} else if kv.hash == hash
&& let Some(val_addr) =
} else if kv.hash == hash {
if let Some(val_addr) =
self.get_value_addr_if_key_match(key, kv.key_value_addr, memory_arena)
{
let v = memory_arena.read(val_addr);
return Some(v);
{
let v = memory_arena.read(val_addr);
return Some(v);
}
}
}
}
@@ -333,14 +334,15 @@ impl SharedArenaHashMap {
self.set_bucket(hash, key_addr, bucket);
return val;
}
if kv.hash == hash
&& let Some(val_addr) =
if kv.hash == hash {
if let Some(val_addr) =
self.get_value_addr_if_key_match(key, kv.key_value_addr, memory_arena)
{
let v = memory_arena.read(val_addr);
let new_v = updater(Some(v));
memory_arena.write_at(val_addr, new_v);
return new_v;
{
let v = memory_arena.read(val_addr);
let new_v = updater(Some(v));
memory_arena.write_at(val_addr, new_v);
return new_v;
}
}
// This allows fetching the next bucket before the loop jmp
bucket = probe.next_probe();

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-tokenizer-api"
version = "0.6.0"
version = "0.5.0"
license = "MIT"
edition = "2021"
description = "Tokenizer API of tantivy"