mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-02 00:20:42 +00:00
Compare commits
16 Commits
paul.masur
...
paul.masur
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c81f16838f | ||
|
|
63da5a21b2 | ||
|
|
54cd5bba98 | ||
|
|
d27ca164a9 | ||
|
|
2f5a48e8b1 | ||
|
|
ae0ab907fe | ||
|
|
7d62e084e7 | ||
|
|
322286ee16 | ||
|
|
73ad18fa1e | ||
|
|
89f0cef807 | ||
|
|
2e16243f9a | ||
|
|
e015abab8e | ||
|
|
73c711ec74 | ||
|
|
cb037c8079 | ||
|
|
ed3453606b | ||
|
|
e9641f99c5 |
4
.github/workflows/coverage.yml
vendored
4
.github/workflows/coverage.yml
vendored
@@ -13,7 +13,7 @@ jobs:
|
||||
coverage:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
- name: Install Rust
|
||||
run: rustup toolchain install nightly-2025-12-01 --profile minimal --component llvm-tools-preview
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
@@ -21,7 +21,7 @@ jobs:
|
||||
- name: Generate code coverage
|
||||
run: cargo +nightly-2025-12-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v3
|
||||
uses: codecov/codecov-action@v6
|
||||
continue-on-error: true
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
|
||||
|
||||
2
.github/workflows/long_running.yml
vendored
2
.github/workflows/long_running.yml
vendored
@@ -19,7 +19,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
- name: Install stable
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
|
||||
4
.github/workflows/test.yml
vendored
4
.github/workflows/test.yml
vendored
@@ -20,7 +20,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Install nightly
|
||||
uses: actions-rs/toolchain@v1
|
||||
@@ -67,7 +67,7 @@ jobs:
|
||||
name: test-${{ matrix.features.label}}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Install stable
|
||||
uses: actions-rs/toolchain@v1
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
Tantivy 0.26.1
|
||||
================================
|
||||
|
||||
## Performance
|
||||
- Fix quadratic runtime in nested term and composite aggregations: memory accounting scanned all parent buckets on every collect instead of just the current parent (@PSeitz @fulmicoton)
|
||||
|
||||
Tantivy 0.26 (Unreleased)
|
||||
================================
|
||||
|
||||
|
||||
14
Cargo.toml
14
Cargo.toml
@@ -92,7 +92,7 @@ postcard = { version = "1.0.4", features = [
|
||||
], default-features = false }
|
||||
|
||||
[target.'cfg(not(windows))'.dev-dependencies]
|
||||
criterion = { version = "0.5", default-features = false }
|
||||
criterion = { version = "0.8", default-features = false }
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.5.0"
|
||||
@@ -201,3 +201,15 @@ harness = false
|
||||
[[bench]]
|
||||
name = "regex_all_terms"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "query_parser_nested"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "intersection_bench"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "nested_terms_agg_bench"
|
||||
harness = false
|
||||
|
||||
@@ -63,6 +63,8 @@ fn bench_agg(mut group: InputGroup<Index>) {
|
||||
register!(group, terms_all_unique_with_avg_sub_agg);
|
||||
register!(group, terms_many_with_avg_sub_agg);
|
||||
register!(group, terms_status_with_avg_sub_agg);
|
||||
register!(group, terms_status_with_terms_zipf_1000_sub_agg);
|
||||
register!(group, terms_zipf_1000_with_terms_status_sub_agg);
|
||||
register!(group, terms_status_with_histogram);
|
||||
register!(group, terms_zipf_1000);
|
||||
register!(group, terms_zipf_1000_with_histogram);
|
||||
@@ -270,6 +272,30 @@ fn terms_all_unique_with_avg_sub_agg(index: &Index) {
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_status_with_terms_zipf_1000_sub_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_few_terms_status" },
|
||||
"aggs": {
|
||||
"nested_terms": { "terms": { "field": "text_1000_terms_zipf" } }
|
||||
}
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn terms_zipf_1000_with_terms_status_sub_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_1000_terms_zipf" },
|
||||
"aggs": {
|
||||
"nested_terms": { "terms": { "field": "text_few_terms_status" } }
|
||||
}
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn terms_status_with_histogram(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
|
||||
149
benches/intersection_bench.rs
Normal file
149
benches/intersection_bench.rs
Normal file
@@ -0,0 +1,149 @@
|
||||
// Benchmarks top-K intersection of term scorers (block_wand_intersection).
|
||||
//
|
||||
// What's measured:
|
||||
// - Conjunctive queries (+a +b, +a +b +c) with top-10 by score
|
||||
// - Varying doc-frequency balance between terms (balanced, skewed, very skewed)
|
||||
// - Realistic term frequencies (geometric distribution, mostly low)
|
||||
// - 1M-doc single segment
|
||||
//
|
||||
// Run with: cargo bench --bench intersection_bench
|
||||
|
||||
use binggan::{black_box, BenchRunner};
|
||||
use rand::prelude::*;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::SeedableRng;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, TEXT};
|
||||
use tantivy::{doc, Index, ReloadPolicy, Searcher};
|
||||
|
||||
const NUM_DOCS: usize = 1_000_000;
|
||||
|
||||
struct BenchIndex {
|
||||
searcher: Searcher,
|
||||
query_parser: QueryParser,
|
||||
}
|
||||
|
||||
/// Generate term frequency from a geometric-like distribution.
|
||||
/// Most values are 1, a few are 2-3, rarely higher.
|
||||
/// p controls the decay: higher p → more weight on tf=1.
|
||||
fn random_term_freq(rng: &mut StdRng, p: f64) -> u32 {
|
||||
let mut tf = 1u32;
|
||||
while tf < 10 && rng.random_bool(1.0 - p) {
|
||||
tf += 1;
|
||||
}
|
||||
tf
|
||||
}
|
||||
|
||||
/// Build an index with three terms (a, b, c) with given doc-frequency probabilities.
|
||||
/// Each term occurrence has a realistic term frequency (geometric distribution).
|
||||
/// Field length is padded with filler tokens to create varied fieldnorms.
|
||||
fn build_index(p_a: f64, p_b: f64, p_c: f64) -> BenchIndex {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let body = schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut rng = StdRng::from_seed([42u8; 32]);
|
||||
|
||||
{
|
||||
let mut writer = index.writer_with_num_threads(1, 500_000_000).unwrap();
|
||||
for _ in 0..NUM_DOCS {
|
||||
let mut tokens: Vec<String> = Vec::new();
|
||||
|
||||
if rng.random_bool(p_a) {
|
||||
let tf = random_term_freq(&mut rng, 0.7);
|
||||
for _ in 0..tf {
|
||||
tokens.push("aaa".to_string());
|
||||
}
|
||||
}
|
||||
if rng.random_bool(p_b) {
|
||||
let tf = random_term_freq(&mut rng, 0.7);
|
||||
for _ in 0..tf {
|
||||
tokens.push("bbb".to_string());
|
||||
}
|
||||
}
|
||||
if rng.random_bool(p_c) {
|
||||
let tf = random_term_freq(&mut rng, 0.7);
|
||||
for _ in 0..tf {
|
||||
tokens.push("ccc".to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// Pad with filler to create varied field lengths (5-30 tokens).
|
||||
let filler_count = rng.random_range(5u32..30u32);
|
||||
for _ in 0..filler_count {
|
||||
tokens.push("filler".to_string());
|
||||
}
|
||||
|
||||
let text = tokens.join(" ");
|
||||
writer.add_document(doc!(body => text)).unwrap();
|
||||
}
|
||||
writer.commit().unwrap();
|
||||
}
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![body]);
|
||||
|
||||
BenchIndex {
|
||||
searcher,
|
||||
query_parser,
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
// Scenarios: (label, p_a, p_b, p_c)
|
||||
//
|
||||
// "balanced": all terms ~10% → intersection ~1% of docs
|
||||
// "skewed": one common (50%), one rare (2%) → intersection ~1%
|
||||
// "very_skewed": one very common (80%), one very rare (0.5%) → intersection ~0.4%
|
||||
// "three_balanced": three terms ~20% each → intersection ~0.8%
|
||||
// "three_skewed": 50% / 10% / 2% → intersection ~0.1%
|
||||
let scenarios: Vec<(&str, f64, f64, f64)> = vec![
|
||||
("balanced_10%_10%", 0.10, 0.10, 0.0),
|
||||
("skewed_50%_2%", 0.50, 0.02, 0.0),
|
||||
("very_skewed_80%_0.5%", 0.80, 0.005, 0.0),
|
||||
("three_balanced_20%_20%_20%", 0.20, 0.20, 0.20),
|
||||
("three_skewed_50%_10%_2%", 0.50, 0.10, 0.02),
|
||||
];
|
||||
|
||||
let mut runner = BenchRunner::new();
|
||||
|
||||
for (label, p_a, p_b, p_c) in &scenarios {
|
||||
let bench_index = build_index(*p_a, *p_b, *p_c);
|
||||
|
||||
let mut group = runner.new_group();
|
||||
group.set_name(format!("intersection — {label}"));
|
||||
|
||||
// Two-term intersection
|
||||
if *p_a > 0.0 && *p_b > 0.0 {
|
||||
let query_str = "+aaa +bbb";
|
||||
let query = bench_index.query_parser.parse_query(query_str).unwrap();
|
||||
let searcher = bench_index.searcher.clone();
|
||||
group.register(format!("{query_str} top10"), move |_| {
|
||||
let collector = TopDocs::with_limit(10).order_by_score();
|
||||
black_box(searcher.search(&query, &collector).unwrap());
|
||||
1usize
|
||||
});
|
||||
}
|
||||
|
||||
// Three-term intersection
|
||||
if *p_c > 0.0 {
|
||||
let query_str = "+aaa +bbb +ccc";
|
||||
let query = bench_index.query_parser.parse_query(query_str).unwrap();
|
||||
let searcher = bench_index.searcher.clone();
|
||||
group.register(format!("{query_str} top10"), move |_| {
|
||||
let collector = TopDocs::with_limit(10).order_by_score();
|
||||
black_box(searcher.search(&query, &collector).unwrap());
|
||||
1usize
|
||||
});
|
||||
}
|
||||
|
||||
group.run();
|
||||
}
|
||||
}
|
||||
121
benches/nested_terms_agg_bench.rs
Normal file
121
benches/nested_terms_agg_bench.rs
Normal file
@@ -0,0 +1,121 @@
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
use binggan::plugins::PeakMemAllocPlugin;
|
||||
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
||||
use rand::rngs::StdRng;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::SeedableRng;
|
||||
use rand_distr::Distribution;
|
||||
use serde_json::json;
|
||||
use tantivy::aggregation::agg_req::Aggregations;
|
||||
use tantivy::aggregation::AggregationCollector;
|
||||
use tantivy::query::AllQuery;
|
||||
use tantivy::schema::{Schema, FAST, STRING};
|
||||
use tantivy::{doc, Index};
|
||||
|
||||
#[global_allocator]
|
||||
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
|
||||
|
||||
const NUM_UNIQUE_IPS: u64 = 2_000_000;
|
||||
const NUM_ASNS: u64 = 22_600;
|
||||
const NUM_PATHS: u64 = 500_000;
|
||||
|
||||
const INDEX_DIR: &str = "bench-index";
|
||||
|
||||
fn main() {
|
||||
let index = get_or_build_index().unwrap();
|
||||
let inputs = vec![("nested_terms_cardinality", index)];
|
||||
let mut group = InputGroup::new_with_inputs(inputs);
|
||||
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
|
||||
|
||||
group.register("terms_ip_50_terms_asn_10_cardinality_path", |index| {
|
||||
let agg_req = json!({
|
||||
"by_ip": {
|
||||
"terms": { "field": "ip", "size": 50 },
|
||||
"aggs": {
|
||||
"by_asn": {
|
||||
"terms": { "field": "asn", "size": 10 },
|
||||
"aggs": {
|
||||
"path_cardinality": {
|
||||
"cardinality": { "field": "path" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
});
|
||||
|
||||
group.run();
|
||||
}
|
||||
|
||||
fn execute_agg(index: &Index, agg_req: serde_json::Value) {
|
||||
let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap();
|
||||
let collector = AggregationCollector::from_aggs(agg_req, Default::default());
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
black_box(searcher.search(&AllQuery, &collector).unwrap());
|
||||
}
|
||||
|
||||
fn get_or_build_index() -> tantivy::Result<Index> {
|
||||
let index_path = Path::new(INDEX_DIR);
|
||||
if index_path.exists() {
|
||||
return Index::open_in_dir(index_path);
|
||||
}
|
||||
std::fs::create_dir_all(index_path)?;
|
||||
|
||||
let mut schema_builder = Schema::builder();
|
||||
let ip_field = schema_builder.add_text_field("ip", STRING | FAST);
|
||||
let asn_field = schema_builder.add_text_field("asn", STRING | FAST);
|
||||
let path_field = schema_builder.add_text_field("path", STRING | FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_dir(index_path, schema)?;
|
||||
|
||||
let mut rng = StdRng::from_seed([42u8; 32]);
|
||||
let zipf = rand_distr::Zipf::new(NUM_UNIQUE_IPS as f64, 1.1).unwrap();
|
||||
|
||||
// Pre-assign a stable asn to each ip.
|
||||
let mut ip_to_asn: HashMap<u64, u64> = HashMap::with_capacity(NUM_UNIQUE_IPS as usize);
|
||||
|
||||
// Generate all rows: long-tail ip distribution, each ip gets a deterministic asn,
|
||||
// and every ip in 0..NUM_UNIQUE_IPS appears at least once.
|
||||
let mut rows: Vec<(u64, u64)> = Vec::with_capacity(NUM_UNIQUE_IPS as usize * 2);
|
||||
|
||||
// Guarantee every ip appears at least once.
|
||||
for ip_val in 0..NUM_UNIQUE_IPS {
|
||||
let asn_val = ip_val % NUM_ASNS;
|
||||
ip_to_asn.insert(ip_val, asn_val);
|
||||
rows.push((ip_val, asn_val));
|
||||
}
|
||||
|
||||
// Add extra rows drawn from the Zipf distribution to create the long-tail effect.
|
||||
let extra_rows = NUM_UNIQUE_IPS;
|
||||
for _ in 0..extra_rows {
|
||||
let ip_val = (zipf.sample(&mut rng) as u64 - 1).min(NUM_UNIQUE_IPS - 1);
|
||||
let asn_val = *ip_to_asn
|
||||
.entry(ip_val)
|
||||
.or_insert_with(|| ip_val % NUM_ASNS);
|
||||
rows.push((ip_val, asn_val));
|
||||
}
|
||||
|
||||
// Shuffle so that document order is not sorted by ip.
|
||||
rows.shuffle(&mut rng);
|
||||
|
||||
let uniform_path = rand_distr::Uniform::new(0u64, NUM_PATHS).unwrap();
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
|
||||
for (ip_val, asn_val) in &rows {
|
||||
let path_val: u64 = uniform_path.sample(&mut rng);
|
||||
index_writer.add_document(doc!(
|
||||
ip_field => format!("ip_{ip_val}"),
|
||||
asn_field => format!("asn_{asn_val}"),
|
||||
path_field => format!("path_{path_val}"),
|
||||
))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
Ok(index)
|
||||
}
|
||||
35
benches/query_parser_nested.rs
Normal file
35
benches/query_parser_nested.rs
Normal file
@@ -0,0 +1,35 @@
|
||||
// Benchmark for the query grammar parsing deeply nested queries.
|
||||
//
|
||||
// Regression guard for https://github.com/quickwit-oss/tantivy/issues/2498:
|
||||
// at depth 20/21 the old parser took 0.87 s / 1.72 s respectively because
|
||||
// `ast()` retried `occur_leaf` on backtrack, giving O(2^n) time. With the
|
||||
// fix parsing is linear and completes in microseconds.
|
||||
//
|
||||
// Run with: `cargo bench --bench query_parser_nested`.
|
||||
|
||||
use binggan::{black_box, BenchRunner};
|
||||
use tantivy::query_grammar::parse_query;
|
||||
|
||||
fn nested_query(depth: usize, leading_plus: bool) -> String {
|
||||
let leading = "(".repeat(depth);
|
||||
let trailing = ")".repeat(depth);
|
||||
let prefix = if leading_plus { "+" } else { "" };
|
||||
format!("{prefix}{leading}title:test{trailing}")
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut runner = BenchRunner::new();
|
||||
|
||||
for depth in [20, 21] {
|
||||
for leading_plus in [false, true] {
|
||||
let query = nested_query(depth, leading_plus);
|
||||
let label = format!(
|
||||
"parse_nested_depth_{depth}_{}",
|
||||
if leading_plus { "plus" } else { "plain" },
|
||||
);
|
||||
runner.bench_function(&label, move |_| {
|
||||
black_box(parse_query(black_box(&query)).unwrap());
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1045,18 +1045,43 @@ fn operand_leaf(inp: &str) -> IResult<&str, (Option<BinaryOperand>, Option<Occur
|
||||
}
|
||||
|
||||
fn ast(inp: &str) -> IResult<&str, UserInputAst> {
|
||||
let boolean_expr = map_res(
|
||||
separated_pair(occur_leaf, multispace1, many1(operand_leaf)),
|
||||
|(left, right)| aggregate_binary_expressions(left, right),
|
||||
);
|
||||
let single_leaf = map(occur_leaf, |(occur, ast)| {
|
||||
if occur == Some(Occur::MustNot) {
|
||||
ast.unary(Occur::MustNot)
|
||||
} else {
|
||||
ast
|
||||
}
|
||||
});
|
||||
delimited(multispace0, alt((boolean_expr, single_leaf)), multispace0)(inp)
|
||||
// Parse `occur_leaf` once, then conditionally extend into a boolean
|
||||
// expression. The previous implementation used `alt((boolean_expr,
|
||||
// single_leaf))` which, when the input was a single leaf with no
|
||||
// following operand, would parse `occur_leaf` once for `boolean_expr`,
|
||||
// fail at `multispace1`, backtrack, then re-parse `occur_leaf` for
|
||||
// `single_leaf`. With recursively-nested groups like `(+(+(+a)))`, that
|
||||
// doubling at every level produced O(2^n) parse time. Parsing once and
|
||||
// peeking ahead for the operand keeps it O(n).
|
||||
delimited(
|
||||
multispace0,
|
||||
|inp| {
|
||||
let (rest, first) = occur_leaf(inp)?;
|
||||
// Only fall back on `Err::Error` (recoverable), mirroring
|
||||
// `alt`'s behaviour. `Err::Failure` and `Err::Incomplete`
|
||||
// must propagate so cut points and streaming needs are not
|
||||
// accidentally swallowed if they are ever introduced in the
|
||||
// operand parsers.
|
||||
match preceded(multispace1, many1(operand_leaf))(rest) {
|
||||
Ok((rest, more)) => {
|
||||
let combined = aggregate_binary_expressions(first, more)
|
||||
.map_err(|_| nom::Err::Error(Error::new(inp, ErrorKind::MapRes)))?;
|
||||
Ok((rest, combined))
|
||||
}
|
||||
Err(nom::Err::Error(_)) => {
|
||||
let (occur, ast) = first;
|
||||
let single = if occur == Some(Occur::MustNot) {
|
||||
ast.unary(Occur::MustNot)
|
||||
} else {
|
||||
ast
|
||||
};
|
||||
Ok((rest, single))
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
},
|
||||
multispace0,
|
||||
)(inp)
|
||||
}
|
||||
|
||||
fn ast_infallible(inp: &str) -> JResult<&str, UserInputAst> {
|
||||
@@ -1891,4 +1916,23 @@ mod test {
|
||||
r#"(+"field":'happy tax payer' +"other_field":1)"#,
|
||||
);
|
||||
}
|
||||
|
||||
// Regression test for https://github.com/quickwit-oss/tantivy/issues/2498:
|
||||
// deeply nested parenthesized queries used to take O(2^n) time because the
|
||||
// top-level `ast()` parser tried `boolean_expr` first and re-parsed the
|
||||
// inner `occur_leaf` when it backtracked to `single_leaf`. Depth 60 would
|
||||
// take ~10^18 operations under the regression; with the fix it parses
|
||||
// instantly. We use `test_parse_query_to_ast_helper` so this test would
|
||||
// never finish if the regression returned.
|
||||
#[test]
|
||||
fn test_parse_deeply_nested_query() {
|
||||
let depth = 60;
|
||||
let leading: String = "(".repeat(depth);
|
||||
let trailing: String = ")".repeat(depth);
|
||||
let query = format!("{leading}title:test{trailing}");
|
||||
test_parse_query_to_ast_helper(&query, r#""title":test"#);
|
||||
|
||||
let query_with_plus = format!("+{leading}title:test{trailing}");
|
||||
test_parse_query_to_ast_helper(&query_with_plus, r#""title":test"#);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -985,8 +985,12 @@ fn build_terms_or_cardinality_nodes(
|
||||
let str_col = str_dict_column
|
||||
.as_ref()
|
||||
.expect("str_dict_column must exist for string column");
|
||||
allowed_term_ids =
|
||||
build_allowed_term_ids_for_str(str_col, &req.include, &req.exclude)?;
|
||||
allowed_term_ids = build_allowed_term_ids_for_str(
|
||||
str_col,
|
||||
&req.include,
|
||||
&req.exclude,
|
||||
missing.is_some(),
|
||||
)?;
|
||||
};
|
||||
let idx_in_req_data = data.push_term_req_data(TermsAggReqData {
|
||||
accessor,
|
||||
@@ -1025,16 +1029,21 @@ fn build_terms_or_cardinality_nodes(
|
||||
|
||||
/// Builds a single BitSet of allowed term ordinals for a string dictionary column according to
|
||||
/// include/exclude parameters.
|
||||
///
|
||||
/// When `reserve_missing_sentinel` is true, the bitset will have 1 additional slot for the missing
|
||||
/// term ordinal
|
||||
fn build_allowed_term_ids_for_str(
|
||||
str_col: &StrColumn,
|
||||
include: &Option<IncludeExcludeParam>,
|
||||
exclude: &Option<IncludeExcludeParam>,
|
||||
reserve_missing_sentinel: bool,
|
||||
) -> crate::Result<Option<BitSet>> {
|
||||
let mut allowed: Option<BitSet> = None;
|
||||
let num_terms = str_col.dictionary().num_terms() as u32;
|
||||
let missing_sentinel_adjustment = if reserve_missing_sentinel { 1 } else { 0 };
|
||||
let allowed_capacity = str_col.dictionary().num_terms() as u32 + missing_sentinel_adjustment;
|
||||
if let Some(include) = include {
|
||||
// add matches
|
||||
allowed = Some(BitSet::with_max_value(num_terms));
|
||||
allowed = Some(BitSet::with_max_value(allowed_capacity));
|
||||
let allowed = allowed.as_mut().unwrap();
|
||||
for_each_matching_term_ord(str_col, include, |ord| allowed.insert(ord))?;
|
||||
};
|
||||
@@ -1042,7 +1051,7 @@ fn build_allowed_term_ids_for_str(
|
||||
if let Some(exclude) = exclude {
|
||||
if allowed.is_none() {
|
||||
// Start with all terms allowed
|
||||
allowed = Some(BitSet::with_max_value_and_full(num_terms));
|
||||
allowed = Some(BitSet::with_max_value_and_full(allowed_capacity));
|
||||
}
|
||||
let allowed = allowed.as_mut().unwrap();
|
||||
for_each_matching_term_ord(str_col, exclude, |ord| allowed.remove(ord))?;
|
||||
|
||||
@@ -152,7 +152,7 @@ impl SegmentAggregationCollector for SegmentCompositeCollector {
|
||||
docs: &[crate::DocId],
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
let mem_pre = self.get_memory_consumption();
|
||||
let mem_pre = self.get_memory_consumption(parent_bucket_id);
|
||||
let composite_agg_data = agg_data.take_composite_req_data(self.accessor_idx);
|
||||
|
||||
for doc in docs {
|
||||
@@ -172,7 +172,7 @@ impl SegmentAggregationCollector for SegmentCompositeCollector {
|
||||
sub_agg.check_flush_local(agg_data)?;
|
||||
}
|
||||
|
||||
let mem_delta = self.get_memory_consumption() - mem_pre;
|
||||
let mem_delta = self.get_memory_consumption(parent_bucket_id) - mem_pre;
|
||||
if mem_delta > 0 {
|
||||
agg_data.context.limits.add_memory_consumed(mem_delta)?;
|
||||
}
|
||||
@@ -202,11 +202,8 @@ impl SegmentAggregationCollector for SegmentCompositeCollector {
|
||||
}
|
||||
|
||||
impl SegmentCompositeCollector {
|
||||
fn get_memory_consumption(&self) -> u64 {
|
||||
self.parent_buckets
|
||||
.iter()
|
||||
.map(|m| m.memory_consumption())
|
||||
.sum()
|
||||
fn get_memory_consumption(&self, parent_bucket_id: BucketId) -> u64 {
|
||||
self.parent_buckets[parent_bucket_id as usize].memory_consumption()
|
||||
}
|
||||
|
||||
pub(crate) fn from_req_and_validate(
|
||||
|
||||
@@ -283,6 +283,11 @@ impl SegmentHistogramBucketEntry {
|
||||
struct HistogramBuckets {
|
||||
pub buckets: FxHashMap<i64, SegmentHistogramBucketEntry>,
|
||||
}
|
||||
impl HistogramBuckets {
|
||||
fn memory_consumption(&self) -> u64 {
|
||||
self.buckets.capacity() as u64 * std::mem::size_of::<SegmentHistogramBucketEntry>() as u64
|
||||
}
|
||||
}
|
||||
|
||||
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
||||
/// the correct datatype.
|
||||
@@ -324,7 +329,7 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
let req = agg_data.take_histogram_req_data(self.accessor_idx);
|
||||
let mem_pre = self.get_memory_consumption();
|
||||
let mem_pre = self.get_memory_consumption(parent_bucket_id);
|
||||
let buckets = &mut self.parent_buckets[parent_bucket_id as usize].buckets;
|
||||
|
||||
let bounds = req.bounds;
|
||||
@@ -358,12 +363,9 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
|
||||
}
|
||||
agg_data.put_back_histogram_req_data(self.accessor_idx, req);
|
||||
|
||||
let mem_delta = self.get_memory_consumption() - mem_pre;
|
||||
let mem_delta = self.get_memory_consumption(parent_bucket_id) - mem_pre;
|
||||
if mem_delta > 0 {
|
||||
agg_data
|
||||
.context
|
||||
.limits
|
||||
.add_memory_consumed(mem_delta as u64)?;
|
||||
agg_data.context.limits.add_memory_consumed(mem_delta)?;
|
||||
}
|
||||
|
||||
if let Some(sub_agg) = &mut self.sub_agg {
|
||||
@@ -395,11 +397,10 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
|
||||
}
|
||||
|
||||
impl SegmentHistogramCollector {
|
||||
fn get_memory_consumption(&self) -> usize {
|
||||
let self_mem = std::mem::size_of::<Self>();
|
||||
let buckets_mem = self.parent_buckets.len() * std::mem::size_of::<HistogramBuckets>();
|
||||
self_mem + buckets_mem
|
||||
fn get_memory_consumption(&self, parent_bucket_id: BucketId) -> u64 {
|
||||
self.parent_buckets[parent_bucket_id as usize].memory_consumption()
|
||||
}
|
||||
|
||||
/// Converts the collector result into a intermediate bucket result.
|
||||
fn add_intermediate_bucket_result(
|
||||
&mut self,
|
||||
|
||||
@@ -809,7 +809,7 @@ impl<TermMap: TermAggregationMap, B: SubAggBuffer> SegmentAggregationCollector
|
||||
docs: &[crate::DocId],
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
// let mem_pre = self.get_memory_consumption();
|
||||
let mem_pre = self.get_memory_consumption(parent_bucket_id);
|
||||
|
||||
let req_data = &mut self.terms_req_data;
|
||||
|
||||
@@ -853,16 +853,13 @@ impl<TermMap: TermAggregationMap, B: SubAggBuffer> SegmentAggregationCollector
|
||||
}
|
||||
}
|
||||
|
||||
// let mem_delta = self.get_memory_consumption() - mem_pre;
|
||||
// if mem_delta > 0 {
|
||||
// agg_data
|
||||
// .context
|
||||
// .limits
|
||||
// .add_memory_consumed(mem_delta as u64)?;
|
||||
// }
|
||||
|
||||
// After commenting out -> 6000ms -> 36ms
|
||||
|
||||
let mem_delta = self.get_memory_consumption(parent_bucket_id) - mem_pre;
|
||||
if mem_delta > 0 {
|
||||
agg_data
|
||||
.context
|
||||
.limits
|
||||
.add_memory_consumed(mem_delta as u64)?;
|
||||
}
|
||||
if let Some(sub_agg) = &mut self.sub_agg {
|
||||
sub_agg.check_flush_local(agg_data)?;
|
||||
}
|
||||
@@ -949,11 +946,9 @@ where
|
||||
TermMap: TermAggregationMap,
|
||||
B: SubAggBuffer,
|
||||
{
|
||||
fn get_memory_consumption(&self) -> usize {
|
||||
self.parent_buckets
|
||||
.iter()
|
||||
.map(|b| b.get_memory_consumption())
|
||||
.sum()
|
||||
#[inline]
|
||||
fn get_memory_consumption(&self, parent_bucket_id: BucketId) -> usize {
|
||||
self.parent_buckets[parent_bucket_id as usize].get_memory_consumption()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -1228,7 +1223,6 @@ pub(crate) fn cut_off_buckets<T: GetDocCount + Debug>(
|
||||
mod tests {
|
||||
use std::net::IpAddr;
|
||||
use std::str::FromStr;
|
||||
use std::time::Instant;
|
||||
|
||||
use common::DateTime;
|
||||
use time::{Date, Month};
|
||||
@@ -1242,10 +1236,9 @@ mod tests {
|
||||
get_test_index_from_terms, get_test_index_from_values_and_terms,
|
||||
};
|
||||
use crate::aggregation::{AggregationLimitsGuard, DistributedAggregationCollector};
|
||||
use crate::collector::{Collector, default_collect_segment_impl};
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::{AllQuery, EnableScoring, Query};
|
||||
use crate::schema::{IntoIpv6Addr, Schema, FAST, STRING};
|
||||
use crate::query::AllQuery;
|
||||
use crate::schema::{IntoIpv6Addr, Schema, FAST, INDEXED, STRING, TEXT};
|
||||
use crate::{Index, IndexWriter};
|
||||
|
||||
#[test]
|
||||
@@ -2940,102 +2933,100 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_terms_double_nesting() {
|
||||
fn prep_index_with_n_unique_terms_plus_one_null(n: u64) -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let outer_field = schema_builder.add_text_field("outer_term", STRING | FAST);
|
||||
let inner_field = schema_builder.add_text_field("inner_term", STRING | FAST);
|
||||
let id_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let title_field = schema_builder.add_text_field("title", TEXT | FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
// set to one thread to guarantee all docs end up in the same segment
|
||||
let mut writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
|
||||
let outer_values = (0..10_000)
|
||||
.map(|i| format!("outer_{i}"))
|
||||
.collect::<Vec<_>>();
|
||||
let inner_values = ["INFO", "ERROR", "WARN", "DEBUG"];
|
||||
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 200_000_000).unwrap();
|
||||
for doc_id in 0..1_000_000u64 {
|
||||
let outer_val = &outer_values[doc_id as usize % outer_values.len()];
|
||||
let inner_val = inner_values[doc_id as usize % inner_values.len()];
|
||||
index_writer.add_document(doc!(
|
||||
outer_field => outer_val.as_str(),
|
||||
inner_field => inner_val,
|
||||
)).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
writer.add_document(doc!(
|
||||
id_field => 0u64,
|
||||
))?;
|
||||
for i in 1u64..=n {
|
||||
let title = format!("foo{i}");
|
||||
writer.add_document(doc!(
|
||||
id_field => i,
|
||||
title_field => title,
|
||||
))?;
|
||||
}
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"outer": {
|
||||
"terms": { "field": "outer_term", "size": 10 },
|
||||
"aggs": {
|
||||
"inner": {
|
||||
"terms": { "field": "inner_term" }
|
||||
|
||||
writer.commit()?;
|
||||
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn null_bitset_bounds_check_regression() -> crate::Result<()> {
|
||||
// include cases
|
||||
for i in 0..=4 {
|
||||
let index = prep_index_with_n_unique_terms_plus_one_null(i * 64)?;
|
||||
let normal_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_bool": {
|
||||
"terms": {
|
||||
"field": "title",
|
||||
"missing": "__NULL__",
|
||||
"size": 1000,
|
||||
}
|
||||
}
|
||||
}))?;
|
||||
let include_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_bool": {
|
||||
"terms": {
|
||||
"field": "title",
|
||||
"include": "foo(.*)",
|
||||
"missing": "__NULL__",
|
||||
"size": 1000,
|
||||
}
|
||||
}
|
||||
}))?;
|
||||
let exclude_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_bool": {
|
||||
"terms": {
|
||||
"field": "title",
|
||||
"exclude": "foo(.*)",
|
||||
"missing": "__NULL__",
|
||||
"size": 1000,
|
||||
}
|
||||
}
|
||||
}))?;
|
||||
|
||||
let normal_res = exec_request(normal_req, &index)?;
|
||||
let normal_buckets = normal_res["my_bool"]["buckets"].as_array().unwrap();
|
||||
assert_eq!(
|
||||
normal_buckets.len(),
|
||||
(i * 64) as usize + 1,
|
||||
"The normal request should return all 'foo' buckets, plus the missing term bucket",
|
||||
);
|
||||
|
||||
let include_res = exec_request(include_req, &index)?;
|
||||
eprintln!("include_res: {include_res:?}");
|
||||
let include_buckets = include_res["my_bool"]["buckets"].as_array().unwrap();
|
||||
assert_eq!(
|
||||
include_buckets.len(),
|
||||
(i * 64) as usize,
|
||||
"The include request should return all 'foo' buckets, and not the missing term \
|
||||
bucket",
|
||||
);
|
||||
assert!(include_buckets
|
||||
.iter()
|
||||
.all(|b| b["key"].as_str().unwrap().starts_with("foo")));
|
||||
|
||||
let exclude_res = exec_request(exclude_req, &index)?;
|
||||
let exclude_buckets = exclude_res["my_bool"]["buckets"].as_array().unwrap();
|
||||
if i != 0 {
|
||||
// TODO: Remove this if after fixing exclude + missing bug
|
||||
assert_eq!(
|
||||
exclude_buckets.len(),
|
||||
1,
|
||||
"The exclude request should exclude all 'foo' buckets, and only the missing \
|
||||
term bucket",
|
||||
);
|
||||
assert_eq!(exclude_buckets[0]["key"], "__NULL__");
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let collector =
|
||||
crate::aggregation::AggregationCollector::from_aggs(agg_req, Default::default());
|
||||
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let all_weight = AllQuery.weight(EnableScoring::disabled_from_schema(&schema)).unwrap();
|
||||
let mut segment_collector = collector.for_segment(0u32, segment_reader).unwrap();
|
||||
let start = Instant::now();
|
||||
default_collect_segment_impl(&mut segment_collector, &*all_weight, segment_reader, false).unwrap();
|
||||
dbg!(start.elapsed());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_terms_simple_nesting() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let outer_field = schema_builder.add_text_field("outer_term", STRING | FAST);
|
||||
let inner_field = schema_builder.add_text_field("inner_term", STRING | FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let outer_values = (0..10_000)
|
||||
.map(|i| format!("outer_{i}"))
|
||||
.collect::<Vec<_>>();
|
||||
let inner_values = ["INFO", "ERROR", "WARN", "DEBUG"];
|
||||
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 200_000_000).unwrap();
|
||||
for doc_id in 0..1_000_000u64 {
|
||||
let outer_val = &outer_values[doc_id as usize % outer_values.len()];
|
||||
let inner_val = inner_values[doc_id as usize % inner_values.len()];
|
||||
index_writer.add_document(doc!(
|
||||
outer_field => outer_val.as_str(),
|
||||
inner_field => inner_val,
|
||||
)).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"outer": {
|
||||
"terms": { "field": "outer_term", "size": 10 },
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let collector =
|
||||
crate::aggregation::AggregationCollector::from_aggs(agg_req, Default::default());
|
||||
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let all_weight = AllQuery.weight(EnableScoring::disabled_from_schema(&schema)).unwrap();
|
||||
let mut segment_collector = collector.for_segment(0u32, segment_reader).unwrap();
|
||||
let start = Instant::now();
|
||||
default_collect_segment_impl(&mut segment_collector, &*all_weight, segment_reader, false).unwrap();
|
||||
dbg!(start.elapsed());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -389,6 +389,13 @@ impl SegmentCollector for FacetSegmentCollector {
|
||||
}
|
||||
let mut facet = vec![];
|
||||
let (facet_ord, facet_depth) = self.unique_facet_ords[collapsed_facet_ord];
|
||||
// u64::MAX is used as a sentinel for unmapped ordinals (e.g. when a
|
||||
// document has the exact registered facet, not a child of it).
|
||||
// Passing it to ord_to_term would resolve to the last dictionary
|
||||
// entry and produce a spurious facet from an unrelated branch.
|
||||
if facet_ord == u64::MAX {
|
||||
continue;
|
||||
}
|
||||
// TODO handle errors.
|
||||
if facet_dict.ord_to_term(facet_ord, &mut facet).is_ok() {
|
||||
if let Some((end_collapsed_facet, _)) = facet
|
||||
@@ -814,6 +821,63 @@ mod tests {
|
||||
assert!(!super::is_child_facet(&b"foo\0bar"[..], &b"foo"[..]));
|
||||
assert!(!super::is_child_facet(&b"foo"[..], &b"foobar\0baz"[..]));
|
||||
}
|
||||
|
||||
// Regression test for https://github.com/quickwit-oss/tantivy/issues/2494
|
||||
// When a document has the exact registered facet path (not just a child),
|
||||
// harvest() must not turn the unmapped sentinel into a spurious root entry.
|
||||
#[test]
|
||||
fn test_facet_collector_wrong_root() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
let facets: Vec<&str> = vec![
|
||||
"/science-fiction/asimov",
|
||||
"/science-fiction/clarke",
|
||||
"/science-fiction/dick",
|
||||
"/science-fiction/herbert",
|
||||
"/science-fiction/orwell",
|
||||
// This exact match on the registered facet is the bug trigger:
|
||||
// its ordinal maps to the sentinel (u64::MAX, 0) in the collapse
|
||||
// mapping, which without the fix resolves to an unrelated term.
|
||||
"/fantasy/epic-fantasy",
|
||||
"/fantasy/epic-fantasy/tolkien",
|
||||
"/fantasy/epic-fantasy/martin",
|
||||
];
|
||||
for facet_str in &facets {
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from(*facet_str)
|
||||
))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let term = Term::from_facet(facet_field, &Facet::from("/fantasy/epic-fantasy"));
|
||||
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
|
||||
let mut facet_collector = FacetCollector::for_field("facet");
|
||||
facet_collector.add_facet("/fantasy/epic-fantasy");
|
||||
let counts: FacetCounts = searcher.search(&query, &facet_collector)?;
|
||||
|
||||
let result: Vec<(String, u64)> = counts
|
||||
.get("/")
|
||||
.map(|(facet, count)| (facet.to_string(), count))
|
||||
.collect();
|
||||
|
||||
// Only children of /fantasy/epic-fantasy should appear, not /science-fiction
|
||||
assert_eq!(
|
||||
result,
|
||||
vec![
|
||||
("/fantasy/epic-fantasy/martin".to_string(), 1),
|
||||
("/fantasy/epic-fantasy/tolkien".to_string(), 1),
|
||||
]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
|
||||
@@ -249,6 +249,12 @@ impl BlockSegmentPostings {
|
||||
|
||||
/// Returns the length of the current block.
|
||||
///
|
||||
/// Returns the decoded term-frequency buffer for the current block.
|
||||
#[inline]
|
||||
pub(crate) fn freq_output_array(&self) -> &[u32] {
|
||||
self.freq_decoder.output_array()
|
||||
}
|
||||
|
||||
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
|
||||
/// except the last block that may have a length
|
||||
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
|
||||
@@ -298,6 +304,11 @@ impl BlockSegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn has_remaining_docs(&self) -> bool {
|
||||
self.skip_reader.has_remaining_docs()
|
||||
}
|
||||
|
||||
pub(crate) fn block_is_loaded(&self) -> bool {
|
||||
self.block_loaded
|
||||
}
|
||||
|
||||
@@ -146,6 +146,11 @@ impl SkipReader {
|
||||
skip_reader
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn has_remaining_docs(&self) -> bool {
|
||||
self.remaining_docs != 0
|
||||
}
|
||||
|
||||
pub fn reset(&mut self, data: OwnedBytes, doc_freq: u32) {
|
||||
self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
|
||||
0
|
||||
|
||||
464
src/query/boolean_query/block_wand_intersection.rs
Normal file
464
src/query/boolean_query/block_wand_intersection.rs
Normal file
@@ -0,0 +1,464 @@
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::Scorer;
|
||||
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
|
||||
/// Block-max pruning for top-K over intersection of term scorers.
|
||||
///
|
||||
/// Uses the least-frequent term as "leader" to define 128-doc processing windows.
|
||||
/// For each window, the sum of block_max_scores is compared to the current threshold;
|
||||
/// if the block can't beat it, the entire block is skipped.
|
||||
///
|
||||
/// Within non-skipped blocks, individual documents are pruned by checking whether
|
||||
/// leader_score + sum(secondary block_max_scores) can exceed the threshold before
|
||||
/// performing the expensive intersection membership check (seeking into secondary scorers).
|
||||
///
|
||||
/// # Preconditions
|
||||
/// - `scorers` has at least 2 elements
|
||||
/// - All scorers read frequencies (`FreqReadingOption::ReadFreq`)
|
||||
pub(crate) fn block_wand_intersection(
|
||||
mut scorers: Vec<TermScorer>,
|
||||
mut threshold: Score,
|
||||
callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
) {
|
||||
assert!(scorers.len() >= 2);
|
||||
|
||||
// Sort by cost (ascending). scorers[0] becomes the "leader" (rarest term).
|
||||
scorers.sort_by_key(TermScorer::size_hint);
|
||||
|
||||
let (leader, secondaries) = scorers.split_first_mut().unwrap();
|
||||
|
||||
// Precompute global max scores for early termination checks.
|
||||
let leader_max_score: Score = leader.max_score();
|
||||
let secondaries_global_max_sum: Score = secondaries.iter().map(TermScorer::max_score).sum();
|
||||
|
||||
// Early exit: no document can possibly beat the threshold.
|
||||
if leader_max_score + secondaries_global_max_sum <= threshold {
|
||||
return;
|
||||
}
|
||||
|
||||
// Borrow fieldnorm reader and BM25 weight before the main loop.
|
||||
// These are immutable references to disjoint fields from block_cursor,
|
||||
// but Rust's borrow checker can't see through method calls, so we
|
||||
// extract them once upfront.
|
||||
let fieldnorm_reader = leader.fieldnorm_reader().clone();
|
||||
let bm25_weight = leader.bm25_weight().clone();
|
||||
|
||||
let mut doc = leader.doc();
|
||||
|
||||
let mut secondary_block_max_scores: Box<[f32]> =
|
||||
vec![0.0f32; secondaries.len()].into_boxed_slice();
|
||||
let mut secondary_suffix_block_max: Box<[f32]> =
|
||||
vec![0.0f32; secondaries.len()].into_boxed_slice();
|
||||
|
||||
while doc < TERMINATED {
|
||||
// --- Phase 1: Block-level pruning ---
|
||||
//
|
||||
// Position all skip readers on the block containing `doc`.
|
||||
// seek_block is cheap: it only advances the skip reader, no block decompression.
|
||||
leader.seek_block(doc);
|
||||
let leader_block_max: Score = leader.block_max_score();
|
||||
|
||||
// Compute the window end as the minimum last_doc_in_block across all scorers.
|
||||
// This ensures the block_max values are valid for all docs in [doc, window_end].
|
||||
// Different scorers have independently aligned blocks, so we must use the
|
||||
// smallest window where all block_max values hold.
|
||||
let mut window_end: DocId = leader.last_doc_in_block();
|
||||
|
||||
let mut secondary_block_max_sum: Score = 0.0;
|
||||
let num_secondaries = secondaries.len();
|
||||
for (idx, secondary) in secondaries.iter_mut().enumerate() {
|
||||
secondary.block_cursor().seek_block(doc);
|
||||
if !secondary.block_cursor().has_remaining_docs() {
|
||||
return;
|
||||
}
|
||||
window_end = window_end.min(secondary.last_doc_in_block());
|
||||
let bms = secondary.block_max_score();
|
||||
secondary_block_max_scores[idx] = bms;
|
||||
secondary_block_max_sum += bms;
|
||||
}
|
||||
|
||||
if leader_block_max + secondary_block_max_sum <= threshold {
|
||||
// The entire window cannot beat the threshold. Skip past it.
|
||||
doc = window_end + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- Phase 2: Batch processing within the window ---
|
||||
//
|
||||
// Score-first approach: decode the leader's block, filter by threshold,
|
||||
// then check intersection membership only for survivors. This avoids expensive
|
||||
// secondary seeks for docs that can't beat the threshold.
|
||||
let block_cursor = leader.block_cursor();
|
||||
// seek loads the block and returns the in-block index of the first doc >= `doc`.
|
||||
let start_idx = block_cursor.seek(doc);
|
||||
|
||||
// Use the branchless binary search on the doc decoder to find the first
|
||||
// index past window_end.
|
||||
let end_idx = block_cursor
|
||||
.doc_decoder
|
||||
.seek_within_block(window_end + 1)
|
||||
.min(block_cursor.block_len());
|
||||
|
||||
let block_docs = &block_cursor.doc_decoder.output_array()[start_idx..end_idx];
|
||||
let block_freqs = &block_cursor.freq_output_array()[start_idx..end_idx];
|
||||
|
||||
// Pass 1: Batch-compute leader BM25 scores and branchlessly filter
|
||||
// candidates that can't beat the threshold.
|
||||
//
|
||||
// The trick: always write to the buffer at `num_candidates`, then
|
||||
// conditionally advance the count. The compiler can turn this into
|
||||
// a cmov instead of a branch, avoiding misprediction costs.
|
||||
let score_threshold = threshold - secondary_block_max_sum;
|
||||
let mut candidate_doc_ids = [0u32; COMPRESSION_BLOCK_SIZE];
|
||||
let mut candidate_scores = [0.0f32; COMPRESSION_BLOCK_SIZE];
|
||||
let mut num_candidates = 0usize;
|
||||
|
||||
for (candidate_doc, term_freq) in
|
||||
block_docs.iter().copied().zip(block_freqs.iter().copied())
|
||||
{
|
||||
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(candidate_doc);
|
||||
let leader_score = bm25_weight.score(fieldnorm_id, term_freq);
|
||||
candidate_doc_ids[num_candidates] = candidate_doc;
|
||||
candidate_scores[num_candidates] = leader_score;
|
||||
num_candidates += (leader_score > score_threshold) as usize;
|
||||
}
|
||||
|
||||
// Precompute suffix sums: suffix[i] = sum of block_max for secondaries[i+1..].
|
||||
// Used in Phase 2 to prune candidates that can't beat threshold even with
|
||||
// remaining secondaries contributing their block_max.
|
||||
if num_candidates == 0 {
|
||||
doc = window_end + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut running = 0.0f32;
|
||||
for idx in (0..num_secondaries).rev() {
|
||||
secondary_suffix_block_max[idx] = running;
|
||||
running += secondary_block_max_scores[idx];
|
||||
}
|
||||
|
||||
// Pass 2: Check intersection membership only for survivors.
|
||||
// score_threshold may be stale (threshold can increase from callbacks),
|
||||
// but that's conservative — we may check a few extra candidates, never miss one.
|
||||
'next_candidate: for candidate_idx in 0..num_candidates {
|
||||
let candidate_doc = candidate_doc_ids[candidate_idx];
|
||||
let mut total_score: Score = candidate_scores[candidate_idx];
|
||||
|
||||
for (secondary_idx, secondary) in secondaries.iter_mut().enumerate() {
|
||||
// If a previous candidate already advanced this secondary past
|
||||
// candidate_doc, the candidate can't be in the intersection.
|
||||
if secondary.doc() > candidate_doc {
|
||||
continue 'next_candidate;
|
||||
}
|
||||
let seek_result = secondary.seek(candidate_doc);
|
||||
if seek_result != candidate_doc {
|
||||
continue 'next_candidate;
|
||||
}
|
||||
total_score += secondary.score();
|
||||
|
||||
// Prune: even if all remaining secondaries score at their block max,
|
||||
// can we still beat the threshold?
|
||||
if total_score + secondary_suffix_block_max[secondary_idx] <= threshold {
|
||||
continue 'next_candidate;
|
||||
}
|
||||
}
|
||||
|
||||
// All secondaries matched.
|
||||
if total_score > threshold {
|
||||
threshold = callback(candidate_doc, total_score);
|
||||
|
||||
if leader_max_score + secondaries_global_max_sum <= threshold {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
doc = window_end + 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::{Bm25Weight, Scorer};
|
||||
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
|
||||
struct Float(Score);
|
||||
|
||||
impl Eq for Float {}
|
||||
|
||||
impl PartialEq for Float {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for Float {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for Float {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other.0.partial_cmp(&self.0).unwrap_or(Ordering::Equal)
|
||||
}
|
||||
}
|
||||
|
||||
fn nearly_equals(left: Score, right: Score) -> bool {
|
||||
(left - right).abs() < 0.0001 * (left + right).abs()
|
||||
}
|
||||
|
||||
/// Run block_wand_intersection and collect (doc, score) pairs above threshold.
|
||||
fn compute_checkpoints_block_wand_intersection(
|
||||
term_scorers: Vec<TermScorer>,
|
||||
top_k: usize,
|
||||
) -> Vec<(DocId, Score)> {
|
||||
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(top_k);
|
||||
let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
|
||||
let mut limit: Score = 0.0;
|
||||
|
||||
let callback = &mut |doc, score| {
|
||||
heap.push(Float(score));
|
||||
if heap.len() > top_k {
|
||||
heap.pop().unwrap();
|
||||
}
|
||||
if heap.len() == top_k {
|
||||
limit = heap.peek().unwrap().0;
|
||||
}
|
||||
if !nearly_equals(score, limit) {
|
||||
checkpoints.push((doc, score));
|
||||
}
|
||||
limit
|
||||
};
|
||||
|
||||
super::block_wand_intersection(term_scorers, Score::MIN, callback);
|
||||
checkpoints
|
||||
}
|
||||
|
||||
/// Naive baseline: intersect by iterating all docs.
|
||||
fn compute_checkpoints_naive_intersection(
|
||||
mut term_scorers: Vec<TermScorer>,
|
||||
top_k: usize,
|
||||
) -> Vec<(DocId, Score)> {
|
||||
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(top_k);
|
||||
let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
|
||||
let mut limit = Score::MIN;
|
||||
|
||||
// Sort by cost to use the cheapest as driver.
|
||||
term_scorers.sort_by_key(|s| s.cost());
|
||||
|
||||
let (leader, secondaries) = term_scorers.split_first_mut().unwrap();
|
||||
|
||||
let mut doc = leader.doc();
|
||||
while doc != TERMINATED {
|
||||
let mut all_match = true;
|
||||
for secondary in secondaries.iter_mut() {
|
||||
let secondary_doc = secondary.doc();
|
||||
let seek_result = if secondary_doc <= doc {
|
||||
secondary.seek(doc)
|
||||
} else {
|
||||
secondary_doc
|
||||
};
|
||||
if seek_result != doc {
|
||||
all_match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if all_match {
|
||||
let score: Score =
|
||||
leader.score() + secondaries.iter_mut().map(|s| s.score()).sum::<Score>();
|
||||
|
||||
if score > limit {
|
||||
heap.push(Float(score));
|
||||
if heap.len() > top_k {
|
||||
heap.pop().unwrap();
|
||||
}
|
||||
if heap.len() == top_k {
|
||||
limit = heap.peek().unwrap().0;
|
||||
}
|
||||
if !nearly_equals(score, limit) {
|
||||
checkpoints.push((doc, score));
|
||||
}
|
||||
}
|
||||
}
|
||||
doc = leader.advance();
|
||||
}
|
||||
checkpoints
|
||||
}
|
||||
|
||||
const MAX_TERM_FREQ: u32 = 100u32;
|
||||
|
||||
fn posting_list(max_doc: u32) -> BoxedStrategy<Vec<(DocId, u32)>> {
|
||||
(1..max_doc + 1)
|
||||
.prop_flat_map(move |doc_freq| {
|
||||
(
|
||||
proptest::bits::bitset::sampled(doc_freq as usize, 0..max_doc as usize),
|
||||
proptest::collection::vec(1u32..MAX_TERM_FREQ, doc_freq as usize),
|
||||
)
|
||||
})
|
||||
.prop_map(|(docset, term_freqs)| {
|
||||
docset
|
||||
.iter()
|
||||
.map(|doc| doc as u32)
|
||||
.zip(term_freqs.iter().cloned())
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.boxed()
|
||||
}
|
||||
|
||||
#[expect(clippy::type_complexity)]
|
||||
fn gen_term_scorers(num_scorers: usize) -> BoxedStrategy<(Vec<Vec<(DocId, u32)>>, Vec<u32>)> {
|
||||
(1u32..100u32)
|
||||
.prop_flat_map(move |max_doc: u32| {
|
||||
(
|
||||
proptest::collection::vec(posting_list(max_doc), num_scorers),
|
||||
proptest::collection::vec(2u32..10u32 * MAX_TERM_FREQ, max_doc as usize),
|
||||
)
|
||||
})
|
||||
.boxed()
|
||||
}
|
||||
|
||||
fn test_block_wand_intersection_aux(posting_lists: &[Vec<(DocId, u32)>], fieldnorms: &[u32]) {
|
||||
// Repeat docs 64 times to create multi-block scenarios, matching block_wand.rs test
|
||||
// strategy.
|
||||
const REPEAT: usize = 64;
|
||||
let fieldnorms_expanded: Vec<u32> = fieldnorms
|
||||
.iter()
|
||||
.cloned()
|
||||
.flat_map(|fieldnorm| std::iter::repeat_n(fieldnorm, REPEAT))
|
||||
.collect();
|
||||
|
||||
let postings_lists_expanded: Vec<Vec<(DocId, u32)>> = posting_lists
|
||||
.iter()
|
||||
.map(|posting_list| {
|
||||
posting_list
|
||||
.iter()
|
||||
.cloned()
|
||||
.flat_map(|(doc, term_freq)| {
|
||||
(0_u32..REPEAT as u32).map(move |offset| {
|
||||
(
|
||||
doc * (REPEAT as u32) + offset,
|
||||
if offset == 0 { term_freq } else { 1 },
|
||||
)
|
||||
})
|
||||
})
|
||||
.collect::<Vec<(DocId, u32)>>()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let total_fieldnorms: u64 = fieldnorms_expanded
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|fieldnorm| fieldnorm as u64)
|
||||
.sum();
|
||||
let average_fieldnorm = (total_fieldnorms as Score) / (fieldnorms_expanded.len() as Score);
|
||||
let max_doc = fieldnorms_expanded.len();
|
||||
|
||||
let make_scorers = || -> Vec<TermScorer> {
|
||||
postings_lists_expanded
|
||||
.iter()
|
||||
.map(|postings| {
|
||||
let bm25_weight = Bm25Weight::for_one_term(
|
||||
postings.len() as u64,
|
||||
max_doc as u64,
|
||||
average_fieldnorm,
|
||||
);
|
||||
TermScorer::create_for_test(postings, &fieldnorms_expanded[..], bm25_weight)
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
|
||||
for top_k in 1..4 {
|
||||
let checkpoints_optimized =
|
||||
compute_checkpoints_block_wand_intersection(make_scorers(), top_k);
|
||||
let checkpoints_naive = compute_checkpoints_naive_intersection(make_scorers(), top_k);
|
||||
assert_eq!(
|
||||
checkpoints_optimized.len(),
|
||||
checkpoints_naive.len(),
|
||||
"Mismatch in checkpoint count for top_k={top_k}"
|
||||
);
|
||||
for (&(left_doc, left_score), &(right_doc, right_score)) in
|
||||
checkpoints_optimized.iter().zip(checkpoints_naive.iter())
|
||||
{
|
||||
assert_eq!(left_doc, right_doc);
|
||||
assert!(
|
||||
nearly_equals(left_score, right_score),
|
||||
"Score mismatch for doc {left_doc}: {left_score} vs {right_score}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(500))]
|
||||
#[test]
|
||||
fn test_block_wand_intersection_two_scorers(
|
||||
(posting_lists, fieldnorms) in gen_term_scorers(2)
|
||||
) {
|
||||
test_block_wand_intersection_aux(&posting_lists[..], &fieldnorms[..]);
|
||||
}
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(500))]
|
||||
#[test]
|
||||
fn test_block_wand_intersection_three_scorers(
|
||||
(posting_lists, fieldnorms) in gen_term_scorers(3)
|
||||
) {
|
||||
test_block_wand_intersection_aux(&posting_lists[..], &fieldnorms[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_wand_intersection_disjoint() {
|
||||
// Two posting lists with no overlap — intersection is empty.
|
||||
let fieldnorms: Vec<u32> = vec![10; 200];
|
||||
let average_fieldnorm = 10.0;
|
||||
let postings_a: Vec<(DocId, u32)> = (0..100).map(|d| (d, 1)).collect();
|
||||
let postings_b: Vec<(DocId, u32)> = (100..200).map(|d| (d, 1)).collect();
|
||||
|
||||
let scorer_a = TermScorer::create_for_test(
|
||||
&postings_a,
|
||||
&fieldnorms,
|
||||
Bm25Weight::for_one_term(100, 200, average_fieldnorm),
|
||||
);
|
||||
let scorer_b = TermScorer::create_for_test(
|
||||
&postings_b,
|
||||
&fieldnorms,
|
||||
Bm25Weight::for_one_term(100, 200, average_fieldnorm),
|
||||
);
|
||||
|
||||
let checkpoints = compute_checkpoints_block_wand_intersection(vec![scorer_a, scorer_b], 10);
|
||||
assert!(checkpoints.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_wand_intersection_all_overlap() {
|
||||
// Two posting lists with full overlap.
|
||||
let fieldnorms: Vec<u32> = vec![10; 50];
|
||||
let average_fieldnorm = 10.0;
|
||||
let postings: Vec<(DocId, u32)> = (0..50).map(|d| (d, 3)).collect();
|
||||
|
||||
let make_scorer = || {
|
||||
TermScorer::create_for_test(
|
||||
&postings,
|
||||
&fieldnorms,
|
||||
Bm25Weight::for_one_term(50, 50, average_fieldnorm),
|
||||
)
|
||||
};
|
||||
|
||||
let checkpoints_opt =
|
||||
compute_checkpoints_block_wand_intersection(vec![make_scorer(), make_scorer()], 5);
|
||||
let checkpoints_naive =
|
||||
compute_checkpoints_naive_intersection(vec![make_scorer(), make_scorer()], 5);
|
||||
assert_eq!(checkpoints_opt.len(), checkpoints_naive.len());
|
||||
}
|
||||
}
|
||||
@@ -50,7 +50,7 @@ fn block_max_was_too_low_advance_one_scorer(
|
||||
scorers: &mut [TermScorerWithMaxScore],
|
||||
pivot_len: usize,
|
||||
) {
|
||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||
debug_assert!(scorers.iter().map(|scorer| scorer.doc()).is_sorted());
|
||||
let mut scorer_to_seek = pivot_len - 1;
|
||||
let mut global_max_score = scorers[scorer_to_seek].max_score;
|
||||
let mut doc_to_seek_after = scorers[scorer_to_seek].last_doc_in_block();
|
||||
@@ -76,7 +76,7 @@ fn block_max_was_too_low_advance_one_scorer(
|
||||
scorers[scorer_to_seek].seek(doc_to_seek_after);
|
||||
|
||||
restore_ordering(scorers, scorer_to_seek);
|
||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||
debug_assert!(scorers.iter().map(|scorer| scorer.doc()).is_sorted());
|
||||
}
|
||||
|
||||
// Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted
|
||||
@@ -90,7 +90,7 @@ fn restore_ordering(term_scorers: &mut [TermScorerWithMaxScore], ord: usize) {
|
||||
}
|
||||
term_scorers.swap(i, i - 1);
|
||||
}
|
||||
debug_assert!(is_sorted(term_scorers.iter().map(|scorer| scorer.doc())));
|
||||
debug_assert!(term_scorers.iter().map(|scorer| scorer.doc()).is_sorted());
|
||||
}
|
||||
|
||||
// Attempts to advance all term_scorers between `&term_scorers[0..before_len]` to the pivot.
|
||||
@@ -150,17 +150,21 @@ pub fn block_wand(
|
||||
mut threshold: Score,
|
||||
callback: &mut dyn FnMut(u32, Score) -> Score,
|
||||
) {
|
||||
scorers.retain(|scorer| scorer.doc() < TERMINATED);
|
||||
if scorers.len() == 1 {
|
||||
let scorer = scorers.pop().unwrap();
|
||||
return block_wand_single_scorer(scorer, threshold, callback);
|
||||
}
|
||||
let mut scorers: Vec<TermScorerWithMaxScore> = scorers
|
||||
.iter_mut()
|
||||
.map(TermScorerWithMaxScore::from)
|
||||
.collect();
|
||||
scorers.sort_by_key(|scorer| scorer.doc());
|
||||
// At this point we need to ensure that the scorers are sorted!
|
||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||
scorers.sort_by_key(|scorer| scorer.doc());
|
||||
while let Some((before_pivot_len, pivot_len, pivot_doc)) =
|
||||
find_pivot_doc(&scorers[..], threshold)
|
||||
{
|
||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||
debug_assert!(scorers.iter().map(|scorer| scorer.doc()).is_sorted());
|
||||
debug_assert_ne!(pivot_doc, TERMINATED);
|
||||
debug_assert!(before_pivot_len < pivot_len);
|
||||
|
||||
@@ -228,7 +232,7 @@ pub fn block_wand_single_scorer(
|
||||
loop {
|
||||
// We position the scorer on a block that can reach
|
||||
// the threshold.
|
||||
while scorer.block_max_score() < threshold {
|
||||
while scorer.block_max_score() <= threshold {
|
||||
let last_doc_in_block = scorer.last_doc_in_block();
|
||||
if last_doc_in_block == TERMINATED {
|
||||
return;
|
||||
@@ -286,18 +290,6 @@ impl DerefMut for TermScorerWithMaxScore<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
|
||||
if let Some(first) = it.next() {
|
||||
let mut prev = first;
|
||||
for doc in it {
|
||||
if doc < prev {
|
||||
return false;
|
||||
}
|
||||
prev = doc;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::cmp::Ordering;
|
||||
@@ -9,13 +9,14 @@ use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::weight::{for_each_docset_buffered, for_each_pruning_scorer, for_each_scorer};
|
||||
use crate::query::{
|
||||
intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude, Explanation, Occur,
|
||||
RequiredOptionalScorer, Scorer, Weight,
|
||||
intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude, Explanation,
|
||||
Intersection, Occur, RequiredOptionalScorer, Scorer, Weight,
|
||||
};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
enum SpecializedScorer {
|
||||
TermUnion(Vec<TermScorer>),
|
||||
TermIntersection(Vec<TermScorer>),
|
||||
Other(Box<dyn Scorer>),
|
||||
}
|
||||
|
||||
@@ -93,6 +94,13 @@ fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
|
||||
BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs);
|
||||
Box::new(union_scorer)
|
||||
}
|
||||
SpecializedScorer::TermIntersection(term_scorers) => {
|
||||
let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
|
||||
.into_iter()
|
||||
.map(|s| Box::new(s) as Box<dyn Scorer>)
|
||||
.collect();
|
||||
intersect_scorers(boxed_scorers, num_docs)
|
||||
}
|
||||
SpecializedScorer::Other(scorer) => scorer,
|
||||
}
|
||||
}
|
||||
@@ -297,14 +305,43 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
// Result depends entirely on MUST + any removed AllScorers.
|
||||
let combined_all_scorer_count = must_special_scorer_counts.num_all_scorers
|
||||
+ should_special_scorer_counts.num_all_scorers;
|
||||
let boxed_scorer: Box<dyn Scorer> = effective_must_scorer(
|
||||
must_scorers,
|
||||
combined_all_scorer_count,
|
||||
reader.max_doc(),
|
||||
num_docs,
|
||||
)
|
||||
.unwrap_or_else(|| Box::new(EmptyScorer));
|
||||
SpecializedScorer::Other(boxed_scorer)
|
||||
|
||||
// Try to detect a pure TermScorer intersection for block-max optimization.
|
||||
// Preconditions: no removed AllScorers, at least 2 scorers, all TermScorer
|
||||
// with frequency reading enabled.
|
||||
if combined_all_scorer_count == 0
|
||||
&& must_scorers.len() >= 2
|
||||
&& must_scorers.iter().all(|s| s.is::<TermScorer>())
|
||||
{
|
||||
let term_scorers: Vec<TermScorer> = must_scorers
|
||||
.into_iter()
|
||||
.map(|s| *(s.downcast::<TermScorer>().map_err(|_| ()).unwrap()))
|
||||
.collect();
|
||||
if term_scorers
|
||||
.iter()
|
||||
.all(|s| s.freq_reading_option() == FreqReadingOption::ReadFreq)
|
||||
{
|
||||
SpecializedScorer::TermIntersection(term_scorers)
|
||||
} else {
|
||||
let must_scorers: Vec<Box<dyn Scorer>> = term_scorers
|
||||
.into_iter()
|
||||
.map(|s| Box::new(s) as Box<dyn Scorer>)
|
||||
.collect();
|
||||
let boxed_scorer: Box<dyn Scorer> =
|
||||
effective_must_scorer(must_scorers, 0, reader.max_doc(), num_docs)
|
||||
.unwrap_or_else(|| Box::new(EmptyScorer));
|
||||
SpecializedScorer::Other(boxed_scorer)
|
||||
}
|
||||
} else {
|
||||
let boxed_scorer: Box<dyn Scorer> = effective_must_scorer(
|
||||
must_scorers,
|
||||
combined_all_scorer_count,
|
||||
reader.max_doc(),
|
||||
num_docs,
|
||||
)
|
||||
.unwrap_or_else(|| Box::new(EmptyScorer));
|
||||
SpecializedScorer::Other(boxed_scorer)
|
||||
}
|
||||
}
|
||||
(ShouldScorersCombinationMethod::Optional(should_scorer), must_scorers) => {
|
||||
// Optional SHOULD: contributes to scoring but not required for matching.
|
||||
@@ -463,15 +500,21 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
callback: &mut dyn FnMut(DocId, Score),
|
||||
) -> crate::Result<()> {
|
||||
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
|
||||
let num_docs = reader.num_docs();
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
let mut union_scorer = BufferedUnionScorer::build(
|
||||
term_scorers,
|
||||
&self.score_combiner_fn,
|
||||
reader.num_docs(),
|
||||
);
|
||||
let mut union_scorer =
|
||||
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
|
||||
for_each_scorer(&mut union_scorer, callback);
|
||||
}
|
||||
SpecializedScorer::TermIntersection(term_scorers) => {
|
||||
let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
|
||||
.into_iter()
|
||||
.map(|term_scorer| Box::new(term_scorer) as Box<dyn Scorer>)
|
||||
.collect();
|
||||
let mut intersection = intersect_scorers(boxed_scorers, num_docs);
|
||||
for_each_scorer(intersection.as_mut(), callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
for_each_scorer(scorer.as_mut(), callback);
|
||||
}
|
||||
@@ -485,17 +528,23 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
callback: &mut dyn FnMut(&[DocId]),
|
||||
) -> crate::Result<()> {
|
||||
let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?;
|
||||
let num_docs = reader.num_docs();
|
||||
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
|
||||
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
let mut union_scorer = BufferedUnionScorer::build(
|
||||
term_scorers,
|
||||
&self.score_combiner_fn,
|
||||
reader.num_docs(),
|
||||
);
|
||||
let mut union_scorer =
|
||||
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
|
||||
for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
|
||||
}
|
||||
SpecializedScorer::TermIntersection(term_scorers) => {
|
||||
let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
|
||||
.into_iter()
|
||||
.map(|term_scorer| Box::new(term_scorer) as Box<dyn Scorer>)
|
||||
.collect();
|
||||
let mut intersection = intersect_scorers(boxed_scorers, num_docs);
|
||||
for_each_docset_buffered(intersection.as_mut(), &mut buffer, callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
for_each_docset_buffered(scorer.as_mut(), &mut buffer, callback);
|
||||
}
|
||||
@@ -524,6 +573,9 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
super::block_wand(term_scorers, threshold, callback);
|
||||
}
|
||||
SpecializedScorer::TermIntersection(term_scorers) => {
|
||||
super::block_wand_intersection(term_scorers, threshold, callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
for_each_pruning_scorer(scorer.as_mut(), threshold, callback);
|
||||
}
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
mod block_wand;
|
||||
mod block_wand_intersection;
|
||||
mod block_wand_union;
|
||||
mod boolean_query;
|
||||
mod boolean_weight;
|
||||
|
||||
pub(crate) use self::block_wand::{block_wand, block_wand_single_scorer};
|
||||
pub(crate) use self::block_wand_intersection::block_wand_intersection;
|
||||
pub(crate) use self::block_wand_union::{block_wand, block_wand_single_scorer};
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::boolean_weight::BooleanWeight;
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::docset::DocSet;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::{FreqReadingOption, Postings, SegmentPostings};
|
||||
use crate::postings::{BlockSegmentPostings, FreqReadingOption, Postings, SegmentPostings};
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
use crate::query::{Explanation, Scorer};
|
||||
use crate::{DocId, Score};
|
||||
@@ -95,6 +95,21 @@ impl TermScorer {
|
||||
pub fn last_doc_in_block(&self) -> DocId {
|
||||
self.postings.block_cursor.skip_reader().last_doc_in_block()
|
||||
}
|
||||
|
||||
/// Returns a mutable reference to the underlying block cursor.
|
||||
pub(crate) fn block_cursor(&mut self) -> &mut BlockSegmentPostings {
|
||||
&mut self.postings.block_cursor
|
||||
}
|
||||
|
||||
/// Returns a reference to the fieldnorm reader for batch lookups.
|
||||
pub(crate) fn fieldnorm_reader(&self) -> &FieldNormReader {
|
||||
&self.fieldnorm_reader
|
||||
}
|
||||
|
||||
/// Returns a reference to the BM25 weight for batch score computation.
|
||||
pub(crate) fn bm25_weight(&self) -> &Bm25Weight {
|
||||
&self.similarity_weight
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for TermScorer {
|
||||
|
||||
@@ -23,7 +23,7 @@ zstd-compression = ["zstd"]
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1"
|
||||
criterion = { version = "0.5", default-features = false }
|
||||
criterion = { version = "0.8", default-features = false }
|
||||
names = "0.14"
|
||||
rand = "0.9"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user