mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-17 16:00:42 +00:00
Compare commits
1 Commits
seek_dange
...
paul.masur
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d5e2709b1b |
15
.github/workflows/coverage.yml
vendored
15
.github/workflows/coverage.yml
vendored
@@ -4,9 +4,6 @@ on:
|
||||
push:
|
||||
branches: [main]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
# Ensures that we cancel running jobs for the same PR / same workflow.
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
@@ -15,20 +12,16 @@ concurrency:
|
||||
jobs:
|
||||
coverage:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install Rust
|
||||
run: rustup toolchain install nightly-2025-12-01 --profile minimal --component llvm-tools-preview
|
||||
- uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
|
||||
- uses: taiki-e/install-action@e4b3a0453201addddc06d3a72db90326aad87084 # cargo-llvm-cov
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- uses: taiki-e/install-action@cargo-llvm-cov
|
||||
- name: Generate code coverage
|
||||
run: cargo +nightly-2025-12-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f # v7.0.0
|
||||
uses: codecov/codecov-action@v3
|
||||
continue-on-error: true
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
|
||||
|
||||
10
.github/workflows/long_running.yml
vendored
10
.github/workflows/long_running.yml
vendored
@@ -8,9 +8,6 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
# Ensures that we cancel running jobs for the same PR / same workflow.
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
@@ -21,13 +18,10 @@ jobs:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install stable
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
profile: minimal
|
||||
|
||||
49
.github/workflows/scorecard.yml
vendored
49
.github/workflows/scorecard.yml
vendored
@@ -1,49 +0,0 @@
|
||||
name: OpenSSF Scorecard
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 0 * * 0'
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
analysis:
|
||||
name: Scorecards analysis
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# Needed to upload the results to code-scanning dashboard.
|
||||
security-events: write
|
||||
# Needed to publish results
|
||||
id-token: write
|
||||
|
||||
steps:
|
||||
- name: 'Checkout code'
|
||||
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: 'Run analysis'
|
||||
uses: ossf/scorecard-action@4eaacf0543bb3f2c246792bd56e8cdeffafb205a # v2.4.3
|
||||
with:
|
||||
results_file: results.sarif
|
||||
results_format: sarif
|
||||
repo_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
publish_results: true
|
||||
|
||||
# Upload the results as artifacts.
|
||||
- name: 'Upload artifact'
|
||||
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
|
||||
with:
|
||||
name: SARIF file
|
||||
path: results.sarif
|
||||
retention-days: 5
|
||||
|
||||
# Upload the results to GitHub's code scanning dashboard.
|
||||
- name: 'Upload to code-scanning'
|
||||
uses: github/codeql-action/upload-sarif@87557b9c84dde89fdd9b10e88954ac2f4248e463 # v4.36.1
|
||||
with:
|
||||
sarif_file: results.sarif
|
||||
28
.github/workflows/test.yml
vendored
28
.github/workflows/test.yml
vendored
@@ -9,9 +9,6 @@ on:
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
# Ensures that we cancel running jobs for the same PR / same workflow.
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
@@ -22,27 +19,23 @@ jobs:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
checks: write
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install nightly
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: nightly
|
||||
profile: minimal
|
||||
components: rustfmt
|
||||
- name: Install stable
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
profile: minimal
|
||||
components: clippy
|
||||
|
||||
- uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
|
||||
- name: Check Formatting
|
||||
run: cargo +nightly fmt --all -- --check
|
||||
@@ -54,7 +47,7 @@ jobs:
|
||||
- name: Check Bench Compilation
|
||||
run: cargo +nightly bench --no-run --profile=dev --all-features
|
||||
|
||||
- uses: actions-rs/clippy-check@b5b5f21f4797c02da247df37026fcd0a5024aa4d # v1.0.7
|
||||
- uses: actions-rs/clippy-check@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@@ -64,9 +57,6 @@ jobs:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
features:
|
||||
@@ -77,17 +67,17 @@ jobs:
|
||||
name: test-${{ matrix.features.label}}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install stable
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
profile: minimal
|
||||
override: true
|
||||
|
||||
- uses: taiki-e/install-action@56cc9adf3a3e2c23eafb56e8acaf9d0373cb845a # nextest
|
||||
- uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
|
||||
- uses: taiki-e/install-action@nextest
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
|
||||
@@ -1,9 +1,3 @@
|
||||
Tantivy 0.26.1
|
||||
================================
|
||||
|
||||
## Performance
|
||||
- Fix quadratic runtime in nested term and composite aggregations: memory accounting scanned all parent buckets on every collect instead of just the current parent (@PSeitz @fulmicoton)
|
||||
|
||||
Tantivy 0.26 (Unreleased)
|
||||
================================
|
||||
|
||||
|
||||
14
Cargo.toml
14
Cargo.toml
@@ -65,7 +65,7 @@ tantivy-bitpacker = { version = "0.10", path = "./bitpacker" }
|
||||
common = { version = "0.11", path = "./common/", package = "tantivy-common" }
|
||||
tokenizer-api = { version = "0.7", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
||||
sketches-ddsketch = { version = "0.4", features = ["use_serde"] }
|
||||
datasketches = { version = "0.3.0", features = ["hll"] }
|
||||
datasketches = { git = "https://github.com/fulmicoton-dd/datasketches-rust", rev = "7635fb8" }
|
||||
futures-util = { version = "0.3.28", optional = true }
|
||||
futures-channel = { version = "0.3.28", optional = true }
|
||||
fnv = "1.0.7"
|
||||
@@ -75,7 +75,7 @@ typetag = "0.2.21"
|
||||
winapi = "0.3.9"
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.17.0"
|
||||
binggan = "0.16.1"
|
||||
rand = "0.9"
|
||||
maplit = "1.0.2"
|
||||
matches = "0.1.9"
|
||||
@@ -92,7 +92,7 @@ postcard = { version = "1.0.4", features = [
|
||||
], default-features = false }
|
||||
|
||||
[target.'cfg(not(windows))'.dev-dependencies]
|
||||
criterion = { version = "0.8", default-features = false }
|
||||
criterion = { version = "0.5", default-features = false }
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.5.0"
|
||||
@@ -201,11 +201,3 @@ harness = false
|
||||
[[bench]]
|
||||
name = "regex_all_terms"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "query_parser_nested"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "intersection_bench"
|
||||
harness = false
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
[](https://docs.rs/crate/tantivy/)
|
||||
[](https://github.com/quickwit-oss/tantivy/actions/workflows/test.yml)
|
||||
[](https://codecov.io/gh/quickwit-oss/tantivy)
|
||||
[](https://scorecard.dev/viewer/?uri=github.com/quickwit-oss/tantivy)
|
||||
[](https://discord.gg/MT27AG5EVE)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://crates.io/crates/tantivy)
|
||||
|
||||
@@ -63,12 +63,7 @@ fn bench_agg(mut group: InputGroup<Index>) {
|
||||
register!(group, terms_all_unique_with_avg_sub_agg);
|
||||
register!(group, terms_many_with_avg_sub_agg);
|
||||
register!(group, terms_status_with_avg_sub_agg);
|
||||
register!(group, terms_status_with_terms_zipf_1000_sub_agg);
|
||||
register!(group, terms_zipf_1000_with_terms_status_sub_agg);
|
||||
register!(group, terms_status_with_histogram);
|
||||
register!(group, terms_status_with_date_histogram);
|
||||
register!(group, terms_status_with_date_histogram_hard_bounds);
|
||||
register!(group, terms_status_with_date_histogram_and_sibling_terms);
|
||||
register!(group, terms_zipf_1000);
|
||||
register!(group, terms_zipf_1000_with_histogram);
|
||||
register!(group, terms_zipf_1000_with_avg_sub_agg);
|
||||
@@ -82,12 +77,8 @@ fn bench_agg(mut group: InputGroup<Index>) {
|
||||
register!(group, composite_histogram_calendar);
|
||||
|
||||
register!(group, cardinality_agg);
|
||||
register!(group, cardinality_agg_high_card);
|
||||
register!(group, cardinality_agg_low_card);
|
||||
register!(group, terms_status_with_cardinality_agg);
|
||||
register!(group, terms_100_buckets_with_cardinality_agg);
|
||||
register!(group, terms_many_with_single_term_order_by_card);
|
||||
register!(group, terms_many_with_single_term_2_order_by_card);
|
||||
|
||||
register!(group, range_agg);
|
||||
register!(group, range_agg_with_avg_sub_agg);
|
||||
@@ -175,32 +166,6 @@ fn cardinality_agg(index: &Index) {
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
// Full-scan cardinality on a near-1M-cardinality string field.
|
||||
// Hits the dense (PagedBitset) path: every doc has a unique term,
|
||||
// so the bucket promotes from FxHashSet shortly into the scan.
|
||||
fn cardinality_agg_high_card(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "text_all_unique_terms"
|
||||
},
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
// Full-scan cardinality on a tiny-cardinality string field (7 distinct
|
||||
// values). Stays on the FxHashSet path — the promotion threshold is
|
||||
// never crossed. Validates no regression on the sparse path.
|
||||
fn cardinality_agg_low_card(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "text_few_terms_status"
|
||||
},
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_status_with_cardinality_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
@@ -233,58 +198,6 @@ fn terms_100_buckets_with_cardinality_agg(index: &Index) {
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn terms_many_with_single_term_order_by_card(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_many_terms" },
|
||||
"aggs": {
|
||||
"nested_terms": {
|
||||
"terms": {
|
||||
"field": "single_term",
|
||||
"order": { "cardinality": "desc" }
|
||||
},
|
||||
"aggs": {
|
||||
"cardinality": {
|
||||
"cardinality": { "field": "text_few_terms" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
// Two-level terms ordered by cardinality at each level: a high-card outer terms
|
||||
// (text_many_terms) ordered by a cardinality sub-agg, with a nested low-card terms
|
||||
// (text_few_terms_status) also ordered by a cardinality sub-agg, plus an avg.
|
||||
fn terms_many_with_single_term_2_order_by_card(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"by_ip": {
|
||||
"terms": {
|
||||
"field": "text_many_terms",
|
||||
"order": { "card_few_terms": "desc" }
|
||||
},
|
||||
"aggs": {
|
||||
"card_few_terms": {
|
||||
"cardinality": { "field": "text_few_terms" }
|
||||
},
|
||||
"nested_terms": {
|
||||
"terms": {
|
||||
"field": " single_term",
|
||||
"order": { "distinct_path2": "desc" }
|
||||
},
|
||||
"aggs": {
|
||||
"avg_botscore": { "avg": { "field": "score" } },
|
||||
"distinct_path2": { "cardinality": { "field": "text_few_terms" } }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn terms_7(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": { "terms": { "field": "text_few_terms_status" } },
|
||||
@@ -357,30 +270,6 @@ fn terms_all_unique_with_avg_sub_agg(index: &Index) {
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_status_with_terms_zipf_1000_sub_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_few_terms_status" },
|
||||
"aggs": {
|
||||
"nested_terms": { "terms": { "field": "text_1000_terms_zipf" } }
|
||||
}
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn terms_zipf_1000_with_terms_status_sub_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_1000_terms_zipf" },
|
||||
"aggs": {
|
||||
"nested_terms": { "terms": { "field": "text_few_terms_status" } }
|
||||
}
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn terms_status_with_histogram(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
@@ -393,57 +282,6 @@ fn terms_status_with_histogram(index: &Index) {
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn terms_status_with_date_histogram(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_few_terms_status" },
|
||||
"aggs": {
|
||||
"over_time": { "date_histogram": { "field": "timestamp", "fixed_interval": "1h" } }
|
||||
}
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
/// Same fused terms × date_histogram, but with `hard_bounds`. The timestamps span 0..120h; the
|
||||
/// bounds drop only the first and last hour (ms: 1h=3_600_000, 119h=428_400_000), so almost every
|
||||
/// doc is in-bounds. This exercises the collector's hard-bounds path: `bounds.contains` runs per
|
||||
/// doc (the `all_docs_in_bounds` short-circuit is off) and the rare out-of-bounds doc takes the
|
||||
/// `term_counts` branch.
|
||||
fn terms_status_with_date_histogram_hard_bounds(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_few_terms_status" },
|
||||
"aggs": {
|
||||
"over_time": {
|
||||
"date_histogram": {
|
||||
"field": "timestamp",
|
||||
"fixed_interval": "1h",
|
||||
"hard_bounds": { "min": 3_600_000, "max": 428_400_000 }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
/// Same fused terms × date_histogram, but with a sibling terms aggregation next to it. The fused
|
||||
/// fast path should still trigger for `my_texts` (sibling aggregations are independent top-level
|
||||
/// aggregations, so they don't change its eligibility).
|
||||
fn terms_status_with_date_histogram_and_sibling_terms(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_few_terms_status" },
|
||||
"aggs": {
|
||||
"over_time": { "date_histogram": { "field": "timestamp", "fixed_interval": "1h" } }
|
||||
}
|
||||
},
|
||||
"other_texts": { "terms": { "field": "text_few_terms" } }
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn terms_zipf_1000_with_histogram(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
@@ -745,8 +583,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
|
||||
let single_term = schema_builder.add_text_field("single_term", FAST);
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let json_field = schema_builder.add_json_field("json", FAST);
|
||||
let text_field_all_unique_terms =
|
||||
schema_builder.add_text_field("text_all_unique_terms", STRING | FAST);
|
||||
@@ -810,8 +647,6 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
||||
index_writer.add_document(doc!(
|
||||
json_field => json!({"mixed_type": 10.0}),
|
||||
json_field => json!({"mixed_type": 10.0}),
|
||||
single_term => "single_term",
|
||||
single_term => "single_term",
|
||||
text_field => "cool",
|
||||
text_field => "cool",
|
||||
text_field_all_unique_terms => "cool",
|
||||
@@ -837,9 +672,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
||||
doc_with_value /= 20;
|
||||
}
|
||||
let _val_max = 1_000_000.0;
|
||||
const SPAN_MS: i64 = 120 * 3600 * 1000; // 120 hours in ms
|
||||
const NOISE_MS: i64 = 2 * 3600 * 1000; // ±2h noise
|
||||
for i in 0..doc_with_value {
|
||||
for _ in 0..doc_with_value {
|
||||
let val: f64 = rng.random_range(0.0..1_000_000.0);
|
||||
let json = if rng.random_bool(0.1) {
|
||||
// 10% are numeric values
|
||||
@@ -847,11 +680,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
||||
} else {
|
||||
json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()})
|
||||
};
|
||||
let base_ms = (i as i64 * SPAN_MS) / doc_with_value as i64;
|
||||
let noise_ms = rng.random_range(-NOISE_MS..NOISE_MS);
|
||||
let ts_ms = (base_ms + noise_ms).clamp(0, SPAN_MS);
|
||||
index_writer.add_document(doc!(
|
||||
single_term => "single_term",
|
||||
text_field => "cool",
|
||||
json_field => json,
|
||||
text_field_all_unique_terms => format!("unique_term_{}", rng.random::<u64>()),
|
||||
@@ -862,7 +691,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
||||
score_field => val as u64,
|
||||
score_field_f64 => lg_norm.sample(&mut rng),
|
||||
score_field_i64 => val as i64,
|
||||
date_field => DateTime::from_timestamp_millis(ts_ms),
|
||||
date_field => DateTime::from_timestamp_millis((val * 1_000_000.) as i64),
|
||||
))?;
|
||||
if cardinality == Cardinality::OptionalSparse {
|
||||
for _ in 0..20 {
|
||||
|
||||
@@ -110,31 +110,43 @@ fn main() {
|
||||
// Prepare corpora with varying scenarios
|
||||
let scenarios = vec![
|
||||
(
|
||||
"dense and 0.1% a".to_string(),
|
||||
5_000_000,
|
||||
0.001,
|
||||
"dense and 99% a".to_string(),
|
||||
10_000_000,
|
||||
0.99,
|
||||
"dense",
|
||||
0,
|
||||
9,
|
||||
),
|
||||
("dense and 1% a".to_string(), 5_000_000, 0.01, "dense", 0, 9),
|
||||
("dense and 10% a".to_string(), 5_000_000, 0.1, "dense", 0, 9),
|
||||
(
|
||||
"sparse and 50% a".to_string(),
|
||||
5_000_000,
|
||||
"dense and 99% a".to_string(),
|
||||
10_000_000,
|
||||
0.99,
|
||||
"dense",
|
||||
990,
|
||||
999,
|
||||
),
|
||||
(
|
||||
"sparse and 99% a".to_string(),
|
||||
10_000_000,
|
||||
0.99,
|
||||
"sparse",
|
||||
0,
|
||||
9,
|
||||
),
|
||||
(
|
||||
"sparse and 99% a".to_string(),
|
||||
10_000_000,
|
||||
0.99,
|
||||
"sparse",
|
||||
9_999_990,
|
||||
9_999_999,
|
||||
),
|
||||
];
|
||||
|
||||
let mut runner = BenchRunner::new();
|
||||
for (scenario_id, num_docs, p_title_a, num_rand_distribution, range_low, range_high) in
|
||||
scenarios
|
||||
{
|
||||
for (scenario_id, n, p_title_a, num_rand_distribution, range_low, range_high) in scenarios {
|
||||
// Build index for this scenario
|
||||
let bench_index = build_shared_indices(num_docs, p_title_a, num_rand_distribution);
|
||||
let bench_index = build_shared_indices(n, p_title_a, num_rand_distribution);
|
||||
|
||||
// Create benchmark group
|
||||
let mut group = runner.new_group();
|
||||
@@ -146,7 +158,7 @@ fn main() {
|
||||
let field_names = ["num_rand", "num_asc", "num_rand_fast", "num_asc_fast"];
|
||||
|
||||
// Define the three terms we want to test with
|
||||
let terms = ["a"];
|
||||
let terms = ["a", "b", "z"];
|
||||
|
||||
// Generate all combinations of terms and field names
|
||||
let mut queries = Vec::new();
|
||||
@@ -191,7 +203,7 @@ fn run_benchmark_tasks(
|
||||
bench_index,
|
||||
query_str,
|
||||
DocSetCollector,
|
||||
"all_results",
|
||||
"all results",
|
||||
);
|
||||
|
||||
// Test top 100 by the field (if it's a FAST field)
|
||||
|
||||
@@ -1,149 +0,0 @@
|
||||
// Benchmarks top-K intersection of term scorers (block_wand_intersection).
|
||||
//
|
||||
// What's measured:
|
||||
// - Conjunctive queries (+a +b, +a +b +c) with top-10 by score
|
||||
// - Varying doc-frequency balance between terms (balanced, skewed, very skewed)
|
||||
// - Realistic term frequencies (geometric distribution, mostly low)
|
||||
// - 1M-doc single segment
|
||||
//
|
||||
// Run with: cargo bench --bench intersection_bench
|
||||
|
||||
use binggan::{black_box, BenchRunner};
|
||||
use rand::prelude::*;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::SeedableRng;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, TEXT};
|
||||
use tantivy::{doc, Index, ReloadPolicy, Searcher};
|
||||
|
||||
const NUM_DOCS: usize = 1_000_000;
|
||||
|
||||
struct BenchIndex {
|
||||
searcher: Searcher,
|
||||
query_parser: QueryParser,
|
||||
}
|
||||
|
||||
/// Generate term frequency from a geometric-like distribution.
|
||||
/// Most values are 1, a few are 2-3, rarely higher.
|
||||
/// p controls the decay: higher p → more weight on tf=1.
|
||||
fn random_term_freq(rng: &mut StdRng, p: f64) -> u32 {
|
||||
let mut tf = 1u32;
|
||||
while tf < 10 && rng.random_bool(1.0 - p) {
|
||||
tf += 1;
|
||||
}
|
||||
tf
|
||||
}
|
||||
|
||||
/// Build an index with three terms (a, b, c) with given doc-frequency probabilities.
|
||||
/// Each term occurrence has a realistic term frequency (geometric distribution).
|
||||
/// Field length is padded with filler tokens to create varied fieldnorms.
|
||||
fn build_index(p_a: f64, p_b: f64, p_c: f64) -> BenchIndex {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let body = schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut rng = StdRng::from_seed([42u8; 32]);
|
||||
|
||||
{
|
||||
let mut writer = index.writer_with_num_threads(1, 500_000_000).unwrap();
|
||||
for _ in 0..NUM_DOCS {
|
||||
let mut tokens: Vec<String> = Vec::new();
|
||||
|
||||
if rng.random_bool(p_a) {
|
||||
let tf = random_term_freq(&mut rng, 0.7);
|
||||
for _ in 0..tf {
|
||||
tokens.push("aaa".to_string());
|
||||
}
|
||||
}
|
||||
if rng.random_bool(p_b) {
|
||||
let tf = random_term_freq(&mut rng, 0.7);
|
||||
for _ in 0..tf {
|
||||
tokens.push("bbb".to_string());
|
||||
}
|
||||
}
|
||||
if rng.random_bool(p_c) {
|
||||
let tf = random_term_freq(&mut rng, 0.7);
|
||||
for _ in 0..tf {
|
||||
tokens.push("ccc".to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// Pad with filler to create varied field lengths (5-30 tokens).
|
||||
let filler_count = rng.random_range(5u32..30u32);
|
||||
for _ in 0..filler_count {
|
||||
tokens.push("filler".to_string());
|
||||
}
|
||||
|
||||
let text = tokens.join(" ");
|
||||
writer.add_document(doc!(body => text)).unwrap();
|
||||
}
|
||||
writer.commit().unwrap();
|
||||
}
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![body]);
|
||||
|
||||
BenchIndex {
|
||||
searcher,
|
||||
query_parser,
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
// Scenarios: (label, p_a, p_b, p_c)
|
||||
//
|
||||
// "balanced": all terms ~10% → intersection ~1% of docs
|
||||
// "skewed": one common (50%), one rare (2%) → intersection ~1%
|
||||
// "very_skewed": one very common (80%), one very rare (0.5%) → intersection ~0.4%
|
||||
// "three_balanced": three terms ~20% each → intersection ~0.8%
|
||||
// "three_skewed": 50% / 10% / 2% → intersection ~0.1%
|
||||
let scenarios: Vec<(&str, f64, f64, f64)> = vec![
|
||||
("balanced_10%_10%", 0.10, 0.10, 0.0),
|
||||
("skewed_50%_2%", 0.50, 0.02, 0.0),
|
||||
("very_skewed_80%_0.5%", 0.80, 0.005, 0.0),
|
||||
("three_balanced_20%_20%_20%", 0.20, 0.20, 0.20),
|
||||
("three_skewed_50%_10%_2%", 0.50, 0.10, 0.02),
|
||||
];
|
||||
|
||||
let mut runner = BenchRunner::new();
|
||||
|
||||
for (label, p_a, p_b, p_c) in &scenarios {
|
||||
let bench_index = build_index(*p_a, *p_b, *p_c);
|
||||
|
||||
let mut group = runner.new_group();
|
||||
group.set_name(format!("intersection — {label}"));
|
||||
|
||||
// Two-term intersection
|
||||
if *p_a > 0.0 && *p_b > 0.0 {
|
||||
let query_str = "+aaa +bbb";
|
||||
let query = bench_index.query_parser.parse_query(query_str).unwrap();
|
||||
let searcher = bench_index.searcher.clone();
|
||||
group.register(format!("{query_str} top10"), move |_| {
|
||||
let collector = TopDocs::with_limit(10).order_by_score();
|
||||
black_box(searcher.search(&query, &collector).unwrap());
|
||||
1usize
|
||||
});
|
||||
}
|
||||
|
||||
// Three-term intersection
|
||||
if *p_c > 0.0 {
|
||||
let query_str = "+aaa +bbb +ccc";
|
||||
let query = bench_index.query_parser.parse_query(query_str).unwrap();
|
||||
let searcher = bench_index.searcher.clone();
|
||||
group.register(format!("{query_str} top10"), move |_| {
|
||||
let collector = TopDocs::with_limit(10).order_by_score();
|
||||
black_box(searcher.search(&query, &collector).unwrap());
|
||||
1usize
|
||||
});
|
||||
}
|
||||
|
||||
group.run();
|
||||
}
|
||||
}
|
||||
@@ -1,35 +0,0 @@
|
||||
// Benchmark for the query grammar parsing deeply nested queries.
|
||||
//
|
||||
// Regression guard for https://github.com/quickwit-oss/tantivy/issues/2498:
|
||||
// at depth 20/21 the old parser took 0.87 s / 1.72 s respectively because
|
||||
// `ast()` retried `occur_leaf` on backtrack, giving O(2^n) time. With the
|
||||
// fix parsing is linear and completes in microseconds.
|
||||
//
|
||||
// Run with: `cargo bench --bench query_parser_nested`.
|
||||
|
||||
use binggan::{black_box, BenchRunner};
|
||||
use tantivy::query_grammar::parse_query;
|
||||
|
||||
fn nested_query(depth: usize, leading_plus: bool) -> String {
|
||||
let leading = "(".repeat(depth);
|
||||
let trailing = ")".repeat(depth);
|
||||
let prefix = if leading_plus { "+" } else { "" };
|
||||
format!("{prefix}{leading}title:test{trailing}")
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut runner = BenchRunner::new();
|
||||
|
||||
for depth in [20, 21] {
|
||||
for leading_plus in [false, true] {
|
||||
let query = nested_query(depth, leading_plus);
|
||||
let label = format!(
|
||||
"parse_nested_depth_{depth}_{}",
|
||||
if leading_plus { "plus" } else { "plain" },
|
||||
);
|
||||
runner.bench_function(&label, move |_| {
|
||||
black_box(parse_query(black_box(&query)).unwrap());
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -18,10 +18,5 @@ homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker1x"] }
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.17.0"
|
||||
rand = "0.9"
|
||||
proptest = "1"
|
||||
|
||||
[[bench]]
|
||||
name = "bench"
|
||||
harness = false
|
||||
|
||||
@@ -1,110 +1,65 @@
|
||||
use std::cell::RefCell;
|
||||
#![feature(test)]
|
||||
|
||||
use binggan::{BenchRunner, black_box};
|
||||
use rand::rng;
|
||||
use rand::seq::IteratorRandom;
|
||||
use tantivy_bitpacker::{BitPacker, BitUnpacker, BlockedBitpacker};
|
||||
extern crate test;
|
||||
|
||||
fn create_bitpacked_data(bit_width: u8, num_els: u32) -> Vec<u8> {
|
||||
let mut bitpacker = BitPacker::new();
|
||||
let mut buffer = Vec::new();
|
||||
for _ in 0..num_els {
|
||||
bitpacker.write(0u64, bit_width, &mut buffer).unwrap();
|
||||
bitpacker.flush(&mut buffer).unwrap();
|
||||
}
|
||||
buffer
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::rng;
|
||||
use rand::seq::IteratorRandom;
|
||||
use tantivy_bitpacker::{BitPacker, BitUnpacker, BlockedBitpacker};
|
||||
use test::Bencher;
|
||||
|
||||
const N: usize = 100_000;
|
||||
const MAX_VAL: u64 = 1_000;
|
||||
const BIT_WIDTH: u8 = 10; // 2^10 = 1024 > MAX_VAL
|
||||
|
||||
fn create_packed_data() -> (BitUnpacker, Vec<u8>) {
|
||||
let mut bitpacker = BitPacker::new();
|
||||
let mut data = Vec::new();
|
||||
for i in 0..N as u64 {
|
||||
let val = i * MAX_VAL / N as u64;
|
||||
bitpacker.write(val, BIT_WIDTH, &mut data).unwrap();
|
||||
}
|
||||
bitpacker.close(&mut data).unwrap();
|
||||
(BitUnpacker::new(BIT_WIDTH), data)
|
||||
}
|
||||
|
||||
fn bench_bitpacking() {
|
||||
let mut runner = BenchRunner::new();
|
||||
let bit_width = 3;
|
||||
let num_els = 1_000_000u32;
|
||||
let bit_unpacker = BitUnpacker::new(bit_width);
|
||||
let data = create_bitpacked_data(bit_width, num_els);
|
||||
let idxs: Vec<u32> = (0..num_els).choose_multiple(&mut rng(), 100_000);
|
||||
runner.bench_function("bitpacking_read", move |_| {
|
||||
let mut out = 0u64;
|
||||
for &idx in &idxs {
|
||||
out = out.wrapping_add(bit_unpacker.get(idx, &data[..]));
|
||||
#[inline(never)]
|
||||
fn create_bitpacked_data(bit_width: u8, num_els: u32) -> Vec<u8> {
|
||||
let mut bitpacker = BitPacker::new();
|
||||
let mut buffer = Vec::new();
|
||||
for _ in 0..num_els {
|
||||
// the values do not matter.
|
||||
bitpacker.write(0u64, bit_width, &mut buffer).unwrap();
|
||||
bitpacker.flush(&mut buffer).unwrap();
|
||||
}
|
||||
black_box(out);
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_blocked_bitpacker() {
|
||||
let mut runner = BenchRunner::new();
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
for val in 0..=21500 {
|
||||
blocked_bitpacker.add(val * val);
|
||||
buffer
|
||||
}
|
||||
runner.bench_function("blockedbitp_read", move |_| {
|
||||
let mut out = 0u64;
|
||||
for val in 0..=21500 {
|
||||
out = out.wrapping_add(blocked_bitpacker.get(val));
|
||||
}
|
||||
black_box(out);
|
||||
});
|
||||
runner.bench_function("blockedbitp_create", |_| {
|
||||
|
||||
#[bench]
|
||||
fn bench_bitpacking_read(b: &mut Bencher) {
|
||||
let bit_width = 3;
|
||||
let num_els = 1_000_000u32;
|
||||
let bit_unpacker = BitUnpacker::new(bit_width);
|
||||
let data = create_bitpacked_data(bit_width, num_els);
|
||||
let idxs: Vec<u32> = (0..num_els).choose_multiple(&mut rng(), 100_000);
|
||||
b.iter(|| {
|
||||
let mut out = 0u64;
|
||||
for &idx in &idxs {
|
||||
out = out.wrapping_add(bit_unpacker.get(idx, &data[..]));
|
||||
}
|
||||
out
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_blockedbitp_read(b: &mut Bencher) {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
for val in 0..=21500 {
|
||||
blocked_bitpacker.add(val * val);
|
||||
}
|
||||
black_box(blocked_bitpacker);
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_filter_vec() {
|
||||
let mut runner = BenchRunner::new();
|
||||
|
||||
let (unpacker, data) = create_packed_data();
|
||||
let positions = RefCell::new(Vec::with_capacity(N));
|
||||
runner.bench_function("filter_vec_dense", move |_| {
|
||||
unpacker.get_ids_for_value_range(
|
||||
250..=750,
|
||||
0..N as u32,
|
||||
&data,
|
||||
&mut positions.borrow_mut(),
|
||||
);
|
||||
black_box(positions.borrow().len());
|
||||
});
|
||||
|
||||
let (unpacker, data) = create_packed_data();
|
||||
let positions = RefCell::new(Vec::with_capacity(N));
|
||||
runner.bench_function("filter_vec_sparse", move |_| {
|
||||
unpacker.get_ids_for_value_range(0..=50, 0..N as u32, &data, &mut positions.borrow_mut());
|
||||
black_box(positions.borrow().len());
|
||||
});
|
||||
|
||||
let (unpacker, data) = create_packed_data();
|
||||
let positions = RefCell::new(Vec::with_capacity(N));
|
||||
runner.bench_function("filter_vec_full", move |_| {
|
||||
unpacker.get_ids_for_value_range(
|
||||
0..=MAX_VAL,
|
||||
0..N as u32,
|
||||
&data,
|
||||
&mut positions.borrow_mut(),
|
||||
);
|
||||
black_box(positions.borrow().len());
|
||||
});
|
||||
}
|
||||
|
||||
fn main() {
|
||||
bench_bitpacking();
|
||||
bench_blocked_bitpacker();
|
||||
bench_filter_vec();
|
||||
b.iter(|| {
|
||||
let mut out = 0u64;
|
||||
for val in 0..=21500 {
|
||||
out = out.wrapping_add(blocked_bitpacker.get(val));
|
||||
}
|
||||
out
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_blockedbitp_create(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
for val in 0..=21500 {
|
||||
blocked_bitpacker.add(val * val);
|
||||
}
|
||||
blocked_bitpacker
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,17 +1,8 @@
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
use std::arch::is_aarch64_feature_detected;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod avx2;
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
mod neon;
|
||||
|
||||
// SVE intrinsics are not exposed on aarch64-apple-darwin.
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
mod sve;
|
||||
|
||||
mod scalar;
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq, Debug)]
|
||||
@@ -19,10 +10,6 @@ mod scalar;
|
||||
enum FilterImplPerInstructionSet {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
AVX2 = 0u8,
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
SVE = 3u8,
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
Neon = 2u8,
|
||||
Scalar = 1u8,
|
||||
}
|
||||
|
||||
@@ -32,57 +19,29 @@ impl FilterImplPerInstructionSet {
|
||||
match *self {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
FilterImplPerInstructionSet::AVX2 => is_x86_feature_detected!("avx2"),
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
FilterImplPerInstructionSet::SVE => is_aarch64_feature_detected!("sve"),
|
||||
// TIL Neon is required on aarch 64.
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
FilterImplPerInstructionSet::Neon => true,
|
||||
FilterImplPerInstructionSet::Scalar => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// List of available implementations in preferred order.
|
||||
// List of available implementation in preferred order.
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
const IMPLS: [FilterImplPerInstructionSet; 2] = [
|
||||
FilterImplPerInstructionSet::AVX2,
|
||||
FilterImplPerInstructionSet::Scalar,
|
||||
];
|
||||
|
||||
// Non-Apple aarch64: try SVE, NEON, Scalar.
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
const IMPLS: [FilterImplPerInstructionSet; 3] = [
|
||||
FilterImplPerInstructionSet::SVE,
|
||||
FilterImplPerInstructionSet::Neon,
|
||||
FilterImplPerInstructionSet::Scalar,
|
||||
];
|
||||
|
||||
// Apple aarch64 (M-series): SVE not available; use NEON or Scalar.
|
||||
#[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
|
||||
const IMPLS: [FilterImplPerInstructionSet; 2] = [
|
||||
FilterImplPerInstructionSet::Neon,
|
||||
FilterImplPerInstructionSet::Scalar,
|
||||
];
|
||||
|
||||
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
|
||||
#[cfg(not(target_arch = "x86_64"))]
|
||||
const IMPLS: [FilterImplPerInstructionSet; 1] = [FilterImplPerInstructionSet::Scalar];
|
||||
|
||||
impl FilterImplPerInstructionSet {
|
||||
#[inline]
|
||||
#[allow(unused_variables)]
|
||||
#[allow(unused_variables)] // on non-x86_64, code is unused.
|
||||
fn from(code: u8) -> FilterImplPerInstructionSet {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
if code == FilterImplPerInstructionSet::AVX2 as u8 {
|
||||
return FilterImplPerInstructionSet::AVX2;
|
||||
}
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
if code == FilterImplPerInstructionSet::SVE as u8 {
|
||||
return FilterImplPerInstructionSet::SVE;
|
||||
}
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
if code == FilterImplPerInstructionSet::Neon as u8 {
|
||||
return FilterImplPerInstructionSet::Neon;
|
||||
}
|
||||
FilterImplPerInstructionSet::Scalar
|
||||
}
|
||||
|
||||
@@ -91,13 +50,6 @@ impl FilterImplPerInstructionSet {
|
||||
match self {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
FilterImplPerInstructionSet::AVX2 => avx2::filter_vec_in_place(range, offset, output),
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
// SAFETY: SVE availability was verified by is_available() before selecting this impl.
|
||||
FilterImplPerInstructionSet::SVE => unsafe {
|
||||
sve::filter_vec_in_place(range, offset, output)
|
||||
},
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
FilterImplPerInstructionSet::Neon => neon::filter_vec_in_place(range, offset, output),
|
||||
FilterImplPerInstructionSet::Scalar => {
|
||||
scalar::filter_vec_in_place(range, offset, output)
|
||||
}
|
||||
@@ -105,12 +57,6 @@ impl FilterImplPerInstructionSet {
|
||||
}
|
||||
}
|
||||
|
||||
fn available_impls() -> impl Iterator<Item = FilterImplPerInstructionSet> {
|
||||
IMPLS
|
||||
.into_iter()
|
||||
.filter(FilterImplPerInstructionSet::is_available)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_best_available_instruction_set() -> FilterImplPerInstructionSet {
|
||||
use std::sync::atomic::{AtomicU8, Ordering};
|
||||
@@ -118,7 +64,10 @@ fn get_best_available_instruction_set() -> FilterImplPerInstructionSet {
|
||||
let instruction_set_byte: u8 = INSTRUCTION_SET_BYTE.load(Ordering::Relaxed);
|
||||
if instruction_set_byte == u8::MAX {
|
||||
// Let's initialize the instruction set and cache it.
|
||||
let instruction_set = available_impls().next().unwrap();
|
||||
let instruction_set = IMPLS
|
||||
.into_iter()
|
||||
.find(FilterImplPerInstructionSet::is_available)
|
||||
.unwrap();
|
||||
INSTRUCTION_SET_BYTE.store(instruction_set as u8, Ordering::Relaxed);
|
||||
return instruction_set;
|
||||
}
|
||||
@@ -131,12 +80,12 @@ pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::strategy::Strategy;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_get_best_available_instruction_set() {
|
||||
// This does not test much unfortunately.
|
||||
// We just make sure the function returns without crashing and returns the same result.
|
||||
let instruction_set = get_best_available_instruction_set();
|
||||
assert_eq!(get_best_available_instruction_set(), instruction_set);
|
||||
}
|
||||
@@ -153,31 +102,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
#[test]
|
||||
fn test_instruction_set_to_code_from_code() {
|
||||
for instruction_set in [
|
||||
FilterImplPerInstructionSet::SVE,
|
||||
FilterImplPerInstructionSet::Neon,
|
||||
FilterImplPerInstructionSet::Scalar,
|
||||
] {
|
||||
let code = instruction_set as u8;
|
||||
assert_eq!(instruction_set, FilterImplPerInstructionSet::from(code));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
|
||||
#[test]
|
||||
fn test_instruction_set_to_code_from_code() {
|
||||
for instruction_set in [
|
||||
FilterImplPerInstructionSet::Neon,
|
||||
FilterImplPerInstructionSet::Scalar,
|
||||
] {
|
||||
let code = instruction_set as u8;
|
||||
assert_eq!(instruction_set, FilterImplPerInstructionSet::from(code));
|
||||
}
|
||||
}
|
||||
|
||||
fn test_filter_impl_empty_aux(filter_impl: FilterImplPerInstructionSet) {
|
||||
let mut output = vec![];
|
||||
filter_impl.filter_vec_in_place(0..=u32::MAX, 0, &mut output);
|
||||
@@ -202,20 +126,11 @@ mod tests {
|
||||
assert_eq!(&output, &[1, 3, 4, 5, 6, 7, 8]);
|
||||
}
|
||||
|
||||
fn test_filter_impl_empty_range_aux(filter_impl: FilterImplPerInstructionSet) {
|
||||
// start > end: RangeInclusive::contains always returns false; output must be empty.
|
||||
// The SVE path's wrapping_sub would otherwise produce a huge range_width.
|
||||
let mut output = vec![3, 2, 1, 5, 11, 2, 5, 10, 2];
|
||||
filter_impl.filter_vec_in_place(10..=5, 0, &mut output);
|
||||
assert_eq!(&output, &[]);
|
||||
}
|
||||
|
||||
fn test_filter_impl_test_suite(filter_impl: FilterImplPerInstructionSet) {
|
||||
test_filter_impl_empty_aux(filter_impl);
|
||||
test_filter_impl_simple_aux(filter_impl);
|
||||
test_filter_impl_simple_aux_shifted(filter_impl);
|
||||
test_filter_impl_simple_outside_i32_range(filter_impl);
|
||||
test_filter_impl_empty_range_aux(filter_impl);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -226,60 +141,25 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
fn test_filter_implementation_sve() {
|
||||
if FilterImplPerInstructionSet::SVE.is_available() {
|
||||
test_filter_impl_test_suite(FilterImplPerInstructionSet::SVE);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
fn test_filter_implementation_neon() {
|
||||
test_filter_impl_test_suite(FilterImplPerInstructionSet::Neon);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_filter_implementation_scalar() {
|
||||
test_filter_impl_test_suite(FilterImplPerInstructionSet::Scalar);
|
||||
}
|
||||
|
||||
fn max_val_strategy() -> impl proptest::strategy::Strategy<Value = u32> {
|
||||
proptest::prop_oneof![
|
||||
0u32..10u32,
|
||||
255u32..258u32,
|
||||
proptest::prelude::Just(1u32 << 25),
|
||||
proptest::prelude::Just(u32::MAX - 1),
|
||||
proptest::prelude::Just(u32::MAX),
|
||||
]
|
||||
}
|
||||
|
||||
fn vals_strategy() -> impl proptest::strategy::Strategy<Value = Vec<u32>> {
|
||||
proptest::prop_oneof![
|
||||
proptest::collection::vec(proptest::prelude::any::<u32>(), 0..300),
|
||||
max_val_strategy()
|
||||
.prop_flat_map(|max_val| { proptest::collection::vec(0..=max_val, 0..300) })
|
||||
]
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
proptest::proptest! {
|
||||
#[test]
|
||||
fn test_filter_compare_scalar_and_impls_impl_proptest(
|
||||
start in 0u32..400u32,
|
||||
end in 0u32..400u32,
|
||||
fn test_filter_compare_scalar_and_avx2_impl_proptest(
|
||||
start in proptest::prelude::any::<u32>(),
|
||||
end in proptest::prelude::any::<u32>(),
|
||||
offset in 0u32..2u32,
|
||||
vals in vals_strategy()) {
|
||||
for implementation in available_impls() {
|
||||
if implementation == FilterImplPerInstructionSet::Scalar {
|
||||
continue;
|
||||
}
|
||||
let mut impl_output = vals.clone();
|
||||
let mut scalar_output = vals.clone();
|
||||
implementation.filter_vec_in_place(start..=end, offset, &mut impl_output);
|
||||
FilterImplPerInstructionSet::Scalar.filter_vec_in_place(start..=end, offset, &mut scalar_output);
|
||||
assert_eq!(&impl_output, &scalar_output);
|
||||
}
|
||||
mut vals in proptest::collection::vec(0..u32::MAX, 0..30)) {
|
||||
if FilterImplPerInstructionSet::AVX2.is_available() {
|
||||
let mut vals_clone = vals.clone();
|
||||
FilterImplPerInstructionSet::AVX2.filter_vec_in_place(start..=end, offset, &mut vals);
|
||||
FilterImplPerInstructionSet::Scalar.filter_vec_in_place(start..=end, offset, &mut vals_clone);
|
||||
assert_eq!(&vals, &vals_clone);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,118 +0,0 @@
|
||||
use std::arch::aarch64::*;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
const NUM_LANES: usize = 4;
|
||||
|
||||
// Compacts matching lanes to the front using a byte-level shuffle.
|
||||
// `mask` is a 4-bit value: bit k=1 means lane k should appear in the output.
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
unsafe fn compact(data: uint32x4_t, mask: u8) -> uint32x4_t {
|
||||
unsafe {
|
||||
// SAFETY: mask is always in [0, 15] by construction (max sum of [1,2,4,8]).
|
||||
// BYTE_SHUFFLE_TABLE has 16 entries, so this is always in bounds.
|
||||
let shuffle = BYTE_SHUFFLE_TABLE.get_unchecked(mask as usize);
|
||||
let shuffle_vec = vld1q_u8(shuffle.as_ptr());
|
||||
vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(data), shuffle_vec))
|
||||
}
|
||||
}
|
||||
|
||||
// Safe (not unsafe) because NEON is mandatory on aarch64: no runtime feature check needed.
|
||||
#[inline(never)]
|
||||
pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
|
||||
let num_words = output.len() / NUM_LANES;
|
||||
let mut output_len = unsafe {
|
||||
filter_vec_neon_aux(
|
||||
output.as_ptr(),
|
||||
range.clone(),
|
||||
output.as_mut_ptr(),
|
||||
offset,
|
||||
num_words,
|
||||
)
|
||||
};
|
||||
let remainder_start = num_words * NUM_LANES;
|
||||
for i in remainder_start..output.len() {
|
||||
let val = output[i];
|
||||
output[output_len] = offset + i as u32;
|
||||
output_len += if range.contains(&val) { 1 } else { 0 };
|
||||
}
|
||||
output.truncate(output_len);
|
||||
}
|
||||
|
||||
#[target_feature(enable = "neon")]
|
||||
unsafe fn filter_vec_neon_aux(
|
||||
input: *const u32,
|
||||
range: RangeInclusive<u32>,
|
||||
output: *mut u32,
|
||||
offset: u32,
|
||||
num_words: usize,
|
||||
) -> usize {
|
||||
unsafe {
|
||||
let mut input = input;
|
||||
let mut output_tail = output;
|
||||
let range_start_simd = vdupq_n_u32(*range.start());
|
||||
let range_end_simd = vdupq_n_u32(*range.end());
|
||||
let mut ids = vld1q_u32([offset, offset + 1, offset + 2, offset + 3].as_ptr());
|
||||
let shift = vdupq_n_u32(NUM_LANES as u32);
|
||||
let bit_weights = vld1q_u32([1u32, 2, 4, 8].as_ptr());
|
||||
|
||||
for _ in 0..num_words {
|
||||
let word = vld1q_u32(input);
|
||||
|
||||
// Unsigned compares: CMHS (compare higher or same) tests `word >= start`
|
||||
// and `end >= word`. ANDing both gives the inside-range mask directly,
|
||||
// which is cheaper than computing `outside` and then negating.
|
||||
let ge_start = vcgeq_u32(word, range_start_simd);
|
||||
let le_end = vcleq_u32(word, range_end_simd);
|
||||
// inside[k] = 0xFFFFFFFF if val[k] is in range, 0 otherwise.
|
||||
let inside = vandq_u32(ge_start, le_end);
|
||||
|
||||
// Build the 4-bit mask: AND bit_weights with the inside lane mask, so each
|
||||
// inside lane contributes its bit_weight (1, 2, 4, or 8). Summing yields the
|
||||
// 4-bit mask in one addv.
|
||||
let inside_bits = vandq_u32(bit_weights, inside);
|
||||
let mask = vaddvq_u32(inside_bits) as u8;
|
||||
// mask is mathematically bounded: max value is 1+2+4+8=15 (all lanes match)
|
||||
debug_assert!(mask <= 15, "mask must fit in 4 bits: {}", mask);
|
||||
|
||||
// Count of matching lanes = popcount(mask). Derives the count directly from
|
||||
// the mask instead of running a parallel SIMD reduction over `outside`.
|
||||
let added_len = mask.count_ones() as usize;
|
||||
|
||||
// Safe because mask is guaranteed to be in [0, 15]
|
||||
let filtered_ids = compact(ids, mask);
|
||||
vst1q_u32(output_tail, filtered_ids);
|
||||
output_tail = output_tail.add(added_len);
|
||||
ids = vaddq_u32(ids, shift);
|
||||
input = input.add(NUM_LANES);
|
||||
}
|
||||
|
||||
output_tail.offset_from(output) as usize
|
||||
}
|
||||
}
|
||||
|
||||
// Byte shuffle patterns to compact matching lanes to the front of the vector.
|
||||
// Index is a 4-bit mask: bit k=1 means lane k (bytes 4k..4k+3) is in-range.
|
||||
// The j-th set bit determines which input lane goes to output position j.
|
||||
const BYTE_SHUFFLE_TABLE: [[u8; 16]; 16] = [
|
||||
[
|
||||
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
|
||||
], // 0b0000: none
|
||||
[0, 1, 2, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0001: lane 0
|
||||
[4, 5, 6, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0010: lane 1
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0011: lanes 0,1
|
||||
[8, 9, 10, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0100: lane 2
|
||||
[0, 1, 2, 3, 8, 9, 10, 11, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0101: lanes 0,2
|
||||
[4, 5, 6, 7, 8, 9, 10, 11, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0110: lanes 1,2
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 16, 16, 16], // 0b0111: lanes 0,1,2
|
||||
[
|
||||
12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
|
||||
], // 0b1000: lane 3
|
||||
[0, 1, 2, 3, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16], // 0b1001: lanes 0,3
|
||||
[4, 5, 6, 7, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16], // 0b1010: lanes 1,3
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 16, 16, 16, 16], // 0b1011: lanes 0,1,3
|
||||
[8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16], // 0b1100: lanes 2,3
|
||||
[0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16], // 0b1101: lanes 0,2,3
|
||||
[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16], // 0b1110: lanes 1,2,3
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], // 0b1111: all lanes
|
||||
];
|
||||
@@ -1,260 +0,0 @@
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
// SVE vector length (in u32 lanes) is not a compile-time constant; query at runtime.
|
||||
// Safe to call only when SVE is confirmed available via is_aarch64_feature_detected!("sve").
|
||||
#[target_feature(enable = "sve")]
|
||||
unsafe fn num_lanes() -> usize {
|
||||
let vl: usize;
|
||||
unsafe {
|
||||
core::arch::asm!(
|
||||
"cntw {vl}",
|
||||
vl = out(reg) vl,
|
||||
options(nostack, nomem, preserves_flags),
|
||||
);
|
||||
}
|
||||
vl
|
||||
}
|
||||
|
||||
// SAFETY: caller must ensure SVE is available (checked via is_aarch64_feature_detected!("sve")).
|
||||
// Unlike NEON, SVE is optional on aarch64 and not guaranteed by the target architecture.
|
||||
pub unsafe fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
|
||||
if range.start() > range.end() {
|
||||
output.clear();
|
||||
return;
|
||||
}
|
||||
let vl = unsafe { num_lanes() };
|
||||
let num_words = output.len() / vl;
|
||||
let range_start = *range.start();
|
||||
// Unsigned subtraction trick: val ∈ [lo, hi] ↔ (val - lo) ≤ᵤ (hi - lo).
|
||||
// Values below lo wrap around to large u32, so the single unsigned ≤ excludes them.
|
||||
let range_width = range.end().wrapping_sub(range_start);
|
||||
let mut output_len = unsafe {
|
||||
filter_vec_sve_aux(
|
||||
output.as_ptr(),
|
||||
range_start,
|
||||
range_width,
|
||||
output.as_mut_ptr(),
|
||||
offset,
|
||||
num_words,
|
||||
vl,
|
||||
)
|
||||
};
|
||||
let remainder_start = num_words * vl;
|
||||
for i in remainder_start..output.len() {
|
||||
let val = output[i];
|
||||
output[output_len] = offset + i as u32;
|
||||
output_len += if range.contains(&val) { 1 } else { 0 };
|
||||
}
|
||||
output.truncate(output_len);
|
||||
}
|
||||
|
||||
// Register allocation for the asm! blocks:
|
||||
// z0 ids_a (index vector for first half of each pair, advances by step2 each iter)
|
||||
// z1 range_width broadcast
|
||||
// z2 range_start broadcast
|
||||
// z3 step2 broadcast (2 * vl)
|
||||
// z4 ids_b (index vector for second half, = ids_a + step, advances by step2)
|
||||
// z5 scratch: loaded word_a, then compacted_a
|
||||
// z6 scratch: loaded word_b, then compacted_b
|
||||
// p0 all-true predicate (ptrue p0.s)
|
||||
// p1 in-range mask for word_a
|
||||
// p2 in-range mask for word_b
|
||||
#[target_feature(enable = "sve")]
|
||||
unsafe fn filter_vec_sve_aux(
|
||||
input: *const u32,
|
||||
range_start: u32,
|
||||
range_width: u32,
|
||||
output: *mut u32,
|
||||
offset: u32,
|
||||
num_words: usize,
|
||||
vl: usize,
|
||||
) -> usize {
|
||||
let num_pairs = num_words / 2;
|
||||
let mut input_ptr = input;
|
||||
let mut output_tail = output;
|
||||
|
||||
if num_pairs > 0 {
|
||||
unsafe {
|
||||
// We rely on asm! because the SVE intrinsics are not available in stable Rust.
|
||||
// The code that follows was generated by Rustc nightly based on the intrinsics version
|
||||
// at the bottom of this file.
|
||||
core::arch::asm!(
|
||||
// --- Setup ---
|
||||
// All-true predicate for 32-bit lanes.
|
||||
"ptrue p0.s",
|
||||
// ids_a = [offset, offset+1, offset+2, ...]
|
||||
"index z0.s, {offset:w}, #1",
|
||||
// Broadcast scalars into SVE vectors.
|
||||
"mov z1.s, {range_width:w}",
|
||||
"mov z2.s, {range_start:w}",
|
||||
// vl_gpr = number of 32-bit lanes (cntw).
|
||||
"cntw {vl_gpr}",
|
||||
// step2_bytes will first hold 2*vl (for the step2 vector), then 2*VL in bytes.
|
||||
"lsl {step2_bytes}, {vl_gpr}, #1",
|
||||
// z4 = step = [vl, vl, ...]; will become ids_b after the add below.
|
||||
"mov z4.s, {vl_gpr:w}",
|
||||
// z3 = step2 = [2*vl, 2*vl, ...], used to advance both id vectors each iter.
|
||||
"mov z3.s, {step2_bytes:w}",
|
||||
// Repurpose step2_bytes to hold the byte stride for advancing the input pointer
|
||||
// by two full SVE vectors per iteration.
|
||||
"rdvl {step2_bytes}, #2",
|
||||
// ids_b = ids_a + step = [offset+vl, offset+vl+1, ...]
|
||||
"add z4.s, z0.s, z4.s",
|
||||
|
||||
// --- Main loop: process two SVE vectors (ids_a and ids_b) per iteration ---
|
||||
"0:",
|
||||
// Load two consecutive SVE vectors from input.
|
||||
"ld1w {{z5.s}}, p0/z, [{input}]",
|
||||
"ld1w {{z6.s}}, p0/z, [{input}, #1, mul vl]",
|
||||
// Advance input pointer by 2 * VL bytes.
|
||||
"add {input}, {input}, {step2_bytes}",
|
||||
// Unsigned shift: subtract range_start so in-range check becomes a single cmpu ≤.
|
||||
"sub z5.s, z5.s, z2.s",
|
||||
"sub z6.s, z6.s, z2.s",
|
||||
// in_range: shifted value ≤ range_width (unsigned, so values below lo also fail).
|
||||
"cmphs p1.s, p0/z, z1.s, z5.s",
|
||||
"cmphs p2.s, p0/z, z1.s, z6.s",
|
||||
// Count matching lanes; both cntp calls have independent inputs for OOO parallelism.
|
||||
"cntp {cnt_a}, p0, p1.s",
|
||||
"compact z5.s, p1, z0.s",
|
||||
"compact z6.s, p2, z4.s",
|
||||
"cntp {cnt_b}, p0, p2.s",
|
||||
// Advance id vectors for the next iteration.
|
||||
"add z0.s, z0.s, z3.s",
|
||||
"add z4.s, z4.s, z3.s",
|
||||
// Store compacted ids. Only the first cnt_a / cnt_b slots are valid; the rest
|
||||
// will be overwritten by subsequent iterations before the final truncate.
|
||||
"str z5, [{out}]",
|
||||
"st1w {{z6.s}}, p0, [{out}, {cnt_a}, lsl #2]",
|
||||
"add {out}, {out}, {cnt_a}, lsl #2",
|
||||
"add {out}, {out}, {cnt_b}, lsl #2",
|
||||
"subs {pairs}, {pairs}, #1",
|
||||
"b.ne 0b",
|
||||
|
||||
// --- Operands ---
|
||||
input = inout(reg) input_ptr,
|
||||
out = inout(reg) output_tail,
|
||||
pairs = inout(reg) num_pairs => _,
|
||||
offset = in(reg) offset,
|
||||
range_start = in(reg) range_start,
|
||||
range_width = in(reg) range_width,
|
||||
vl_gpr = out(reg) _,
|
||||
step2_bytes = out(reg) _,
|
||||
cnt_a = out(reg) _,
|
||||
cnt_b = out(reg) _,
|
||||
out("p0") _, out("p1") _, out("p2") _,
|
||||
out("v0") _, out("v1") _, out("v2") _, out("v3") _,
|
||||
out("v4") _, out("v5") _, out("v6") _,
|
||||
options(nostack),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle an odd trailing vector.
|
||||
if num_words % 2 == 1 {
|
||||
// ids_a for the odd word starts at offset + num_pairs * 2 * vl.
|
||||
// input_ptr was advanced by the main loop and now points at the odd word.
|
||||
let odd_offset =
|
||||
offset.wrapping_add((num_pairs as u32).wrapping_mul(2).wrapping_mul(vl as u32));
|
||||
unsafe {
|
||||
core::arch::asm!(
|
||||
"ptrue p0.s",
|
||||
"index z0.s, {odd_offset:w}, #1",
|
||||
"mov z1.s, {range_width:w}",
|
||||
"mov z2.s, {range_start:w}",
|
||||
"ld1w {{z3.s}}, p0/z, [{input}]",
|
||||
"sub z3.s, z3.s, z2.s",
|
||||
"cmphs p1.s, p0/z, z1.s, z3.s",
|
||||
"cntp {cnt}, p0, p1.s",
|
||||
"compact z0.s, p1, z0.s",
|
||||
"str z0, [{out}]",
|
||||
"add {out}, {out}, {cnt}, lsl #2",
|
||||
odd_offset = in(reg) odd_offset,
|
||||
range_width = in(reg) range_width,
|
||||
range_start = in(reg) range_start,
|
||||
input = in(reg) input_ptr,
|
||||
out = inout(reg) output_tail,
|
||||
cnt = out(reg) _,
|
||||
out("p0") _, out("p1") _,
|
||||
out("v0") _, out("v1") _, out("v2") _, out("v3") _,
|
||||
options(nostack),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
unsafe { output_tail.offset_from(output) as usize }
|
||||
}
|
||||
|
||||
// SVE implements with intrinsics.
|
||||
//
|
||||
// #[target_feature(enable = "sve")]
|
||||
// unsafe fn filter_vec_sve_aux(
|
||||
// input: *const u32,
|
||||
// range_start: u32,
|
||||
// range_width: u32,
|
||||
// output: *mut u32,
|
||||
// offset: u32,
|
||||
// num_words: usize,
|
||||
// vl: usize,
|
||||
// ) -> usize {
|
||||
// unsafe {
|
||||
// let all_true = svptrue_b32();
|
||||
// let range_start_simd = svdup_n_u32(range_start);
|
||||
// let range_width_simd = svdup_n_u32(range_width);
|
||||
// // ids_a covers [offset .. offset+vl), ids_b covers the next vl ids.
|
||||
// // Keeping them separate breaks the loop-carried dependency through ids so
|
||||
// // both compact/cntp chains are fully independent within each unrolled body.
|
||||
// let mut ids_a = svindex_u32(offset, 1);
|
||||
// let step = svdup_n_u32(vl as u32);
|
||||
// let step2 = svdup_n_u32(2 * vl as u32);
|
||||
// let mut ids_b = svadd_u32_x(all_true, ids_a, step);
|
||||
|
||||
// let mut input = input;
|
||||
// let mut output_tail = output;
|
||||
|
||||
// // Unrolled ×2: both cntp calls have independent inputs and execute in parallel.
|
||||
// // The two output_tail updates are sequential but together cost 4+1+1=6 cy per
|
||||
// // pair vs 5+5=10 cy for two scalar iterations, breaking the cntp latency chain.
|
||||
// let num_pairs = num_words / 2;
|
||||
// for _ in 0..num_pairs {
|
||||
// let word_a = svld1_u32(all_true, input);
|
||||
// let word_b = svld1_u32(all_true, input.add(vl));
|
||||
|
||||
// let shifted_a = svsub_u32_x(all_true, word_a, range_start_simd);
|
||||
// let shifted_b = svsub_u32_x(all_true, word_b, range_start_simd);
|
||||
|
||||
// let in_range_a = svcmple_u32(all_true, shifted_a, range_width_simd);
|
||||
// let in_range_b = svcmple_u32(all_true, shifted_b, range_width_simd);
|
||||
|
||||
// let compacted_a = svcompact_u32(in_range_a, ids_a);
|
||||
// let compacted_b = svcompact_u32(in_range_b, ids_b);
|
||||
// // cntp_a and cntp_b have independent inputs: OOO engine issues them in parallel.
|
||||
// let added_len_a = svcntp_b32(all_true, in_range_a) as usize;
|
||||
// let added_len_b = svcntp_b32(all_true, in_range_b) as usize;
|
||||
|
||||
// // Write the full vector — only the first added_len slots are valid.
|
||||
// // Subsequent iterations overwrite the trailing zeros before truncate.
|
||||
// svst1_u32(all_true, output_tail, compacted_a);
|
||||
// output_tail = output_tail.add(added_len_a);
|
||||
// svst1_u32(all_true, output_tail, compacted_b);
|
||||
// output_tail = output_tail.add(added_len_b);
|
||||
|
||||
// ids_a = svadd_u32_x(all_true, ids_a, step2);
|
||||
// ids_b = svadd_u32_x(all_true, ids_b, step2);
|
||||
// input = input.add(2 * vl);
|
||||
// }
|
||||
|
||||
// // Handle an odd trailing word.
|
||||
// if num_words % 2 == 1 {
|
||||
// let word = svld1_u32(all_true, input);
|
||||
// let shifted = svsub_u32_x(all_true, word, range_start_simd);
|
||||
// let in_range = svcmple_u32(all_true, shifted, range_width_simd);
|
||||
// let added_len = svcntp_b32(all_true, in_range) as usize;
|
||||
// let compacted_ids = svcompact_u32(in_range, ids_a);
|
||||
// svst1_u32(all_true, output_tail, compacted_ids);
|
||||
// output_tail = output_tail.add(added_len);
|
||||
// }
|
||||
|
||||
// output_tail.offset_from(output) as usize
|
||||
// }
|
||||
// }
|
||||
@@ -23,7 +23,7 @@ downcast-rs = "2.0.1"
|
||||
proptest = "1"
|
||||
more-asserts = "0.3.1"
|
||||
rand = "0.9"
|
||||
binggan = "0.17.0"
|
||||
binggan = "0.16.1"
|
||||
|
||||
[[bench]]
|
||||
name = "bench_merge"
|
||||
|
||||
@@ -15,37 +15,9 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
|
||||
{
|
||||
#[inline]
|
||||
pub fn fetch_block<'a>(&'a mut self, docs: &'a [u32], accessor: &Column<T>) {
|
||||
self.fetch_block_with_is_full(docs, accessor, accessor.index.get_cardinality().is_full());
|
||||
}
|
||||
|
||||
/// Like [`Self::fetch_block`] but takes the column's fullness instead of querying
|
||||
/// `accessor.index.get_cardinality()` each call — for callers that know it up front (e.g.
|
||||
/// checked once at construction). `is_full` must equal
|
||||
/// `accessor.index.get_cardinality().is_full()`.
|
||||
#[inline]
|
||||
pub fn fetch_block_with_is_full<'a>(
|
||||
&'a mut self,
|
||||
docs: &'a [u32],
|
||||
accessor: &Column<T>,
|
||||
is_full: bool,
|
||||
) {
|
||||
if is_full {
|
||||
// Skip the resize when already the right length (common case: fixed-size blocks).
|
||||
if self.val_cache.len() != docs.len() {
|
||||
self.val_cache.resize(docs.len(), T::default());
|
||||
}
|
||||
// When the docs form a contiguous ascending run we can fetch the values
|
||||
// as a single range. This lets codecs (e.g. bitpacked) bulk-decode the
|
||||
// slice instead of gathering value-by-value, and avoids per-value dynamic
|
||||
// dispatch. `docs` is always sorted ascending and free of duplicates here,
|
||||
// so comparing the endpoints is enough to detect contiguity.
|
||||
if is_contiguous(docs) {
|
||||
accessor
|
||||
.values
|
||||
.get_range(docs[0] as u64, &mut self.val_cache);
|
||||
} else {
|
||||
accessor.values.get_vals(docs, &mut self.val_cache);
|
||||
}
|
||||
if accessor.index.get_cardinality().is_full() {
|
||||
self.val_cache.resize(docs.len(), T::default());
|
||||
accessor.values.get_vals(docs, &mut self.val_cache);
|
||||
} else {
|
||||
self.docid_cache.clear();
|
||||
self.row_id_cache.clear();
|
||||
@@ -186,22 +158,6 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if `docs` is a contiguous ascending run `[d, d + 1, ..., d + n - 1]`.
|
||||
///
|
||||
/// Assumes `docs` is sorted ascending and free of duplicates (the invariant for the
|
||||
/// doc blocks passed to `fetch_block`), so comparing the endpoints is sufficient.
|
||||
#[inline]
|
||||
fn is_contiguous(docs: &[u32]) -> bool {
|
||||
let (Some(&first), Some(&last)) = (docs.first(), docs.last()) else {
|
||||
return false;
|
||||
};
|
||||
debug_assert!(
|
||||
docs.windows(2).all(|w| w[0] < w[1]),
|
||||
"fetch_block requires docs sorted ascending without duplicates"
|
||||
);
|
||||
(last - first) as usize + 1 == docs.len()
|
||||
}
|
||||
|
||||
/// Given two sorted lists of docids `docs` and `hits`, hits is a subset of `docs`.
|
||||
/// Return all docs that are not in `hits`.
|
||||
fn find_missing_docs<F>(docs: &[u32], hits: &[u32], mut callback: F)
|
||||
@@ -332,46 +288,4 @@ mod tests {
|
||||
assert_eq!(accessor.docid_cache, vec![0]);
|
||||
assert_eq!(accessor.val_cache, vec![1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_contiguous() {
|
||||
assert!(!is_contiguous(&[]));
|
||||
assert!(is_contiguous(&[5]));
|
||||
assert!(is_contiguous(&[5, 6, 7, 8]));
|
||||
assert!(is_contiguous(&[0, 1, 2]));
|
||||
assert!(!is_contiguous(&[5, 7, 8]));
|
||||
assert!(!is_contiguous(&[0, 1, 3]));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fetch_block_contiguous_and_gather_match() {
|
||||
use crate::column_index::ColumnIndex;
|
||||
use crate::column_values::{
|
||||
ALL_U64_CODEC_TYPES, serialize_and_load_u64_based_column_values,
|
||||
};
|
||||
|
||||
let vals: Vec<u64> = (0..200u64).map(|i| i * 7 + 3).collect();
|
||||
let values =
|
||||
serialize_and_load_u64_based_column_values::<u64>(&&vals[..], &ALL_U64_CODEC_TYPES);
|
||||
let column = Column {
|
||||
index: ColumnIndex::Full,
|
||||
values,
|
||||
};
|
||||
|
||||
let check = |accessor: &mut ColumnBlockAccessor<u64>, docs: &[u32]| {
|
||||
accessor.fetch_block(docs, &column);
|
||||
let got: Vec<(u32, u64)> = accessor.iter_docid_vals(docs, &column).collect();
|
||||
let expected: Vec<(u32, u64)> = docs.iter().map(|&d| (d, vals[d as usize])).collect();
|
||||
assert_eq!(got, expected);
|
||||
};
|
||||
|
||||
let mut accessor = ColumnBlockAccessor::<u64>::default();
|
||||
// Contiguous block -> get_range fast path.
|
||||
check(&mut accessor, &(10..74).collect::<Vec<u32>>());
|
||||
// Non-contiguous block -> get_vals gather path.
|
||||
check(&mut accessor, &[0, 5, 9, 100, 199]);
|
||||
// Single doc and full span.
|
||||
check(&mut accessor, &[42]);
|
||||
check(&mut accessor, &(0..200).collect::<Vec<u32>>());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -119,18 +119,8 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
|
||||
/// the segment's `maxdoc`.
|
||||
#[inline(always)]
|
||||
fn get_range(&self, start: u64, output: &mut [T]) {
|
||||
let mut out_chunks = output.chunks_exact_mut(4);
|
||||
let mut idx = start;
|
||||
for out_x4 in out_chunks.by_ref() {
|
||||
out_x4[0] = self.get_val(idx as u32);
|
||||
out_x4[1] = self.get_val((idx + 1) as u32);
|
||||
out_x4[2] = self.get_val((idx + 2) as u32);
|
||||
out_x4[3] = self.get_val((idx + 3) as u32);
|
||||
idx += 4;
|
||||
}
|
||||
for out in out_chunks.into_remainder() {
|
||||
for (out, idx) in output.iter_mut().zip(start..) {
|
||||
*out = self.get_val(idx as u32);
|
||||
idx += 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -121,22 +121,6 @@ pub(crate) fn create_and_validate<TColumnCodec: ColumnCodec>(
|
||||
reader.get_vals(&all_docs, &mut buffer);
|
||||
assert_eq!(vals, buffer);
|
||||
|
||||
// Validate `get_range` over the full column and a sub-range. The sub-range starts
|
||||
// at a non-zero offset to exercise the entrance-ramp alignment of the batch decode.
|
||||
buffer.resize(all_docs.len(), 0);
|
||||
reader.get_range(0, &mut buffer);
|
||||
assert_eq!(vals, buffer, "get_range (full) mismatch in data set {name}");
|
||||
if vals.len() >= 2 {
|
||||
let start = 1usize;
|
||||
buffer.resize(vals.len() - start, 0);
|
||||
reader.get_range(start as u64, &mut buffer);
|
||||
assert_eq!(
|
||||
&vals[start..],
|
||||
&buffer[..],
|
||||
"get_range (sub-range) mismatch in data set {name}"
|
||||
);
|
||||
}
|
||||
|
||||
if !vals.is_empty() {
|
||||
let test_rand_idx = rand::rng().random_range(0..=vals.len() - 1);
|
||||
let expected_positions: Vec<u32> = vals
|
||||
|
||||
@@ -19,6 +19,6 @@ time = { version = "0.3.47", features = ["serde-well-known"] }
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.17.0"
|
||||
binggan = "0.16.1"
|
||||
proptest = "1.0.0"
|
||||
rand = "0.9"
|
||||
|
||||
@@ -121,7 +121,7 @@ pub struct FileSlice {
|
||||
|
||||
impl fmt::Debug for FileSlice {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "FileSlice({:?}, {:?})", self.data, self.range)
|
||||
write!(f, "FileSlice({:?}, {:?})", &self.data, self.range)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -327,9 +327,7 @@ fn exists(inp: &str) -> IResult<&str, UserInputLeaf> {
|
||||
peek(alt((
|
||||
value(
|
||||
"",
|
||||
satisfy(|c: char| {
|
||||
c.is_whitespace() || (ESCAPE_IN_WORD.contains(&c) && c != '\\')
|
||||
}),
|
||||
satisfy(|c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c)),
|
||||
),
|
||||
eof,
|
||||
))),
|
||||
@@ -347,9 +345,7 @@ fn exists_precond(inp: &str) -> IResult<&str, (), ()> {
|
||||
peek(alt((
|
||||
value(
|
||||
"",
|
||||
satisfy(|c: char| {
|
||||
c.is_whitespace() || (ESCAPE_IN_WORD.contains(&c) && c != '\\')
|
||||
}),
|
||||
satisfy(|c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c)),
|
||||
),
|
||||
eof,
|
||||
))), // we need to check this isn't a wildcard query
|
||||
@@ -711,7 +707,6 @@ fn regex(inp: &str) -> IResult<&str, UserInputLeaf> {
|
||||
peek(alt((
|
||||
value((), multispace1),
|
||||
value((), char(')')),
|
||||
value((), char('^')),
|
||||
value((), eof),
|
||||
))),
|
||||
),
|
||||
@@ -733,10 +728,9 @@ fn regex_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
peek(alt((
|
||||
value((), multispace1),
|
||||
value((), char(')')),
|
||||
value((), char('^')),
|
||||
value((), eof),
|
||||
))),
|
||||
"expected whitespace, closing parenthesis, boost, or end of input",
|
||||
"expected whitespace, closing parenthesis, or end of input",
|
||||
),
|
||||
)(inp)
|
||||
{
|
||||
@@ -779,10 +773,6 @@ fn leaf(inp: &str) -> IResult<&str, UserInputAst> {
|
||||
value((), multispace1),
|
||||
value((), char(')')),
|
||||
value((), eof),
|
||||
value(
|
||||
(),
|
||||
satisfy(|c: char| ESCAPE_IN_WORD.contains(&c) && c != '\\'),
|
||||
),
|
||||
))),
|
||||
),
|
||||
|_| UserInputAst::from(UserInputLeaf::All),
|
||||
@@ -815,10 +805,6 @@ fn leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
|
||||
value((), multispace1),
|
||||
value((), char(')')),
|
||||
value((), eof),
|
||||
value(
|
||||
(),
|
||||
satisfy(|c: char| ESCAPE_IN_WORD.contains(&c) && c != '\\'),
|
||||
),
|
||||
))),
|
||||
),
|
||||
),
|
||||
@@ -1059,43 +1045,18 @@ fn operand_leaf(inp: &str) -> IResult<&str, (Option<BinaryOperand>, Option<Occur
|
||||
}
|
||||
|
||||
fn ast(inp: &str) -> IResult<&str, UserInputAst> {
|
||||
// Parse `occur_leaf` once, then conditionally extend into a boolean
|
||||
// expression. The previous implementation used `alt((boolean_expr,
|
||||
// single_leaf))` which, when the input was a single leaf with no
|
||||
// following operand, would parse `occur_leaf` once for `boolean_expr`,
|
||||
// fail at `multispace1`, backtrack, then re-parse `occur_leaf` for
|
||||
// `single_leaf`. With recursively-nested groups like `(+(+(+a)))`, that
|
||||
// doubling at every level produced O(2^n) parse time. Parsing once and
|
||||
// peeking ahead for the operand keeps it O(n).
|
||||
delimited(
|
||||
multispace0,
|
||||
|inp| {
|
||||
let (rest, first) = occur_leaf(inp)?;
|
||||
// Only fall back on `Err::Error` (recoverable), mirroring
|
||||
// `alt`'s behaviour. `Err::Failure` and `Err::Incomplete`
|
||||
// must propagate so cut points and streaming needs are not
|
||||
// accidentally swallowed if they are ever introduced in the
|
||||
// operand parsers.
|
||||
match preceded(multispace1, many1(operand_leaf))(rest) {
|
||||
Ok((rest, more)) => {
|
||||
let combined = aggregate_binary_expressions(first, more)
|
||||
.map_err(|_| nom::Err::Error(Error::new(inp, ErrorKind::MapRes)))?;
|
||||
Ok((rest, combined))
|
||||
}
|
||||
Err(nom::Err::Error(_)) => {
|
||||
let (occur, ast) = first;
|
||||
let single = if occur == Some(Occur::MustNot) {
|
||||
ast.unary(Occur::MustNot)
|
||||
} else {
|
||||
ast
|
||||
};
|
||||
Ok((rest, single))
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
},
|
||||
multispace0,
|
||||
)(inp)
|
||||
let boolean_expr = map_res(
|
||||
separated_pair(occur_leaf, multispace1, many1(operand_leaf)),
|
||||
|(left, right)| aggregate_binary_expressions(left, right),
|
||||
);
|
||||
let single_leaf = map(occur_leaf, |(occur, ast)| {
|
||||
if occur == Some(Occur::MustNot) {
|
||||
ast.unary(Occur::MustNot)
|
||||
} else {
|
||||
ast
|
||||
}
|
||||
});
|
||||
delimited(multispace0, alt((boolean_expr, single_leaf)), multispace0)(inp)
|
||||
}
|
||||
|
||||
fn ast_infallible(inp: &str) -> JResult<&str, UserInputAst> {
|
||||
@@ -1765,8 +1726,6 @@ mod test {
|
||||
test_parse_query_to_ast_helper("*", "*");
|
||||
test_parse_query_to_ast_helper("(*)", "*");
|
||||
test_parse_query_to_ast_helper("(* )", "*");
|
||||
// All query with boost
|
||||
test_parse_query_to_ast_helper("*^2", "(*)^2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1829,7 +1788,6 @@ mod test {
|
||||
test_parse_query_to_ast_helper("a:b*", "\"a\":b*");
|
||||
test_parse_query_to_ast_helper("a:*b", "\"a\":*b");
|
||||
test_parse_query_to_ast_helper(r#"a:*def*"#, "\"a\":*def*");
|
||||
test_parse_query_to_ast_helper("a:*\\:foo", "\"a\":*:foo");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1894,8 +1852,6 @@ mod test {
|
||||
},
|
||||
_ => panic!("Expected a leaf"),
|
||||
}
|
||||
// Regex followed by `^boost`
|
||||
test_parse_query_to_ast_helper(r#"foo:/bar/^2"#, r#"("foo":/bar/)^2"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1935,23 +1891,4 @@ mod test {
|
||||
r#"(+"field":'happy tax payer' +"other_field":1)"#,
|
||||
);
|
||||
}
|
||||
|
||||
// Regression test for https://github.com/quickwit-oss/tantivy/issues/2498:
|
||||
// deeply nested parenthesized queries used to take O(2^n) time because the
|
||||
// top-level `ast()` parser tried `boolean_expr` first and re-parsed the
|
||||
// inner `occur_leaf` when it backtracked to `single_leaf`. Depth 60 would
|
||||
// take ~10^18 operations under the regression; with the fix it parses
|
||||
// instantly. We use `test_parse_query_to_ast_helper` so this test would
|
||||
// never finish if the regression returned.
|
||||
#[test]
|
||||
fn test_parse_deeply_nested_query() {
|
||||
let depth = 60;
|
||||
let leading: String = "(".repeat(depth);
|
||||
let trailing: String = ")".repeat(depth);
|
||||
let query = format!("{leading}title:test{trailing}");
|
||||
test_parse_query_to_ast_helper(&query, r#""title":test"#);
|
||||
|
||||
let query_with_plus = format!("+{leading}title:test{trailing}");
|
||||
test_parse_query_to_ast_helper(&query_with_plus, r#""title":test"#);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,18 +10,18 @@ use crate::aggregation::accessor_helpers::{
|
||||
};
|
||||
use crate::aggregation::agg_req::{Aggregation, AggregationVariants, Aggregations};
|
||||
use crate::aggregation::bucket::{
|
||||
build_segment_filter_collector, build_segment_histogram_collector,
|
||||
build_segment_range_collector, CompositeAggReqData, CompositeAggregation,
|
||||
CompositeSourceAccessors, FilterAggReqData, HistogramAggReqData, HistogramBounds,
|
||||
IncludeExcludeParam, MissingTermAggReqData, RangeAggReqData, TermMissingAgg, TermsAggReqData,
|
||||
TermsAggregation, TermsAggregationInternal,
|
||||
build_segment_filter_collector, build_segment_range_collector, CompositeAggReqData,
|
||||
CompositeAggregation, CompositeSourceAccessors, FilterAggReqData, HistogramAggReqData,
|
||||
HistogramBounds, IncludeExcludeParam, MissingTermAggReqData, RangeAggReqData,
|
||||
SegmentHistogramCollector, TermMissingAgg, TermsAggReqData, TermsAggregation,
|
||||
TermsAggregationInternal,
|
||||
};
|
||||
use crate::aggregation::metric::{
|
||||
build_segment_stats_collector, AverageAggregation, CardinalityAggReqData,
|
||||
CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation, MaxAggregation,
|
||||
MetricAggReqData, MinAggregation, SegmentCardinalityCollector, SegmentExtendedStatsCollector,
|
||||
SegmentPercentilesCollector, StatsAggregation, StatsType, SumAggregation, TermOrdSet,
|
||||
TopHitsAggReqData, TopHitsSegmentCollector, BITSET_MAX_TERM_ORD,
|
||||
SegmentPercentilesCollector, StatsAggregation, StatsType, SumAggregation, TopHitsAggReqData,
|
||||
TopHitsSegmentCollector,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::{
|
||||
GenericSegmentAggregationResultsCollector, SegmentAggregationCollector,
|
||||
@@ -41,7 +41,7 @@ pub struct AggregationsSegmentCtx {
|
||||
|
||||
impl AggregationsSegmentCtx {
|
||||
pub(crate) fn push_term_req_data(&mut self, data: TermsAggReqData) -> usize {
|
||||
self.per_request.term_req_data.push(data);
|
||||
self.per_request.term_req_data.push(Some(Box::new(data)));
|
||||
self.per_request.term_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_cardinality_req_data(&mut self, data: CardinalityAggReqData) -> usize {
|
||||
@@ -61,25 +61,31 @@ impl AggregationsSegmentCtx {
|
||||
self.per_request.missing_term_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_histogram_req_data(&mut self, data: HistogramAggReqData) -> usize {
|
||||
self.per_request.histogram_req_data.push(data);
|
||||
self.per_request
|
||||
.histogram_req_data
|
||||
.push(Some(Box::new(data)));
|
||||
self.per_request.histogram_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_range_req_data(&mut self, data: RangeAggReqData) -> usize {
|
||||
self.per_request.range_req_data.push(data);
|
||||
self.per_request.range_req_data.push(Some(Box::new(data)));
|
||||
self.per_request.range_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_filter_req_data(&mut self, data: FilterAggReqData) -> usize {
|
||||
self.per_request.filter_req_data.push(data);
|
||||
self.per_request.filter_req_data.push(Some(Box::new(data)));
|
||||
self.per_request.filter_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_composite_req_data(&mut self, data: CompositeAggReqData) -> usize {
|
||||
self.per_request.composite_req_data.push(data);
|
||||
self.per_request
|
||||
.composite_req_data
|
||||
.push(Some(Box::new(data)));
|
||||
self.per_request.composite_req_data.len() - 1
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_term_req_data(&self, idx: usize) -> &TermsAggReqData {
|
||||
&self.per_request.term_req_data[idx]
|
||||
self.per_request.term_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("term_req_data slot is empty (taken)")
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_cardinality_req_data(&self, idx: usize) -> &CardinalityAggReqData {
|
||||
@@ -97,6 +103,116 @@ impl AggregationsSegmentCtx {
|
||||
pub(crate) fn get_missing_term_req_data(&self, idx: usize) -> &MissingTermAggReqData {
|
||||
&self.per_request.missing_term_req_data[idx]
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_histogram_req_data(&self, idx: usize) -> &HistogramAggReqData {
|
||||
self.per_request.histogram_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_range_req_data(&self, idx: usize) -> &RangeAggReqData {
|
||||
self.per_request.range_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("range_req_data slot is empty (taken)")
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_composite_req_data(&self, idx: usize) -> &CompositeAggReqData {
|
||||
self.per_request.composite_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("composite_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
// ---------- mutable getters ----------
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_metric_req_data_mut(&mut self, idx: usize) -> &mut MetricAggReqData {
|
||||
&mut self.per_request.stats_metric_req_data[idx]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_cardinality_req_data_mut(
|
||||
&mut self,
|
||||
idx: usize,
|
||||
) -> &mut CardinalityAggReqData {
|
||||
&mut self.per_request.cardinality_req_data[idx]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_histogram_req_data_mut(&mut self, idx: usize) -> &mut HistogramAggReqData {
|
||||
self.per_request.histogram_req_data[idx]
|
||||
.as_deref_mut()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
// ---------- take / put (terms, histogram, range) ----------
|
||||
|
||||
/// Move out the boxed Histogram request at `idx`, leaving `None`.
|
||||
#[inline]
|
||||
pub(crate) fn take_histogram_req_data(&mut self, idx: usize) -> Box<HistogramAggReqData> {
|
||||
self.per_request.histogram_req_data[idx]
|
||||
.take()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
/// Put back a Histogram request into an empty slot at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn put_back_histogram_req_data(
|
||||
&mut self,
|
||||
idx: usize,
|
||||
value: Box<HistogramAggReqData>,
|
||||
) {
|
||||
debug_assert!(self.per_request.histogram_req_data[idx].is_none());
|
||||
self.per_request.histogram_req_data[idx] = Some(value);
|
||||
}
|
||||
|
||||
/// Move out the boxed Range request at `idx`, leaving `None`.
|
||||
#[inline]
|
||||
pub(crate) fn take_range_req_data(&mut self, idx: usize) -> Box<RangeAggReqData> {
|
||||
self.per_request.range_req_data[idx]
|
||||
.take()
|
||||
.expect("range_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
/// Put back a Range request into an empty slot at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn put_back_range_req_data(&mut self, idx: usize, value: Box<RangeAggReqData>) {
|
||||
debug_assert!(self.per_request.range_req_data[idx].is_none());
|
||||
self.per_request.range_req_data[idx] = Some(value);
|
||||
}
|
||||
|
||||
/// Move out the boxed Filter request at `idx`, leaving `None`.
|
||||
#[inline]
|
||||
pub(crate) fn take_filter_req_data(&mut self, idx: usize) -> Box<FilterAggReqData> {
|
||||
self.per_request.filter_req_data[idx]
|
||||
.take()
|
||||
.expect("filter_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
/// Put back a Filter request into an empty slot at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn put_back_filter_req_data(&mut self, idx: usize, value: Box<FilterAggReqData>) {
|
||||
debug_assert!(self.per_request.filter_req_data[idx].is_none());
|
||||
self.per_request.filter_req_data[idx] = Some(value);
|
||||
}
|
||||
|
||||
/// Move out the Composite request at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn take_composite_req_data(&mut self, idx: usize) -> Box<CompositeAggReqData> {
|
||||
self.per_request.composite_req_data[idx]
|
||||
.take()
|
||||
.expect("composite_req_data slot is empty (taken)")
|
||||
}
|
||||
|
||||
/// Put back a Composite request into an empty slot at `idx`.
|
||||
#[inline]
|
||||
pub(crate) fn put_back_composite_req_data(
|
||||
&mut self,
|
||||
idx: usize,
|
||||
value: Box<CompositeAggReqData>,
|
||||
) {
|
||||
debug_assert!(self.per_request.composite_req_data[idx].is_none());
|
||||
self.per_request.composite_req_data[idx] = Some(value);
|
||||
}
|
||||
}
|
||||
|
||||
/// Each type of aggregation has its own request data struct. This struct holds
|
||||
@@ -107,14 +223,15 @@ impl AggregationsSegmentCtx {
|
||||
/// for a node with [AggKind::Terms]).
|
||||
#[derive(Default)]
|
||||
pub struct PerRequestAggSegCtx {
|
||||
// Box for cheap take/put - Only necessary for bucket aggs that have sub-aggregations
|
||||
/// TermsAggReqData contains the request data for a terms aggregation.
|
||||
pub term_req_data: Vec<TermsAggReqData>,
|
||||
pub term_req_data: Vec<Option<Box<TermsAggReqData>>>,
|
||||
/// HistogramAggReqData contains the request data for a histogram aggregation.
|
||||
pub histogram_req_data: Vec<HistogramAggReqData>,
|
||||
pub histogram_req_data: Vec<Option<Box<HistogramAggReqData>>>,
|
||||
/// RangeAggReqData contains the request data for a range aggregation.
|
||||
pub range_req_data: Vec<RangeAggReqData>,
|
||||
pub range_req_data: Vec<Option<Box<RangeAggReqData>>>,
|
||||
/// FilterAggReqData contains the request data for a filter aggregation.
|
||||
pub filter_req_data: Vec<FilterAggReqData>,
|
||||
pub filter_req_data: Vec<Option<Box<FilterAggReqData>>>,
|
||||
/// Shared by avg, min, max, sum, stats, extended_stats, count
|
||||
pub stats_metric_req_data: Vec<MetricAggReqData>,
|
||||
/// CardinalityAggReqData contains the request data for a cardinality aggregation.
|
||||
@@ -124,7 +241,7 @@ pub struct PerRequestAggSegCtx {
|
||||
/// MissingTermAggReqData contains the request data for a missing term aggregation.
|
||||
pub missing_term_req_data: Vec<MissingTermAggReqData>,
|
||||
/// CompositeAggReqData contains the request data for a composite aggregation.
|
||||
pub composite_req_data: Vec<CompositeAggReqData>,
|
||||
pub composite_req_data: Vec<Option<Box<CompositeAggReqData>>>,
|
||||
|
||||
/// Request tree used to build collectors.
|
||||
pub agg_tree: Vec<AggRefNode>,
|
||||
@@ -135,22 +252,22 @@ impl PerRequestAggSegCtx {
|
||||
fn get_memory_consumption(&self) -> usize {
|
||||
self.term_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().unwrap().get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.histogram_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().unwrap().get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.range_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().unwrap().get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.filter_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().unwrap().get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.stats_metric_req_data
|
||||
@@ -175,7 +292,7 @@ impl PerRequestAggSegCtx {
|
||||
+ self
|
||||
.composite_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.map(|b| b.as_ref().map(|d| d.get_memory_consumption()).unwrap_or(0))
|
||||
.sum::<usize>()
|
||||
+ self.agg_tree.len() * std::mem::size_of::<AggRefNode>()
|
||||
}
|
||||
@@ -184,16 +301,40 @@ impl PerRequestAggSegCtx {
|
||||
let idx = node.idx_in_req_data;
|
||||
let kind = node.kind;
|
||||
match kind {
|
||||
AggKind::Terms => self.term_req_data[idx].name.as_str(),
|
||||
AggKind::Terms => self.term_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("term_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::Cardinality => &self.cardinality_req_data[idx].name,
|
||||
AggKind::StatsKind(_) => &self.stats_metric_req_data[idx].name,
|
||||
AggKind::TopHits => &self.top_hits_req_data[idx].name,
|
||||
AggKind::MissingTerm => &self.missing_term_req_data[idx].name,
|
||||
AggKind::Histogram => self.histogram_req_data[idx].name.as_str(),
|
||||
AggKind::DateHistogram => self.histogram_req_data[idx].name.as_str(),
|
||||
AggKind::Range => self.range_req_data[idx].name.as_str(),
|
||||
AggKind::Filter => self.filter_req_data[idx].name.as_str(),
|
||||
AggKind::Composite => self.composite_req_data[idx].name.as_str(),
|
||||
AggKind::Histogram => self.histogram_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::DateHistogram => self.histogram_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("histogram_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::Range => self.range_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("range_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::Filter => self.filter_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("filter_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
AggKind::Composite => self.composite_req_data[idx]
|
||||
.as_deref()
|
||||
.expect("composite_req_data slot is empty (taken)")
|
||||
.name
|
||||
.as_str(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -271,39 +412,13 @@ pub(crate) fn build_segment_agg_collector(
|
||||
Ok(Box::new(TermMissingAgg::new(req, node)?))
|
||||
}
|
||||
AggKind::Cardinality => {
|
||||
let req_data = req.get_cardinality_req_data(node.idx_in_req_data);
|
||||
// For str columns, choose the per-bucket entries representation
|
||||
// based on the segment's column.max_value():
|
||||
// * small (< BITSET_MAX_TERM_ORD): `BitSet`, pre-allocated, no promotion machinery.
|
||||
// * large: `TermOrdSet` (sparse FxHashSet that promotes to a paged bitset).
|
||||
// For non-str columns the `entries` field is unused (values go
|
||||
// straight into the HLL sketch); we still pick `TermOrdSet`
|
||||
// because its empty Sparse(FxHashSet) costs nothing.
|
||||
let is_str = req_data.column_type == ColumnType::Str;
|
||||
let max_term_ord_inclusive = if is_str {
|
||||
req_data.accessor.max_value()
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let collector: Box<dyn SegmentAggregationCollector> =
|
||||
if is_str && max_term_ord_inclusive < BITSET_MAX_TERM_ORD {
|
||||
Box::new(SegmentCardinalityCollector::<BitSet>::from_req(
|
||||
req_data.column_type,
|
||||
node.idx_in_req_data,
|
||||
req_data.accessor.clone(),
|
||||
req_data.missing_value_for_accessor,
|
||||
max_term_ord_inclusive,
|
||||
))
|
||||
} else {
|
||||
Box::new(SegmentCardinalityCollector::<TermOrdSet>::from_req(
|
||||
req_data.column_type,
|
||||
node.idx_in_req_data,
|
||||
req_data.accessor.clone(),
|
||||
req_data.missing_value_for_accessor,
|
||||
max_term_ord_inclusive,
|
||||
))
|
||||
};
|
||||
Ok(collector)
|
||||
let req_data = &mut req.get_cardinality_req_data_mut(node.idx_in_req_data);
|
||||
Ok(Box::new(SegmentCardinalityCollector::from_req(
|
||||
req_data.column_type,
|
||||
node.idx_in_req_data,
|
||||
req_data.accessor.clone(),
|
||||
req_data.missing_value_for_accessor,
|
||||
)))
|
||||
}
|
||||
AggKind::StatsKind(stats_type) => {
|
||||
let req_data = &mut req.per_request.stats_metric_req_data[node.idx_in_req_data];
|
||||
@@ -318,7 +433,7 @@ pub(crate) fn build_segment_agg_collector(
|
||||
SegmentExtendedStatsCollector::from_req(req_data, sigma),
|
||||
)),
|
||||
StatsType::Percentiles => {
|
||||
let req_data = req.get_metric_req_data(node.idx_in_req_data);
|
||||
let req_data = req.get_metric_req_data_mut(node.idx_in_req_data);
|
||||
Ok(Box::new(
|
||||
SegmentPercentilesCollector::from_req_and_validate(
|
||||
req_data.field_type,
|
||||
@@ -338,8 +453,12 @@ pub(crate) fn build_segment_agg_collector(
|
||||
req_data.segment_ordinal,
|
||||
)))
|
||||
}
|
||||
AggKind::Histogram => build_segment_histogram_collector(req, node),
|
||||
AggKind::DateHistogram => build_segment_histogram_collector(req, node),
|
||||
AggKind::Histogram => Ok(Box::new(SegmentHistogramCollector::from_req_and_validate(
|
||||
req, node,
|
||||
)?)),
|
||||
AggKind::DateHistogram => Ok(Box::new(SegmentHistogramCollector::from_req_and_validate(
|
||||
req, node,
|
||||
)?)),
|
||||
AggKind::Range => Ok(build_segment_range_collector(req, node)?),
|
||||
AggKind::Filter => build_segment_filter_collector(req, node),
|
||||
AggKind::Composite => Ok(Box::new(
|
||||
@@ -654,18 +773,23 @@ fn build_nodes(
|
||||
let schema = reader.schema();
|
||||
let tokenizers = &data.context.tokenizers;
|
||||
let query = filter_req.parse_query(schema, tokenizers)?;
|
||||
let evaluator =
|
||||
std::rc::Rc::new(crate::aggregation::bucket::DocumentQueryEvaluator::new(
|
||||
query,
|
||||
schema.clone(),
|
||||
reader,
|
||||
)?);
|
||||
let evaluator = crate::aggregation::bucket::DocumentQueryEvaluator::new(
|
||||
query,
|
||||
schema.clone(),
|
||||
reader,
|
||||
)?;
|
||||
|
||||
// Pre-allocate buffer for batch filtering
|
||||
let max_doc = reader.max_doc();
|
||||
let buffer_capacity = crate::docset::COLLECT_BLOCK_BUFFER_LEN.min(max_doc as usize);
|
||||
let matching_docs_buffer = Vec::with_capacity(buffer_capacity);
|
||||
|
||||
let idx_in_req_data = data.push_filter_req_data(FilterAggReqData {
|
||||
name: agg_name.to_string(),
|
||||
req: filter_req.clone(),
|
||||
segment_reader: reader.clone(),
|
||||
evaluator,
|
||||
matching_docs_buffer,
|
||||
is_top_level,
|
||||
});
|
||||
let children = build_children(&req.sub_aggregation, reader, segment_ordinal, data)?;
|
||||
@@ -861,12 +985,8 @@ fn build_terms_or_cardinality_nodes(
|
||||
let str_col = str_dict_column
|
||||
.as_ref()
|
||||
.expect("str_dict_column must exist for string column");
|
||||
allowed_term_ids = build_allowed_term_ids_for_str(
|
||||
str_col,
|
||||
&req.include,
|
||||
&req.exclude,
|
||||
missing.is_some(),
|
||||
)?;
|
||||
allowed_term_ids =
|
||||
build_allowed_term_ids_for_str(str_col, &req.include, &req.exclude)?;
|
||||
};
|
||||
let idx_in_req_data = data.push_term_req_data(TermsAggReqData {
|
||||
accessor,
|
||||
@@ -882,20 +1002,10 @@ fn build_terms_or_cardinality_nodes(
|
||||
(idx_in_req_data, AggKind::Terms)
|
||||
}
|
||||
TermsOrCardinalityRequest::Cardinality(ref req) => {
|
||||
// `str_dict_column` is computed once per field; for JSON paths
|
||||
// with mixed types it's `Some` even on the numeric req_data.
|
||||
// Cardinality only consults it for the str column path, so
|
||||
// gate by column_type to avoid driving non-str collectors
|
||||
// through the coupon-cache path.
|
||||
let str_dict_column_for_req = if column_type == ColumnType::Str {
|
||||
str_dict_column.clone()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let idx_in_req_data = data.push_cardinality_req_data(CardinalityAggReqData {
|
||||
accessor,
|
||||
column_type,
|
||||
str_dict_column: str_dict_column_for_req,
|
||||
str_dict_column: str_dict_column.clone(),
|
||||
missing_value_for_accessor,
|
||||
name: agg_name.to_string(),
|
||||
req: req.clone(),
|
||||
@@ -915,21 +1025,16 @@ fn build_terms_or_cardinality_nodes(
|
||||
|
||||
/// Builds a single BitSet of allowed term ordinals for a string dictionary column according to
|
||||
/// include/exclude parameters.
|
||||
///
|
||||
/// When `reserve_missing_sentinel` is true, the bitset will have 1 additional slot for the missing
|
||||
/// term ordinal
|
||||
fn build_allowed_term_ids_for_str(
|
||||
str_col: &StrColumn,
|
||||
include: &Option<IncludeExcludeParam>,
|
||||
exclude: &Option<IncludeExcludeParam>,
|
||||
reserve_missing_sentinel: bool,
|
||||
) -> crate::Result<Option<BitSet>> {
|
||||
let mut allowed: Option<BitSet> = None;
|
||||
let missing_sentinel_adjustment = if reserve_missing_sentinel { 1 } else { 0 };
|
||||
let allowed_capacity = str_col.dictionary().num_terms() as u32 + missing_sentinel_adjustment;
|
||||
let num_terms = str_col.dictionary().num_terms() as u32;
|
||||
if let Some(include) = include {
|
||||
// add matches
|
||||
allowed = Some(BitSet::with_max_value(allowed_capacity));
|
||||
allowed = Some(BitSet::with_max_value(num_terms));
|
||||
let allowed = allowed.as_mut().unwrap();
|
||||
for_each_matching_term_ord(str_col, include, |ord| allowed.insert(ord))?;
|
||||
};
|
||||
@@ -937,7 +1042,7 @@ fn build_allowed_term_ids_for_str(
|
||||
if let Some(exclude) = exclude {
|
||||
if allowed.is_none() {
|
||||
// Start with all terms allowed
|
||||
allowed = Some(BitSet::with_max_value_and_full(allowed_capacity));
|
||||
allowed = Some(BitSet::with_max_value_and_full(num_terms));
|
||||
}
|
||||
let allowed = allowed.as_mut().unwrap();
|
||||
for_each_matching_term_ord(str_col, exclude, |ord| allowed.remove(ord))?;
|
||||
|
||||
@@ -115,71 +115,6 @@ pub fn get_fast_field_names(aggs: &Aggregations) -> HashSet<String> {
|
||||
fast_field_names
|
||||
}
|
||||
|
||||
/// Validates that all fields referenced in the aggregation request exist in the schema
|
||||
/// and are configured as fast fields.
|
||||
///
|
||||
/// This is a convenience function for upfront validation before executing aggregations.
|
||||
/// Returns an error if any field doesn't exist or is not a fast field.
|
||||
///
|
||||
/// Validation is intentionally opt-in rather than baked into aggregation execution: the
|
||||
/// default lenient behavior (returning empty results for missing fields) supports
|
||||
/// schema evolution and federated queries where the same request runs against segments
|
||||
/// or indices with different schemas.
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// use tantivy::aggregation::agg_req::{Aggregations, validate_aggregation_fields_exist};
|
||||
/// use tantivy::schema::{Schema, FAST};
|
||||
/// use tantivy::Index;
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// // Create a simple index
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// schema_builder.add_f64_field("price", FAST);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// // Parse aggregation request
|
||||
/// let agg_req: Aggregations = serde_json::from_str(r#"{
|
||||
/// "avg_price": { "avg": { "field": "price" } }
|
||||
/// }"#)?;
|
||||
///
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// // Validate fields before executing
|
||||
/// for segment_reader in searcher.segment_readers() {
|
||||
/// validate_aggregation_fields_exist(&agg_req, segment_reader)?;
|
||||
/// }
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn validate_aggregation_fields_exist(
|
||||
aggs: &Aggregations,
|
||||
reader: &crate::SegmentReader,
|
||||
) -> crate::Result<()> {
|
||||
let field_names = get_fast_field_names(aggs);
|
||||
let schema = reader.schema();
|
||||
|
||||
for field_name in field_names {
|
||||
// Check if the field is either directly in the schema or could be part of a json field
|
||||
// present in the schema, and verify it's a fast field.
|
||||
if let Some((field, _path)) = schema.find_field(&field_name) {
|
||||
let field_type = schema.get_field_entry(field).field_type();
|
||||
if !field_type.is_fast() {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"Field '{}' is not a fast field. Aggregations require fast fields.",
|
||||
field_name
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
return Err(crate::TantivyError::FieldNotFound(field_name));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// All aggregation types.
|
||||
pub enum AggregationVariants {
|
||||
@@ -299,12 +234,6 @@ impl AggregationVariants {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_sum(&self) -> Option<&SumAggregation> {
|
||||
match &self {
|
||||
AggregationVariants::Sum(sum) => Some(sum),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1436,46 +1436,3 @@ fn test_aggregation_on_json_object_mixed_numerical_segments() {
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_field_validation_helper() {
|
||||
// Test the standalone validation helper function for field validation
|
||||
let index = get_test_index_2_segments(false).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
// Test with invalid field
|
||||
let agg_req: Aggregations = serde_json::from_str(
|
||||
r#"{
|
||||
"avg_test": {
|
||||
"avg": { "field": "nonexistent_field" }
|
||||
}
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let result =
|
||||
crate::aggregation::agg_req::validate_aggregation_fields_exist(&agg_req, segment_reader);
|
||||
assert!(result.is_err());
|
||||
match result {
|
||||
Err(crate::TantivyError::FieldNotFound(field_name)) => {
|
||||
assert_eq!(field_name, "nonexistent_field");
|
||||
}
|
||||
_ => panic!("Expected FieldNotFound error, got: {:?}", result),
|
||||
}
|
||||
|
||||
// Test with valid field
|
||||
let agg_req: Aggregations = serde_json::from_str(
|
||||
r#"{
|
||||
"avg_test": {
|
||||
"avg": { "field": "score" }
|
||||
}
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let result =
|
||||
crate::aggregation::agg_req::validate_aggregation_fields_exist(&agg_req, segment_reader);
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
@@ -16,7 +16,6 @@ use crate::{SegmentReader, TantivyError};
|
||||
|
||||
/// Contains all information required by the SegmentCompositeCollector to perform the
|
||||
/// composite aggregation on a segment.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CompositeAggReqData {
|
||||
/// The name of the aggregation.
|
||||
pub name: String,
|
||||
@@ -35,7 +34,6 @@ impl CompositeAggReqData {
|
||||
}
|
||||
|
||||
/// Accessors for a single column in a composite source.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CompositeAccessor {
|
||||
/// The fast field column
|
||||
pub column: Column<u64>,
|
||||
@@ -50,7 +48,6 @@ pub struct CompositeAccessor {
|
||||
}
|
||||
|
||||
/// Accessors to all the columns that belong to the field of a composite source.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CompositeSourceAccessors {
|
||||
/// The accessors for this source
|
||||
pub accessors: Vec<CompositeAccessor>,
|
||||
@@ -361,7 +358,7 @@ impl PrecomputedDateInterval {
|
||||
///
|
||||
/// Some column types (term, IP) might not have an exact representation of the
|
||||
/// specified after key
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug)]
|
||||
pub enum PrecomputedAfterKey {
|
||||
/// The after key could be exactly represented in the column space.
|
||||
Exact(u64),
|
||||
|
||||
@@ -118,7 +118,7 @@ impl InternalValueRepr {
|
||||
pub struct SegmentCompositeCollector {
|
||||
/// One DynArrayHeapMap per parent bucket.
|
||||
parent_buckets: Vec<DynArrayHeapMap<InternalValueRepr, CompositeBucketCollector>>,
|
||||
req_data: CompositeAggReqData,
|
||||
accessor_idx: usize,
|
||||
sub_agg: Option<BufferedSubAggs<HighCardSubAggBuffer>>,
|
||||
bucket_id_provider: BucketIdProvider,
|
||||
/// Number of sources, needed when creating new DynArrayHeapMaps.
|
||||
@@ -132,7 +132,10 @@ impl SegmentAggregationCollector for SegmentCompositeCollector {
|
||||
results: &mut IntermediateAggregationResults,
|
||||
parent_bucket_id: BucketId,
|
||||
) -> crate::Result<()> {
|
||||
let name = self.req_data.name.clone();
|
||||
let name = agg_data
|
||||
.get_composite_req_data(self.accessor_idx)
|
||||
.name
|
||||
.clone();
|
||||
|
||||
let buckets = self.add_intermediate_bucket_result(agg_data, parent_bucket_id)?;
|
||||
results.push(
|
||||
@@ -149,12 +152,13 @@ impl SegmentAggregationCollector for SegmentCompositeCollector {
|
||||
docs: &[crate::DocId],
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
let mem_pre = self.get_memory_consumption(parent_bucket_id);
|
||||
let mem_pre = self.get_memory_consumption();
|
||||
let composite_agg_data = agg_data.take_composite_req_data(self.accessor_idx);
|
||||
|
||||
for doc in docs {
|
||||
let mut visitor = CompositeKeyVisitor {
|
||||
doc_id: *doc,
|
||||
composite_agg_data: &self.req_data,
|
||||
composite_agg_data: &composite_agg_data,
|
||||
buckets: &mut self.parent_buckets[parent_bucket_id as usize],
|
||||
sub_agg: &mut self.sub_agg,
|
||||
bucket_id_provider: &mut self.bucket_id_provider,
|
||||
@@ -162,12 +166,13 @@ impl SegmentAggregationCollector for SegmentCompositeCollector {
|
||||
};
|
||||
visitor.visit(0, true)?;
|
||||
}
|
||||
agg_data.put_back_composite_req_data(self.accessor_idx, composite_agg_data);
|
||||
|
||||
if let Some(sub_agg) = &mut self.sub_agg {
|
||||
sub_agg.check_flush_local(agg_data)?;
|
||||
}
|
||||
|
||||
let mem_delta = self.get_memory_consumption(parent_bucket_id) - mem_pre;
|
||||
let mem_delta = self.get_memory_consumption() - mem_pre;
|
||||
if mem_delta > 0 {
|
||||
agg_data.context.limits.add_memory_consumed(mem_delta)?;
|
||||
}
|
||||
@@ -194,35 +199,21 @@ impl SegmentAggregationCollector for SegmentCompositeCollector {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_metric_value(
|
||||
&self,
|
||||
_bucket_id: BucketId,
|
||||
_sub_agg_name: &str,
|
||||
_sub_agg_property: &str,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> Option<f64> {
|
||||
// Composite is a multi-bucket agg with no single value to extract.
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCompositeCollector {
|
||||
fn get_memory_consumption(&self, parent_bucket_id: BucketId) -> u64 {
|
||||
self.parent_buckets[parent_bucket_id as usize].memory_consumption()
|
||||
fn get_memory_consumption(&self) -> u64 {
|
||||
self.parent_buckets
|
||||
.iter()
|
||||
.map(|m| m.memory_consumption())
|
||||
.sum()
|
||||
}
|
||||
|
||||
pub(crate) fn from_req_and_validate(
|
||||
req_data: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
) -> crate::Result<Self> {
|
||||
let composite_req_data =
|
||||
req_data.per_request.composite_req_data[node.idx_in_req_data].clone();
|
||||
validate_req(&composite_req_data)?;
|
||||
req_data
|
||||
.context
|
||||
.limits
|
||||
.add_memory_consumed(composite_req_data.get_memory_consumption() as u64)?;
|
||||
validate_req(req_data, node.idx_in_req_data)?;
|
||||
|
||||
let has_sub_aggregations = !node.children.is_empty();
|
||||
let sub_agg = if has_sub_aggregations {
|
||||
@@ -232,11 +223,12 @@ impl SegmentCompositeCollector {
|
||||
None
|
||||
};
|
||||
|
||||
let composite_req_data = req_data.get_composite_req_data(node.idx_in_req_data);
|
||||
let num_sources = composite_req_data.req.sources.len();
|
||||
|
||||
Ok(SegmentCompositeCollector {
|
||||
parent_buckets: vec![DynArrayHeapMap::try_new(num_sources)?],
|
||||
req_data: composite_req_data,
|
||||
accessor_idx: node.idx_in_req_data,
|
||||
sub_agg,
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
num_sources,
|
||||
@@ -258,7 +250,7 @@ impl SegmentCompositeCollector {
|
||||
let mut dict: FxHashMap<Vec<CompositeIntermediateKey>, IntermediateCompositeBucketEntry> =
|
||||
Default::default();
|
||||
dict.reserve(heap_map.size());
|
||||
let composite_data = &self.req_data;
|
||||
let composite_data = agg_data.get_composite_req_data(self.accessor_idx);
|
||||
for (key_internal_repr, agg) in heap_map.into_iter() {
|
||||
let key = resolve_key(&key_internal_repr, composite_data)?;
|
||||
let mut sub_aggregation_res = IntermediateAggregationResults::default();
|
||||
@@ -298,7 +290,8 @@ impl SegmentCompositeCollector {
|
||||
}
|
||||
}
|
||||
|
||||
fn validate_req(composite_data: &CompositeAggReqData) -> crate::Result<()> {
|
||||
fn validate_req(req_data: &mut AggregationsSegmentCtx, accessor_idx: usize) -> crate::Result<()> {
|
||||
let composite_data = req_data.get_composite_req_data(accessor_idx);
|
||||
let req = &composite_data.req;
|
||||
if req.sources.is_empty() {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
|
||||
@@ -559,30 +559,34 @@ mod tests {
|
||||
page_size,
|
||||
agg_req,
|
||||
);
|
||||
assert!(
|
||||
res["my_composite"].get("after_key").is_some(),
|
||||
"expected after_key on every non-empty page"
|
||||
);
|
||||
after_key = Some(res["my_composite"]["after_key"].clone());
|
||||
}
|
||||
// Using the after_key from the last page must yield an empty page.
|
||||
let agg_req_json = json!({
|
||||
"my_composite": {
|
||||
"composite": {
|
||||
"sources": composite_agg_sources,
|
||||
"size": page_size,
|
||||
"after": after_key,
|
||||
}
|
||||
if page_idx + 1 < page_count {
|
||||
assert!(
|
||||
res["my_composite"].get("after_key").is_some(),
|
||||
"expected after_key on all but last page"
|
||||
);
|
||||
after_key = Some(res["my_composite"]["after_key"].clone());
|
||||
} else if res["my_composite"].get("after_key").is_some() {
|
||||
// currently we sometime have an after_key on the last page,
|
||||
// check that the next "page" is empty
|
||||
let agg_req_json = json!({
|
||||
"my_composite": {
|
||||
"composite": {
|
||||
"sources": composite_agg_sources,
|
||||
"size": page_size,
|
||||
"after": res["my_composite"]["after_key"].clone(),
|
||||
}
|
||||
}
|
||||
});
|
||||
let agg_req: Aggregations = serde_json::from_value(agg_req_json).unwrap();
|
||||
let res = exec_request(agg_req.clone(), index).unwrap();
|
||||
assert_eq!(
|
||||
res["my_composite"]["buckets"],
|
||||
json!([]),
|
||||
"expected no buckets when using after_key from last page, query: {:?}",
|
||||
agg_req
|
||||
);
|
||||
}
|
||||
});
|
||||
let agg_req: Aggregations = serde_json::from_value(agg_req_json).unwrap();
|
||||
let res = exec_request(agg_req.clone(), index).unwrap();
|
||||
assert_eq!(
|
||||
res["my_composite"]["buckets"],
|
||||
json!([]),
|
||||
"expected no buckets when using after_key from last page, query: {:?}",
|
||||
agg_req
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -707,28 +711,8 @@ mod tests {
|
||||
{"key": {"myterm": "terme"}, "doc_count": 1}
|
||||
])
|
||||
);
|
||||
|
||||
// paginating past last page should be empty
|
||||
let agg_req_json = json!({
|
||||
"my_composite": {
|
||||
"composite": {
|
||||
"sources": [
|
||||
{"myterm": {"terms": {"field": "string_id"}}}
|
||||
],
|
||||
"size": 3,
|
||||
"after": &res["my_composite"]["after_key"]
|
||||
}
|
||||
}
|
||||
});
|
||||
let agg_req: Aggregations = serde_json::from_value(agg_req_json).unwrap();
|
||||
let res = exec_request(agg_req.clone(), &index).unwrap();
|
||||
assert!(res["my_composite"].get("after_key").is_none());
|
||||
assert_eq!(
|
||||
res["my_composite"]["buckets"],
|
||||
json!([]),
|
||||
"expected no buckets when using after_key from last page, query: {:?}",
|
||||
agg_req
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -836,10 +820,7 @@ mod tests {
|
||||
{"key": {"myterm": "apple"}, "doc_count": 1}
|
||||
])
|
||||
);
|
||||
assert_eq!(
|
||||
res["fruity_aggreg"]["after_key"],
|
||||
json!({"myterm": "str:apple"})
|
||||
);
|
||||
assert!(res["fruity_aggreg"].get("after_key").is_none());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1811,14 +1792,7 @@ mod tests {
|
||||
{"key": {"month": ms_timestamp_from_iso_str("2021-02-01T00:00:00Z"), "category": "books"}, "doc_count": 1},
|
||||
]),
|
||||
);
|
||||
let feb_2021_ns = ms_timestamp_from_iso_str("2021-02-01T00:00:00Z") * 1_000_000;
|
||||
assert_eq!(
|
||||
res["my_composite"]["after_key"],
|
||||
json!({
|
||||
"month": format!("dt:{}", feb_2021_ns),
|
||||
"category": "str:books"
|
||||
})
|
||||
);
|
||||
assert!(res["my_composite"].get("after_key").is_none());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use std::fmt::Debug;
|
||||
use std::rc::Rc;
|
||||
|
||||
use common::BitSet;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
@@ -397,7 +396,6 @@ impl PartialEq for FilterAggregation {
|
||||
|
||||
/// Request data for filter aggregation
|
||||
/// This struct holds the per-segment data needed to execute a filter aggregation
|
||||
#[derive(Clone)]
|
||||
pub struct FilterAggReqData {
|
||||
/// The name of the filter aggregation
|
||||
pub name: String,
|
||||
@@ -405,20 +403,22 @@ pub struct FilterAggReqData {
|
||||
pub req: FilterAggregation,
|
||||
/// The segment reader
|
||||
pub segment_reader: SegmentReader,
|
||||
/// Document evaluator for the filter query (precomputed BitSet).
|
||||
/// Wrapped in `Rc` so cloning the request data does not duplicate the (potentially large)
|
||||
/// underlying BitSet.
|
||||
pub evaluator: Rc<DocumentQueryEvaluator>,
|
||||
/// Document evaluator for the filter query (precomputed BitSet)
|
||||
/// This is built once when the request data is created
|
||||
pub evaluator: DocumentQueryEvaluator,
|
||||
/// Reusable buffer for matching documents to minimize allocations during collection
|
||||
pub matching_docs_buffer: Vec<DocId>,
|
||||
/// True if this filter aggregation is at the top level of the aggregation tree (not nested).
|
||||
pub is_top_level: bool,
|
||||
}
|
||||
|
||||
impl FilterAggReqData {
|
||||
pub(crate) fn get_memory_consumption(&self) -> usize {
|
||||
// Estimate: name + segment reader reference + bitset
|
||||
// Estimate: name + segment reader reference + bitset + buffer capacity
|
||||
self.name.len()
|
||||
+ std::mem::size_of::<SegmentReader>()
|
||||
+ self.evaluator.bitset.len() / 8 // BitSet memory (bits to bytes)
|
||||
+ self.matching_docs_buffer.capacity() * std::mem::size_of::<DocId>()
|
||||
+ std::mem::size_of::<bool>()
|
||||
}
|
||||
}
|
||||
@@ -509,10 +509,8 @@ pub struct SegmentFilterCollector<B: SubAggBuffer> {
|
||||
/// Sub-aggregation collectors
|
||||
sub_aggregations: Option<BufferedSubAggs<B>>,
|
||||
bucket_id_provider: BucketIdProvider,
|
||||
/// Per-segment filter request data, owned by this collector.
|
||||
req_data: FilterAggReqData,
|
||||
/// Reusable buffer for matching documents to minimize allocations during collection.
|
||||
matching_docs_buffer: Vec<DocId>,
|
||||
/// Accessor index for this filter aggregation (to access FilterAggReqData)
|
||||
accessor_idx: usize,
|
||||
}
|
||||
|
||||
impl<B: SubAggBuffer> SegmentFilterCollector<B> {
|
||||
@@ -520,7 +518,6 @@ impl<B: SubAggBuffer> SegmentFilterCollector<B> {
|
||||
pub(crate) fn from_req_and_validate(
|
||||
req: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
req_data: FilterAggReqData,
|
||||
) -> crate::Result<Self> {
|
||||
// Build sub-aggregation collectors if any
|
||||
let sub_agg_collector = if !node.children.is_empty() {
|
||||
@@ -530,15 +527,11 @@ impl<B: SubAggBuffer> SegmentFilterCollector<B> {
|
||||
};
|
||||
let sub_agg_collector = sub_agg_collector.map(BufferedSubAggs::new);
|
||||
|
||||
let max_doc = req_data.segment_reader.max_doc();
|
||||
let buffer_capacity = crate::docset::COLLECT_BLOCK_BUFFER_LEN.min(max_doc as usize);
|
||||
|
||||
Ok(SegmentFilterCollector {
|
||||
parent_buckets: Vec::new(),
|
||||
sub_aggregations: sub_agg_collector,
|
||||
req_data,
|
||||
accessor_idx: node.idx_in_req_data,
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
matching_docs_buffer: Vec::with_capacity(buffer_capacity),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -547,23 +540,18 @@ pub(crate) fn build_segment_filter_collector(
|
||||
req: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
||||
let req_data = req.per_request.filter_req_data[node.idx_in_req_data].clone();
|
||||
req.context
|
||||
.limits
|
||||
.add_memory_consumed(req_data.get_memory_consumption() as u64)?;
|
||||
let is_top_level = req_data.is_top_level;
|
||||
let is_top_level = req.per_request.filter_req_data[node.idx_in_req_data]
|
||||
.as_ref()
|
||||
.expect("filter_req_data slot is empty")
|
||||
.is_top_level;
|
||||
|
||||
if is_top_level {
|
||||
Ok(Box::new(
|
||||
SegmentFilterCollector::<LowCardSubAggBuffer>::from_req_and_validate(
|
||||
req, node, req_data,
|
||||
)?,
|
||||
SegmentFilterCollector::<LowCardSubAggBuffer>::from_req_and_validate(req, node)?,
|
||||
))
|
||||
} else {
|
||||
Ok(Box::new(
|
||||
SegmentFilterCollector::<HighCardSubAggBuffer>::from_req_and_validate(
|
||||
req, node, req_data,
|
||||
)?,
|
||||
SegmentFilterCollector::<HighCardSubAggBuffer>::from_req_and_validate(req, node)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
@@ -573,7 +561,7 @@ impl<B: SubAggBuffer> Debug for SegmentFilterCollector<B> {
|
||||
f.debug_struct("SegmentFilterCollector")
|
||||
.field("buckets", &self.parent_buckets)
|
||||
.field("has_sub_aggs", &self.sub_aggregations.is_some())
|
||||
.field("name", &self.req_data.name)
|
||||
.field("accessor_idx", &self.accessor_idx)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -610,7 +598,11 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentFilterCollector<B>
|
||||
};
|
||||
|
||||
// Get the name of this filter aggregation
|
||||
let name = self.req_data.name.clone();
|
||||
let name = agg_data.per_request.filter_req_data[self.accessor_idx]
|
||||
.as_ref()
|
||||
.expect("filter_req_data slot is empty")
|
||||
.name
|
||||
.clone();
|
||||
|
||||
results.push(
|
||||
name,
|
||||
@@ -631,24 +623,27 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentFilterCollector<B>
|
||||
}
|
||||
|
||||
let mut bucket = self.parent_buckets[parent_bucket_id as usize];
|
||||
// Take the request data to avoid borrow checker issues with sub-aggregations
|
||||
let mut req = agg_data.take_filter_req_data(self.accessor_idx);
|
||||
|
||||
// Use batch filtering with O(1) BitSet lookups
|
||||
self.matching_docs_buffer.clear();
|
||||
self.req_data
|
||||
.evaluator
|
||||
.filter_batch(docs, &mut self.matching_docs_buffer);
|
||||
req.matching_docs_buffer.clear();
|
||||
req.evaluator
|
||||
.filter_batch(docs, &mut req.matching_docs_buffer);
|
||||
|
||||
bucket.doc_count += self.matching_docs_buffer.len() as u64;
|
||||
bucket.doc_count += req.matching_docs_buffer.len() as u64;
|
||||
|
||||
// Batch process sub-aggregations if we have matches
|
||||
if !self.matching_docs_buffer.is_empty() {
|
||||
if !req.matching_docs_buffer.is_empty() {
|
||||
if let Some(sub_aggs) = &mut self.sub_aggregations {
|
||||
for &doc_id in &self.matching_docs_buffer {
|
||||
for &doc_id in &req.matching_docs_buffer {
|
||||
sub_aggs.push(bucket.bucket_id, doc_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Put the request data back
|
||||
agg_data.put_back_filter_req_data(self.accessor_idx, req);
|
||||
if let Some(sub_aggs) = &mut self.sub_aggregations {
|
||||
sub_aggs.check_flush_local(agg_data)?;
|
||||
}
|
||||
@@ -679,17 +674,6 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentFilterCollector<B>
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_metric_value(
|
||||
&self,
|
||||
_bucket_id: BucketId,
|
||||
_sub_agg_name: &str,
|
||||
_sub_agg_property: &str,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> Option<f64> {
|
||||
// TODO: forward into the inner `sub_agg` for nested order paths (`filter.metric`).
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Intermediate result for filter aggregation
|
||||
|
||||
@@ -21,7 +21,6 @@ use crate::TantivyError;
|
||||
|
||||
/// Contains all information required by the SegmentHistogramCollector to perform the
|
||||
/// histogram or date_histogram aggregation on a segment.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HistogramAggReqData {
|
||||
/// The column accessor to access the fast field values.
|
||||
pub accessor: Column<u64>,
|
||||
@@ -244,52 +243,19 @@ impl Display for HistogramBounds {
|
||||
}
|
||||
|
||||
impl HistogramBounds {
|
||||
pub(crate) fn contains(&self, val: f64) -> bool {
|
||||
fn contains(&self, val: f64) -> bool {
|
||||
val >= self.min && val <= self.max
|
||||
}
|
||||
}
|
||||
|
||||
/// The per-bucket identifier stored in a [`SegmentHistogramBucketEntry`].
|
||||
///
|
||||
/// It is [`BucketId`] when the histogram has sub aggregations (which key their state by it), and
|
||||
/// the zero-sized `()` when it does not. Without sub aggregations the id is never read, so storing
|
||||
/// `()` drops 8 bytes per bucket (24 -> 16) and turns id assignment into a no-op.
|
||||
pub trait BucketIdSlot: Copy + Default + std::fmt::Debug + PartialEq {
|
||||
/// Assigns the next id from the provider, called once when a bucket is first filled.
|
||||
fn assign(provider: &mut BucketIdProvider) -> Self;
|
||||
/// Resolves to the `BucketId` for sub-aggregation bookkeeping.
|
||||
///
|
||||
/// Only ever called for the [`BucketId`] slot: the `()` slot is used exactly when there are no
|
||||
/// sub aggregations, so every call site is guarded by `sub_agg.is_some()` and is dead for `()`.
|
||||
fn to_bucket_id(self) -> BucketId;
|
||||
}
|
||||
impl BucketIdSlot for BucketId {
|
||||
#[inline(always)]
|
||||
fn assign(provider: &mut BucketIdProvider) -> Self {
|
||||
provider.next_bucket_id()
|
||||
}
|
||||
#[inline(always)]
|
||||
fn to_bucket_id(self) -> BucketId {
|
||||
self
|
||||
}
|
||||
}
|
||||
impl BucketIdSlot for () {
|
||||
#[inline(always)]
|
||||
fn assign(_provider: &mut BucketIdProvider) -> Self {}
|
||||
#[inline(always)]
|
||||
fn to_bucket_id(self) -> BucketId {
|
||||
unreachable!("bucket ids are only resolved when sub aggregations are present")
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, PartialEq)]
|
||||
pub(crate) struct SegmentHistogramBucketEntry<B> {
|
||||
pub(crate) struct SegmentHistogramBucketEntry {
|
||||
pub key: f64,
|
||||
pub doc_count: u64,
|
||||
pub bucket_id: B,
|
||||
pub bucket_id: BucketId,
|
||||
}
|
||||
|
||||
impl<B: BucketIdSlot> SegmentHistogramBucketEntry<B> {
|
||||
impl SegmentHistogramBucketEntry {
|
||||
pub(crate) fn into_intermediate_bucket_entry(
|
||||
self,
|
||||
sub_aggregation: &mut Option<HighCardBufferedSubAggs>,
|
||||
@@ -302,7 +268,7 @@ impl<B: BucketIdSlot> SegmentHistogramBucketEntry<B> {
|
||||
.add_intermediate_aggregation_result(
|
||||
agg_data,
|
||||
&mut sub_aggregation_res,
|
||||
self.bucket_id.to_bucket_id(),
|
||||
self.bucket_id,
|
||||
)?;
|
||||
}
|
||||
Ok(IntermediateHistogramBucketEntry {
|
||||
@@ -313,147 +279,34 @@ impl<B: BucketIdSlot> SegmentHistogramBucketEntry<B> {
|
||||
}
|
||||
}
|
||||
|
||||
/// The contiguous bucket range a histogram can span, derived from the column min/max (clamped to
|
||||
/// the histogram bounds). Buckets in `[base_pos, base_pos + len)` can be stored in a flat `Vec`
|
||||
/// indexed by `bucket_pos - base_pos`, avoiding the hash map on the hot path.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub(crate) struct DenseRange {
|
||||
/// `bucket_pos` mapped to index 0 of the dense `Vec`.
|
||||
pub(crate) base_pos: i64,
|
||||
/// Number of bucket positions in the range.
|
||||
pub(crate) len: usize,
|
||||
}
|
||||
|
||||
/// Storage for the histogram buckets of a single parent bucket.
|
||||
///
|
||||
/// Starts out sparse (a hash map keyed by `bucket_pos`). Once enough distinct buckets have been
|
||||
/// filled that we are clearly going to cover most of the column's theoretical range, it switches
|
||||
/// to a dense `Vec` indexed by `bucket_pos - base_pos`, which removes hashing from the hot loop.
|
||||
#[derive(Clone, Debug)]
|
||||
enum HistogramBuckets<B> {
|
||||
Sparse(FxHashMap<i64, SegmentHistogramBucketEntry<B>>),
|
||||
Dense {
|
||||
base_pos: i64,
|
||||
/// One slot per bucket position; a slot with `doc_count == 0` has not been hit yet.
|
||||
buckets: Vec<SegmentHistogramBucketEntry<B>>,
|
||||
},
|
||||
}
|
||||
impl<B> Default for HistogramBuckets<B> {
|
||||
fn default() -> Self {
|
||||
HistogramBuckets::Sparse(FxHashMap::default())
|
||||
}
|
||||
}
|
||||
impl<B: BucketIdSlot> HistogramBuckets<B> {
|
||||
fn memory_consumption(&self) -> u64 {
|
||||
let num_slots = match self {
|
||||
HistogramBuckets::Sparse(map) => map.capacity(),
|
||||
HistogramBuckets::Dense { buckets, .. } => buckets.capacity(),
|
||||
};
|
||||
num_slots as u64 * std::mem::size_of::<SegmentHistogramBucketEntry<B>>() as u64
|
||||
}
|
||||
|
||||
/// Switches from sparse to dense storage once the dense `Vec` would use no more memory than the
|
||||
/// hash map does now, so the switch never increases memory. Called at block boundaries.
|
||||
///
|
||||
/// The `Vec` holds one `Entry` per bucket position in the range. The map additionally stores
|
||||
/// the key and a control byte per slot, at a load factor of 7/16..7/8, so for a dense histogram
|
||||
/// its footprint grows past the `Vec` well before full coverage. And since the `Vec` never
|
||||
/// grows afterwards while the map would keep growing, dense only gets relatively cheaper — so
|
||||
/// no upper bound on the range is needed: a large but sparse range simply never crosses over.
|
||||
#[inline]
|
||||
fn maybe_densify(&mut self, dense_range: Option<DenseRange>) {
|
||||
let Some(range) = dense_range else { return };
|
||||
let HistogramBuckets::Sparse(map) = self else {
|
||||
return;
|
||||
};
|
||||
let dense_bytes = range
|
||||
.len
|
||||
.saturating_mul(std::mem::size_of::<SegmentHistogramBucketEntry<B>>());
|
||||
let sparse_bytes = map
|
||||
.capacity()
|
||||
.saturating_mul(std::mem::size_of::<(i64, SegmentHistogramBucketEntry<B>)>() + 1);
|
||||
if dense_bytes > sparse_bytes {
|
||||
return;
|
||||
}
|
||||
let map = std::mem::take(map);
|
||||
let mut buckets = vec![SegmentHistogramBucketEntry::<B>::default(); range.len];
|
||||
for (bucket_pos, entry) in map {
|
||||
buckets[(bucket_pos - range.base_pos) as usize] = entry;
|
||||
}
|
||||
*self = HistogramBuckets::Dense {
|
||||
base_pos: range.base_pos,
|
||||
buckets,
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns the bucket entry for `bucket_pos`, setting its key (and `bucket_id`, when `B` is
|
||||
/// [`BucketId`]) on first use.
|
||||
///
|
||||
/// For the dense variant `bucket_pos` is guaranteed to be inside the range, since it is
|
||||
/// derived from the column min/max that bounds every value (see [`compute_dense_range`]).
|
||||
#[inline]
|
||||
fn get_or_create(
|
||||
&mut self,
|
||||
bucket_pos: i64,
|
||||
bucket_id_provider: &mut BucketIdProvider,
|
||||
key_from_pos: impl FnOnce(i64) -> f64,
|
||||
) -> &mut SegmentHistogramBucketEntry<B> {
|
||||
match self {
|
||||
HistogramBuckets::Sparse(map) => {
|
||||
map.entry(bucket_pos)
|
||||
.or_insert_with(|| SegmentHistogramBucketEntry {
|
||||
key: key_from_pos(bucket_pos),
|
||||
doc_count: 0,
|
||||
bucket_id: B::assign(bucket_id_provider),
|
||||
})
|
||||
}
|
||||
HistogramBuckets::Dense { base_pos, buckets } => {
|
||||
let idx = (bucket_pos - *base_pos) as usize;
|
||||
debug_assert!(idx < buckets.len(), "bucket_pos outside the dense range");
|
||||
let entry = &mut buckets[idx];
|
||||
if entry.doc_count == 0 {
|
||||
entry.key = key_from_pos(bucket_pos);
|
||||
entry.bucket_id = B::assign(bucket_id_provider);
|
||||
}
|
||||
entry
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Consumes the storage, yielding all non-empty bucket entries.
|
||||
fn into_filled_entries(self) -> Vec<SegmentHistogramBucketEntry<B>> {
|
||||
match self {
|
||||
HistogramBuckets::Sparse(map) => map.into_values().collect(),
|
||||
HistogramBuckets::Dense { buckets, .. } => {
|
||||
buckets.into_iter().filter(|b| b.doc_count > 0).collect()
|
||||
}
|
||||
}
|
||||
}
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct HistogramBuckets {
|
||||
pub buckets: FxHashMap<i64, SegmentHistogramBucketEntry>,
|
||||
}
|
||||
|
||||
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
||||
/// the correct datatype.
|
||||
#[derive(Debug)]
|
||||
pub struct SegmentHistogramCollector<B> {
|
||||
pub struct SegmentHistogramCollector {
|
||||
/// The buckets containing the aggregation data.
|
||||
/// One Histogram bucket per parent bucket id.
|
||||
parent_buckets: Vec<HistogramBuckets<B>>,
|
||||
parent_buckets: Vec<HistogramBuckets>,
|
||||
sub_agg: Option<HighCardBufferedSubAggs>,
|
||||
req_data: HistogramAggReqData,
|
||||
accessor_idx: usize,
|
||||
bucket_id_provider: BucketIdProvider,
|
||||
/// Theoretical bucket range derived from the column min/max, if dense `Vec` storage is
|
||||
/// viable. `None` keeps every parent bucket in the sparse hash map.
|
||||
dense_range: Option<DenseRange>,
|
||||
}
|
||||
|
||||
impl<B: BucketIdSlot> SegmentAggregationCollector for SegmentHistogramCollector<B> {
|
||||
impl SegmentAggregationCollector for SegmentHistogramCollector {
|
||||
fn add_intermediate_aggregation_result(
|
||||
&mut self,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
results: &mut IntermediateAggregationResults,
|
||||
parent_bucket_id: BucketId,
|
||||
) -> crate::Result<()> {
|
||||
let name = self.req_data.name.clone();
|
||||
let name = agg_data
|
||||
.get_histogram_req_data(self.accessor_idx)
|
||||
.name
|
||||
.clone();
|
||||
// TODO: avoid prepare_max_bucket here and handle empty buckets.
|
||||
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
|
||||
let histogram = std::mem::take(&mut self.parent_buckets[parent_bucket_id as usize]);
|
||||
@@ -470,13 +323,10 @@ impl<B: BucketIdSlot> SegmentAggregationCollector for SegmentHistogramCollector<
|
||||
docs: &[crate::DocId],
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
let mem_pre = self.get_memory_consumption(parent_bucket_id);
|
||||
let dense_range = self.dense_range;
|
||||
let store = &mut self.parent_buckets[parent_bucket_id as usize];
|
||||
// Upgrade to dense storage before processing the block if the buckets are dense enough.
|
||||
store.maybe_densify(dense_range);
|
||||
let req = agg_data.take_histogram_req_data(self.accessor_idx);
|
||||
let mem_pre = self.get_memory_consumption();
|
||||
let buckets = &mut self.parent_buckets[parent_bucket_id as usize].buckets;
|
||||
|
||||
let req = &self.req_data;
|
||||
let bounds = req.bounds;
|
||||
let interval = req.req.interval;
|
||||
let offset = req.offset;
|
||||
@@ -485,43 +335,35 @@ impl<B: BucketIdSlot> SegmentAggregationCollector for SegmentHistogramCollector<
|
||||
agg_data
|
||||
.column_block_accessor
|
||||
.fetch_block(docs, &req.accessor);
|
||||
// special path for nested buckets
|
||||
if let Some(sub_agg) = &mut self.sub_agg {
|
||||
for (doc, val) in agg_data
|
||||
.column_block_accessor
|
||||
.iter_docid_vals(docs, &req.accessor)
|
||||
{
|
||||
let val = f64_from_fastfield_u64(val, req.field_type);
|
||||
if bounds.contains(val) {
|
||||
let bucket = store.get_or_create(
|
||||
get_bucket_pos(val),
|
||||
&mut self.bucket_id_provider,
|
||||
|pos| get_bucket_key_from_pos(pos as f64, interval, offset),
|
||||
);
|
||||
bucket.doc_count += 1;
|
||||
sub_agg.push(bucket.bucket_id.to_bucket_id(), doc);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for val in agg_data.column_block_accessor.iter_vals() {
|
||||
let val = f64_from_fastfield_u64(val, req.field_type);
|
||||
if bounds.contains(val) {
|
||||
let bucket = store.get_or_create(
|
||||
get_bucket_pos(val),
|
||||
&mut self.bucket_id_provider,
|
||||
|pos| get_bucket_key_from_pos(pos as f64, interval, offset),
|
||||
);
|
||||
bucket.doc_count += 1;
|
||||
for (doc, val) in agg_data
|
||||
.column_block_accessor
|
||||
.iter_docid_vals(docs, &req.accessor)
|
||||
{
|
||||
let val = f64_from_fastfield_u64(val, req.field_type);
|
||||
let bucket_pos = get_bucket_pos(val);
|
||||
if bounds.contains(val) {
|
||||
let bucket = buckets.entry(bucket_pos).or_insert_with(|| {
|
||||
let key = get_bucket_key_from_pos(bucket_pos as f64, interval, offset);
|
||||
SegmentHistogramBucketEntry {
|
||||
key,
|
||||
doc_count: 0,
|
||||
bucket_id: self.bucket_id_provider.next_bucket_id(),
|
||||
}
|
||||
});
|
||||
bucket.doc_count += 1;
|
||||
if let Some(sub_agg) = &mut self.sub_agg {
|
||||
sub_agg.push(bucket.bucket_id, doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
agg_data.put_back_histogram_req_data(self.accessor_idx, req);
|
||||
|
||||
// `checked_sub` is `None` when densifying shrank the accounted memory; only account growth.
|
||||
if let Some(mem_delta) = self
|
||||
.get_memory_consumption(parent_bucket_id)
|
||||
.checked_sub(mem_pre)
|
||||
{
|
||||
agg_data.context.limits.add_memory_consumed(mem_delta)?;
|
||||
let mem_delta = self.get_memory_consumption() - mem_pre;
|
||||
if mem_delta > 0 {
|
||||
agg_data
|
||||
.context
|
||||
.limits
|
||||
.add_memory_consumed(mem_delta as u64)?;
|
||||
}
|
||||
|
||||
if let Some(sub_agg) = &mut self.sub_agg {
|
||||
@@ -544,45 +386,39 @@ impl<B: BucketIdSlot> SegmentAggregationCollector for SegmentHistogramCollector<
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
while self.parent_buckets.len() <= max_bucket as usize {
|
||||
self.parent_buckets.push(HistogramBuckets::default());
|
||||
self.parent_buckets.push(HistogramBuckets {
|
||||
buckets: FxHashMap::default(),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_metric_value(
|
||||
&self,
|
||||
_bucket_id: BucketId,
|
||||
_sub_agg_name: &str,
|
||||
_sub_agg_property: &str,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> Option<f64> {
|
||||
// Histogram is a multi-bucket agg with no single value to extract.
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<B: BucketIdSlot> SegmentHistogramCollector<B> {
|
||||
fn get_memory_consumption(&self, parent_bucket_id: BucketId) -> u64 {
|
||||
self.parent_buckets[parent_bucket_id as usize].memory_consumption()
|
||||
impl SegmentHistogramCollector {
|
||||
fn get_memory_consumption(&self) -> usize {
|
||||
let self_mem = std::mem::size_of::<Self>();
|
||||
let buckets_mem = self.parent_buckets.len() * std::mem::size_of::<HistogramBuckets>();
|
||||
self_mem + buckets_mem
|
||||
}
|
||||
|
||||
/// Converts the collector result into a intermediate bucket result.
|
||||
fn add_intermediate_bucket_result(
|
||||
&mut self,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
histogram: HistogramBuckets<B>,
|
||||
histogram: HistogramBuckets,
|
||||
) -> crate::Result<IntermediateBucketResult> {
|
||||
let filled = histogram.into_filled_entries();
|
||||
let mut buckets = Vec::with_capacity(filled.len());
|
||||
let mut buckets = Vec::with_capacity(histogram.buckets.len());
|
||||
|
||||
for bucket in filled {
|
||||
for bucket in histogram.buckets.into_values() {
|
||||
let bucket_res = bucket.into_intermediate_bucket_entry(&mut self.sub_agg, agg_data);
|
||||
|
||||
buckets.push(bucket_res?);
|
||||
}
|
||||
buckets.sort_unstable_by(|b1, b2| b1.key.total_cmp(&b2.key));
|
||||
|
||||
let is_date_agg = self.req_data.field_type == ColumnType::DateTime;
|
||||
let is_date_agg = agg_data
|
||||
.get_histogram_req_data(self.accessor_idx)
|
||||
.field_type
|
||||
== ColumnType::DateTime;
|
||||
Ok(IntermediateBucketResult::Histogram {
|
||||
buckets,
|
||||
is_date_agg,
|
||||
@@ -598,175 +434,32 @@ impl<B: BucketIdSlot> SegmentHistogramCollector<B> {
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let mut req_data = agg_data.per_request.histogram_req_data[node.idx_in_req_data].clone();
|
||||
normalize_histogram_req(&mut req_data)?;
|
||||
agg_data
|
||||
.context
|
||||
.limits
|
||||
.add_memory_consumed(req_data.get_memory_consumption() as u64)?;
|
||||
let dense_range = compute_dense_range(
|
||||
&req_data.accessor,
|
||||
req_data.field_type,
|
||||
req_data.req.interval,
|
||||
req_data.offset,
|
||||
req_data.bounds,
|
||||
);
|
||||
let req_data = agg_data.get_histogram_req_data_mut(node.idx_in_req_data);
|
||||
req_data.req.validate()?;
|
||||
if req_data.field_type == ColumnType::DateTime && !req_data.is_date_histogram {
|
||||
req_data.req.normalize_date_time();
|
||||
}
|
||||
req_data.bounds = req_data.req.hard_bounds.unwrap_or(HistogramBounds {
|
||||
min: f64::MIN,
|
||||
max: f64::MAX,
|
||||
});
|
||||
req_data.offset = req_data.req.offset.unwrap_or(0.0);
|
||||
let sub_agg = sub_agg.map(BufferedSubAggs::new);
|
||||
|
||||
Ok(Self {
|
||||
parent_buckets: Default::default(),
|
||||
sub_agg,
|
||||
req_data,
|
||||
accessor_idx: node.idx_in_req_data,
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
dense_range,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentHistogramCollector<()> {
|
||||
/// Builds a histogram collector whose parent `t` is a dense histogram filled from
|
||||
/// `counts[t * num_time_buckets .. (t + 1) * num_time_buckets]` (row-major). Used by the fused
|
||||
/// terms×histogram collector to turn its flat 2D counters into the regular intermediate result,
|
||||
/// so cross-segment merging is shared with the general path.
|
||||
pub(crate) fn from_dense_rows(
|
||||
req_data: HistogramAggReqData,
|
||||
base_pos: i64,
|
||||
num_time_buckets: usize,
|
||||
counts: &[u32],
|
||||
) -> Self {
|
||||
let interval = req_data.req.interval;
|
||||
let offset = req_data.offset;
|
||||
let num_parents = counts.len().checked_div(num_time_buckets).unwrap_or(0);
|
||||
let parent_buckets = (0..num_parents)
|
||||
.map(|t| {
|
||||
let row = &counts[t * num_time_buckets..(t + 1) * num_time_buckets];
|
||||
let buckets = row
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(b, &doc_count)| SegmentHistogramBucketEntry {
|
||||
key: get_bucket_key_from_pos(
|
||||
(base_pos + b as i64) as f64,
|
||||
interval,
|
||||
offset,
|
||||
),
|
||||
doc_count: doc_count as u64,
|
||||
bucket_id: (),
|
||||
})
|
||||
.collect();
|
||||
HistogramBuckets::Dense { base_pos, buckets }
|
||||
})
|
||||
.collect();
|
||||
Self {
|
||||
parent_buckets,
|
||||
sub_agg: None,
|
||||
req_data,
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
dense_range: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Validates and normalizes a histogram request in place: applies date ns-normalization (for a
|
||||
/// `histogram` on a date column) and resolves `bounds`/`offset` from the request.
|
||||
fn normalize_histogram_req(req_data: &mut HistogramAggReqData) -> crate::Result<()> {
|
||||
req_data.req.validate()?;
|
||||
if req_data.field_type == ColumnType::DateTime && !req_data.is_date_histogram {
|
||||
req_data.req.normalize_date_time();
|
||||
}
|
||||
req_data.bounds = req_data.req.hard_bounds.unwrap_or(HistogramBounds {
|
||||
min: f64::MIN,
|
||||
max: f64::MAX,
|
||||
});
|
||||
req_data.offset = req_data.req.offset.unwrap_or(0.0);
|
||||
// Drop `hard_bounds` that can't exclude any value (the column's range already sits inside
|
||||
// them): the per-doc `bounds.contains` check is then a no-op, so collapsing to the unbounded
|
||||
// sentinel lets the histogram hot loop skip it and the fused term×histogram path derive
|
||||
// per-term counts from the grid. Only this collect-time filter is touched — empty-bucket
|
||||
// emission reads `req.hard_bounds` directly (see `get_req_min_max`), and `hard_bounds` only
|
||||
// ever clips that range, so a wider-than-data bound leaves the result unchanged.
|
||||
if req_data.req.hard_bounds.is_some() {
|
||||
let col_min = f64_from_fastfield_u64(req_data.accessor.min_value(), req_data.field_type);
|
||||
let col_max = f64_from_fastfield_u64(req_data.accessor.max_value(), req_data.field_type);
|
||||
if col_min >= req_data.bounds.min && col_max <= req_data.bounds.max {
|
||||
req_data.bounds = HistogramBounds {
|
||||
min: f64::MIN,
|
||||
max: f64::MAX,
|
||||
};
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clones and normalizes (resolving interval/offset/bounds) the histogram request at `node`, and
|
||||
/// returns it together with its dense bucket range — or `None` if the column has no usable range.
|
||||
/// Used by the fused terms×histogram collector, which then owns the normalized request.
|
||||
pub(crate) fn prepare_histogram_dense_range(
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
) -> crate::Result<Option<(HistogramAggReqData, DenseRange)>> {
|
||||
let mut req_data = agg_data.per_request.histogram_req_data[node.idx_in_req_data].clone();
|
||||
normalize_histogram_req(&mut req_data)?;
|
||||
let dense_range = compute_dense_range(
|
||||
&req_data.accessor,
|
||||
req_data.field_type,
|
||||
req_data.req.interval,
|
||||
req_data.offset,
|
||||
req_data.bounds,
|
||||
);
|
||||
Ok(dense_range.map(|range| (req_data, range)))
|
||||
}
|
||||
|
||||
/// Builds a boxed histogram (or date histogram) segment collector, picking the bucket-id storage
|
||||
/// based on whether there are sub aggregations: `()` (no id stored) when there are none, otherwise
|
||||
/// [`BucketId`].
|
||||
pub(crate) fn build_segment_histogram_collector(
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
||||
if node.children.is_empty() {
|
||||
Ok(Box::new(
|
||||
SegmentHistogramCollector::<()>::from_req_and_validate(agg_data, node)?,
|
||||
))
|
||||
} else {
|
||||
Ok(Box::new(
|
||||
SegmentHistogramCollector::<BucketId>::from_req_and_validate(agg_data, node)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_bucket_pos_f64(val: f64, interval: f64, offset: f64) -> f64 {
|
||||
fn get_bucket_pos_f64(val: f64, interval: f64, offset: f64) -> f64 {
|
||||
((val - offset) / interval).floor()
|
||||
}
|
||||
|
||||
/// Computes the dense bucket range for a column from its min/max value (clamped to the histogram
|
||||
/// bounds), or `None` if there are no values within bounds (or the range overflows `usize`).
|
||||
///
|
||||
/// There is no upper bound on the range: whether dense storage is actually used is decided later,
|
||||
/// per parent bucket, by [`HistogramBuckets::maybe_densify`] based on the memory it would save.
|
||||
///
|
||||
/// The column min/max bound every value the collector can see, so a `Vec` sized to this range can
|
||||
/// be indexed by `bucket_pos - base_pos` without any out-of-bounds check on the hot path.
|
||||
fn compute_dense_range(
|
||||
accessor: &Column<u64>,
|
||||
field_type: ColumnType,
|
||||
interval: f64,
|
||||
offset: f64,
|
||||
bounds: HistogramBounds,
|
||||
) -> Option<DenseRange> {
|
||||
let col_min = f64_from_fastfield_u64(accessor.min_value(), field_type);
|
||||
let col_max = f64_from_fastfield_u64(accessor.max_value(), field_type);
|
||||
let lo = col_min.max(bounds.min);
|
||||
let hi = col_max.min(bounds.max);
|
||||
if lo > hi {
|
||||
return None;
|
||||
}
|
||||
let base_pos = get_bucket_pos_f64(lo, interval, offset) as i64;
|
||||
let top_pos = get_bucket_pos_f64(hi, interval, offset) as i64;
|
||||
let len = usize::try_from(top_pos.checked_sub(base_pos)?.checked_add(1)?).ok()?;
|
||||
(len > 0).then_some(DenseRange { base_pos, len })
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_bucket_key_from_pos(bucket_pos: f64, interval: f64, offset: f64) -> f64 {
|
||||
bucket_pos * interval + offset
|
||||
@@ -1071,62 +764,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_dense_storage_test() -> crate::Result<()> {
|
||||
histogram_dense_storage_test_with_opt(false)?;
|
||||
histogram_dense_storage_test_with_opt(true)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Exercises the switch from sparse hash map to dense `Vec` storage. The switch happens at a
|
||||
/// block boundary (a block is `COLLECT_BLOCK_BUFFER_LEN` = 64 docs), so we need many docs in a
|
||||
/// single segment, densely covering the bucket range. `with_sub_agg` toggles the `iter_vals`
|
||||
/// fast path vs. the `iter_docid_vals` path used when there is a sub aggregation.
|
||||
fn histogram_dense_storage_test_with_opt(with_sub_agg: bool) -> crate::Result<()> {
|
||||
let num_buckets = 50usize;
|
||||
let docs_per_bucket = 10usize;
|
||||
// Value `k` repeated `docs_per_bucket` times for each bucket `k`, so every value in bucket
|
||||
// `k` equals `k` and the per-bucket average is exactly `k`.
|
||||
let values: Vec<f64> = (0..num_buckets * docs_per_bucket)
|
||||
.map(|i| (i % num_buckets) as f64)
|
||||
.collect();
|
||||
// `merge_segments = true` collapses the per-value segments into a single segment with all
|
||||
// the docs, which is collected in 64-doc blocks and therefore switches to dense storage.
|
||||
let index = get_test_index_from_values(true, &values)?;
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(if with_sub_agg {
|
||||
json!({
|
||||
"histogram": {
|
||||
"histogram": { "field": "score_f64", "interval": 1.0 },
|
||||
"aggs": { "avg": { "avg": { "field": "score_f64" } } }
|
||||
}
|
||||
})
|
||||
} else {
|
||||
json!({
|
||||
"histogram": {
|
||||
"histogram": { "field": "score_f64", "interval": 1.0 }
|
||||
}
|
||||
})
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
|
||||
for k in 0..num_buckets {
|
||||
assert_eq!(res["histogram"]["buckets"][k]["key"], k as f64);
|
||||
assert_eq!(
|
||||
res["histogram"]["buckets"][k]["doc_count"],
|
||||
docs_per_bucket as u64
|
||||
);
|
||||
if with_sub_agg {
|
||||
assert_eq!(res["histogram"]["buckets"][k]["avg"]["value"], k as f64);
|
||||
}
|
||||
}
|
||||
assert_eq!(res["histogram"]["buckets"][num_buckets], Value::Null);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_memory_limit() -> crate::Result<()> {
|
||||
let index = get_test_index_with_num_docs(true, 100)?;
|
||||
@@ -1421,55 +1058,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_non_binding_hard_bounds_test_multi_segment() -> crate::Result<()> {
|
||||
histogram_non_binding_hard_bounds_test_with_opt(false)
|
||||
}
|
||||
#[test]
|
||||
fn histogram_non_binding_hard_bounds_test_single_segment() -> crate::Result<()> {
|
||||
histogram_non_binding_hard_bounds_test_with_opt(true)
|
||||
}
|
||||
/// `hard_bounds` wider than the data (here with mid-interval edges, to cover the "bound cuts a
|
||||
/// bucket" case) can't exclude any value, so the result must be identical to the same request
|
||||
/// without bounds. Guards the normalization that collapses such bounds to the unbounded
|
||||
/// sentinel so the hot loop / fused path can skip the per-doc bounds check.
|
||||
fn histogram_non_binding_hard_bounds_test_with_opt(merge_segments: bool) -> crate::Result<()> {
|
||||
let values = vec![10.0, 12.0, 14.0, 16.0, 10.0, 13.0, 10.0, 12.0];
|
||||
let index = get_test_index_from_values(merge_segments, &values)?;
|
||||
|
||||
// Mid-interval edges, but wider than the data range [10, 16] -> they exclude nothing.
|
||||
let with_bounds: Aggregations = serde_json::from_value(json!({
|
||||
"histogram": {
|
||||
"histogram": {
|
||||
"field": "score_f64",
|
||||
"interval": 1.0,
|
||||
"hard_bounds": { "min": 9.5, "max": 16.5 }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
let no_bounds: Aggregations = serde_json::from_value(json!({
|
||||
"histogram": {
|
||||
"histogram": { "field": "score_f64", "interval": 1.0 }
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res_bounds = exec_request(with_bounds, &index)?;
|
||||
let res_plain = exec_request(no_bounds, &index)?;
|
||||
// Dropping a non-binding bound must not change anything.
|
||||
assert_eq!(res_bounds, res_plain);
|
||||
|
||||
// Sanity: buckets span the data range with gaps filled (min_doc_count defaults to 0).
|
||||
assert_eq!(res_bounds["histogram"]["buckets"][0]["key"], 10.0);
|
||||
assert_eq!(res_bounds["histogram"]["buckets"][0]["doc_count"], 3);
|
||||
assert_eq!(res_bounds["histogram"]["buckets"][6]["key"], 16.0);
|
||||
assert_eq!(res_bounds["histogram"]["buckets"][6]["doc_count"], 1);
|
||||
assert_eq!(res_bounds["histogram"]["buckets"][7], Value::Null);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_empty_result_behaviour_test_single_segment() -> crate::Result<()> {
|
||||
histogram_empty_result_behaviour_test_with_opt(true)
|
||||
|
||||
@@ -23,7 +23,6 @@ use crate::TantivyError;
|
||||
|
||||
/// Contains all information required by the SegmentRangeCollector to perform the
|
||||
/// range aggregation on a segment.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RangeAggReqData {
|
||||
/// The column accessor to access the fast field values.
|
||||
pub accessor: Column<u64>,
|
||||
@@ -162,7 +161,7 @@ pub struct SegmentRangeCollector<B: SubAggBuffer> {
|
||||
/// One for each ParentBucketId
|
||||
parent_buckets: Vec<Vec<SegmentRangeAndBucketEntry>>,
|
||||
column_type: ColumnType,
|
||||
pub(crate) req_data: RangeAggReqData,
|
||||
pub(crate) accessor_idx: usize,
|
||||
sub_agg: Option<BufferedSubAggs<B>>,
|
||||
/// Here things get a bit weird. We need to assign unique bucket ids across all
|
||||
/// parent buckets. So we keep track of the next available bucket id here.
|
||||
@@ -185,7 +184,7 @@ impl<B: SubAggBuffer> Debug for SegmentRangeCollector<B> {
|
||||
f.debug_struct("SegmentRangeCollector")
|
||||
.field("parent_buckets_len", &self.parent_buckets.len())
|
||||
.field("column_type", &self.column_type)
|
||||
.field("name", &self.req_data.name)
|
||||
.field("accessor_idx", &self.accessor_idx)
|
||||
.field("has_sub_agg", &self.sub_agg.is_some())
|
||||
.finish()
|
||||
}
|
||||
@@ -240,7 +239,10 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentRangeCollector<B> {
|
||||
) -> crate::Result<()> {
|
||||
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
|
||||
let field_type = self.column_type;
|
||||
let name = self.req_data.name.to_string();
|
||||
let name = agg_data
|
||||
.get_range_req_data(self.accessor_idx)
|
||||
.name
|
||||
.to_string();
|
||||
|
||||
let buckets = std::mem::take(&mut self.parent_buckets[parent_bucket_id as usize]);
|
||||
|
||||
@@ -279,15 +281,17 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentRangeCollector<B> {
|
||||
docs: &[crate::DocId],
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
let req = agg_data.take_range_req_data(self.accessor_idx);
|
||||
|
||||
agg_data
|
||||
.column_block_accessor
|
||||
.fetch_block(docs, &self.req_data.accessor);
|
||||
.fetch_block(docs, &req.accessor);
|
||||
|
||||
let buckets = &mut self.parent_buckets[parent_bucket_id as usize];
|
||||
|
||||
for (doc, val) in agg_data
|
||||
.column_block_accessor
|
||||
.iter_docid_vals(docs, &self.req_data.accessor)
|
||||
.iter_docid_vals(docs, &req.accessor)
|
||||
{
|
||||
let bucket_pos = get_bucket_pos(val, buckets);
|
||||
let bucket = &mut buckets[bucket_pos];
|
||||
@@ -297,6 +301,7 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentRangeCollector<B> {
|
||||
}
|
||||
}
|
||||
|
||||
agg_data.put_back_range_req_data(self.accessor_idx, req);
|
||||
if let Some(sub_agg) = self.sub_agg.as_mut() {
|
||||
sub_agg.check_flush_local(agg_data)?;
|
||||
}
|
||||
@@ -314,26 +319,15 @@ impl<B: SubAggBuffer> SegmentAggregationCollector for SegmentRangeCollector<B> {
|
||||
fn prepare_max_bucket(
|
||||
&mut self,
|
||||
max_bucket: BucketId,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
while self.parent_buckets.len() <= max_bucket as usize {
|
||||
let new_buckets = self.create_new_buckets()?;
|
||||
let new_buckets = self.create_new_buckets(agg_data)?;
|
||||
self.parent_buckets.push(new_buckets);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_metric_value(
|
||||
&self,
|
||||
_bucket_id: BucketId,
|
||||
_sub_agg_name: &str,
|
||||
_sub_agg_property: &str,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> Option<f64> {
|
||||
// Range is a multi-bucket agg with no single value to extract.
|
||||
None
|
||||
}
|
||||
}
|
||||
/// Build a concrete `SegmentRangeCollector` with either a Vec- or HashMap-backed
|
||||
/// bucket storage, depending on the column type and aggregation level.
|
||||
@@ -341,11 +335,8 @@ pub(crate) fn build_segment_range_collector(
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
||||
let req_data = agg_data.per_request.range_req_data[node.idx_in_req_data].clone();
|
||||
agg_data
|
||||
.context
|
||||
.limits
|
||||
.add_memory_consumed(req_data.get_memory_consumption() as u64)?;
|
||||
let accessor_idx = node.idx_in_req_data;
|
||||
let req_data = agg_data.get_range_req_data(node.idx_in_req_data);
|
||||
let field_type = req_data.field_type;
|
||||
|
||||
// TODO: A better metric instead of is_top_level would be the number of buckets expected.
|
||||
@@ -363,7 +354,7 @@ pub(crate) fn build_segment_range_collector(
|
||||
Ok(Box::new(SegmentRangeCollector::<LowCardSubAggBuffer> {
|
||||
sub_agg: sub_agg.map(LowCardBufferedSubAggs::new),
|
||||
column_type: field_type,
|
||||
req_data,
|
||||
accessor_idx,
|
||||
parent_buckets: Vec::new(),
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
limits: agg_data.context.limits.clone(),
|
||||
@@ -372,7 +363,7 @@ pub(crate) fn build_segment_range_collector(
|
||||
Ok(Box::new(SegmentRangeCollector::<HighCardSubAggBuffer> {
|
||||
sub_agg: sub_agg.map(BufferedSubAggs::new),
|
||||
column_type: field_type,
|
||||
req_data,
|
||||
accessor_idx,
|
||||
parent_buckets: Vec::new(),
|
||||
bucket_id_provider: BucketIdProvider::default(),
|
||||
limits: agg_data.context.limits.clone(),
|
||||
@@ -381,9 +372,12 @@ pub(crate) fn build_segment_range_collector(
|
||||
}
|
||||
|
||||
impl<B: SubAggBuffer> SegmentRangeCollector<B> {
|
||||
pub(crate) fn create_new_buckets(&mut self) -> crate::Result<Vec<SegmentRangeAndBucketEntry>> {
|
||||
pub(crate) fn create_new_buckets(
|
||||
&mut self,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
) -> crate::Result<Vec<SegmentRangeAndBucketEntry>> {
|
||||
let field_type = self.column_type;
|
||||
let req_data = &self.req_data;
|
||||
let req_data = agg_data.get_range_req_data(self.accessor_idx);
|
||||
// The range input on the request is f64.
|
||||
// We need to convert to u64 ranges, because we read the values as u64.
|
||||
// The mapping from the conversion is monotonic so ordering is preserved.
|
||||
@@ -558,16 +552,17 @@ mod tests {
|
||||
get_test_index_with_num_docs,
|
||||
};
|
||||
|
||||
pub fn build_test_buckets(
|
||||
ranges: &[RangeAggregationRange],
|
||||
pub fn get_collector_from_ranges(
|
||||
ranges: Vec<RangeAggregationRange>,
|
||||
field_type: ColumnType,
|
||||
) -> Vec<SegmentRangeAndBucketEntry> {
|
||||
) -> SegmentRangeCollector<HighCardSubAggBuffer> {
|
||||
let req = RangeAggregation {
|
||||
field: "dummy".to_string(),
|
||||
ranges: ranges.to_vec(),
|
||||
ranges,
|
||||
..Default::default()
|
||||
};
|
||||
extend_validate_ranges(&req.ranges, &field_type)
|
||||
// Build buckets directly as in from_req_and_validate without AggregationsData
|
||||
let buckets: Vec<_> = extend_validate_ranges(&req.ranges, &field_type)
|
||||
.expect("unexpected error in extend_validate_ranges")
|
||||
.iter()
|
||||
.map(|range| {
|
||||
@@ -598,7 +593,16 @@ mod tests {
|
||||
},
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
.collect();
|
||||
|
||||
SegmentRangeCollector {
|
||||
parent_buckets: vec![buckets],
|
||||
column_type: field_type,
|
||||
accessor_idx: 0,
|
||||
sub_agg: None,
|
||||
bucket_id_provider: Default::default(),
|
||||
limits: AggregationLimitsGuard::default(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -841,10 +845,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn bucket_test_extend_range_hole() {
|
||||
let buckets = [(10f64..20f64).into(), (30f64..40f64).into()];
|
||||
let parent_buckets = [build_test_buckets(&buckets, ColumnType::F64)];
|
||||
let buckets = vec![(10f64..20f64).into(), (30f64..40f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||
|
||||
let buckets = parent_buckets[0].clone();
|
||||
let buckets = collector.parent_buckets[0].clone();
|
||||
assert_eq!(buckets[0].range.start, u64::MIN);
|
||||
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
||||
@@ -860,14 +864,14 @@ mod tests {
|
||||
fn bucket_test_range_conversion_special_case() {
|
||||
// the monotonic conversion between f64 and u64, does not map f64::MIN.to_u64() ==
|
||||
// u64::MIN, but the into trait converts f64::MIN/MAX to None
|
||||
let buckets = [
|
||||
let buckets = vec![
|
||||
(f64::MIN..10f64).into(),
|
||||
(10f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
];
|
||||
let parent_buckets = [build_test_buckets(&buckets, ColumnType::F64)];
|
||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||
|
||||
let buckets = parent_buckets[0].clone();
|
||||
let buckets = collector.parent_buckets[0].clone();
|
||||
assert_eq!(buckets[0].range.start, u64::MIN);
|
||||
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
||||
@@ -879,28 +883,28 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn bucket_range_test_negative_vals() {
|
||||
let buckets = [(-10f64..-1f64).into()];
|
||||
let parent_buckets = [build_test_buckets(&buckets, ColumnType::F64)];
|
||||
let buckets = vec![(-10f64..-1f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||
|
||||
let buckets = parent_buckets[0].clone();
|
||||
let buckets = collector.parent_buckets[0].clone();
|
||||
assert_eq!(&buckets[0].bucket.key.to_string(), "*--10");
|
||||
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "-1-*");
|
||||
}
|
||||
#[test]
|
||||
fn bucket_range_test_positive_vals() {
|
||||
let buckets = [(0f64..10f64).into()];
|
||||
let parent_buckets = [build_test_buckets(&buckets, ColumnType::F64)];
|
||||
let buckets = vec![(0f64..10f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||
|
||||
let buckets = parent_buckets[0].clone();
|
||||
let buckets = collector.parent_buckets[0].clone();
|
||||
assert_eq!(&buckets[0].bucket.key.to_string(), "*-0");
|
||||
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "10-*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_binary_search_test_u64() {
|
||||
let check_ranges = |ranges: &[RangeAggregationRange]| {
|
||||
let parent_buckets = [build_test_buckets(ranges, ColumnType::U64)];
|
||||
let search = |val: u64| get_bucket_pos(val, &parent_buckets[0]);
|
||||
let check_ranges = |ranges: Vec<RangeAggregationRange>| {
|
||||
let collector = get_collector_from_ranges(ranges, ColumnType::U64);
|
||||
let search = |val: u64| get_bucket_pos(val, &collector.parent_buckets[0]);
|
||||
|
||||
assert_eq!(search(u64::MIN), 0);
|
||||
assert_eq!(search(9), 0);
|
||||
@@ -913,7 +917,7 @@ mod tests {
|
||||
};
|
||||
|
||||
let ranges = vec![(10.0..100.0).into()];
|
||||
check_ranges(&ranges);
|
||||
check_ranges(ranges);
|
||||
|
||||
let ranges = vec![
|
||||
RangeAggregationRange {
|
||||
@@ -923,7 +927,7 @@ mod tests {
|
||||
},
|
||||
(10.0..100.0).into(),
|
||||
];
|
||||
check_ranges(&ranges);
|
||||
check_ranges(ranges);
|
||||
|
||||
let ranges = vec![
|
||||
RangeAggregationRange {
|
||||
@@ -938,15 +942,15 @@ mod tests {
|
||||
from: Some(100.0),
|
||||
},
|
||||
];
|
||||
check_ranges(&ranges);
|
||||
check_ranges(ranges);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_binary_search_test_f64() {
|
||||
let ranges = [(10.0..100.0).into()];
|
||||
let ranges = vec![(10.0..100.0).into()];
|
||||
|
||||
let parent_buckets = [build_test_buckets(&ranges, ColumnType::F64)];
|
||||
let search = |val: u64| get_bucket_pos(val, &parent_buckets[0]);
|
||||
let collector = get_collector_from_ranges(ranges, ColumnType::F64);
|
||||
let search = |val: u64| get_bucket_pos(val, &collector.parent_buckets[0]);
|
||||
|
||||
assert_eq!(search(u64::MIN), 0);
|
||||
assert_eq!(search(9f64.to_u64()), 0);
|
||||
|
||||
@@ -29,8 +29,6 @@ use crate::aggregation::{format_date, BucketId, Key};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::TantivyError;
|
||||
|
||||
mod term_histogram;
|
||||
|
||||
/// Contains all information required by the SegmentTermCollector to perform the
|
||||
/// terms aggregation on a segment.
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -354,15 +352,19 @@ pub(crate) fn build_segment_term_collector(
|
||||
)));
|
||||
}
|
||||
|
||||
// Validate that the referenced sub-aggregation exists when ordering by one.
|
||||
if let OrderTarget::SubAggregation(sub_agg_name) = &terms_req_data.req.order.target {
|
||||
let (agg_name, _agg_property) = get_agg_name_and_property(sub_agg_name);
|
||||
node.get_sub_agg(agg_name, &req_data.per_request)
|
||||
.ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"could not find aggregation with name {agg_name} in metric sub_aggregations"
|
||||
))
|
||||
})?;
|
||||
// Validate sub aggregation exists when ordering by sub-aggregation.
|
||||
{
|
||||
if let OrderTarget::SubAggregation(sub_agg_name) = &terms_req_data.req.order.target {
|
||||
let (agg_name, _agg_property) = get_agg_name_and_property(sub_agg_name);
|
||||
|
||||
node.get_sub_agg(agg_name, &req_data.per_request)
|
||||
.ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"could not find aggregation with name {agg_name} in metric \
|
||||
sub_aggregations"
|
||||
))
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
// Build sub-aggregation blueprint if there are children.
|
||||
@@ -376,21 +378,9 @@ pub(crate) fn build_segment_term_collector(
|
||||
// Let's see if we can use a vec to aggregate our data
|
||||
// instead of a hashmap.
|
||||
let col_max_value = terms_req_data.accessor.max_value();
|
||||
let max_column_val: u64 =
|
||||
let max_term_id: u64 =
|
||||
col_max_value.max(terms_req_data.missing_value_for_accessor.unwrap_or(0u64));
|
||||
|
||||
// Fused fast path: low-cardinality terms × a single `histogram`/`date_histogram` leaf over full
|
||||
// columns with a small enough bucket grid. Anything else falls through to the general path.
|
||||
if let Some(collector) = term_histogram::maybe_build_collector(
|
||||
req_data,
|
||||
node,
|
||||
&terms_req_data,
|
||||
max_column_val,
|
||||
is_top_level,
|
||||
)? {
|
||||
return Ok(collector);
|
||||
}
|
||||
|
||||
let sub_agg_collector = if has_sub_aggregations {
|
||||
Some(build_segment_agg_collectors(req_data, &node.children)?)
|
||||
} else {
|
||||
@@ -399,30 +389,30 @@ pub(crate) fn build_segment_term_collector(
|
||||
|
||||
let mut bucket_id_provider = BucketIdProvider::default();
|
||||
// Decide which bucket storage is best suited for this aggregation.
|
||||
if is_top_level && max_column_val < MAX_NUM_TERMS_FOR_VEC && !has_sub_aggregations {
|
||||
let term_buckets = VecTermBucketsNoAgg::new(max_column_val + 1, &mut bucket_id_provider);
|
||||
if is_top_level && max_term_id < MAX_NUM_TERMS_FOR_VEC && !has_sub_aggregations {
|
||||
let term_buckets = VecTermBucketsNoAgg::new(max_term_id + 1, &mut bucket_id_provider);
|
||||
let collector: SegmentTermCollector<_, HighCardSubAggBuffer> = SegmentTermCollector {
|
||||
parent_buckets: vec![term_buckets],
|
||||
sub_agg: None,
|
||||
bucket_id_provider,
|
||||
max_term_id: max_column_val,
|
||||
max_term_id,
|
||||
terms_req_data,
|
||||
};
|
||||
Ok(Box::new(collector))
|
||||
} else if is_top_level && max_column_val < MAX_NUM_TERMS_FOR_VEC {
|
||||
let term_buckets = VecTermBuckets::new(max_column_val + 1, &mut bucket_id_provider);
|
||||
} else if is_top_level && max_term_id < MAX_NUM_TERMS_FOR_VEC {
|
||||
let term_buckets = VecTermBuckets::new(max_term_id + 1, &mut bucket_id_provider);
|
||||
let sub_agg = sub_agg_collector.map(LowCardBufferedSubAggs::new);
|
||||
let collector: SegmentTermCollector<_, LowCardSubAggBuffer> = SegmentTermCollector {
|
||||
parent_buckets: vec![term_buckets],
|
||||
sub_agg,
|
||||
bucket_id_provider,
|
||||
max_term_id: max_column_val,
|
||||
max_term_id,
|
||||
terms_req_data,
|
||||
};
|
||||
Ok(Box::new(collector))
|
||||
} else if max_column_val < 8_000_000 && is_top_level {
|
||||
} else if max_term_id < 8_000_000 && is_top_level {
|
||||
let term_buckets: PagedTermMap =
|
||||
PagedTermMap::new(max_column_val + 1, &mut bucket_id_provider);
|
||||
PagedTermMap::new(max_term_id + 1, &mut bucket_id_provider);
|
||||
// Build sub-aggregation blueprint (flat pairs)
|
||||
let sub_agg = sub_agg_collector.map(BufferedSubAggs::new);
|
||||
let collector: SegmentTermCollector<PagedTermMap, HighCardSubAggBuffer> =
|
||||
@@ -430,7 +420,7 @@ pub(crate) fn build_segment_term_collector(
|
||||
parent_buckets: vec![term_buckets],
|
||||
sub_agg,
|
||||
bucket_id_provider,
|
||||
max_term_id: max_column_val,
|
||||
max_term_id,
|
||||
terms_req_data,
|
||||
};
|
||||
Ok(Box::new(collector))
|
||||
@@ -443,7 +433,7 @@ pub(crate) fn build_segment_term_collector(
|
||||
parent_buckets: vec![term_buckets],
|
||||
sub_agg,
|
||||
bucket_id_provider,
|
||||
max_term_id: max_column_val,
|
||||
max_term_id,
|
||||
terms_req_data,
|
||||
};
|
||||
Ok(Box::new(collector))
|
||||
@@ -819,7 +809,7 @@ impl<TermMap: TermAggregationMap, B: SubAggBuffer> SegmentAggregationCollector
|
||||
docs: &[crate::DocId],
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
let mem_pre = self.get_memory_consumption(parent_bucket_id);
|
||||
// let mem_pre = self.get_memory_consumption();
|
||||
|
||||
let req_data = &mut self.terms_req_data;
|
||||
|
||||
@@ -863,13 +853,16 @@ impl<TermMap: TermAggregationMap, B: SubAggBuffer> SegmentAggregationCollector
|
||||
}
|
||||
}
|
||||
|
||||
let mem_delta = self.get_memory_consumption(parent_bucket_id) - mem_pre;
|
||||
if mem_delta > 0 {
|
||||
agg_data
|
||||
.context
|
||||
.limits
|
||||
.add_memory_consumed(mem_delta as u64)?;
|
||||
}
|
||||
// let mem_delta = self.get_memory_consumption() - mem_pre;
|
||||
// if mem_delta > 0 {
|
||||
// agg_data
|
||||
// .context
|
||||
// .limits
|
||||
// .add_memory_consumed(mem_delta as u64)?;
|
||||
// }
|
||||
|
||||
// After commenting out -> 6000ms -> 36ms
|
||||
|
||||
if let Some(sub_agg) = &mut self.sub_agg {
|
||||
sub_agg.check_flush_local(agg_data)?;
|
||||
}
|
||||
@@ -897,17 +890,6 @@ impl<TermMap: TermAggregationMap, B: SubAggBuffer> SegmentAggregationCollector
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_metric_value(
|
||||
&self,
|
||||
_bucket_id: BucketId,
|
||||
_sub_agg_name: &str,
|
||||
_sub_agg_property: &str,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> Option<f64> {
|
||||
// Terms is a multi-bucket agg with no single value to extract.
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Missing value are represented as a sentinel value in the column.
|
||||
@@ -967,9 +949,11 @@ where
|
||||
TermMap: TermAggregationMap,
|
||||
B: SubAggBuffer,
|
||||
{
|
||||
#[inline]
|
||||
fn get_memory_consumption(&self, parent_bucket_id: BucketId) -> usize {
|
||||
self.parent_buckets[parent_bucket_id as usize].get_memory_consumption()
|
||||
fn get_memory_consumption(&self) -> usize {
|
||||
self.parent_buckets
|
||||
.iter()
|
||||
.map(|b| b.get_memory_consumption())
|
||||
.sum()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -981,6 +965,9 @@ where
|
||||
) -> crate::Result<IntermediateBucketResult> {
|
||||
let mut entries: Vec<(u64, Bucket)> = term_buckets.into_vec();
|
||||
|
||||
let order_by_sub_aggregation =
|
||||
matches!(term_req.req.order.target, OrderTarget::SubAggregation(_));
|
||||
|
||||
match &term_req.req.order.target {
|
||||
OrderTarget::Key => {
|
||||
// We rely on the fact, that term ordinals match the order of the strings
|
||||
@@ -992,37 +979,10 @@ where
|
||||
entries.sort_unstable_by_key(|bucket| bucket.0);
|
||||
}
|
||||
}
|
||||
OrderTarget::SubAggregation(sub_agg_path) => {
|
||||
// Peek segment-level metric values, sort, then fall through to
|
||||
// `cut_off_buckets`. Like Elasticsearch, we always cut off when ordering
|
||||
// by a sub-agg: top-K results are approximate and may differ from the
|
||||
// global ordering, especially for non-monotonic metrics like avg/min.
|
||||
let coll = sub_agg_collector.as_deref().ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"Could not find sub-aggregation collector for path {sub_agg_path}"
|
||||
))
|
||||
})?;
|
||||
let (agg_name, agg_prop) = get_agg_name_and_property(sub_agg_path);
|
||||
// Fetch values up-front; otherwise sort would re-compute per comparison
|
||||
let mut keyed: Vec<(f64, (u64, Bucket))> = entries
|
||||
.into_iter()
|
||||
.map(|bucket| {
|
||||
let metric_value = coll
|
||||
.compute_metric_value(bucket.1.bucket_id, agg_name, agg_prop, agg_data)
|
||||
.unwrap_or(0.0);
|
||||
(metric_value, bucket)
|
||||
})
|
||||
.collect();
|
||||
if term_req.req.order.order == Order::Desc {
|
||||
keyed.sort_unstable_by(|a, b| {
|
||||
b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal)
|
||||
});
|
||||
} else {
|
||||
keyed.sort_unstable_by(|a, b| {
|
||||
a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal)
|
||||
});
|
||||
}
|
||||
entries = keyed.into_iter().map(|(_, e)| e).collect();
|
||||
OrderTarget::SubAggregation(_name) => {
|
||||
// don't sort and cut off since it's hard to make assumptions on the quality of the
|
||||
// results when cutting off du to unknown nature of the sub_aggregation (possible
|
||||
// to check).
|
||||
}
|
||||
OrderTarget::Count => {
|
||||
if term_req.req.order.order == Order::Desc {
|
||||
@@ -1033,8 +993,11 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
let (term_doc_count_before_cutoff, sum_other_doc_count) =
|
||||
cut_off_buckets(&mut entries, term_req.req.segment_size as usize);
|
||||
let (term_doc_count_before_cutoff, sum_other_doc_count) = if order_by_sub_aggregation {
|
||||
(0, 0)
|
||||
} else {
|
||||
cut_off_buckets(&mut entries, term_req.req.segment_size as usize)
|
||||
};
|
||||
|
||||
let mut dict: FxHashMap<IntermediateKey, IntermediateTermBucketEntry> = Default::default();
|
||||
dict.reserve(entries.len());
|
||||
@@ -1265,6 +1228,7 @@ pub(crate) fn cut_off_buckets<T: GetDocCount + Debug>(
|
||||
mod tests {
|
||||
use std::net::IpAddr;
|
||||
use std::str::FromStr;
|
||||
use std::time::Instant;
|
||||
|
||||
use common::DateTime;
|
||||
use time::{Date, Month};
|
||||
@@ -1278,9 +1242,10 @@ mod tests {
|
||||
get_test_index_from_terms, get_test_index_from_values_and_terms,
|
||||
};
|
||||
use crate::aggregation::{AggregationLimitsGuard, DistributedAggregationCollector};
|
||||
use crate::collector::{Collector, default_collect_segment_impl};
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::AllQuery;
|
||||
use crate::schema::{IntoIpv6Addr, Schema, FAST, INDEXED, STRING, TEXT};
|
||||
use crate::query::{AllQuery, EnableScoring, Query};
|
||||
use crate::schema::{IntoIpv6Addr, Schema, FAST, STRING};
|
||||
use crate::{Index, IndexWriter};
|
||||
|
||||
#[test]
|
||||
@@ -1809,263 +1774,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terms_aggregation_order_by_cardinality_desc_single_segment() -> crate::Result<()> {
|
||||
terms_aggregation_order_by_cardinality_desc(true)
|
||||
}
|
||||
#[test]
|
||||
fn terms_aggregation_order_by_cardinality_desc_multi_segment() -> crate::Result<()> {
|
||||
terms_aggregation_order_by_cardinality_desc(false)
|
||||
}
|
||||
fn terms_aggregation_order_by_cardinality_desc(merge_segments: bool) -> crate::Result<()> {
|
||||
// Distinct score values per bucket key: A→5, B→1, C→3.
|
||||
// Order by cardinality desc must yield A, C, B.
|
||||
let segment_and_terms = vec![vec![
|
||||
(1.0, "A".to_string()),
|
||||
(2.0, "A".to_string()),
|
||||
(3.0, "A".to_string()),
|
||||
(4.0, "A".to_string()),
|
||||
(5.0, "A".to_string()),
|
||||
(1.0, "B".to_string()),
|
||||
(1.0, "B".to_string()),
|
||||
(1.0, "B".to_string()),
|
||||
(1.0, "C".to_string()),
|
||||
(2.0, "C".to_string()),
|
||||
(3.0, "C".to_string()),
|
||||
]];
|
||||
let index = get_test_index_from_values_and_terms(merge_segments, &segment_and_terms)?;
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": {
|
||||
"field": "string_id",
|
||||
"order": { "card": "desc" }
|
||||
},
|
||||
"aggs": {
|
||||
"card": { "cardinality": { "field": "score" } }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A");
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["card"]["value"], 5.0);
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "C");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["card"]["value"], 3.0);
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "B");
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["card"]["value"], 1.0);
|
||||
|
||||
// Asc engages the segment-cutoff path too (monotonic-safe: discarded buckets had
|
||||
// local card >= cutoff, so merged card >= cutoff and they cannot be globally smallest).
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": {
|
||||
"field": "string_id",
|
||||
"order": { "card": "asc" }
|
||||
},
|
||||
"aggs": {
|
||||
"card": { "cardinality": { "field": "score" } }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "B");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "C");
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "A");
|
||||
|
||||
// size=2 with desc engages the segment cutoff: must keep top-2 by cardinality (A, C),
|
||||
// and `sum_other_doc_count` reflects the dropped B (3 docs).
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": {
|
||||
"field": "string_id",
|
||||
"size": 2,
|
||||
"order": { "card": "desc" }
|
||||
},
|
||||
"aggs": {
|
||||
"card": { "cardinality": { "field": "score" } }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "C");
|
||||
assert_eq!(res["my_texts"]["buckets"].as_array().unwrap().len(), 2);
|
||||
|
||||
// size=2 with asc engages the segment cutoff: must keep bottom-2 by cardinality (B, C).
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": {
|
||||
"field": "string_id",
|
||||
"size": 2,
|
||||
"order": { "card": "asc" }
|
||||
},
|
||||
"aggs": {
|
||||
"card": { "cardinality": { "field": "score" } }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "B");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "C");
|
||||
assert_eq!(res["my_texts"]["buckets"].as_array().unwrap().len(), 2);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terms_aggregation_order_by_sum_single_segment() -> crate::Result<()> {
|
||||
terms_aggregation_order_by_sum(true)
|
||||
}
|
||||
#[test]
|
||||
fn terms_aggregation_order_by_sum_multi_segment() -> crate::Result<()> {
|
||||
terms_aggregation_order_by_sum(false)
|
||||
}
|
||||
fn terms_aggregation_order_by_sum(merge_segments: bool) -> crate::Result<()> {
|
||||
// Per-bucket sums on the U64 `score` column (non-negative => sum is monotonic):
|
||||
// A → 1+2+3+4+5 = 15, B → 1+1+1 = 3, C → 1+2+3 = 6.
|
||||
let segment_and_terms = vec![
|
||||
vec![
|
||||
(1.0, "A".to_string()),
|
||||
(2.0, "A".to_string()),
|
||||
(3.0, "A".to_string()),
|
||||
(1.0, "B".to_string()),
|
||||
(1.0, "C".to_string()),
|
||||
],
|
||||
vec![
|
||||
(4.0, "A".to_string()),
|
||||
(5.0, "A".to_string()),
|
||||
(1.0, "B".to_string()),
|
||||
(1.0, "B".to_string()),
|
||||
(2.0, "C".to_string()),
|
||||
(3.0, "C".to_string()),
|
||||
],
|
||||
];
|
||||
let index = get_test_index_from_values_and_terms(merge_segments, &segment_and_terms)?;
|
||||
|
||||
// Desc on a Sum metric engages the fast path (column is U64).
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": {
|
||||
"field": "string_id",
|
||||
"order": { "total": "desc" }
|
||||
},
|
||||
"aggs": {
|
||||
"total": { "sum": { "field": "score" } }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A");
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["total"]["value"], 15.0);
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "C");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["total"]["value"], 6.0);
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "B");
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["total"]["value"], 3.0);
|
||||
|
||||
// Asc engages the fast path too — discarded buckets had local sum >= cutoff,
|
||||
// and merged sum >= local (non-negative addends), so they cannot be globally smallest.
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": {
|
||||
"field": "string_id",
|
||||
"order": { "total": "asc" }
|
||||
},
|
||||
"aggs": {
|
||||
"total": { "sum": { "field": "score" } }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "B");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "C");
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "A");
|
||||
|
||||
// size=2 desc with cutoff: top-2 by sum (A, C).
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": {
|
||||
"field": "string_id",
|
||||
"size": 2,
|
||||
"order": { "total": "desc" }
|
||||
},
|
||||
"aggs": {
|
||||
"total": { "sum": { "field": "score" } }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "C");
|
||||
assert_eq!(res["my_texts"]["buckets"].as_array().unwrap().len(), 2);
|
||||
|
||||
// Stats sub-property: ordering by `mystats.sum` on a U64 column also engages.
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": {
|
||||
"field": "string_id",
|
||||
"order": { "mystats.sum": "desc" }
|
||||
},
|
||||
"aggs": {
|
||||
"mystats": { "stats": { "field": "score" } }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "C");
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "B");
|
||||
|
||||
// Sum on a signed column (I64) takes the same cutoff path. Results may be
|
||||
// approximate near the boundary on adversarial data, but for this dataset the
|
||||
// top-K is unambiguous.
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": {
|
||||
"field": "string_id",
|
||||
"order": { "total": "desc" }
|
||||
},
|
||||
"aggs": {
|
||||
"total": { "sum": { "field": "score_i64" } }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "C");
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "B");
|
||||
|
||||
// Order by extended_stats sub-property exercises compute_metric_value on the
|
||||
// ExtendedStats collector. A→max=5, B→max=1, C→max=3, so desc by max → A, C, B.
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": {
|
||||
"field": "string_id",
|
||||
"order": { "ext.max": "desc" }
|
||||
},
|
||||
"aggs": {
|
||||
"ext": { "extended_stats": { "field": "score" } }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "C");
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "B");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terms_aggregation_test_order_key_single_segment() -> crate::Result<()> {
|
||||
terms_aggregation_test_order_key_merge_segment(true)
|
||||
@@ -3232,100 +2940,102 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn prep_index_with_n_unique_terms_plus_one_null(n: u64) -> crate::Result<Index> {
|
||||
#[test]
|
||||
fn test_terms_double_nesting() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let id_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let title_field = schema_builder.add_text_field("title", TEXT | FAST);
|
||||
let outer_field = schema_builder.add_text_field("outer_term", STRING | FAST);
|
||||
let inner_field = schema_builder.add_text_field("inner_term", STRING | FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
// set to one thread to guarantee all docs end up in the same segment
|
||||
let mut writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
|
||||
writer.add_document(doc!(
|
||||
id_field => 0u64,
|
||||
))?;
|
||||
for i in 1u64..=n {
|
||||
let title = format!("foo{i}");
|
||||
writer.add_document(doc!(
|
||||
id_field => i,
|
||||
title_field => title,
|
||||
))?;
|
||||
let outer_values = (0..10_000)
|
||||
.map(|i| format!("outer_{i}"))
|
||||
.collect::<Vec<_>>();
|
||||
let inner_values = ["INFO", "ERROR", "WARN", "DEBUG"];
|
||||
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 200_000_000).unwrap();
|
||||
for doc_id in 0..1_000_000u64 {
|
||||
let outer_val = &outer_values[doc_id as usize % outer_values.len()];
|
||||
let inner_val = inner_values[doc_id as usize % inner_values.len()];
|
||||
index_writer.add_document(doc!(
|
||||
outer_field => outer_val.as_str(),
|
||||
inner_field => inner_val,
|
||||
)).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"outer": {
|
||||
"terms": { "field": "outer_term", "size": 10 },
|
||||
"aggs": {
|
||||
"inner": {
|
||||
"terms": { "field": "inner_term" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
writer.commit()?;
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
Ok(index)
|
||||
let collector =
|
||||
crate::aggregation::AggregationCollector::from_aggs(agg_req, Default::default());
|
||||
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let all_weight = AllQuery.weight(EnableScoring::disabled_from_schema(&schema)).unwrap();
|
||||
let mut segment_collector = collector.for_segment(0u32, segment_reader).unwrap();
|
||||
let start = Instant::now();
|
||||
default_collect_segment_impl(&mut segment_collector, &*all_weight, segment_reader, false).unwrap();
|
||||
dbg!(start.elapsed());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn null_bitset_bounds_check_regression() -> crate::Result<()> {
|
||||
// include cases
|
||||
for i in 0..=4 {
|
||||
let index = prep_index_with_n_unique_terms_plus_one_null(i * 64)?;
|
||||
let normal_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_bool": {
|
||||
"terms": {
|
||||
"field": "title",
|
||||
"missing": "__NULL__",
|
||||
"size": 1000,
|
||||
}
|
||||
}
|
||||
}))?;
|
||||
let include_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_bool": {
|
||||
"terms": {
|
||||
"field": "title",
|
||||
"include": "foo(.*)",
|
||||
"missing": "__NULL__",
|
||||
"size": 1000,
|
||||
}
|
||||
}
|
||||
}))?;
|
||||
let exclude_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_bool": {
|
||||
"terms": {
|
||||
"field": "title",
|
||||
"exclude": "foo(.*)",
|
||||
"missing": "__NULL__",
|
||||
"size": 1000,
|
||||
}
|
||||
}
|
||||
}))?;
|
||||
fn test_terms_simple_nesting() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let outer_field = schema_builder.add_text_field("outer_term", STRING | FAST);
|
||||
let inner_field = schema_builder.add_text_field("inner_term", STRING | FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let normal_res = exec_request(normal_req, &index)?;
|
||||
let normal_buckets = normal_res["my_bool"]["buckets"].as_array().unwrap();
|
||||
assert_eq!(
|
||||
normal_buckets.len(),
|
||||
(i * 64) as usize + 1,
|
||||
"The normal request should return all 'foo' buckets, plus the missing term bucket",
|
||||
);
|
||||
let outer_values = (0..10_000)
|
||||
.map(|i| format!("outer_{i}"))
|
||||
.collect::<Vec<_>>();
|
||||
let inner_values = ["INFO", "ERROR", "WARN", "DEBUG"];
|
||||
|
||||
let include_res = exec_request(include_req, &index)?;
|
||||
eprintln!("include_res: {include_res:?}");
|
||||
let include_buckets = include_res["my_bool"]["buckets"].as_array().unwrap();
|
||||
assert_eq!(
|
||||
include_buckets.len(),
|
||||
(i * 64) as usize,
|
||||
"The include request should return all 'foo' buckets, and not the missing term \
|
||||
bucket",
|
||||
);
|
||||
assert!(include_buckets
|
||||
.iter()
|
||||
.all(|b| b["key"].as_str().unwrap().starts_with("foo")));
|
||||
|
||||
let exclude_res = exec_request(exclude_req, &index)?;
|
||||
let exclude_buckets = exclude_res["my_bool"]["buckets"].as_array().unwrap();
|
||||
if i != 0 {
|
||||
// TODO: Remove this if after fixing exclude + missing bug
|
||||
assert_eq!(
|
||||
exclude_buckets.len(),
|
||||
1,
|
||||
"The exclude request should exclude all 'foo' buckets, and only the missing \
|
||||
term bucket",
|
||||
);
|
||||
assert_eq!(exclude_buckets[0]["key"], "__NULL__");
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 200_000_000).unwrap();
|
||||
for doc_id in 0..1_000_000u64 {
|
||||
let outer_val = &outer_values[doc_id as usize % outer_values.len()];
|
||||
let inner_val = inner_values[doc_id as usize % inner_values.len()];
|
||||
index_writer.add_document(doc!(
|
||||
outer_field => outer_val.as_str(),
|
||||
inner_field => inner_val,
|
||||
)).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
Ok(())
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"outer": {
|
||||
"terms": { "field": "outer_term", "size": 10 },
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let collector =
|
||||
crate::aggregation::AggregationCollector::from_aggs(agg_req, Default::default());
|
||||
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let all_weight = AllQuery.weight(EnableScoring::disabled_from_schema(&schema)).unwrap();
|
||||
let mut segment_collector = collector.for_segment(0u32, segment_reader).unwrap();
|
||||
let start = Instant::now();
|
||||
default_collect_segment_impl(&mut segment_collector, &*all_weight, segment_reader, false).unwrap();
|
||||
dbg!(start.elapsed());
|
||||
}
|
||||
}
|
||||
@@ -1,585 +0,0 @@
|
||||
//! Fused collector for the very common shape `terms` (low cardinality) × a single
|
||||
//! `histogram`/`date_histogram` sub-aggregation with nothing nested below it.
|
||||
//!
|
||||
//! See [`SegmentTermHistogramCollector`] for the approach and [`maybe_build_collector`] for the
|
||||
//! conditions under which it is used.
|
||||
|
||||
use columnar::ColumnBlockAccessor;
|
||||
|
||||
use super::{Bucket, SegmentTermCollector, TermsAggReqData, VecTermBuckets};
|
||||
use crate::aggregation::agg_data::{AggKind, AggRefNode, AggregationsSegmentCtx};
|
||||
use crate::aggregation::bucket::{
|
||||
get_bucket_pos_f64, prepare_histogram_dense_range, HistogramAggReqData,
|
||||
SegmentHistogramCollector,
|
||||
};
|
||||
use crate::aggregation::buffered_sub_aggs::LowCardSubAggBuffer;
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResult, IntermediateAggregationResults,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::{BucketIdProvider, SegmentAggregationCollector};
|
||||
use crate::aggregation::{f64_from_fastfield_u64, BucketId};
|
||||
|
||||
/// Maximum number of cells (`num_terms × num_time_buckets`) in the fused flat 2D grid. Above this
|
||||
/// the grid would be too large/cache-unfriendly, so we fall back to the general buffered path.
|
||||
/// `1 << 14` cells = 128 KB of `u64` counters, comfortably L2-resident.
|
||||
///
|
||||
/// Since we are only at the top-level, this won't be multiplied by any parent buckets.
|
||||
const MAX_FUSED_GRID_BUCKETS: usize = 16384;
|
||||
|
||||
/// Fused collector for `terms` (low cardinality) × a single `histogram`/`date_histogram` leaf with
|
||||
/// nothing nested below it, when the resulting `num_terms × num_time_buckets` grid is small (see
|
||||
/// [`MAX_FUSED_GRID_BUCKETS`]).
|
||||
///
|
||||
/// It keeps a flat, fully dense 2D counter grid (`counts[term * num_time_buckets + bucket]`) and a
|
||||
/// per-term total. A single pass reads both the term and histogram columns in document order and
|
||||
/// bumps the counters directly — no doc-id buffering, no per-term scattered re-fetch, no dynamic
|
||||
/// dispatch on flush, no per-bucket key/id storage during collection (keys are derived from the
|
||||
/// index at the end).
|
||||
///
|
||||
/// At result time the flat grid is expanded back into the regular term map + histogram storage and
|
||||
/// handed to the shared intermediate-result builders, so cross-segment merging is identical to the
|
||||
/// general path.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct SegmentTermHistogramCollector {
|
||||
/// Per-term count of docs *outside* `hard_bounds` (still in `doc_count`, but in no bucket).
|
||||
/// Per-term total = this + the term's `counts` row-sum; left empty when there are no hard
|
||||
/// bounds (every doc is in-bounds, so there's no remainder to track).
|
||||
term_counts: Vec<u32>,
|
||||
/// Flattened `[num_terms * num_time_buckets]` histogram counters (`u32`, see
|
||||
/// `term_counts`).
|
||||
///
|
||||
/// Each term id get its own contiguous slice of `num_time_buckets` histogram counter.
|
||||
/// When we count all docs (#nofilter), we can derive the per-term total as the sum over that
|
||||
/// term's slice.
|
||||
counts: Vec<u32>,
|
||||
/// Histogram buckets per term (the dense time-range length).
|
||||
num_time_buckets: usize,
|
||||
/// `bucket_pos` mapped to time-bucket index 0.
|
||||
base_pos: i64,
|
||||
terms_req_data: TermsAggReqData,
|
||||
/// The (cloned, normalized) histogram request: its column + interval/offset/bounds.
|
||||
hist_req_data: HistogramAggReqData,
|
||||
/// Private block accessors for both columns. We read them together, so each needs its own
|
||||
/// (the shared `agg_data` scratch accessor only holds one block at a time). Owning them keeps
|
||||
/// `collect` independent of `agg_data`.
|
||||
term_block: ColumnBlockAccessor<u64>,
|
||||
hist_block: ColumnBlockAccessor<u64>,
|
||||
/// No hard bounds, so every doc is in-bounds.
|
||||
all_docs_in_bounds: bool,
|
||||
/// Both columns are full (fused-path precondition); cached so `collect` skips the per-block
|
||||
/// cardinality lookup in `fetch_block`.
|
||||
is_full: bool,
|
||||
}
|
||||
|
||||
impl SegmentAggregationCollector for SegmentTermHistogramCollector {
|
||||
fn add_intermediate_aggregation_result(
|
||||
&mut self,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
results: &mut IntermediateAggregationResults,
|
||||
parent_bucket_id: BucketId,
|
||||
) -> crate::Result<()> {
|
||||
debug_assert_eq!(
|
||||
parent_bucket_id, 0,
|
||||
"fused term-histogram collector is top-level only"
|
||||
);
|
||||
// Expand the flat grid back into the regular structures and reuse the shared builders, so
|
||||
// ordering/cut-off/dict handling and cross-segment merging match the general path exactly.
|
||||
let mut bucket_id_provider = BucketIdProvider::default();
|
||||
// Per-term total = histogram row-sum (in-bounds) + `term_counts` (out-of-bounds remainder,
|
||||
// empty when there are no hard bounds).
|
||||
let term_buckets = VecTermBuckets {
|
||||
buckets: self
|
||||
.counts
|
||||
.chunks_exact(self.num_time_buckets)
|
||||
.enumerate()
|
||||
.map(|(term_id, row)| {
|
||||
let in_bounds: u32 = row.iter().sum();
|
||||
let out_of_bounds = self.term_counts.get(term_id).copied().unwrap_or(0);
|
||||
Bucket {
|
||||
count: in_bounds + out_of_bounds,
|
||||
bucket_id: bucket_id_provider.next_bucket_id(),
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
};
|
||||
let mut histogram = SegmentHistogramCollector::<()>::from_dense_rows(
|
||||
self.hist_req_data.clone(),
|
||||
self.base_pos,
|
||||
self.num_time_buckets,
|
||||
&self.counts,
|
||||
);
|
||||
let name = self.terms_req_data.name.clone();
|
||||
let bucket = SegmentTermCollector::<VecTermBuckets, LowCardSubAggBuffer>::into_intermediate_bucket_result(
|
||||
&self.terms_req_data,
|
||||
Some(&mut histogram as &mut dyn SegmentAggregationCollector),
|
||||
term_buckets,
|
||||
agg_data,
|
||||
)?;
|
||||
results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn collect(
|
||||
&mut self,
|
||||
parent_bucket_id: BucketId,
|
||||
docs: &[crate::DocId],
|
||||
_agg_data: &mut AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
debug_assert_eq!(
|
||||
parent_bucket_id, 0,
|
||||
"fused term-histogram collector is top-level only"
|
||||
);
|
||||
|
||||
// Fetch both columns into our own accessors (we read them together, so they can't share the
|
||||
// single `agg_data` scratch accessor). The collector owns all its inputs, so `collect`
|
||||
// doesn't touch `agg_data`.
|
||||
self.term_block
|
||||
.fetch_block_with_is_full(docs, &self.terms_req_data.accessor, self.is_full);
|
||||
self.hist_block
|
||||
.fetch_block_with_is_full(docs, &self.hist_req_data.accessor, self.is_full);
|
||||
|
||||
// Hoist the loop-invariant fields into locals: the optimizer can't prove the
|
||||
// `self.counts`/`self.term_counts` writes don't alias these `self` fields, so it can't keep
|
||||
// them in registers and re-reads them from memory every iteration — ~15% slower on
|
||||
// `terms_status_with_date_histogram` when read straight from `self`.
|
||||
// Note: check which are actually relevant.
|
||||
let field_type = self.hist_req_data.field_type;
|
||||
let bounds = self.hist_req_data.bounds;
|
||||
let interval = self.hist_req_data.req.interval;
|
||||
let offset = self.hist_req_data.offset;
|
||||
let base_pos = self.base_pos;
|
||||
let num_time_buckets = self.num_time_buckets;
|
||||
let all_docs_in_bounds = self.all_docs_in_bounds;
|
||||
let term_counts = &mut self.term_counts;
|
||||
let counts = &mut self.counts;
|
||||
|
||||
// Both columns are full (checked at construction), so values align with `docs` positionally
|
||||
// and are read together in one pass.
|
||||
// In-bounds docs bump the `counts` grid, out-of-bounds bump `term_counts`; deriving the
|
||||
// total at flush avoids a per-doc `term_counts` RMW that serializes on
|
||||
// store-to-load forwarding.
|
||||
for (term_id, hist_raw) in self.term_block.iter_vals().zip(self.hist_block.iter_vals()) {
|
||||
let term_id = term_id as usize;
|
||||
let val = f64_from_fastfield_u64(hist_raw, field_type);
|
||||
if all_docs_in_bounds || bounds.contains(val) {
|
||||
let bucket = (get_bucket_pos_f64(val, interval, offset) as i64 - base_pos) as usize;
|
||||
debug_assert!(
|
||||
bucket < num_time_buckets,
|
||||
"histogram bucket outside dense range"
|
||||
);
|
||||
counts[term_id * num_time_buckets + bucket] += 1;
|
||||
} else {
|
||||
term_counts[term_id] += 1;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush(&mut self, _agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
||||
// Nothing is buffered: `collect` writes the flat grid directly.
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn prepare_max_bucket(
|
||||
&mut self,
|
||||
_max_bucket: BucketId,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
// Top-level: the flat grid is allocated up front.
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_metric_value(
|
||||
&self,
|
||||
_bucket_id: BucketId,
|
||||
_sub_agg_name: &str,
|
||||
_sub_agg_property: &str,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> Option<f64> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds the fused terms×histogram collector for a single top-level parent, when the shape is
|
||||
/// eligible. Returns `Ok(None)` to fall back to the general buffered terms path.
|
||||
///
|
||||
/// Eligibility: top-level, low-cardinality terms over a full column with no missing/include-exclude
|
||||
/// handling; a single `histogram`/`date_histogram` leaf (no nesting below it) over a full column;
|
||||
/// and a `num_terms × num_time_buckets` grid no larger than [`MAX_FUSED_GRID_BUCKETS`].
|
||||
pub(super) fn maybe_build_collector(
|
||||
agg_data: &mut AggregationsSegmentCtx,
|
||||
node: &AggRefNode,
|
||||
terms_req_data: &TermsAggReqData,
|
||||
col_max_val: u64,
|
||||
is_top_level: bool,
|
||||
) -> crate::Result<Option<Box<dyn SegmentAggregationCollector>>> {
|
||||
// Both columns must be full (one value per doc) so their values align positionally with `docs`
|
||||
// and we can zip them. Requiring full columns also makes the terms agg's `missing` config a
|
||||
// no-op (`fetch_block_with_missing` early-returns on full columns), so we needn't check for it.
|
||||
//
|
||||
// We don't cap the term cardinality here: the flat grid is bounded by the total cell count
|
||||
// (`num_terms * num_time_buckets <= MAX_FUSED_GRID_BUCKETS`) checked below, which subsumes it.
|
||||
//
|
||||
// We only allow this at the top-level, since we don't know how many buckets are created. We
|
||||
// are less likely to get enough docs for the preallocation to be worth and there's a risk of
|
||||
// using too much memory. We could check the maximum theoretical buckets up-front and pass
|
||||
// them down.
|
||||
let fuseable = is_top_level
|
||||
// TODO: We can easily support this
|
||||
&& terms_req_data.allowed_term_ids.is_none()
|
||||
&& terms_req_data.accessor.get_cardinality().is_full()
|
||||
// The flat counters are `u32`, bumped once per value, so no count can exceed the column's
|
||||
// value count. (Essentially always true here: the column is full, so its value count
|
||||
// equals the doc count, and `DocId` is `u32`.)
|
||||
&& terms_req_data.accessor.values.num_vals() < u32::MAX
|
||||
&& node.children.len() == 1
|
||||
&& matches!(
|
||||
node.children[0].kind,
|
||||
AggKind::Histogram | AggKind::DateHistogram
|
||||
)
|
||||
&& node.children[0].children.is_empty()
|
||||
&& agg_data.per_request.histogram_req_data[node.children[0].idx_in_req_data]
|
||||
.accessor
|
||||
.get_cardinality()
|
||||
.is_full();
|
||||
if !fuseable {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Clone + normalize the histogram request and get its dense bucket range; only take the fused
|
||||
// path when the flat `num_terms × num_time_buckets` grid is small enough.
|
||||
let Some((hist_req_data, range)) = prepare_histogram_dense_range(agg_data, &node.children[0])?
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
let num_terms = col_max_val.saturating_add(1) as usize;
|
||||
if num_terms.saturating_mul(range.len) > MAX_FUSED_GRID_BUCKETS {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// No hard bounds means every doc is in-bounds, letting `collect` short-circuit the bounds
|
||||
// check — and leaving `term_counts` (the out-of-bounds remainder) unused, so we skip allocating
|
||||
// it.
|
||||
let all_docs_in_bounds =
|
||||
hist_req_data.bounds.min == f64::MIN && hist_req_data.bounds.max == f64::MAX;
|
||||
let counts = vec![0u32; num_terms * range.len];
|
||||
let term_counts = if all_docs_in_bounds {
|
||||
Vec::new()
|
||||
} else {
|
||||
vec![0u32; num_terms]
|
||||
};
|
||||
// Charge both grids to the aggregation memory limit.
|
||||
agg_data.context.limits.add_memory_consumed(
|
||||
((counts.len() + term_counts.len()) * std::mem::size_of::<u32>()) as u64,
|
||||
)?;
|
||||
Ok(Some(Box::new(SegmentTermHistogramCollector {
|
||||
term_counts,
|
||||
counts,
|
||||
num_time_buckets: range.len,
|
||||
base_pos: range.base_pos,
|
||||
terms_req_data: terms_req_data.clone(),
|
||||
hist_req_data,
|
||||
term_block: ColumnBlockAccessor::default(),
|
||||
hist_block: ColumnBlockAccessor::default(),
|
||||
all_docs_in_bounds,
|
||||
is_full: terms_req_data.accessor.get_cardinality().is_full(),
|
||||
})))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::aggregation::agg_req::Aggregations;
|
||||
use crate::aggregation::tests::{
|
||||
exec_request, exec_request_with_query_and_memory_limit,
|
||||
get_test_index_from_values_and_terms,
|
||||
};
|
||||
use crate::aggregation::AggregationLimitsGuard;
|
||||
|
||||
/// Hand-computed correctness check for the fused terms×histogram fast path
|
||||
/// ([`super::SegmentTermHistogramCollector`]): low-cardinality terms × a histogram leaf over
|
||||
/// full columns, exercised single- and multi-segment.
|
||||
#[test]
|
||||
fn fused_term_histogram_test() -> crate::Result<()> {
|
||||
fused_term_histogram_with_opt(false)?;
|
||||
fused_term_histogram_with_opt(true)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn fused_term_histogram_with_opt(merge_segments: bool) -> crate::Result<()> {
|
||||
// 300 docs: term = {a, b, c} by i % 3, histogram value = i % 20 (interval 1 => buckets
|
||||
// 0..19). gcd(3, 20) = 1, so every (term, bucket) pair occurs exactly 300 / 60 = 5 times.
|
||||
let docs: Vec<(f64, String)> = (0..300u64)
|
||||
.map(|i| {
|
||||
(
|
||||
(i % 20) as f64,
|
||||
["a", "b", "c"][(i % 3) as usize].to_string(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
// Two segments, to also exercise cross-segment merging of the fused per-term histograms.
|
||||
let segments = vec![docs[..150].to_vec(), docs[150..].to_vec()];
|
||||
let index = get_test_index_from_values_and_terms(merge_segments, &segments)?;
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(serde_json::json!({
|
||||
"by_term": {
|
||||
"terms": { "field": "string_id", "order": { "_key": "asc" } },
|
||||
"aggs": {
|
||||
"histo": { "histogram": { "field": "score_f64", "interval": 1.0 } }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
|
||||
for (term_idx, term) in ["a", "b", "c"].iter().enumerate() {
|
||||
assert_eq!(res["by_term"]["buckets"][term_idx]["key"], *term);
|
||||
assert_eq!(res["by_term"]["buckets"][term_idx]["doc_count"], 100);
|
||||
let histo = &res["by_term"]["buckets"][term_idx]["histo"]["buckets"];
|
||||
for b in 0..20usize {
|
||||
assert_eq!(histo[b]["key"], b as f64, "term {term} bucket {b}");
|
||||
assert_eq!(histo[b]["doc_count"], 5, "term {term} bucket {b}");
|
||||
}
|
||||
assert_eq!(histo[20], serde_json::Value::Null);
|
||||
}
|
||||
assert_eq!(res["by_term"]["buckets"][3], serde_json::Value::Null);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// A `missing` config on a *full* term column still takes the fused path (the string sentinel
|
||||
/// is just `col_max + 1`, so the column stays low-cardinality). Since no doc is missing, the
|
||||
/// real term buckets must be exactly as without `missing`.
|
||||
#[test]
|
||||
fn fused_term_histogram_with_missing_on_full_column() -> crate::Result<()> {
|
||||
let docs: Vec<(f64, String)> = (0..300u64)
|
||||
.map(|i| {
|
||||
(
|
||||
(i % 20) as f64,
|
||||
["a", "b", "c"][(i % 3) as usize].to_string(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
let index = get_test_index_from_values_and_terms(true, &[docs])?;
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(serde_json::json!({
|
||||
"by_term": {
|
||||
"terms": { "field": "string_id", "missing": "MISSING", "order": { "_key": "asc" } },
|
||||
"aggs": {
|
||||
"histo": { "histogram": { "field": "score_f64", "interval": 1.0 } }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
|
||||
// Column is full, so "MISSING" never applies: a, b, c are unchanged (100 docs, 5 per
|
||||
// bucket).
|
||||
for (term_idx, term) in ["a", "b", "c"].iter().enumerate() {
|
||||
assert_eq!(res["by_term"]["buckets"][term_idx]["key"], *term);
|
||||
assert_eq!(res["by_term"]["buckets"][term_idx]["doc_count"], 100);
|
||||
let histo = &res["by_term"]["buckets"][term_idx]["histo"]["buckets"];
|
||||
for b in 0..20usize {
|
||||
assert_eq!(histo[b]["doc_count"], 5, "term {term} bucket {b}");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Term cardinality above the general path's `MAX_NUM_TERMS_FOR_VEC` (100) still fuses: the
|
||||
/// flat grid is bounded by the total cell count (`num_terms * num_time_buckets`), not the
|
||||
/// term count.
|
||||
#[test]
|
||||
fn fused_term_histogram_many_terms() -> crate::Result<()> {
|
||||
let num_terms = 150usize;
|
||||
let docs_per_term = 2usize;
|
||||
// All docs share histogram value 0 (a single bucket), so the grid is 150 x 1 = 150 cells.
|
||||
let docs: Vec<(f64, String)> = (0..num_terms * docs_per_term)
|
||||
.map(|i| (0.0, format!("t{:03}", i % num_terms)))
|
||||
.collect();
|
||||
let index = get_test_index_from_values_and_terms(true, &[docs])?;
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(serde_json::json!({
|
||||
"by_term": {
|
||||
"terms": { "field": "string_id", "size": 1000, "order": { "_key": "asc" } },
|
||||
"aggs": {
|
||||
"histo": { "histogram": { "field": "score_f64", "interval": 1.0 } }
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
|
||||
let buckets = res["by_term"]["buckets"].as_array().unwrap();
|
||||
assert_eq!(buckets.len(), num_terms);
|
||||
for (i, bucket) in buckets.iter().enumerate() {
|
||||
assert_eq!(bucket["key"], format!("t{i:03}"));
|
||||
assert_eq!(bucket["doc_count"], docs_per_term as u64);
|
||||
assert_eq!(bucket["histo"]["buckets"][0]["key"], 0.0);
|
||||
assert_eq!(
|
||||
bucket["histo"]["buckets"][0]["doc_count"],
|
||||
docs_per_term as u64
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// `hard_bounds` exercises the non-derived `term_counts` branch: a term's `doc_count` must
|
||||
/// count *every* doc with that term, including docs whose histogram value is outside the
|
||||
/// bounds (those are excluded from the histogram buckets but still counted for the term). This
|
||||
/// is the case where the per-doc `term_counts` increment cannot be replaced by the grid
|
||||
/// row-sum.
|
||||
#[test]
|
||||
fn fused_term_histogram_with_hard_bounds() -> crate::Result<()> {
|
||||
// 300 docs: term = {a, b, c} by i % 3, value = i % 20. Per term: 100 docs, each value in
|
||||
// 0..=19 occurring 5 times.
|
||||
let docs: Vec<(f64, String)> = (0..300u64)
|
||||
.map(|i| {
|
||||
(
|
||||
(i % 20) as f64,
|
||||
["a", "b", "c"][(i % 3) as usize].to_string(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
let index = get_test_index_from_values_and_terms(true, &[docs])?;
|
||||
|
||||
// hard_bounds [5, 14] (inclusive) keeps only values 5..=14 in the histogram (10 buckets);
|
||||
// values 0..=4 and 15..=19 are out of bounds.
|
||||
let agg_req: Aggregations = serde_json::from_value(serde_json::json!({
|
||||
"by_term": {
|
||||
"terms": { "field": "string_id", "order": { "_key": "asc" } },
|
||||
"aggs": {
|
||||
"histo": {
|
||||
"histogram": {
|
||||
"field": "score_f64",
|
||||
"interval": 1.0,
|
||||
"hard_bounds": { "min": 5.0, "max": 14.0 }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
|
||||
for (term_idx, term) in ["a", "b", "c"].iter().enumerate() {
|
||||
assert_eq!(res["by_term"]["buckets"][term_idx]["key"], *term);
|
||||
// doc_count includes the 50 per-term docs whose value is outside [5, 14].
|
||||
assert_eq!(res["by_term"]["buckets"][term_idx]["doc_count"], 100);
|
||||
let histo = &res["by_term"]["buckets"][term_idx]["histo"]["buckets"];
|
||||
for b in 0..10usize {
|
||||
let key = 5 + b;
|
||||
assert_eq!(histo[b]["key"], key as f64, "term {term} bucket key {key}");
|
||||
assert_eq!(histo[b]["doc_count"], 5, "term {term} bucket {key}");
|
||||
}
|
||||
// Only the 10 in-bounds buckets exist.
|
||||
assert_eq!(histo[10], serde_json::Value::Null);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Non-binding `hard_bounds` (wider than the data, with mid-interval edges) must still produce
|
||||
/// exact results via the derive-from-grid path: since no doc is out of bounds, normalization
|
||||
/// drops the bound, every doc lands in the dense range, and each term's total equals its
|
||||
/// histogram row-sum. This is the case that previously fell back to the per-doc counter only
|
||||
/// because `bounds != [MIN, MAX]`.
|
||||
#[test]
|
||||
fn fused_term_histogram_with_non_binding_hard_bounds() -> crate::Result<()> {
|
||||
// 300 docs: term = {a, b, c} by i % 3, value = i % 20. Data values span [0, 19].
|
||||
let docs: Vec<(f64, String)> = (0..300u64)
|
||||
.map(|i| {
|
||||
(
|
||||
(i % 20) as f64,
|
||||
["a", "b", "c"][(i % 3) as usize].to_string(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
let index = get_test_index_from_values_and_terms(true, &[docs])?;
|
||||
|
||||
// Bounds wider than [0, 19], with mid-interval edges -> they exclude nothing.
|
||||
let agg_req: Aggregations = serde_json::from_value(serde_json::json!({
|
||||
"by_term": {
|
||||
"terms": { "field": "string_id", "order": { "_key": "asc" } },
|
||||
"aggs": {
|
||||
"histo": {
|
||||
"histogram": {
|
||||
"field": "score_f64",
|
||||
"interval": 1.0,
|
||||
"hard_bounds": { "min": -0.5, "max": 19.5 }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
|
||||
for (term_idx, term) in ["a", "b", "c"].iter().enumerate() {
|
||||
assert_eq!(res["by_term"]["buckets"][term_idx]["key"], *term);
|
||||
// Every doc is in-bounds, so the per-term total is the full 100 (as without bounds).
|
||||
assert_eq!(res["by_term"]["buckets"][term_idx]["doc_count"], 100);
|
||||
let histo = &res["by_term"]["buckets"][term_idx]["histo"]["buckets"];
|
||||
for b in 0..20usize {
|
||||
assert_eq!(histo[b]["key"], b as f64, "term {term} bucket {b}");
|
||||
assert_eq!(histo[b]["doc_count"], 5, "term {term} bucket {b}");
|
||||
}
|
||||
assert_eq!(histo[20], serde_json::Value::Null);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Regression: with hard bounds the fused path allocates `term_counts` (one `u32`/term) on top
|
||||
/// of the grid, and that allocation must be charged to the memory limit. With many terms and a
|
||||
/// single time bucket the two are equal in size, so a limit admitting the grid alone but not
|
||||
/// grid + `term_counts` must fail.
|
||||
#[test]
|
||||
fn fused_term_histogram_hard_bounds_charges_term_counts() -> crate::Result<()> {
|
||||
// 16k distinct terms, one doc each; values alternate in/out of the single-bucket bounds
|
||||
// [5, 5] so the bounds bind and `term_counts` is allocated. num_terms=16000,
|
||||
// num_time_buckets=1 => `counts` and `term_counts` are ~64 KB each.
|
||||
let docs: Vec<(f64, String)> = (0..16_000u64)
|
||||
.map(|i| (if i % 2 == 0 { 5.0 } else { 10.0 }, format!("t{i:05}")))
|
||||
.collect();
|
||||
let index = get_test_index_from_values_and_terms(true, &[docs])?;
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(serde_json::json!({
|
||||
"by_term": {
|
||||
"terms": { "field": "string_id" },
|
||||
"aggs": {
|
||||
"histo": {
|
||||
"histogram": {
|
||||
"field": "score_f64",
|
||||
"interval": 1.0,
|
||||
"hard_bounds": { "min": 5.0, "max": 5.0 }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
// ~96 KB admits the grid (~64 KB) but not grid + `term_counts` (~128 KB).
|
||||
let err = exec_request_with_query_and_memory_limit(
|
||||
agg_req,
|
||||
&index,
|
||||
None,
|
||||
AggregationLimitsGuard::new(Some(96_000), None),
|
||||
)
|
||||
.unwrap_err();
|
||||
assert!(
|
||||
err.to_string().contains("memory limit was exceeded"),
|
||||
"expected a memory-limit error, got: {err}"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -177,17 +177,6 @@ impl SegmentAggregationCollector for TermMissingAgg {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_metric_value(
|
||||
&self,
|
||||
_bucket_id: BucketId,
|
||||
_sub_agg_name: &str,
|
||||
_sub_agg_property: &str,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> Option<f64> {
|
||||
// TODO: forward to `sub_agg` for nested order paths (`missing_agg>metric`).
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -138,7 +138,6 @@ impl SubAggBuffer for HighCardSubAggBuffer {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn push(&mut self, bucket_id: BucketId, doc_id: DocId) {
|
||||
let idx = bucket_id % NUM_PARTITIONS as u32;
|
||||
let slot = &mut self.partitions[idx as usize];
|
||||
@@ -197,7 +196,6 @@ impl SubAggBuffer for LowCardSubAggBuffer {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn push(&mut self, bucket_id: BucketId, doc_id: DocId) {
|
||||
let idx = bucket_id as usize;
|
||||
if self.per_bucket_docs.len() <= idx {
|
||||
|
||||
@@ -377,22 +377,7 @@ impl IntermediateMetricResult {
|
||||
MetricResult::ExtendedStats(intermediate_stats.finalize())
|
||||
}
|
||||
IntermediateMetricResult::Sum(intermediate_sum) => {
|
||||
// By default match Elasticsearch: empty / all-missing sum
|
||||
// buckets serialize as `"value": 0`, not `"value": null`.
|
||||
// The non-ES `none_if_no_match` flag on `SumAggregation`
|
||||
// opts into SQL-style `null` for downstream consumers.
|
||||
let none_if_no_match = req
|
||||
.agg
|
||||
.as_sum()
|
||||
.and_then(|sum| sum.none_if_no_match)
|
||||
.unwrap_or(false);
|
||||
let value = intermediate_sum.finalize();
|
||||
if none_if_no_match {
|
||||
MetricResult::Sum(value.into())
|
||||
} else {
|
||||
let value = Some(value.unwrap_or(0.0));
|
||||
MetricResult::Sum(value.into())
|
||||
}
|
||||
MetricResult::Sum(intermediate_sum.finalize().into())
|
||||
}
|
||||
IntermediateMetricResult::Percentiles(percentiles) => MetricResult::Percentiles(
|
||||
percentiles
|
||||
@@ -1019,20 +1004,24 @@ impl IntermediateCompositeBucketResult {
|
||||
) -> crate::Result<BucketResult> {
|
||||
let trimmed_entry_vec =
|
||||
trim_composite_buckets(self.entries, &self.orders, self.target_size)?;
|
||||
let after_key = trimmed_entry_vec
|
||||
.last()
|
||||
.map(|bucket| {
|
||||
let (intermediate_key, _entry) = bucket;
|
||||
intermediate_key
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, intermediate_key)| {
|
||||
let source = &req.sources[idx];
|
||||
(source.name().to_string(), intermediate_key.clone().into())
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
let after_key = if trimmed_entry_vec.len() == req.size as usize {
|
||||
trimmed_entry_vec
|
||||
.last()
|
||||
.map(|bucket| {
|
||||
let (intermediate_key, _entry) = bucket;
|
||||
intermediate_key
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, intermediate_key)| {
|
||||
let source = &req.sources[idx];
|
||||
(source.name().to_string(), intermediate_key.clone().into())
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.unwrap()
|
||||
} else {
|
||||
FxHashMap::default()
|
||||
};
|
||||
|
||||
let buckets = trimmed_entry_vec
|
||||
.into_iter()
|
||||
|
||||
@@ -4,7 +4,6 @@ use std::io;
|
||||
|
||||
use columnar::column_values::CompactSpaceU64Accessor;
|
||||
use columnar::{Column, ColumnType, Dictionary, StrColumn};
|
||||
use common::{BitSet, TinySet};
|
||||
use datasketches::hll::{Coupon, HllSketch, HllType, HllUnion};
|
||||
use rustc_hash::{FxBuildHasher, FxHashMap, FxHashSet};
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
@@ -21,12 +20,6 @@ use crate::TantivyError;
|
||||
/// 2^11 = 2048 registers, giving ~2.3% relative error and ~1KB per sketch (Hll4).
|
||||
const LG_K: u8 = 11;
|
||||
|
||||
/// Promote FxHashSet<u64> -> PagedBitset at ~3% density (`len * 32 >
|
||||
/// dict_num_terms`). Past this point the bitset (~`dict_num_terms / 7.5`
|
||||
/// bytes) is smaller than the hashset (~10 B/entry minimum) and avoids
|
||||
/// the per-insert hash.
|
||||
const PROMOTION_RATIO: u64 = 32;
|
||||
|
||||
/// # Cardinality
|
||||
///
|
||||
/// The cardinality aggregation allows for computing an estimate
|
||||
@@ -166,13 +159,8 @@ impl CouponCache {
|
||||
let should_use_dense =
|
||||
highest_term_ord < 1_000_000u64 || highest_term_ord < num_terms as u64 * 3u64;
|
||||
if should_use_dense {
|
||||
// We don't really care about the value here. We will populate all the values we will
|
||||
// read anyway.
|
||||
let uninitialized_coupon = Coupon::from_hash(0);
|
||||
let mut coupon_map: Vec<Coupon> =
|
||||
vec![uninitialized_coupon; highest_term_ord as usize + 1];
|
||||
|
||||
for (term_ord, coupon) in term_ords.into_iter().zip(coupons) {
|
||||
let mut coupon_map: Vec<Coupon> = vec![Coupon::EMPTY; highest_term_ord as usize + 1];
|
||||
for (term_ord, coupon) in term_ords.into_iter().zip(coupons.into_iter()) {
|
||||
coupon_map[term_ord as usize] = coupon;
|
||||
}
|
||||
CouponCache::Dense {
|
||||
@@ -189,263 +177,9 @@ impl CouponCache {
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
// PagedBitset: a sparse bitset indexed by term_ord.
|
||||
//
|
||||
// Used as the dense alternative to FxHashSet<u64> once a string
|
||||
// cardinality bucket has accumulated enough unique term ordinals.
|
||||
// Memory is bounded to (touched pages) * (page bytes), not
|
||||
// (max_term_ord / 8).
|
||||
//
|
||||
// Page geometry mirrors `PagedTermMap` in `term_agg.rs`: 1024 ords
|
||||
// per page, lazy `Vec<Option<Box<Page>>>` directory.
|
||||
// =================================================================
|
||||
const BITSET_PAGE_SHIFT: u32 = 10;
|
||||
const BITSET_PAGE_BITS: u64 = 1u64 << BITSET_PAGE_SHIFT; // 1024
|
||||
const BITSET_PAGE_MASK: u64 = BITSET_PAGE_BITS - 1;
|
||||
const BITSET_WORDS_PER_PAGE: usize = (BITSET_PAGE_BITS / 64) as usize; // 16
|
||||
|
||||
#[derive(Clone)]
|
||||
struct PagedBitsetPage {
|
||||
words: [TinySet; BITSET_WORDS_PER_PAGE],
|
||||
}
|
||||
|
||||
impl PagedBitsetPage {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
words: [TinySet::empty(); BITSET_WORDS_PER_PAGE],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct PagedBitset {
|
||||
pages: Vec<Option<Box<PagedBitsetPage>>>,
|
||||
/// Cached number of set bits, maintained on insert.
|
||||
count: u64,
|
||||
}
|
||||
|
||||
impl PagedBitset {
|
||||
/// Allocates a directory big enough to hold ords up to and including
|
||||
/// `max_term_ord`. Pages are allocated lazily on first set.
|
||||
fn with_max_term_ord(max_term_ord: u64) -> Self {
|
||||
let max_page_idx = (max_term_ord >> BITSET_PAGE_SHIFT) as usize;
|
||||
let num_pages = max_page_idx + 1;
|
||||
Self {
|
||||
pages: vec![None; num_pages],
|
||||
count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn insert(&mut self, term_ord: u64) {
|
||||
let page_idx = (term_ord >> BITSET_PAGE_SHIFT) as usize;
|
||||
let intra = term_ord & BITSET_PAGE_MASK;
|
||||
let word_idx = (intra >> 6) as usize;
|
||||
let bit_idx = (intra & 63) as u32;
|
||||
|
||||
let page = match &mut self.pages[page_idx] {
|
||||
Some(p) => p,
|
||||
None => {
|
||||
self.pages[page_idx] = Some(Box::new(PagedBitsetPage::new()));
|
||||
self.pages[page_idx].as_mut().unwrap()
|
||||
}
|
||||
};
|
||||
if page.words[word_idx].insert_mut(bit_idx) {
|
||||
self.count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Number of set bits. O(1).
|
||||
#[inline]
|
||||
fn len(&self) -> u64 {
|
||||
self.count
|
||||
}
|
||||
|
||||
/// Iterate set ords in ascending order.
|
||||
fn iter_sorted(&self) -> impl Iterator<Item = u64> + '_ {
|
||||
self.pages
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(page_idx, page_opt)| page_opt.as_ref().map(|p| (page_idx, p)))
|
||||
.flat_map(|(page_idx, page)| {
|
||||
let page_base_ord = (page_idx as u64) << BITSET_PAGE_SHIFT;
|
||||
page.words
|
||||
.iter()
|
||||
.enumerate()
|
||||
.flat_map(move |(word_idx, &word)| {
|
||||
let word_base_ord = page_base_ord + (word_idx as u64) * 64;
|
||||
word.into_iter()
|
||||
.map(move |bit| word_base_ord + u64::from(bit))
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Threshold below which we use `BitSet` instead of `TermOrdSet`.
|
||||
///
|
||||
/// Both `BitSet` and `FxHashSet<u64>` have the same 32-byte struct, so the comparison is heap only:
|
||||
/// * `BitSet` at T=256: 5 `TinySet` words covering 258 bits (with the missing-value sentinel) =
|
||||
/// 40 bytes.
|
||||
/// * `FxHashSet<u64>` after one insert: 4-bucket hashbrown table ≈ 56 bytes
|
||||
pub(crate) const BITSET_MAX_TERM_ORD: u64 = 256;
|
||||
|
||||
// =================================================================
|
||||
// TermOrdAccumulator: per-bucket abstraction over the entries set.
|
||||
//
|
||||
// Implementations:
|
||||
// - `BitSet` (from `common`): used when `column.max_value()` is small (< BITSET_MAX_TERM_ORD).
|
||||
// Pre-allocated, no promotion.
|
||||
// - `TermOrdSet`: adaptive, starts as FxHashSet and promotes to a paged bitset when occupancy
|
||||
// crosses the density threshold (only if promotion is enabled — typically gated on top-level
|
||||
// aggregation).
|
||||
//
|
||||
// The trait lets `SegmentCardinalityCollector` be generic over the choice
|
||||
// so the hot collect() loop monomorphizes to a direct call (no enum
|
||||
// dispatch per insert).
|
||||
// =================================================================
|
||||
pub(crate) trait TermOrdAccumulator: Sized {
|
||||
/// Construct an empty accumulator.
|
||||
/// `max_term_ord_inclusive` is the largest term_ord that may be
|
||||
/// inserted (used to size pre-allocated bitsets and the dense bitset
|
||||
/// on promotion).
|
||||
fn new(max_term_ord_inclusive: u64) -> Self;
|
||||
fn insert(&mut self, term_ord: u64);
|
||||
/// Bulk insert. Implementations may override to hoist any inner
|
||||
/// dispatch outside the loop. Default loops `insert`.
|
||||
#[inline]
|
||||
fn extend_from_iter<I: IntoIterator<Item = u64>>(&mut self, ords: I) {
|
||||
for ord in ords {
|
||||
self.insert(ord);
|
||||
}
|
||||
}
|
||||
/// Hook called once per ingested block. Adaptive impls use this to
|
||||
/// decide on sparse->dense promotion.
|
||||
fn maybe_compact(&mut self) {}
|
||||
fn len(&self) -> usize;
|
||||
fn iter_ords(&self) -> impl Iterator<Item = u64> + '_;
|
||||
}
|
||||
|
||||
impl TermOrdAccumulator for BitSet {
|
||||
#[inline]
|
||||
fn new(max_term_ord_inclusive: u64) -> Self {
|
||||
// `BitSet::with_max_value(M)` accepts ords in [0, M).
|
||||
// We need ords up to and including `max_term_ord_inclusive`, plus
|
||||
// the missing-value sentinel `column.max_value() + 1`.
|
||||
BitSet::with_max_value((max_term_ord_inclusive + 2) as u32)
|
||||
}
|
||||
#[inline]
|
||||
fn insert(&mut self, term_ord: u64) {
|
||||
BitSet::insert(self, term_ord as u32);
|
||||
}
|
||||
#[inline]
|
||||
fn len(&self) -> usize {
|
||||
BitSet::len(self)
|
||||
}
|
||||
fn iter_ords(&self) -> impl Iterator<Item = u64> + '_ {
|
||||
// `BitSet` itself doesn't expose iteration, but
|
||||
// `BitSet::tinyset(bucket)` does. Walk per-bucket and yield each
|
||||
// set bit. The capacity is `max_value()`; iterating to
|
||||
// `div_ceil(64)` covers every possible ord exactly once.
|
||||
let num_buckets = self.max_value().div_ceil(64);
|
||||
(0..num_buckets).flat_map(move |bucket| {
|
||||
let chunk_base = u64::from(bucket) * 64;
|
||||
self.tinyset(bucket)
|
||||
.into_iter()
|
||||
.map(move |bit| chunk_base + u64::from(bit))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
// TermOrdSet: adaptive sparse->dense accumulator.
|
||||
//
|
||||
// Starts as an FxHashSet (cheap when few ords are seen). When occupancy
|
||||
// crosses `len * PROMOTION_RATIO > max_term_ord_inclusive`, drains into
|
||||
// a `PagedBitset` and continues dense. Promotion is one-way.
|
||||
// =================================================================
|
||||
pub(crate) struct TermOrdSet {
|
||||
inner: TermOrdSetInner,
|
||||
/// Largest term_ord that may be inserted. Used for both sizing the
|
||||
/// dense bitset on promotion and as the promotion-threshold reference.
|
||||
max_term_ord_inclusive: u64,
|
||||
}
|
||||
|
||||
enum TermOrdSetInner {
|
||||
Sparse(FxHashSet<u64>),
|
||||
Dense(PagedBitset),
|
||||
}
|
||||
|
||||
impl TermOrdAccumulator for TermOrdSet {
|
||||
fn new(max_term_ord_inclusive: u64) -> Self {
|
||||
Self {
|
||||
inner: TermOrdSetInner::Sparse(FxHashSet::default()),
|
||||
max_term_ord_inclusive,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn insert(&mut self, term_ord: u64) {
|
||||
match &mut self.inner {
|
||||
TermOrdSetInner::Sparse(set) => {
|
||||
set.insert(term_ord);
|
||||
}
|
||||
TermOrdSetInner::Dense(bitset) => bitset.insert(term_ord),
|
||||
}
|
||||
}
|
||||
|
||||
/// Hoist the Sparse/Dense match outside the per-ord loop so that a
|
||||
/// block of inserts dispatches once.
|
||||
fn extend_from_iter<I: IntoIterator<Item = u64>>(&mut self, ords: I) {
|
||||
match &mut self.inner {
|
||||
TermOrdSetInner::Sparse(set) => {
|
||||
for ord in ords {
|
||||
set.insert(ord);
|
||||
}
|
||||
}
|
||||
TermOrdSetInner::Dense(bitset) => {
|
||||
for ord in ords {
|
||||
bitset.insert(ord);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn maybe_compact(&mut self) {
|
||||
let TermOrdSetInner::Sparse(set) = &mut self.inner else {
|
||||
return;
|
||||
};
|
||||
if set.len() as u64 * PROMOTION_RATIO <= self.max_term_ord_inclusive {
|
||||
return;
|
||||
}
|
||||
// Size for ord <= max_term_ord_inclusive plus the missing sentinel
|
||||
// (column.max_value() + 1, which may equal max_term_ord_inclusive
|
||||
// when the column references every dictionary term).
|
||||
let mut bitset = PagedBitset::with_max_term_ord(self.max_term_ord_inclusive + 1);
|
||||
let set = std::mem::take(set);
|
||||
for ord in set {
|
||||
bitset.insert(ord);
|
||||
}
|
||||
self.inner = TermOrdSetInner::Dense(bitset);
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
match &self.inner {
|
||||
TermOrdSetInner::Sparse(set) => set.len(),
|
||||
TermOrdSetInner::Dense(bitset) => bitset.len() as usize,
|
||||
}
|
||||
}
|
||||
|
||||
fn iter_ords(&self) -> impl Iterator<Item = u64> + '_ {
|
||||
match &self.inner {
|
||||
TermOrdSetInner::Sparse(set) => itertools::Either::Left(set.iter().copied()),
|
||||
TermOrdSetInner::Dense(bitset) => itertools::Either::Right(bitset.iter_sorted()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct SegmentCardinalityCollector<S: TermOrdAccumulator> {
|
||||
pub(crate) struct SegmentCardinalityCollector {
|
||||
/// Buckets are Some(_) until they get consumed by into_intermediate_results().
|
||||
buckets: Vec<Option<SegmentCardinalityCollectorBucket<S>>>,
|
||||
buckets: Vec<Option<SegmentCardinalityCollectorBucket>>,
|
||||
accessor_idx: usize,
|
||||
/// The column accessor to access the fast field values.
|
||||
accessor: Column<u64>,
|
||||
@@ -454,13 +188,9 @@ pub(crate) struct SegmentCardinalityCollector<S: TermOrdAccumulator> {
|
||||
/// The missing value normalized to the internal u64 representation of the field type.
|
||||
missing_value_for_accessor: Option<u64>,
|
||||
coupon_cache: Option<CouponCache>,
|
||||
/// Largest term_ord that may be inserted into a bucket. For str columns
|
||||
/// this is `accessor.max_value()`; for non-str columns this is unused
|
||||
/// (no inserts go into `entries`) and set to 0.
|
||||
max_term_ord_inclusive: u64,
|
||||
}
|
||||
|
||||
impl<S: TermOrdAccumulator> Debug for SegmentCardinalityCollector<S> {
|
||||
impl Debug for SegmentCardinalityCollector {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentCardinalityCollector")
|
||||
.field("column_type", &self.column_type)
|
||||
@@ -472,21 +202,16 @@ impl<S: TermOrdAccumulator> Debug for SegmentCardinalityCollector<S> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-bucket state. Shape depends on column kind: str columns dedup
|
||||
/// term ords and only build the HLL sketch at finalization (saves the
|
||||
/// ~96 B `CardinalityCollector` per bucket during collect); numeric/IpAddr
|
||||
/// columns feed the sketch directly during collect.
|
||||
pub(crate) enum SegmentCardinalityCollectorBucket<S: TermOrdAccumulator> {
|
||||
Str(S),
|
||||
Numeric(CardinalityCollector),
|
||||
pub(crate) struct SegmentCardinalityCollectorBucket {
|
||||
cardinality: CardinalityCollector,
|
||||
entries: FxHashSet<u64>,
|
||||
}
|
||||
impl<S: TermOrdAccumulator> SegmentCardinalityCollectorBucket<S> {
|
||||
impl SegmentCardinalityCollectorBucket {
|
||||
#[inline(always)]
|
||||
pub fn new(column_type: ColumnType, max_term_ord_inclusive: u64) -> Self {
|
||||
if column_type == ColumnType::Str {
|
||||
Self::Str(S::new(max_term_ord_inclusive))
|
||||
} else {
|
||||
Self::Numeric(CardinalityCollector::new(column_type as u8))
|
||||
pub fn new(column_type: ColumnType) -> Self {
|
||||
Self {
|
||||
cardinality: CardinalityCollector::new(column_type as u8),
|
||||
entries: FxHashSet::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -497,57 +222,37 @@ impl<S: TermOrdAccumulator> SegmentCardinalityCollectorBucket<S> {
|
||||
//
|
||||
// If the column is str, then the values are dictionary encoded
|
||||
// and have not been added to the sketch yet.
|
||||
// We need to resolves the term ords accumulated in the str entries
|
||||
// with the coupon cache, and append the results to a fresh sketch.
|
||||
// We need to resolves the term ords accumulated in self.entries
|
||||
// with the coupon cache, and append the results to the sketch.
|
||||
fn into_intermediate_metric_result(
|
||||
self,
|
||||
mut self,
|
||||
coupon_cache_opt: Option<&CouponCache>,
|
||||
) -> crate::Result<IntermediateMetricResult> {
|
||||
let cardinality = match self {
|
||||
Self::Str(entries) => {
|
||||
let mut cardinality = CardinalityCollector::new(ColumnType::Str as u8);
|
||||
if let Some(coupon_cache) = coupon_cache_opt {
|
||||
// Sketch must be empty for str columns: coupons are appended here
|
||||
// from the term_ord set (and not directly during collection).
|
||||
assert!(cardinality.sketch.is_empty());
|
||||
append_to_sketch(&entries, coupon_cache, &mut cardinality);
|
||||
}
|
||||
cardinality
|
||||
}
|
||||
Self::Numeric(cardinality) => cardinality,
|
||||
};
|
||||
Ok(IntermediateMetricResult::Cardinality(cardinality))
|
||||
if let Some(coupon_cache) = coupon_cache_opt {
|
||||
assert!(self.cardinality.sketch.is_empty());
|
||||
append_to_sketch(&self.entries, coupon_cache, &mut self.cardinality);
|
||||
}
|
||||
Ok(IntermediateMetricResult::Cardinality(self.cardinality))
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a coupon cache from the given buckets, dictionary, and optional missing value.
|
||||
/// Returns a mapping from term_ord to the hash (coupon) of the associated term.
|
||||
fn build_coupon_cache<S: TermOrdAccumulator>(
|
||||
buckets: &[Option<SegmentCardinalityCollectorBucket<S>>],
|
||||
fn build_coupon_cache(
|
||||
buckets: &[Option<SegmentCardinalityCollectorBucket>],
|
||||
dictionary: &Dictionary,
|
||||
missing_value_opt: Option<&Key>,
|
||||
) -> io::Result<CouponCache> {
|
||||
// Caller restricts this to str cardinality collectors, so every
|
||||
// present bucket must be the `Str` variant. Pass 1 validates and
|
||||
// computes the capacity hint; pass 2 inserts.
|
||||
let mut max_bucket_len = 0usize;
|
||||
let term_ords_capacity: usize = buckets
|
||||
.iter()
|
||||
.flatten()
|
||||
.map(|bucket| bucket.entries.len())
|
||||
.max()
|
||||
.unwrap_or(0)
|
||||
* 2;
|
||||
let mut term_ords_set = FxHashSet::with_capacity_and_hasher(term_ords_capacity, FxBuildHasher);
|
||||
for bucket in buckets.iter().flatten() {
|
||||
match bucket {
|
||||
SegmentCardinalityCollectorBucket::Str(entries) => {
|
||||
max_bucket_len = max_bucket_len.max(entries.len());
|
||||
}
|
||||
SegmentCardinalityCollectorBucket::Numeric(_) => {
|
||||
return Err(io::Error::other(
|
||||
"build_coupon_cache invoked with a non-str bucket",
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut term_ords_set = FxHashSet::with_capacity_and_hasher(max_bucket_len * 2, FxBuildHasher);
|
||||
for bucket in buckets.iter().flatten() {
|
||||
if let SegmentCardinalityCollectorBucket::Str(entries) = bucket {
|
||||
term_ords_set.extend(entries.iter_ords());
|
||||
}
|
||||
term_ords_set.extend(bucket.entries.iter().copied());
|
||||
}
|
||||
let mut term_ords: Vec<u64> = term_ords_set.into_iter().collect();
|
||||
term_ords.sort_unstable();
|
||||
@@ -579,8 +284,8 @@ fn build_coupon_cache<S: TermOrdAccumulator>(
|
||||
Ok(CouponCache::new(term_ords, coupons, missing_coupon_opt))
|
||||
}
|
||||
|
||||
fn append_to_sketch<S: TermOrdAccumulator>(
|
||||
term_ords: &S,
|
||||
fn append_to_sketch(
|
||||
term_ords: &FxHashSet<u64>,
|
||||
coupon_cache: &CouponCache,
|
||||
sketch: &mut CardinalityCollector,
|
||||
) {
|
||||
@@ -589,7 +294,7 @@ fn append_to_sketch<S: TermOrdAccumulator>(
|
||||
coupon_map,
|
||||
missing_coupon_opt,
|
||||
} => {
|
||||
for term_ord in term_ords.iter_ords() {
|
||||
for &term_ord in term_ords {
|
||||
if let Some(coupon) = coupon_map
|
||||
.get(term_ord as usize)
|
||||
.copied()
|
||||
@@ -603,8 +308,8 @@ fn append_to_sketch<S: TermOrdAccumulator>(
|
||||
coupon_map,
|
||||
missing_coupon_opt,
|
||||
} => {
|
||||
for term_ord in term_ords.iter_ords() {
|
||||
if let Some(coupon) = coupon_map.get(&term_ord).copied().or(*missing_coupon_opt) {
|
||||
for term_ord in term_ords {
|
||||
if let Some(coupon) = coupon_map.get(term_ord).copied().or(*missing_coupon_opt) {
|
||||
sketch.insert_coupon(coupon);
|
||||
}
|
||||
}
|
||||
@@ -612,13 +317,12 @@ fn append_to_sketch<S: TermOrdAccumulator>(
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: TermOrdAccumulator> SegmentCardinalityCollector<S> {
|
||||
impl SegmentCardinalityCollector {
|
||||
pub fn from_req(
|
||||
column_type: ColumnType,
|
||||
accessor_idx: usize,
|
||||
accessor: Column<u64>,
|
||||
missing_value_for_accessor: Option<u64>,
|
||||
max_term_ord_inclusive: u64,
|
||||
) -> Self {
|
||||
Self {
|
||||
buckets: Vec::new(),
|
||||
@@ -627,7 +331,6 @@ impl<S: TermOrdAccumulator> SegmentCardinalityCollector<S> {
|
||||
accessor,
|
||||
missing_value_for_accessor,
|
||||
coupon_cache: None,
|
||||
max_term_ord_inclusive,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -644,9 +347,7 @@ impl<S: TermOrdAccumulator> SegmentCardinalityCollector<S> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: TermOrdAccumulator + 'static> SegmentAggregationCollector
|
||||
for SegmentCardinalityCollector<S>
|
||||
{
|
||||
impl SegmentAggregationCollector for SegmentCardinalityCollector {
|
||||
fn add_intermediate_aggregation_result(
|
||||
&mut self,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
@@ -701,41 +402,31 @@ impl<S: TermOrdAccumulator + 'static> SegmentAggregationCollector
|
||||
));
|
||||
};
|
||||
let col_block_accessor = &agg_data.column_block_accessor;
|
||||
match bucket {
|
||||
SegmentCardinalityCollectorBucket::Str(entries) => {
|
||||
// Promotion check runs on the pre-block state: the first call
|
||||
// sees an empty set (no-op), and the last block of inserts
|
||||
// doesn't trigger a promotion of a set we won't grow further.
|
||||
// The trait dispatches once per block (via `extend_from_iter`)
|
||||
// for adaptive variants and inlines to a tight loop for the
|
||||
// BitSet path.
|
||||
entries.maybe_compact();
|
||||
entries.extend_from_iter(col_block_accessor.iter_vals());
|
||||
if self.column_type == ColumnType::Str {
|
||||
for term_ord in col_block_accessor.iter_vals() {
|
||||
bucket.entries.insert(term_ord);
|
||||
}
|
||||
SegmentCardinalityCollectorBucket::Numeric(cardinality) => {
|
||||
if self.column_type == ColumnType::IpAddr {
|
||||
let compact_space_accessor = self
|
||||
.accessor
|
||||
.values
|
||||
.clone()
|
||||
.downcast_arc::<CompactSpaceU64Accessor>()
|
||||
.map_err(|_| {
|
||||
TantivyError::AggregationError(
|
||||
crate::aggregation::AggregationError::InternalError(
|
||||
"Type mismatch: Could not downcast to CompactSpaceU64Accessor"
|
||||
.to_string(),
|
||||
),
|
||||
)
|
||||
})?;
|
||||
for val in col_block_accessor.iter_vals() {
|
||||
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
|
||||
cardinality.insert(val);
|
||||
}
|
||||
} else {
|
||||
for val in col_block_accessor.iter_vals() {
|
||||
cardinality.insert(val);
|
||||
}
|
||||
}
|
||||
} else if self.column_type == ColumnType::IpAddr {
|
||||
let compact_space_accessor = self
|
||||
.accessor
|
||||
.values
|
||||
.clone()
|
||||
.downcast_arc::<CompactSpaceU64Accessor>()
|
||||
.map_err(|_| {
|
||||
TantivyError::AggregationError(
|
||||
crate::aggregation::AggregationError::InternalError(
|
||||
"Type mismatch: Could not downcast to CompactSpaceU64Accessor"
|
||||
.to_string(),
|
||||
),
|
||||
)
|
||||
})?;
|
||||
for val in col_block_accessor.iter_vals() {
|
||||
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
|
||||
bucket.cardinality.insert(val);
|
||||
}
|
||||
} else {
|
||||
for val in col_block_accessor.iter_vals() {
|
||||
bucket.cardinality.insert(val);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -748,40 +439,12 @@ impl<S: TermOrdAccumulator + 'static> SegmentAggregationCollector
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> crate::Result<()> {
|
||||
if max_bucket as usize >= self.buckets.len() {
|
||||
let column_type = self.column_type;
|
||||
let max_term_ord_inclusive = self.max_term_ord_inclusive;
|
||||
self.buckets.resize_with(max_bucket as usize + 1, || {
|
||||
Some(SegmentCardinalityCollectorBucket::<S>::new(
|
||||
column_type,
|
||||
max_term_ord_inclusive,
|
||||
))
|
||||
Some(SegmentCardinalityCollectorBucket::new(self.column_type))
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_metric_value(
|
||||
&self,
|
||||
bucket_id: BucketId,
|
||||
sub_agg_name: &str,
|
||||
sub_agg_property: &str,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
) -> Option<f64> {
|
||||
let req_data = &agg_data.get_cardinality_req_data(self.accessor_idx);
|
||||
if req_data.name != sub_agg_name || !sub_agg_property.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let bucket = self.buckets.get(bucket_id as usize)?.as_ref()?;
|
||||
// For string columns the sketch isn't built until finalization; the
|
||||
// term_ord set's len is the exact distinct count. For numeric columns
|
||||
// the sketch is populated during collect.
|
||||
match bucket {
|
||||
SegmentCardinalityCollectorBucket::Str(entries) => Some(entries.len() as f64),
|
||||
SegmentCardinalityCollectorBucket::Numeric(cardinality) => {
|
||||
Some(cardinality.sketch.estimate().trunc())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -826,7 +489,7 @@ impl<'de> Deserialize<'de> for CardinalityCollector {
|
||||
impl CardinalityCollector {
|
||||
fn new(salt: u8) -> Self {
|
||||
Self {
|
||||
sketch: HllSketch::new(LG_K, HllType::Hll8),
|
||||
sketch: HllSketch::new(LG_K, HllType::Hll4),
|
||||
salt,
|
||||
}
|
||||
}
|
||||
@@ -857,7 +520,7 @@ impl CardinalityCollector {
|
||||
let mut union = HllUnion::new(LG_K);
|
||||
union.update(&self.sketch);
|
||||
union.update(&right.sketch);
|
||||
self.sketch = union.to_sketch(HllType::Hll8);
|
||||
self.sketch = union.to_sketch(HllType::Hll4);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -929,134 +592,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Build a single-segment string-cardinality index with 32 unique terms.
|
||||
/// `column.max_value() = 31` is well below `BITSET_MAX_TERM_ORD`,
|
||||
/// so the bucket exercises the `BitSet` path end to end.
|
||||
#[test]
|
||||
fn cardinality_aggregation_test_str_bitset() -> crate::Result<()> {
|
||||
let terms: Vec<String> = (0..32).map(|i| format!("term_{i}")).collect();
|
||||
let term_refs: Vec<Vec<&str>> = terms.iter().map(|t| vec![t.as_str()]).collect::<Vec<_>>();
|
||||
// single segment so we have a single dictionary of 32 terms.
|
||||
let index = get_test_index_from_terms(true, &term_refs)?;
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"cardinality": {
|
||||
"cardinality": { "field": "string_id" }
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
assert_eq!(res["cardinality"]["value"], 32.0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// `BitSet` path with a `missing` parameter: the column-level missing
|
||||
/// sentinel (`column.max_value() + 1`) flows into the bitset, the
|
||||
/// dict lookup filter at finalization drops it, and the missing
|
||||
/// coupon is applied separately.
|
||||
#[test]
|
||||
fn cardinality_aggregation_test_str_bitset_with_missing() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let name_field = schema_builder.add_text_field("name", STRING | FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
for i in 0..16 {
|
||||
let term = format!("t{i:02}");
|
||||
writer.add_document(doc!(name_field => term)).unwrap();
|
||||
}
|
||||
// One empty doc, exercising the missing sentinel.
|
||||
writer.add_document(doc!()).unwrap();
|
||||
writer.commit().unwrap();
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "name",
|
||||
"missing": "MISSING_SENTINEL_KEY",
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index).unwrap();
|
||||
// 16 distinct real terms + 1 distinct "missing" value = 17.
|
||||
assert_eq!(res["cardinality"]["value"], 17.0);
|
||||
}
|
||||
|
||||
/// Unit-test the PagedBitset itself: cross-page inserts produce sorted
|
||||
/// iteration, len() matches the inserted set, and duplicates are
|
||||
/// idempotent.
|
||||
#[test]
|
||||
fn paged_bitset_basic() {
|
||||
use super::PagedBitset;
|
||||
// Span several pages: BITSET_PAGE_BITS = 1024, so ords > 1024 land
|
||||
// on the second page, > 2048 on the third, etc.
|
||||
let ords = [0u64, 1, 63, 64, 1023, 1024, 1025, 4096, 4097, 9999, 10_000];
|
||||
let max_ord = *ords.iter().max().unwrap();
|
||||
let mut bitset = PagedBitset::with_max_term_ord(max_ord);
|
||||
for &ord in &ords {
|
||||
bitset.insert(ord);
|
||||
// Idempotent: inserting again must not increase count.
|
||||
bitset.insert(ord);
|
||||
}
|
||||
assert_eq!(bitset.len(), ords.len() as u64);
|
||||
let collected: Vec<u64> = bitset.iter_sorted().collect();
|
||||
let mut expected: Vec<u64> = ords.to_vec();
|
||||
expected.sort_unstable();
|
||||
assert_eq!(collected, expected);
|
||||
}
|
||||
|
||||
/// Unit-test `TermOrdSet`: starts Sparse, promotes to Dense on
|
||||
/// `maybe_compact` once the density threshold is crossed, and
|
||||
/// `iter_ords()` yields the same set in either state. Ords spanning
|
||||
/// multiple paged-bitset pages exercise the Dense iter ordering.
|
||||
#[test]
|
||||
fn term_ord_set_promotes_on_maybe_compact() {
|
||||
use super::{TermOrdAccumulator, TermOrdSet, PROMOTION_RATIO};
|
||||
// Pick max so promotion needs few inserts: len * RATIO > max with
|
||||
// RATIO=32 and max=64 trips at len=3 (3*32=96 > 64).
|
||||
let max_term_ord = 64u64;
|
||||
let mut set = <TermOrdSet as TermOrdAccumulator>::new(max_term_ord);
|
||||
// Two inserts: should stay Sparse after maybe_compact (2 * RATIO = 64, not > 64).
|
||||
set.insert(0);
|
||||
set.insert(7);
|
||||
set.maybe_compact();
|
||||
assert_eq!(set.len(), 2);
|
||||
|
||||
// Third insert promotes on next maybe_compact.
|
||||
set.insert(20);
|
||||
assert_eq!(set.len(), 3);
|
||||
// Sanity check: at len=3, 3 * PROMOTION_RATIO = 96 > 64.
|
||||
assert!(3u64 * PROMOTION_RATIO > max_term_ord);
|
||||
set.maybe_compact();
|
||||
|
||||
// Post-promotion: extending continues to work.
|
||||
set.insert(15);
|
||||
set.insert(15); // dup
|
||||
assert_eq!(set.len(), 4);
|
||||
|
||||
let mut collected: Vec<u64> = set.iter_ords().collect();
|
||||
collected.sort_unstable();
|
||||
assert_eq!(collected, vec![0, 7, 15, 20]);
|
||||
}
|
||||
|
||||
/// Unit-test the `BitSet` impl of `TermOrdAccumulator`: insert,
|
||||
/// dedup, and iter_ords order.
|
||||
#[test]
|
||||
fn bitset_accumulator_basic() {
|
||||
use common::BitSet;
|
||||
|
||||
use super::TermOrdAccumulator;
|
||||
let mut set = <BitSet as TermOrdAccumulator>::new(255);
|
||||
for ord in [0u64, 1, 63, 64, 65, 128, 200, 200, 0] {
|
||||
<BitSet as TermOrdAccumulator>::insert(&mut set, ord);
|
||||
}
|
||||
assert_eq!(<BitSet as TermOrdAccumulator>::len(&set), 7);
|
||||
let collected: Vec<u64> = set.iter_ords().collect();
|
||||
assert_eq!(collected, vec![0, 1, 63, 64, 65, 128, 200]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cardinality_aggregation_u64() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -1148,42 +683,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// A JSON path that resolves to both a Str column and a numeric column
|
||||
/// produces two collector instances per segment — one with `Str` buckets
|
||||
/// and one with `Numeric` buckets. Their `IntermediateMetricResult`s must
|
||||
/// merge into the union cardinality.
|
||||
#[test]
|
||||
fn cardinality_aggregation_json_str_and_numeric() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_json_field("json", FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut writer = index.writer_for_tests()?;
|
||||
writer.add_document(doc!(field => json!({"value": "hello"})))?;
|
||||
writer.add_document(doc!(field => json!({"value": "world"})))?;
|
||||
writer.add_document(doc!(field => json!({"value": "hello"})))?; // dup str
|
||||
writer.add_document(doc!(field => json!({"value": i64::from_u64(7u64)})))?;
|
||||
writer.add_document(doc!(field => json!({"value": i64::from_u64(42u64)})))?;
|
||||
writer.add_document(doc!(field => json!({"value": i64::from_u64(7u64)})))?; // dup num
|
||||
writer.commit()?;
|
||||
}
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "json.value"
|
||||
},
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
// 4 distinct values: "hello", "world", 7, 42.
|
||||
assert_eq!(res["cardinality"]["value"], 4.0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cardinality_collector_serde_roundtrip() {
|
||||
use super::CardinalityCollector;
|
||||
|
||||
@@ -399,26 +399,6 @@ impl SegmentAggregationCollector for SegmentExtendedStatsCollector {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_metric_value(
|
||||
&self,
|
||||
bucket_id: BucketId,
|
||||
sub_agg_name: &str,
|
||||
sub_agg_property: &str,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> Option<f64> {
|
||||
if self.name != sub_agg_name {
|
||||
return None;
|
||||
}
|
||||
let extended = self.buckets.get(bucket_id as usize)?;
|
||||
// Finalize is a pure read of accumulators — calling it here for the cutoff sort
|
||||
// doesn't disturb the eventual intermediate result.
|
||||
extended
|
||||
.finalize()
|
||||
.get_value(sub_agg_property)
|
||||
.ok()
|
||||
.flatten()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -312,26 +312,6 @@ impl SegmentAggregationCollector for SegmentPercentilesCollector {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_metric_value(
|
||||
&self,
|
||||
bucket_id: BucketId,
|
||||
sub_agg_name: &str,
|
||||
sub_agg_property: &str,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
) -> Option<f64> {
|
||||
if agg_data.get_metric_req_data(self.accessor_idx).name != sub_agg_name {
|
||||
return None;
|
||||
}
|
||||
let percentile: f64 = sub_agg_property.parse().ok()?;
|
||||
if !(0.0..=100.0).contains(&percentile) {
|
||||
return None;
|
||||
}
|
||||
let bucket = self.buckets.get(bucket_id as usize)?;
|
||||
// DDSketch.quantile is a pure read; calling it here for the cutoff sort does
|
||||
// not affect the intermediate state used for the final result.
|
||||
bucket.sketch.quantile(percentile / 100.0).ok().flatten()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -321,40 +321,6 @@ impl<const COLUMN_TYPE_ID: u8> SegmentAggregationCollector
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_metric_value(
|
||||
&self,
|
||||
bucket_id: BucketId,
|
||||
sub_agg_name: &str,
|
||||
sub_agg_property: &str,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> Option<f64> {
|
||||
if self.name != sub_agg_name {
|
||||
return None;
|
||||
}
|
||||
let stats = self.buckets.get(bucket_id as usize)?;
|
||||
// The property depends on what we're collecting:
|
||||
// - StatsType::Stats exposes count/sum/min/max/avg via dotted property.
|
||||
// - Single-value kinds (Sum/Count/Min/Max/Average) expect an empty property and return
|
||||
// the value they were configured to collect.
|
||||
let prop = match self.collecting_for {
|
||||
StatsType::Stats if !sub_agg_property.is_empty() => sub_agg_property,
|
||||
StatsType::Sum if sub_agg_property.is_empty() => "sum",
|
||||
StatsType::Count if sub_agg_property.is_empty() => "count",
|
||||
StatsType::Max if sub_agg_property.is_empty() => "max",
|
||||
StatsType::Min if sub_agg_property.is_empty() => "min",
|
||||
StatsType::Average if sub_agg_property.is_empty() => "avg",
|
||||
_ => return None,
|
||||
};
|
||||
match prop {
|
||||
"count" => Some(stats.count as f64),
|
||||
"sum" => Some(stats.sum),
|
||||
"min" if stats.count > 0 => Some(stats.min),
|
||||
"max" if stats.count > 0 => Some(stats.max),
|
||||
"avg" if stats.count > 0 => Some(stats.sum / stats.count as f64),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
|
||||
@@ -27,16 +27,6 @@ pub struct SumAggregation {
|
||||
/// { "field": "my_numbers", "missing": "10.0" }
|
||||
#[serde(default, deserialize_with = "deserialize_option_f64")]
|
||||
pub missing: Option<f64>,
|
||||
/// Non-Elasticsearch extension. When `Some(true)`, the serialized result
|
||||
/// returns `"value": null` if no values were collected (all documents had
|
||||
/// missing/NULL values for the field), matching the behavior of `min`,
|
||||
/// `max`, and `avg`. When `None` or `Some(false)` (the default) the
|
||||
/// result returns `"value": 0`, matching Elasticsearch.
|
||||
///
|
||||
/// Intended for SQL-style consumers where `SUM` of zero rows is `NULL`
|
||||
/// and must be distinguishable from a bucket that genuinely sums to `0`.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub none_if_no_match: Option<bool>,
|
||||
}
|
||||
|
||||
impl SumAggregation {
|
||||
@@ -45,7 +35,6 @@ impl SumAggregation {
|
||||
Self {
|
||||
field: field_name,
|
||||
missing: None,
|
||||
none_if_no_match: None,
|
||||
}
|
||||
}
|
||||
/// Returns the field name the aggregation is computed on.
|
||||
@@ -70,104 +59,8 @@ impl IntermediateSum {
|
||||
pub fn merge_fruits(&mut self, other: IntermediateSum) {
|
||||
self.stats.merge_fruits(other.stats);
|
||||
}
|
||||
/// Computes the final sum value.
|
||||
///
|
||||
/// Returns `None` when no values were collected, matching the Rust-side
|
||||
/// behavior of `IntermediateMin`, `IntermediateMax`, and
|
||||
/// `IntermediateAvg`. The Elasticsearch-vs-SQL choice for the
|
||||
/// user-visible result is made at the boundary in
|
||||
/// [`IntermediateMetricResult::into_final_metric_result`]: by default
|
||||
/// `None` is coerced to `Some(0.0)` to match Elasticsearch
|
||||
/// (`"value": 0`), and the [`SumAggregation::none_if_no_match`] flag
|
||||
/// opts out of that coercion for SQL-style consumers.
|
||||
/// Computes the final minimum value.
|
||||
pub fn finalize(&self) -> Option<f64> {
|
||||
let stats = self.stats.finalize();
|
||||
if stats.count == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(stats.sum)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_sum_finalize_returns_none_when_no_values() {
|
||||
// Default IntermediateSum has count=0 — finalize should return None,
|
||||
// matching MIN/MAX/AVG behavior for all-NULL groups.
|
||||
let sum = IntermediateSum::default();
|
||||
assert_eq!(sum.finalize(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sum_finalize_returns_value_when_has_values() {
|
||||
let mut sum = IntermediateSum::default();
|
||||
// Merge in a result that has actual values
|
||||
let stats = IntermediateStats {
|
||||
count: 3,
|
||||
sum: 42.0,
|
||||
min: 10.0,
|
||||
max: 20.0,
|
||||
..Default::default()
|
||||
};
|
||||
let other = IntermediateSum::from_stats(stats);
|
||||
sum.merge_fruits(other);
|
||||
assert_eq!(sum.finalize(), Some(42.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sum_merge_two_empty_still_none() {
|
||||
let mut a = IntermediateSum::default();
|
||||
let b = IntermediateSum::default();
|
||||
a.merge_fruits(b);
|
||||
assert_eq!(a.finalize(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sum_aggregation_empty_index_default_matches_es() -> crate::Result<()> {
|
||||
use serde_json::json;
|
||||
|
||||
use crate::aggregation::agg_req::Aggregations;
|
||||
use crate::aggregation::tests::{exec_request, get_test_index_from_terms};
|
||||
|
||||
// Empty index — sum has no values to collect.
|
||||
let values: Vec<Vec<&str>> = vec![];
|
||||
let index = get_test_index_from_terms(false, &values)?;
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"score_sum": { "sum": { "field": "score" } }
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
// Default: match Elasticsearch — empty sum serializes as 0, not null.
|
||||
assert_eq!(res["score_sum"]["value"], 0.0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sum_aggregation_empty_index_none_if_no_match_opt_in() -> crate::Result<()> {
|
||||
use serde_json::json;
|
||||
|
||||
use crate::aggregation::agg_req::Aggregations;
|
||||
use crate::aggregation::tests::{exec_request, get_test_index_from_terms};
|
||||
|
||||
let values: Vec<Vec<&str>> = vec![];
|
||||
let index = get_test_index_from_terms(false, &values)?;
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"score_sum": { "sum": { "field": "score", "none_if_no_match": true } }
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
// Opt-in non-ES extension — empty sum serializes as null.
|
||||
assert!(
|
||||
res["score_sum"]["value"].is_null(),
|
||||
"expected null, got {:?}",
|
||||
res["score_sum"]["value"]
|
||||
);
|
||||
Ok(())
|
||||
Some(self.stats.finalize().sum)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -644,17 +644,6 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_metric_value(
|
||||
&self,
|
||||
_bucket_id: BucketId,
|
||||
_sub_agg_name: &str,
|
||||
_sub_agg_property: &str,
|
||||
_agg_data: &AggregationsSegmentCtx,
|
||||
) -> Option<f64> {
|
||||
// top_hits is not a numeric metric and cannot be used as an order target.
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -76,31 +76,6 @@ pub trait SegmentAggregationCollector: Debug {
|
||||
fn flush(&mut self, _agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compute the segment-level metric value of the named direct-child metric for `bucket_id`.
|
||||
///
|
||||
/// Used by parent term aggs that order by a sub-aggregation: the parent sorts on
|
||||
/// this value and cuts off at segment time, matching the approximation tradeoff
|
||||
/// Elasticsearch makes for any sub-agg ordering.
|
||||
///
|
||||
/// `sub_agg_property` is the dotted suffix (e.g. `"sum"` in `mystats.sum`); empty when
|
||||
/// the metric is a single-value kind such as cardinality.
|
||||
///
|
||||
/// Returns `None` only on name mismatch, unknown property, or empty bucket. Implementations
|
||||
/// may finalize their per-bucket state (e.g. compute a percentile from a sketch); calls
|
||||
/// must be idempotent so the final intermediate result is unaffected.
|
||||
///
|
||||
/// No default impl on purpose: every collector must decide explicitly whether it
|
||||
/// produces a metric value, forwards into children (single-bucket aggs), or rejects
|
||||
/// the lookup. A silent `None` default would let a parent term agg's cutoff sort all
|
||||
/// buckets to the same key and drop arbitrary winners.
|
||||
fn compute_metric_value(
|
||||
&self,
|
||||
bucket_id: BucketId,
|
||||
sub_agg_name: &str,
|
||||
sub_agg_property: &str,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
) -> Option<f64>;
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -162,21 +137,4 @@ impl SegmentAggregationCollector for GenericSegmentAggregationResultsCollector {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_metric_value(
|
||||
&self,
|
||||
bucket_id: BucketId,
|
||||
sub_agg_name: &str,
|
||||
sub_agg_property: &str,
|
||||
agg_data: &AggregationsSegmentCtx,
|
||||
) -> Option<f64> {
|
||||
for agg in &self.aggs {
|
||||
if let Some(value) =
|
||||
agg.compute_metric_value(bucket_id, sub_agg_name, sub_agg_property, agg_data)
|
||||
{
|
||||
return Some(value);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -389,13 +389,6 @@ impl SegmentCollector for FacetSegmentCollector {
|
||||
}
|
||||
let mut facet = vec![];
|
||||
let (facet_ord, facet_depth) = self.unique_facet_ords[collapsed_facet_ord];
|
||||
// u64::MAX is used as a sentinel for unmapped ordinals (e.g. when a
|
||||
// document has the exact registered facet, not a child of it).
|
||||
// Passing it to ord_to_term would resolve to the last dictionary
|
||||
// entry and produce a spurious facet from an unrelated branch.
|
||||
if facet_ord == u64::MAX {
|
||||
continue;
|
||||
}
|
||||
// TODO handle errors.
|
||||
if facet_dict.ord_to_term(facet_ord, &mut facet).is_ok() {
|
||||
if let Some((end_collapsed_facet, _)) = facet
|
||||
@@ -821,63 +814,6 @@ mod tests {
|
||||
assert!(!super::is_child_facet(&b"foo\0bar"[..], &b"foo"[..]));
|
||||
assert!(!super::is_child_facet(&b"foo"[..], &b"foobar\0baz"[..]));
|
||||
}
|
||||
|
||||
// Regression test for https://github.com/quickwit-oss/tantivy/issues/2494
|
||||
// When a document has the exact registered facet path (not just a child),
|
||||
// harvest() must not turn the unmapped sentinel into a spurious root entry.
|
||||
#[test]
|
||||
fn test_facet_collector_wrong_root() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
let facets: Vec<&str> = vec![
|
||||
"/science-fiction/asimov",
|
||||
"/science-fiction/clarke",
|
||||
"/science-fiction/dick",
|
||||
"/science-fiction/herbert",
|
||||
"/science-fiction/orwell",
|
||||
// This exact match on the registered facet is the bug trigger:
|
||||
// its ordinal maps to the sentinel (u64::MAX, 0) in the collapse
|
||||
// mapping, which without the fix resolves to an unrelated term.
|
||||
"/fantasy/epic-fantasy",
|
||||
"/fantasy/epic-fantasy/tolkien",
|
||||
"/fantasy/epic-fantasy/martin",
|
||||
];
|
||||
for facet_str in &facets {
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from(*facet_str)
|
||||
))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let term = Term::from_facet(facet_field, &Facet::from("/fantasy/epic-fantasy"));
|
||||
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
|
||||
let mut facet_collector = FacetCollector::for_field("facet");
|
||||
facet_collector.add_facet("/fantasy/epic-fantasy");
|
||||
let counts: FacetCounts = searcher.search(&query, &facet_collector)?;
|
||||
|
||||
let result: Vec<(String, u64)> = counts
|
||||
.get("/")
|
||||
.map(|(facet, count)| (facet.to_string(), count))
|
||||
.collect();
|
||||
|
||||
// Only children of /fantasy/epic-fantasy should appear, not /science-fiction
|
||||
assert_eq!(
|
||||
result,
|
||||
vec![
|
||||
("/fantasy/epic-fantasy/martin".to_string(), 1),
|
||||
("/fantasy/epic-fantasy/tolkien".to_string(), 1),
|
||||
]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
|
||||
@@ -52,7 +52,7 @@ impl<T: FastValue> SortKeyComputer for SortByStaticFastValue<T> {
|
||||
if schema_type != T::to_type() {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"Field `{}` is of type {schema_type:?}, not of the type {:?}.",
|
||||
self.field,
|
||||
&self.field,
|
||||
T::to_type()
|
||||
)));
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ use common::{ByteCount, HasLen};
|
||||
use fnv::FnvHashMap;
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::directory::error::OpenReadError;
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
|
||||
@@ -160,10 +159,12 @@ impl SegmentReader {
|
||||
let postings_file = segment.open_read(SegmentComponent::Postings)?;
|
||||
let postings_composite = CompositeFile::open(&postings_file)?;
|
||||
|
||||
let positions_composite = match segment.open_read(SegmentComponent::Positions) {
|
||||
Ok(positions_file) => CompositeFile::open(&positions_file)?,
|
||||
Err(OpenReadError::FileDoesNotExist(_)) => CompositeFile::empty(),
|
||||
Err(open_read_error) => return Err(open_read_error.into()),
|
||||
let positions_composite = {
|
||||
if let Ok(positions_file) = segment.open_read(SegmentComponent::Positions) {
|
||||
CompositeFile::open(&positions_file)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
@@ -322,7 +323,7 @@ impl SegmentReader {
|
||||
// Without expand dots enabled dots need to be escaped.
|
||||
let escaped_json_path = json_path.replace('.', "\\.");
|
||||
let full_path = format!("{field_name}.{escaped_json_path}");
|
||||
let full_path_unescaped = format!("{}.{}", field_name, json_path);
|
||||
let full_path_unescaped = format!("{}.{}", field_name, &json_path);
|
||||
map_to_canonical.insert(full_path_unescaped, full_path.to_string());
|
||||
full_path
|
||||
} else {
|
||||
|
||||
@@ -249,12 +249,6 @@ impl BlockSegmentPostings {
|
||||
|
||||
/// Returns the length of the current block.
|
||||
///
|
||||
/// Returns the decoded term-frequency buffer for the current block.
|
||||
#[inline]
|
||||
pub(crate) fn freq_output_array(&self) -> &[u32] {
|
||||
self.freq_decoder.output_array()
|
||||
}
|
||||
|
||||
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
|
||||
/// except the last block that may have a length
|
||||
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
|
||||
@@ -287,33 +281,6 @@ impl BlockSegmentPostings {
|
||||
doc
|
||||
}
|
||||
|
||||
/// Returns the number of documents with a doc id strictly smaller than `target`
|
||||
/// (i.e. the *rank* of `target` in this posting list).
|
||||
///
|
||||
/// This jumps to the block that may contain `target` through the skip list, so no
|
||||
/// skipped block is decoded; a single block is then decoded to locate `target`
|
||||
/// within it. The cost is therefore `O(number_of_skip_list_entries)` plus one block
|
||||
/// decode, rather than `O(doc_freq)`.
|
||||
///
|
||||
/// Like [`Self::seek`], the underlying cursor only ever moves forward. This method
|
||||
/// must be called with **non-decreasing** `target` values (galloping); calling it
|
||||
/// with a `target` smaller than a previous one yields an incorrect result. `target`
|
||||
/// must be a valid doc id (i.e. `target <= TERMINATED`), exactly as for `seek`.
|
||||
///
|
||||
/// Edge cases: returns `0` when `target` is smaller than every doc id, and
|
||||
/// `doc_freq()` when `target` is larger than every doc id.
|
||||
pub fn rank(&mut self, target: DocId) -> u32 {
|
||||
if self.doc_freq == 0 {
|
||||
return 0;
|
||||
}
|
||||
// `within` = number of docs in the landed block with a doc id < target.
|
||||
let within = self.seek(target);
|
||||
// `remaining_docs` counts the landed block and everything after it, so the
|
||||
// difference is the number of docs in all blocks strictly before it.
|
||||
let docs_before_block = self.doc_freq - self.skip_reader.remaining_docs();
|
||||
docs_before_block + within as u32
|
||||
}
|
||||
|
||||
pub(crate) fn position_offset(&self) -> u64 {
|
||||
self.skip_reader.position_offset()
|
||||
}
|
||||
@@ -331,11 +298,6 @@ impl BlockSegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn has_remaining_docs(&self) -> bool {
|
||||
self.skip_reader.has_remaining_docs()
|
||||
}
|
||||
|
||||
pub(crate) fn block_is_loaded(&self) -> bool {
|
||||
self.block_loaded
|
||||
}
|
||||
@@ -595,38 +557,4 @@ mod tests {
|
||||
assert_eq!(block_segments.docs(), &[1, 3, 5]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings_rank() -> crate::Result<()> {
|
||||
// ~8 blocks worth of docs so the skip list is actually exercised.
|
||||
let docs: Vec<DocId> = (0..1000u32).map(|i| i * 3).collect();
|
||||
let mut block_postings = build_block_postings(&docs[..])?;
|
||||
let doc_freq = block_postings.doc_freq();
|
||||
|
||||
// rank(target) must equal the number of docs strictly below target.
|
||||
// Targets are queried in non-decreasing order, as the API requires.
|
||||
// `target` values must be a valid doc id (<= TERMINATED) and non-decreasing.
|
||||
let targets = [
|
||||
0u32, 1, 2, 3, 4, 299, 300, 301, 1500, 2996, 2997, 3000, 10_000,
|
||||
];
|
||||
for &target in &targets {
|
||||
let expected = docs.iter().filter(|&&d| d < target).count() as u32;
|
||||
assert_eq!(
|
||||
block_postings.rank(target),
|
||||
expected,
|
||||
"rank({target}) mismatch"
|
||||
);
|
||||
}
|
||||
|
||||
// Edge cases: below the first doc -> 0, above the last doc -> doc_freq.
|
||||
let mut fresh = build_block_postings(&docs[..])?;
|
||||
assert_eq!(fresh.rank(0), 0);
|
||||
let mut fresh = build_block_postings(&docs[..])?;
|
||||
assert_eq!(fresh.rank(1_000_000), doc_freq);
|
||||
|
||||
// Empty postings: rank is always 0.
|
||||
let mut empty = BlockSegmentPostings::empty();
|
||||
assert_eq!(empty.rank(42), 0);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -275,9 +275,8 @@ impl Recorder for TfAndPositionRecorder {
|
||||
mod tests {
|
||||
|
||||
use common::write_u32_vint;
|
||||
use stacker::MemoryArena;
|
||||
|
||||
use super::{BufferLender, Recorder, TermFrequencyRecorder, VInt32Reader};
|
||||
use super::{BufferLender, VInt32Reader};
|
||||
|
||||
#[test]
|
||||
fn test_buffer_lender() {
|
||||
@@ -315,98 +314,4 @@ mod tests {
|
||||
let res: Vec<u32> = VInt32Reader::new(&buffer[..]).collect();
|
||||
assert_eq!(&res[..], &vals[..]);
|
||||
}
|
||||
|
||||
// ── TermFrequencyRecorder ─────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn term_frequency_recorder_has_term_freq() {
|
||||
let rec = TermFrequencyRecorder::default();
|
||||
assert!(
|
||||
rec.has_term_freq(),
|
||||
"TermFrequencyRecorder must advertise term-frequency support"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn term_frequency_recorder_term_doc_freq_single_doc() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut rec = TermFrequencyRecorder::default();
|
||||
|
||||
// Record one document with two term occurrences.
|
||||
rec.new_doc(0, &mut arena);
|
||||
rec.record_position(0, &mut arena);
|
||||
rec.record_position(1, &mut arena);
|
||||
rec.close_doc(&mut arena);
|
||||
|
||||
assert_eq!(
|
||||
rec.term_doc_freq(),
|
||||
Some(1),
|
||||
"term_doc_freq should be 1 after recording one document"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn term_frequency_recorder_term_doc_freq_multiple_docs() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut rec = TermFrequencyRecorder::default();
|
||||
|
||||
// Three documents with 1, 3, and 2 occurrences respectively.
|
||||
for (doc, tf) in [(0u32, 1u32), (5, 3), (10, 2)] {
|
||||
rec.new_doc(doc, &mut arena);
|
||||
for pos in 0..tf {
|
||||
rec.record_position(pos, &mut arena);
|
||||
}
|
||||
rec.close_doc(&mut arena);
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
rec.term_doc_freq(),
|
||||
Some(3),
|
||||
"term_doc_freq should equal the number of documents recorded"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn term_frequency_recorder_zero_docs() {
|
||||
let rec = TermFrequencyRecorder::default();
|
||||
assert_eq!(
|
||||
rec.term_doc_freq(),
|
||||
Some(0),
|
||||
"term_doc_freq should be 0 before any document is recorded"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn term_frequency_recorder_single_occurrence_per_doc() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut rec = TermFrequencyRecorder::default();
|
||||
|
||||
// Each document has exactly one occurrence — the minimum non-trivial case.
|
||||
for doc in [1u32, 2, 100] {
|
||||
rec.new_doc(doc, &mut arena);
|
||||
rec.record_position(0, &mut arena);
|
||||
rec.close_doc(&mut arena);
|
||||
}
|
||||
|
||||
assert_eq!(rec.term_doc_freq(), Some(3));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn term_frequency_recorder_high_frequency_doc() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut rec = TermFrequencyRecorder::default();
|
||||
|
||||
// A document where the term appears many times.
|
||||
rec.new_doc(42, &mut arena);
|
||||
for pos in 0..1000 {
|
||||
rec.record_position(pos, &mut arena);
|
||||
}
|
||||
rec.close_doc(&mut arena);
|
||||
|
||||
assert_eq!(
|
||||
rec.term_doc_freq(),
|
||||
Some(1),
|
||||
"term_doc_freq counts documents, not occurrences"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -146,11 +146,6 @@ impl SkipReader {
|
||||
skip_reader
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn has_remaining_docs(&self) -> bool {
|
||||
self.remaining_docs != 0
|
||||
}
|
||||
|
||||
pub fn reset(&mut self, data: OwnedBytes, doc_freq: u32) {
|
||||
self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
|
||||
0
|
||||
@@ -187,12 +182,6 @@ impl SkipReader {
|
||||
self.last_doc_in_block
|
||||
}
|
||||
|
||||
/// Number of docs from the start of the current block to the end of the postings
|
||||
/// (i.e. the current block plus every block after it).
|
||||
pub(crate) fn remaining_docs(&self) -> u32 {
|
||||
self.remaining_docs
|
||||
}
|
||||
|
||||
pub fn position_offset(&self) -> u64 {
|
||||
self.position_offset
|
||||
}
|
||||
|
||||
@@ -50,7 +50,7 @@ fn block_max_was_too_low_advance_one_scorer(
|
||||
scorers: &mut [TermScorerWithMaxScore],
|
||||
pivot_len: usize,
|
||||
) {
|
||||
debug_assert!(scorers.iter().map(|scorer| scorer.doc()).is_sorted());
|
||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||
let mut scorer_to_seek = pivot_len - 1;
|
||||
let mut global_max_score = scorers[scorer_to_seek].max_score;
|
||||
let mut doc_to_seek_after = scorers[scorer_to_seek].last_doc_in_block();
|
||||
@@ -76,7 +76,7 @@ fn block_max_was_too_low_advance_one_scorer(
|
||||
scorers[scorer_to_seek].seek(doc_to_seek_after);
|
||||
|
||||
restore_ordering(scorers, scorer_to_seek);
|
||||
debug_assert!(scorers.iter().map(|scorer| scorer.doc()).is_sorted());
|
||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||
}
|
||||
|
||||
// Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted
|
||||
@@ -90,7 +90,7 @@ fn restore_ordering(term_scorers: &mut [TermScorerWithMaxScore], ord: usize) {
|
||||
}
|
||||
term_scorers.swap(i, i - 1);
|
||||
}
|
||||
debug_assert!(term_scorers.iter().map(|scorer| scorer.doc()).is_sorted());
|
||||
debug_assert!(is_sorted(term_scorers.iter().map(|scorer| scorer.doc())));
|
||||
}
|
||||
|
||||
// Attempts to advance all term_scorers between `&term_scorers[0..before_len]` to the pivot.
|
||||
@@ -150,21 +150,17 @@ pub fn block_wand(
|
||||
mut threshold: Score,
|
||||
callback: &mut dyn FnMut(u32, Score) -> Score,
|
||||
) {
|
||||
scorers.retain(|scorer| scorer.doc() < TERMINATED);
|
||||
if scorers.len() == 1 {
|
||||
let scorer = scorers.pop().unwrap();
|
||||
return block_wand_single_scorer(scorer, threshold, callback);
|
||||
}
|
||||
let mut scorers: Vec<TermScorerWithMaxScore> = scorers
|
||||
.iter_mut()
|
||||
.map(TermScorerWithMaxScore::from)
|
||||
.collect();
|
||||
// At this point we need to ensure that the scorers are sorted!
|
||||
scorers.sort_by_key(|scorer| scorer.doc());
|
||||
// At this point we need to ensure that the scorers are sorted!
|
||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||
while let Some((before_pivot_len, pivot_len, pivot_doc)) =
|
||||
find_pivot_doc(&scorers[..], threshold)
|
||||
{
|
||||
debug_assert!(scorers.iter().map(|scorer| scorer.doc()).is_sorted());
|
||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||
debug_assert_ne!(pivot_doc, TERMINATED);
|
||||
debug_assert!(before_pivot_len < pivot_len);
|
||||
|
||||
@@ -232,7 +228,7 @@ pub fn block_wand_single_scorer(
|
||||
loop {
|
||||
// We position the scorer on a block that can reach
|
||||
// the threshold.
|
||||
while scorer.block_max_score() <= threshold {
|
||||
while scorer.block_max_score() < threshold {
|
||||
let last_doc_in_block = scorer.last_doc_in_block();
|
||||
if last_doc_in_block == TERMINATED {
|
||||
return;
|
||||
@@ -290,6 +286,18 @@ impl DerefMut for TermScorerWithMaxScore<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
|
||||
if let Some(first) = it.next() {
|
||||
let mut prev = first;
|
||||
for doc in it {
|
||||
if doc < prev {
|
||||
return false;
|
||||
}
|
||||
prev = doc;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::cmp::Ordering;
|
||||
@@ -1,464 +0,0 @@
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::Scorer;
|
||||
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
|
||||
/// Block-max pruning for top-K over intersection of term scorers.
|
||||
///
|
||||
/// Uses the least-frequent term as "leader" to define 128-doc processing windows.
|
||||
/// For each window, the sum of block_max_scores is compared to the current threshold;
|
||||
/// if the block can't beat it, the entire block is skipped.
|
||||
///
|
||||
/// Within non-skipped blocks, individual documents are pruned by checking whether
|
||||
/// leader_score + sum(secondary block_max_scores) can exceed the threshold before
|
||||
/// performing the expensive intersection membership check (seeking into secondary scorers).
|
||||
///
|
||||
/// # Preconditions
|
||||
/// - `scorers` has at least 2 elements
|
||||
/// - All scorers read frequencies (`FreqReadingOption::ReadFreq`)
|
||||
pub(crate) fn block_wand_intersection(
|
||||
mut scorers: Vec<TermScorer>,
|
||||
mut threshold: Score,
|
||||
callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
) {
|
||||
assert!(scorers.len() >= 2);
|
||||
|
||||
// Sort by cost (ascending). scorers[0] becomes the "leader" (rarest term).
|
||||
scorers.sort_by_key(TermScorer::size_hint);
|
||||
|
||||
let (leader, secondaries) = scorers.split_first_mut().unwrap();
|
||||
|
||||
// Precompute global max scores for early termination checks.
|
||||
let leader_max_score: Score = leader.max_score();
|
||||
let secondaries_global_max_sum: Score = secondaries.iter().map(TermScorer::max_score).sum();
|
||||
|
||||
// Early exit: no document can possibly beat the threshold.
|
||||
if leader_max_score + secondaries_global_max_sum <= threshold {
|
||||
return;
|
||||
}
|
||||
|
||||
// Borrow fieldnorm reader and BM25 weight before the main loop.
|
||||
// These are immutable references to disjoint fields from block_cursor,
|
||||
// but Rust's borrow checker can't see through method calls, so we
|
||||
// extract them once upfront.
|
||||
let fieldnorm_reader = leader.fieldnorm_reader().clone();
|
||||
let bm25_weight = leader.bm25_weight().clone();
|
||||
|
||||
let mut doc = leader.doc();
|
||||
|
||||
let mut secondary_block_max_scores: Box<[f32]> =
|
||||
vec![0.0f32; secondaries.len()].into_boxed_slice();
|
||||
let mut secondary_suffix_block_max: Box<[f32]> =
|
||||
vec![0.0f32; secondaries.len()].into_boxed_slice();
|
||||
|
||||
while doc < TERMINATED {
|
||||
// --- Phase 1: Block-level pruning ---
|
||||
//
|
||||
// Position all skip readers on the block containing `doc`.
|
||||
// seek_block is cheap: it only advances the skip reader, no block decompression.
|
||||
leader.seek_block(doc);
|
||||
let leader_block_max: Score = leader.block_max_score();
|
||||
|
||||
// Compute the window end as the minimum last_doc_in_block across all scorers.
|
||||
// This ensures the block_max values are valid for all docs in [doc, window_end].
|
||||
// Different scorers have independently aligned blocks, so we must use the
|
||||
// smallest window where all block_max values hold.
|
||||
let mut window_end: DocId = leader.last_doc_in_block();
|
||||
|
||||
let mut secondary_block_max_sum: Score = 0.0;
|
||||
let num_secondaries = secondaries.len();
|
||||
for (idx, secondary) in secondaries.iter_mut().enumerate() {
|
||||
secondary.block_cursor().seek_block(doc);
|
||||
if !secondary.block_cursor().has_remaining_docs() {
|
||||
return;
|
||||
}
|
||||
window_end = window_end.min(secondary.last_doc_in_block());
|
||||
let bms = secondary.block_max_score();
|
||||
secondary_block_max_scores[idx] = bms;
|
||||
secondary_block_max_sum += bms;
|
||||
}
|
||||
|
||||
if leader_block_max + secondary_block_max_sum <= threshold {
|
||||
// The entire window cannot beat the threshold. Skip past it.
|
||||
doc = window_end + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- Phase 2: Batch processing within the window ---
|
||||
//
|
||||
// Score-first approach: decode the leader's block, filter by threshold,
|
||||
// then check intersection membership only for survivors. This avoids expensive
|
||||
// secondary seeks for docs that can't beat the threshold.
|
||||
let block_cursor = leader.block_cursor();
|
||||
// seek loads the block and returns the in-block index of the first doc >= `doc`.
|
||||
let start_idx = block_cursor.seek(doc);
|
||||
|
||||
// Use the branchless binary search on the doc decoder to find the first
|
||||
// index past window_end.
|
||||
let end_idx = block_cursor
|
||||
.doc_decoder
|
||||
.seek_within_block(window_end + 1)
|
||||
.min(block_cursor.block_len());
|
||||
|
||||
let block_docs = &block_cursor.doc_decoder.output_array()[start_idx..end_idx];
|
||||
let block_freqs = &block_cursor.freq_output_array()[start_idx..end_idx];
|
||||
|
||||
// Pass 1: Batch-compute leader BM25 scores and branchlessly filter
|
||||
// candidates that can't beat the threshold.
|
||||
//
|
||||
// The trick: always write to the buffer at `num_candidates`, then
|
||||
// conditionally advance the count. The compiler can turn this into
|
||||
// a cmov instead of a branch, avoiding misprediction costs.
|
||||
let score_threshold = threshold - secondary_block_max_sum;
|
||||
let mut candidate_doc_ids = [0u32; COMPRESSION_BLOCK_SIZE];
|
||||
let mut candidate_scores = [0.0f32; COMPRESSION_BLOCK_SIZE];
|
||||
let mut num_candidates = 0usize;
|
||||
|
||||
for (candidate_doc, term_freq) in
|
||||
block_docs.iter().copied().zip(block_freqs.iter().copied())
|
||||
{
|
||||
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(candidate_doc);
|
||||
let leader_score = bm25_weight.score(fieldnorm_id, term_freq);
|
||||
candidate_doc_ids[num_candidates] = candidate_doc;
|
||||
candidate_scores[num_candidates] = leader_score;
|
||||
num_candidates += (leader_score > score_threshold) as usize;
|
||||
}
|
||||
|
||||
// Precompute suffix sums: suffix[i] = sum of block_max for secondaries[i+1..].
|
||||
// Used in Phase 2 to prune candidates that can't beat threshold even with
|
||||
// remaining secondaries contributing their block_max.
|
||||
if num_candidates == 0 {
|
||||
doc = window_end + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut running = 0.0f32;
|
||||
for idx in (0..num_secondaries).rev() {
|
||||
secondary_suffix_block_max[idx] = running;
|
||||
running += secondary_block_max_scores[idx];
|
||||
}
|
||||
|
||||
// Pass 2: Check intersection membership only for survivors.
|
||||
// score_threshold may be stale (threshold can increase from callbacks),
|
||||
// but that's conservative — we may check a few extra candidates, never miss one.
|
||||
'next_candidate: for candidate_idx in 0..num_candidates {
|
||||
let candidate_doc = candidate_doc_ids[candidate_idx];
|
||||
let mut total_score: Score = candidate_scores[candidate_idx];
|
||||
|
||||
for (secondary_idx, secondary) in secondaries.iter_mut().enumerate() {
|
||||
// If a previous candidate already advanced this secondary past
|
||||
// candidate_doc, the candidate can't be in the intersection.
|
||||
if secondary.doc() > candidate_doc {
|
||||
continue 'next_candidate;
|
||||
}
|
||||
let seek_result = secondary.seek(candidate_doc);
|
||||
if seek_result != candidate_doc {
|
||||
continue 'next_candidate;
|
||||
}
|
||||
total_score += secondary.score();
|
||||
|
||||
// Prune: even if all remaining secondaries score at their block max,
|
||||
// can we still beat the threshold?
|
||||
if total_score + secondary_suffix_block_max[secondary_idx] <= threshold {
|
||||
continue 'next_candidate;
|
||||
}
|
||||
}
|
||||
|
||||
// All secondaries matched.
|
||||
if total_score > threshold {
|
||||
threshold = callback(candidate_doc, total_score);
|
||||
|
||||
if leader_max_score + secondaries_global_max_sum <= threshold {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
doc = window_end + 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::{Bm25Weight, Scorer};
|
||||
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
|
||||
struct Float(Score);
|
||||
|
||||
impl Eq for Float {}
|
||||
|
||||
impl PartialEq for Float {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for Float {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for Float {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other.0.partial_cmp(&self.0).unwrap_or(Ordering::Equal)
|
||||
}
|
||||
}
|
||||
|
||||
fn nearly_equals(left: Score, right: Score) -> bool {
|
||||
(left - right).abs() < 0.0001 * (left + right).abs()
|
||||
}
|
||||
|
||||
/// Run block_wand_intersection and collect (doc, score) pairs above threshold.
|
||||
fn compute_checkpoints_block_wand_intersection(
|
||||
term_scorers: Vec<TermScorer>,
|
||||
top_k: usize,
|
||||
) -> Vec<(DocId, Score)> {
|
||||
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(top_k);
|
||||
let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
|
||||
let mut limit: Score = 0.0;
|
||||
|
||||
let callback = &mut |doc, score| {
|
||||
heap.push(Float(score));
|
||||
if heap.len() > top_k {
|
||||
heap.pop().unwrap();
|
||||
}
|
||||
if heap.len() == top_k {
|
||||
limit = heap.peek().unwrap().0;
|
||||
}
|
||||
if !nearly_equals(score, limit) {
|
||||
checkpoints.push((doc, score));
|
||||
}
|
||||
limit
|
||||
};
|
||||
|
||||
super::block_wand_intersection(term_scorers, Score::MIN, callback);
|
||||
checkpoints
|
||||
}
|
||||
|
||||
/// Naive baseline: intersect by iterating all docs.
|
||||
fn compute_checkpoints_naive_intersection(
|
||||
mut term_scorers: Vec<TermScorer>,
|
||||
top_k: usize,
|
||||
) -> Vec<(DocId, Score)> {
|
||||
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(top_k);
|
||||
let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
|
||||
let mut limit = Score::MIN;
|
||||
|
||||
// Sort by cost to use the cheapest as driver.
|
||||
term_scorers.sort_by_key(|s| s.cost());
|
||||
|
||||
let (leader, secondaries) = term_scorers.split_first_mut().unwrap();
|
||||
|
||||
let mut doc = leader.doc();
|
||||
while doc != TERMINATED {
|
||||
let mut all_match = true;
|
||||
for secondary in secondaries.iter_mut() {
|
||||
let secondary_doc = secondary.doc();
|
||||
let seek_result = if secondary_doc <= doc {
|
||||
secondary.seek(doc)
|
||||
} else {
|
||||
secondary_doc
|
||||
};
|
||||
if seek_result != doc {
|
||||
all_match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if all_match {
|
||||
let score: Score =
|
||||
leader.score() + secondaries.iter_mut().map(|s| s.score()).sum::<Score>();
|
||||
|
||||
if score > limit {
|
||||
heap.push(Float(score));
|
||||
if heap.len() > top_k {
|
||||
heap.pop().unwrap();
|
||||
}
|
||||
if heap.len() == top_k {
|
||||
limit = heap.peek().unwrap().0;
|
||||
}
|
||||
if !nearly_equals(score, limit) {
|
||||
checkpoints.push((doc, score));
|
||||
}
|
||||
}
|
||||
}
|
||||
doc = leader.advance();
|
||||
}
|
||||
checkpoints
|
||||
}
|
||||
|
||||
const MAX_TERM_FREQ: u32 = 100u32;
|
||||
|
||||
fn posting_list(max_doc: u32) -> BoxedStrategy<Vec<(DocId, u32)>> {
|
||||
(1..max_doc + 1)
|
||||
.prop_flat_map(move |doc_freq| {
|
||||
(
|
||||
proptest::bits::bitset::sampled(doc_freq as usize, 0..max_doc as usize),
|
||||
proptest::collection::vec(1u32..MAX_TERM_FREQ, doc_freq as usize),
|
||||
)
|
||||
})
|
||||
.prop_map(|(docset, term_freqs)| {
|
||||
docset
|
||||
.iter()
|
||||
.map(|doc| doc as u32)
|
||||
.zip(term_freqs.iter().cloned())
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.boxed()
|
||||
}
|
||||
|
||||
#[expect(clippy::type_complexity)]
|
||||
fn gen_term_scorers(num_scorers: usize) -> BoxedStrategy<(Vec<Vec<(DocId, u32)>>, Vec<u32>)> {
|
||||
(1u32..100u32)
|
||||
.prop_flat_map(move |max_doc: u32| {
|
||||
(
|
||||
proptest::collection::vec(posting_list(max_doc), num_scorers),
|
||||
proptest::collection::vec(2u32..10u32 * MAX_TERM_FREQ, max_doc as usize),
|
||||
)
|
||||
})
|
||||
.boxed()
|
||||
}
|
||||
|
||||
fn test_block_wand_intersection_aux(posting_lists: &[Vec<(DocId, u32)>], fieldnorms: &[u32]) {
|
||||
// Repeat docs 64 times to create multi-block scenarios, matching block_wand.rs test
|
||||
// strategy.
|
||||
const REPEAT: usize = 64;
|
||||
let fieldnorms_expanded: Vec<u32> = fieldnorms
|
||||
.iter()
|
||||
.cloned()
|
||||
.flat_map(|fieldnorm| std::iter::repeat_n(fieldnorm, REPEAT))
|
||||
.collect();
|
||||
|
||||
let postings_lists_expanded: Vec<Vec<(DocId, u32)>> = posting_lists
|
||||
.iter()
|
||||
.map(|posting_list| {
|
||||
posting_list
|
||||
.iter()
|
||||
.cloned()
|
||||
.flat_map(|(doc, term_freq)| {
|
||||
(0_u32..REPEAT as u32).map(move |offset| {
|
||||
(
|
||||
doc * (REPEAT as u32) + offset,
|
||||
if offset == 0 { term_freq } else { 1 },
|
||||
)
|
||||
})
|
||||
})
|
||||
.collect::<Vec<(DocId, u32)>>()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let total_fieldnorms: u64 = fieldnorms_expanded
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|fieldnorm| fieldnorm as u64)
|
||||
.sum();
|
||||
let average_fieldnorm = (total_fieldnorms as Score) / (fieldnorms_expanded.len() as Score);
|
||||
let max_doc = fieldnorms_expanded.len();
|
||||
|
||||
let make_scorers = || -> Vec<TermScorer> {
|
||||
postings_lists_expanded
|
||||
.iter()
|
||||
.map(|postings| {
|
||||
let bm25_weight = Bm25Weight::for_one_term(
|
||||
postings.len() as u64,
|
||||
max_doc as u64,
|
||||
average_fieldnorm,
|
||||
);
|
||||
TermScorer::create_for_test(postings, &fieldnorms_expanded[..], bm25_weight)
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
|
||||
for top_k in 1..4 {
|
||||
let checkpoints_optimized =
|
||||
compute_checkpoints_block_wand_intersection(make_scorers(), top_k);
|
||||
let checkpoints_naive = compute_checkpoints_naive_intersection(make_scorers(), top_k);
|
||||
assert_eq!(
|
||||
checkpoints_optimized.len(),
|
||||
checkpoints_naive.len(),
|
||||
"Mismatch in checkpoint count for top_k={top_k}"
|
||||
);
|
||||
for (&(left_doc, left_score), &(right_doc, right_score)) in
|
||||
checkpoints_optimized.iter().zip(checkpoints_naive.iter())
|
||||
{
|
||||
assert_eq!(left_doc, right_doc);
|
||||
assert!(
|
||||
nearly_equals(left_score, right_score),
|
||||
"Score mismatch for doc {left_doc}: {left_score} vs {right_score}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(500))]
|
||||
#[test]
|
||||
fn test_block_wand_intersection_two_scorers(
|
||||
(posting_lists, fieldnorms) in gen_term_scorers(2)
|
||||
) {
|
||||
test_block_wand_intersection_aux(&posting_lists[..], &fieldnorms[..]);
|
||||
}
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(500))]
|
||||
#[test]
|
||||
fn test_block_wand_intersection_three_scorers(
|
||||
(posting_lists, fieldnorms) in gen_term_scorers(3)
|
||||
) {
|
||||
test_block_wand_intersection_aux(&posting_lists[..], &fieldnorms[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_wand_intersection_disjoint() {
|
||||
// Two posting lists with no overlap — intersection is empty.
|
||||
let fieldnorms: Vec<u32> = vec![10; 200];
|
||||
let average_fieldnorm = 10.0;
|
||||
let postings_a: Vec<(DocId, u32)> = (0..100).map(|d| (d, 1)).collect();
|
||||
let postings_b: Vec<(DocId, u32)> = (100..200).map(|d| (d, 1)).collect();
|
||||
|
||||
let scorer_a = TermScorer::create_for_test(
|
||||
&postings_a,
|
||||
&fieldnorms,
|
||||
Bm25Weight::for_one_term(100, 200, average_fieldnorm),
|
||||
);
|
||||
let scorer_b = TermScorer::create_for_test(
|
||||
&postings_b,
|
||||
&fieldnorms,
|
||||
Bm25Weight::for_one_term(100, 200, average_fieldnorm),
|
||||
);
|
||||
|
||||
let checkpoints = compute_checkpoints_block_wand_intersection(vec![scorer_a, scorer_b], 10);
|
||||
assert!(checkpoints.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_wand_intersection_all_overlap() {
|
||||
// Two posting lists with full overlap.
|
||||
let fieldnorms: Vec<u32> = vec![10; 50];
|
||||
let average_fieldnorm = 10.0;
|
||||
let postings: Vec<(DocId, u32)> = (0..50).map(|d| (d, 3)).collect();
|
||||
|
||||
let make_scorer = || {
|
||||
TermScorer::create_for_test(
|
||||
&postings,
|
||||
&fieldnorms,
|
||||
Bm25Weight::for_one_term(50, 50, average_fieldnorm),
|
||||
)
|
||||
};
|
||||
|
||||
let checkpoints_opt =
|
||||
compute_checkpoints_block_wand_intersection(vec![make_scorer(), make_scorer()], 5);
|
||||
let checkpoints_naive =
|
||||
compute_checkpoints_naive_intersection(vec![make_scorer(), make_scorer()], 5);
|
||||
assert_eq!(checkpoints_opt.len(), checkpoints_naive.len());
|
||||
}
|
||||
}
|
||||
@@ -16,7 +16,6 @@ use crate::{DocId, Score};
|
||||
|
||||
enum SpecializedScorer {
|
||||
TermUnion(Vec<TermScorer>),
|
||||
TermIntersection(Vec<TermScorer>),
|
||||
Other(Box<dyn Scorer>),
|
||||
}
|
||||
|
||||
@@ -50,9 +49,10 @@ where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
{
|
||||
assert!(!scorers.is_empty());
|
||||
if scorers.len() == 1 && !scorers[0].is::<TermScorer>() {
|
||||
if scorers.len() == 1 {
|
||||
return SpecializedScorer::Other(scorers.into_iter().next().unwrap()); //< we checked the size beforehand
|
||||
}
|
||||
|
||||
{
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| scorer.is::<TermScorer>());
|
||||
if is_all_term_queries {
|
||||
@@ -66,9 +66,6 @@ where
|
||||
{
|
||||
// Block wand is only available if we read frequencies.
|
||||
return SpecializedScorer::TermUnion(scorers);
|
||||
} else if scorers.len() == 1 {
|
||||
// Single TermScorer without freq reading — unwrap directly.
|
||||
return SpecializedScorer::Other(Box::new(scorers.into_iter().next().unwrap()));
|
||||
} else {
|
||||
return SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
|
||||
scorers,
|
||||
@@ -96,13 +93,6 @@ fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
|
||||
BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs);
|
||||
Box::new(union_scorer)
|
||||
}
|
||||
SpecializedScorer::TermIntersection(term_scorers) => {
|
||||
let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
|
||||
.into_iter()
|
||||
.map(|s| Box::new(s) as Box<dyn Scorer>)
|
||||
.collect();
|
||||
intersect_scorers(boxed_scorers, num_docs)
|
||||
}
|
||||
SpecializedScorer::Other(scorer) => scorer,
|
||||
}
|
||||
}
|
||||
@@ -307,43 +297,14 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
||||
// Result depends entirely on MUST + any removed AllScorers.
|
||||
let combined_all_scorer_count = must_special_scorer_counts.num_all_scorers
|
||||
+ should_special_scorer_counts.num_all_scorers;
|
||||
|
||||
// Try to detect a pure TermScorer intersection for block-max optimization.
|
||||
// Preconditions: no removed AllScorers, at least 2 scorers, all TermScorer
|
||||
// with frequency reading enabled.
|
||||
if combined_all_scorer_count == 0
|
||||
&& must_scorers.len() >= 2
|
||||
&& must_scorers.iter().all(|s| s.is::<TermScorer>())
|
||||
{
|
||||
let term_scorers: Vec<TermScorer> = must_scorers
|
||||
.into_iter()
|
||||
.map(|s| *(s.downcast::<TermScorer>().map_err(|_| ()).unwrap()))
|
||||
.collect();
|
||||
if term_scorers
|
||||
.iter()
|
||||
.all(|s| s.freq_reading_option() == FreqReadingOption::ReadFreq)
|
||||
{
|
||||
SpecializedScorer::TermIntersection(term_scorers)
|
||||
} else {
|
||||
let must_scorers: Vec<Box<dyn Scorer>> = term_scorers
|
||||
.into_iter()
|
||||
.map(|s| Box::new(s) as Box<dyn Scorer>)
|
||||
.collect();
|
||||
let boxed_scorer: Box<dyn Scorer> =
|
||||
effective_must_scorer(must_scorers, 0, reader.max_doc(), num_docs)
|
||||
.unwrap_or_else(|| Box::new(EmptyScorer));
|
||||
SpecializedScorer::Other(boxed_scorer)
|
||||
}
|
||||
} else {
|
||||
let boxed_scorer: Box<dyn Scorer> = effective_must_scorer(
|
||||
must_scorers,
|
||||
combined_all_scorer_count,
|
||||
reader.max_doc(),
|
||||
num_docs,
|
||||
)
|
||||
.unwrap_or_else(|| Box::new(EmptyScorer));
|
||||
SpecializedScorer::Other(boxed_scorer)
|
||||
}
|
||||
let boxed_scorer: Box<dyn Scorer> = effective_must_scorer(
|
||||
must_scorers,
|
||||
combined_all_scorer_count,
|
||||
reader.max_doc(),
|
||||
num_docs,
|
||||
)
|
||||
.unwrap_or_else(|| Box::new(EmptyScorer));
|
||||
SpecializedScorer::Other(boxed_scorer)
|
||||
}
|
||||
(ShouldScorersCombinationMethod::Optional(should_scorer), must_scorers) => {
|
||||
// Optional SHOULD: contributes to scoring but not required for matching.
|
||||
@@ -502,21 +463,15 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
callback: &mut dyn FnMut(DocId, Score),
|
||||
) -> crate::Result<()> {
|
||||
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
|
||||
let num_docs = reader.num_docs();
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
let mut union_scorer =
|
||||
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
|
||||
let mut union_scorer = BufferedUnionScorer::build(
|
||||
term_scorers,
|
||||
&self.score_combiner_fn,
|
||||
reader.num_docs(),
|
||||
);
|
||||
for_each_scorer(&mut union_scorer, callback);
|
||||
}
|
||||
SpecializedScorer::TermIntersection(term_scorers) => {
|
||||
let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
|
||||
.into_iter()
|
||||
.map(|term_scorer| Box::new(term_scorer) as Box<dyn Scorer>)
|
||||
.collect();
|
||||
let mut intersection = intersect_scorers(boxed_scorers, num_docs);
|
||||
for_each_scorer(intersection.as_mut(), callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
for_each_scorer(scorer.as_mut(), callback);
|
||||
}
|
||||
@@ -530,23 +485,17 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
callback: &mut dyn FnMut(&[DocId]),
|
||||
) -> crate::Result<()> {
|
||||
let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?;
|
||||
let num_docs = reader.num_docs();
|
||||
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
|
||||
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
let mut union_scorer =
|
||||
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
|
||||
let mut union_scorer = BufferedUnionScorer::build(
|
||||
term_scorers,
|
||||
&self.score_combiner_fn,
|
||||
reader.num_docs(),
|
||||
);
|
||||
for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
|
||||
}
|
||||
SpecializedScorer::TermIntersection(term_scorers) => {
|
||||
let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
|
||||
.into_iter()
|
||||
.map(|term_scorer| Box::new(term_scorer) as Box<dyn Scorer>)
|
||||
.collect();
|
||||
let mut intersection = intersect_scorers(boxed_scorers, num_docs);
|
||||
for_each_docset_buffered(intersection.as_mut(), &mut buffer, callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
for_each_docset_buffered(scorer.as_mut(), &mut buffer, callback);
|
||||
}
|
||||
@@ -575,9 +524,6 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
super::block_wand(term_scorers, threshold, callback);
|
||||
}
|
||||
SpecializedScorer::TermIntersection(term_scorers) => {
|
||||
super::block_wand_intersection(term_scorers, threshold, callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
for_each_pruning_scorer(scorer.as_mut(), threshold, callback);
|
||||
}
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
mod block_wand_intersection;
|
||||
mod block_wand_union;
|
||||
mod block_wand;
|
||||
mod boolean_query;
|
||||
mod boolean_weight;
|
||||
|
||||
pub(crate) use self::block_wand_intersection::block_wand_intersection;
|
||||
pub(crate) use self::block_wand_union::{block_wand, block_wand_single_scorer};
|
||||
pub(crate) use self::block_wand::{block_wand, block_wand_single_scorer};
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::boolean_weight::BooleanWeight;
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::fmt;
|
||||
|
||||
use crate::docset::{SeekDangerResult, COLLECT_BLOCK_BUFFER_LEN};
|
||||
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
|
||||
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
|
||||
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term};
|
||||
|
||||
@@ -119,10 +119,6 @@ impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
|
||||
self.docset.seek(target)
|
||||
}
|
||||
|
||||
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
|
||||
self.docset.seek_danger(target)
|
||||
}
|
||||
|
||||
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
|
||||
self.docset.fill_buffer(buffer)
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ use std::ops::RangeInclusive;
|
||||
|
||||
use columnar::Column;
|
||||
|
||||
use crate::docset::SeekDangerResult;
|
||||
use crate::{DocId, DocSet, TERMINATED};
|
||||
|
||||
/// Helper to have a cursor over a vec of docids
|
||||
@@ -185,37 +184,6 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
|
||||
doc
|
||||
}
|
||||
|
||||
/// `seek_danger` only needs to answer whether `target` itself matches, so it does a cheap
|
||||
/// point lookup on the column instead of scanning forward to materialize the next match (the
|
||||
/// expensive part of a regular `seek`).
|
||||
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
|
||||
// Covers `target == TERMINATED` and any target past the last doc: no match is possible.
|
||||
if target >= self.column.num_docs() {
|
||||
return SeekDangerResult::SeekLowerBound(TERMINATED);
|
||||
}
|
||||
|
||||
if self.is_last_seek_distance_large(target) {
|
||||
self.reset_fetch_range();
|
||||
}
|
||||
self.last_seek_pos_opt = Some(target);
|
||||
|
||||
let is_match = self
|
||||
.column
|
||||
.values_for_doc(target)
|
||||
.any(|value| self.value_range.contains(&value));
|
||||
if is_match {
|
||||
// Leave the docset in a valid state positioned on `target`, so `doc()` returns it and a
|
||||
// following `advance()` resumes the scan right after it.
|
||||
self.loaded_docs.get_cleared_data().push(target);
|
||||
self.next_fetch_start = target + 1;
|
||||
SeekDangerResult::Found
|
||||
} else {
|
||||
// `target` is not in the docset. The next match is strictly greater than `target`, so
|
||||
// `target + 1` is a valid lower bound. We may leave the docset in an invalid state.
|
||||
SeekDangerResult::SeekLowerBound(target + 1)
|
||||
}
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
// TODO: Implement a better size hint
|
||||
self.column.num_docs() / 10
|
||||
@@ -241,148 +209,12 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ops::{Bound, RangeInclusive};
|
||||
use std::ops::Bound;
|
||||
|
||||
use columnar::Column;
|
||||
|
||||
use super::RangeDocSet;
|
||||
use crate::collector::Count;
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::docset::{SeekDangerResult, TERMINATED};
|
||||
use crate::query::RangeQuery;
|
||||
use crate::{schema, DocSet, Index, IndexBuilder, TantivyDocument, Term};
|
||||
|
||||
/// Builds a single-segment index where doc `i` carries `values_for_doc(i)` in a u64 fast
|
||||
/// field, then returns its column so we can drive a `RangeDocSet` directly.
|
||||
fn build_u64_column(
|
||||
num_docs: usize,
|
||||
values_for_doc: impl Fn(usize) -> Vec<u64>,
|
||||
) -> Column<u64> {
|
||||
let mut schema_builder = schema::SchemaBuilder::new();
|
||||
let value_field = schema_builder.add_u64_field("value", schema::FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
for i in 0..num_docs {
|
||||
let mut doc = TantivyDocument::new();
|
||||
for v in values_for_doc(i) {
|
||||
doc.add_u64(value_field, v);
|
||||
}
|
||||
writer.add_document(doc).unwrap();
|
||||
}
|
||||
writer.commit().unwrap();
|
||||
}
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
searcher
|
||||
.segment_reader(0)
|
||||
.fast_fields()
|
||||
.u64("value")
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn range_docset(
|
||||
value_range: RangeInclusive<u64>,
|
||||
num_docs: usize,
|
||||
values_for_doc: impl Fn(usize) -> Vec<u64>,
|
||||
) -> RangeDocSet<u64> {
|
||||
RangeDocSet::new(value_range, build_u64_column(num_docs, values_for_doc))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seek_danger_found_leaves_valid_state() {
|
||||
// Even docs match the range, odd docs do not.
|
||||
let mut docset = range_docset(0..=0, 100, |i| vec![(i % 2) as u64]);
|
||||
|
||||
// Matching target: `Found`, and the docset is positioned exactly on it.
|
||||
assert_eq!(docset.seek_danger(10), SeekDangerResult::Found);
|
||||
assert_eq!(docset.doc(), 10);
|
||||
// A following advance resumes the scan right after the found doc.
|
||||
assert_eq!(docset.advance(), 12);
|
||||
assert_eq!(docset.doc(), 12);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seek_danger_miss_returns_lower_bound() {
|
||||
let mut docset = range_docset(0..=0, 100, |i| vec![(i % 2) as u64]);
|
||||
|
||||
// Odd target does not match: lower bound is strictly greater than the target and never
|
||||
// skips past the next real match (here doc 12, the first even doc after 11).
|
||||
match docset.seek_danger(11) {
|
||||
SeekDangerResult::SeekLowerBound(lower_bound) => {
|
||||
assert!(lower_bound > 11);
|
||||
assert!(lower_bound <= 12);
|
||||
}
|
||||
SeekDangerResult::Found => panic!("11 should not match"),
|
||||
}
|
||||
// After a miss we may be in an invalid state; another seek_danger recovers it.
|
||||
assert_eq!(docset.seek_danger(12), SeekDangerResult::Found);
|
||||
assert_eq!(docset.doc(), 12);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seek_danger_terminated_and_out_of_bounds() {
|
||||
let mut docset = range_docset(0..=0, 10, |i| vec![(i % 2) as u64]);
|
||||
assert_eq!(
|
||||
docset.seek_danger(TERMINATED),
|
||||
SeekDangerResult::SeekLowerBound(TERMINATED)
|
||||
);
|
||||
// A target past the last doc has no possible match either.
|
||||
assert_eq!(
|
||||
docset.seek_danger(10),
|
||||
SeekDangerResult::SeekLowerBound(TERMINATED)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seek_danger_multivalued() {
|
||||
// Doc `i` holds values [i, i+1]; the range {5} matches docs 4 and 5.
|
||||
let mut docset = range_docset(5..=5, 20, |i| vec![i as u64, i as u64 + 1]);
|
||||
|
||||
assert_eq!(docset.seek_danger(4), SeekDangerResult::Found);
|
||||
assert_eq!(docset.doc(), 4);
|
||||
assert_eq!(docset.advance(), 5);
|
||||
// No further match after doc 5.
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seek_danger_matches_seek() {
|
||||
// Cross-check seek_danger against the true next match for every target, on a column with a
|
||||
// few sparse matches.
|
||||
let matches = [3u32, 7, 50, 51, 99];
|
||||
let num_docs = 100;
|
||||
let values_for_doc = |i: usize| {
|
||||
vec![if matches.contains(&(i as u32)) {
|
||||
1u64
|
||||
} else {
|
||||
0u64
|
||||
}]
|
||||
};
|
||||
|
||||
for target in 0..num_docs as u32 {
|
||||
// The first matching doc greater than or equal to `target`, i.e. what `seek` returns.
|
||||
let expected = matches
|
||||
.iter()
|
||||
.copied()
|
||||
.find(|&m| m >= target)
|
||||
.unwrap_or(TERMINATED);
|
||||
|
||||
let mut danger = range_docset(1..=1, num_docs, values_for_doc);
|
||||
match danger.seek_danger(target) {
|
||||
SeekDangerResult::Found => {
|
||||
assert_eq!(expected, target, "target {target} reported Found");
|
||||
assert_eq!(danger.doc(), target);
|
||||
}
|
||||
SeekDangerResult::SeekLowerBound(lower_bound) => {
|
||||
assert_ne!(expected, target, "target {target} should have been Found");
|
||||
assert!(lower_bound > target);
|
||||
// The lower bound must never skip past the true next match.
|
||||
assert!(lower_bound <= expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
use crate::{schema, IndexBuilder, TantivyDocument, Term};
|
||||
|
||||
#[test]
|
||||
fn range_query_fast_optional_field_minimum() {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::docset::DocSet;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::{BlockSegmentPostings, FreqReadingOption, Postings, SegmentPostings};
|
||||
use crate::postings::{FreqReadingOption, Postings, SegmentPostings};
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
use crate::query::{Explanation, Scorer};
|
||||
use crate::{DocId, Score};
|
||||
@@ -95,21 +95,6 @@ impl TermScorer {
|
||||
pub fn last_doc_in_block(&self) -> DocId {
|
||||
self.postings.block_cursor.skip_reader().last_doc_in_block()
|
||||
}
|
||||
|
||||
/// Returns a mutable reference to the underlying block cursor.
|
||||
pub(crate) fn block_cursor(&mut self) -> &mut BlockSegmentPostings {
|
||||
&mut self.postings.block_cursor
|
||||
}
|
||||
|
||||
/// Returns a reference to the fieldnorm reader for batch lookups.
|
||||
pub(crate) fn fieldnorm_reader(&self) -> &FieldNormReader {
|
||||
&self.fieldnorm_reader
|
||||
}
|
||||
|
||||
/// Returns a reference to the BM25 weight for batch score computation.
|
||||
pub(crate) fn bm25_weight(&self) -> &Bm25Weight {
|
||||
&self.similarity_weight
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for TermScorer {
|
||||
|
||||
@@ -94,7 +94,13 @@ impl SkipIndex {
|
||||
byte_range: 0..first_layer_len,
|
||||
};
|
||||
for layer in &self.layers {
|
||||
cur_checkpoint = layer.seek_start_at_offset(target, cur_checkpoint.byte_range.start)?;
|
||||
if let Some(checkpoint) =
|
||||
layer.seek_start_at_offset(target, cur_checkpoint.byte_range.start)
|
||||
{
|
||||
cur_checkpoint = checkpoint;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
Some(cur_checkpoint)
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ zstd-compression = ["zstd"]
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1"
|
||||
criterion = { version = "0.8", default-features = false }
|
||||
criterion = { version = "0.5", default-features = false }
|
||||
names = "0.14"
|
||||
rand = "0.9"
|
||||
|
||||
|
||||
@@ -14,8 +14,11 @@ use itertools::Itertools;
|
||||
use tantivy_fst::Automaton;
|
||||
use tantivy_fst::automaton::AlwaysMatch;
|
||||
|
||||
use crate::sstable_index_v3::SSTableIndexV3Empty;
|
||||
use crate::streamer::{Streamer, StreamerBuilder};
|
||||
use crate::{BlockAddr, DeltaReader, Reader, SSTable, SSTableIndex, TermOrdinal, VoidSSTable};
|
||||
use crate::{
|
||||
BlockAddr, DeltaReader, Reader, SSTable, SSTableIndex, SSTableIndexV3, TermOrdinal, VoidSSTable,
|
||||
};
|
||||
|
||||
/// An SSTable is a sorted map that associates sorted `&[u8]` keys
|
||||
/// to any kind of typed values.
|
||||
@@ -285,7 +288,33 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
||||
let (sstable_slice, index_slice) = main_slice.split(index_offset as usize);
|
||||
let sstable_index_bytes = index_slice.read_bytes()?;
|
||||
|
||||
let sstable_index = SSTableIndex::open(version, index_offset, sstable_index_bytes)?;
|
||||
let sstable_index = match version {
|
||||
2 => SSTableIndex::V2(
|
||||
crate::sstable_index_v2::SSTableIndex::load(sstable_index_bytes).map_err(|_| {
|
||||
io::Error::new(io::ErrorKind::InvalidData, "SSTable corruption")
|
||||
})?,
|
||||
),
|
||||
3 => {
|
||||
let (sstable_index_bytes, mut footerv3_len_bytes) = sstable_index_bytes.rsplit(8);
|
||||
let store_offset = u64::deserialize(&mut footerv3_len_bytes)?;
|
||||
if store_offset != 0 {
|
||||
SSTableIndex::V3(
|
||||
SSTableIndexV3::load(sstable_index_bytes, store_offset).map_err(|_| {
|
||||
io::Error::new(io::ErrorKind::InvalidData, "SSTable corruption")
|
||||
})?,
|
||||
)
|
||||
} else {
|
||||
// if store_offset is zero, there is no index, so we build a pseudo-index
|
||||
// assuming a single block of sstable covering everything.
|
||||
SSTableIndex::V3Empty(SSTableIndexV3Empty::load(index_offset as usize))
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
return Err(io::Error::other(format!(
|
||||
"Unsupported sstable version, expected one of [2, 3], found {version}"
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Dictionary {
|
||||
sstable_slice,
|
||||
@@ -496,15 +525,10 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
||||
|
||||
// Open the block for the first ordinal.
|
||||
let mut bytes = Vec::new();
|
||||
let (mut current_block_addr, block_id) = self.sstable_index.get_and_locate_with_ord(ord);
|
||||
let mut current_block_addr = self.sstable_index.get_block_with_ord(ord);
|
||||
let mut current_sstable_delta_reader =
|
||||
self.sstable_delta_reader_block(current_block_addr.clone())?;
|
||||
let mut current_block_ordinal = current_block_addr.first_ordinal;
|
||||
let mut current_block_end_bound = self
|
||||
.sstable_index
|
||||
.get_block(block_id + 1)
|
||||
.map(|block_addr| block_addr.first_ordinal)
|
||||
.unwrap_or(u64::MAX);
|
||||
|
||||
loop {
|
||||
// move to the ord inside the current block
|
||||
@@ -533,19 +557,17 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
||||
}
|
||||
};
|
||||
|
||||
if next_ord >= current_block_end_bound {
|
||||
let (new_block_addr, block_id) =
|
||||
self.sstable_index.get_and_locate_with_ord(next_ord);
|
||||
// TODO optimization: it is silly to do a binary search to get the block every single
|
||||
// time.
|
||||
//
|
||||
// Check if block changed for new term_ord
|
||||
let new_block_addr = self.sstable_index.get_block_with_ord(next_ord);
|
||||
if new_block_addr != current_block_addr {
|
||||
current_block_addr = new_block_addr;
|
||||
current_block_ordinal = current_block_addr.first_ordinal;
|
||||
current_sstable_delta_reader =
|
||||
self.sstable_delta_reader_block(current_block_addr.clone())?;
|
||||
bytes.clear();
|
||||
current_block_end_bound = self
|
||||
.sstable_index
|
||||
.get_block(block_id + 1)
|
||||
.map(|block_addr| block_addr.first_ordinal)
|
||||
.unwrap_or(u64::MAX)
|
||||
}
|
||||
ord = next_ord;
|
||||
}
|
||||
|
||||
@@ -1,319 +0,0 @@
|
||||
pub(crate) mod v2;
|
||||
pub(crate) mod v3;
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Range;
|
||||
|
||||
use common::{BinarySerializable, FixedSize, OwnedBytes};
|
||||
use tantivy_fst::{Automaton, MapBuilder};
|
||||
|
||||
use crate::{TermOrdinal, common_prefix_len};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum SSTableIndex {
|
||||
V2(v2::SSTableIndex),
|
||||
V3(v3::SSTableIndexV3),
|
||||
V3Empty(v3::SSTableIndexV3Empty),
|
||||
}
|
||||
|
||||
impl SSTableIndex {
|
||||
pub(crate) fn open(
|
||||
version: u32,
|
||||
index_offset: u64,
|
||||
index_bytes: OwnedBytes,
|
||||
) -> io::Result<Self> {
|
||||
let index = match version {
|
||||
2 => {
|
||||
SSTableIndex::V2(v2::SSTableIndex::load(index_bytes).map_err(|_| {
|
||||
io::Error::new(io::ErrorKind::InvalidData, "SSTable corruption")
|
||||
})?)
|
||||
}
|
||||
3 => {
|
||||
let (index_bytes, mut footerv3_len_bytes) = index_bytes.rsplit(8);
|
||||
let store_offset = u64::deserialize(&mut footerv3_len_bytes)?;
|
||||
if store_offset != 0 {
|
||||
SSTableIndex::V3(v3::SSTableIndexV3::load(index_bytes, store_offset).map_err(
|
||||
|_| io::Error::new(io::ErrorKind::InvalidData, "SSTable corruption"),
|
||||
)?)
|
||||
} else {
|
||||
// if store_offset is zero, there is no index, so we build a pseudo-index
|
||||
// assuming a single block of sstable covering everything.
|
||||
SSTableIndex::V3Empty(v3::SSTableIndexV3Empty::load(index_offset as usize))
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
return Err(io::Error::other(format!(
|
||||
"Unsupported sstable version, expected one of [2, 3], found {version}"
|
||||
)));
|
||||
}
|
||||
};
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
/// Get the [`BlockAddr`] of the requested block.
|
||||
pub(crate) fn get_block(&self, block_id: u64) -> Option<BlockAddr> {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => v2_index.get_block(block_id as usize),
|
||||
SSTableIndex::V3(v3_index) => v3_index.get_block(block_id),
|
||||
SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block(block_id),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the block id of the block that would contain `key`.
|
||||
///
|
||||
/// Returns None if `key` is lexicographically after the last key recorded.
|
||||
pub(crate) fn locate_with_key(&self, key: &[u8]) -> Option<u64> {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => v2_index.locate_with_key(key).map(|i| i as u64),
|
||||
SSTableIndex::V3(v3_index) => v3_index.locate_with_key(key),
|
||||
SSTableIndex::V3Empty(v3_empty) => v3_empty.locate_with_key(key),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the [`BlockAddr`] of the block that would contain `key`.
|
||||
///
|
||||
/// Returns None if `key` is lexicographically after the last key recorded.
|
||||
pub fn get_block_with_key(&self, key: &[u8]) -> Option<BlockAddr> {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => v2_index.get_block_with_key(key),
|
||||
SSTableIndex::V3(v3_index) => v3_index.get_block_with_key(key),
|
||||
SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block_with_key(key),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn locate_with_ord(&self, ord: TermOrdinal) -> u64 {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => v2_index.locate_with_ord(ord) as u64,
|
||||
SSTableIndex::V3(v3_index) => v3_index.locate_with_ord(ord),
|
||||
SSTableIndex::V3Empty(v3_empty) => v3_empty.locate_with_ord(ord),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
||||
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => v2_index.get_block_with_ord(ord),
|
||||
SSTableIndex::V3(v3_index) => v3_index.get_block_with_ord(ord),
|
||||
SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block_with_ord(ord),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_and_locate_with_ord(&self, ord: TermOrdinal) -> (BlockAddr, u64) {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => v2_index.get_and_locate_with_ord(ord),
|
||||
SSTableIndex::V3(v3_index) => v3_index.get_and_locate_with_ord(ord),
|
||||
SSTableIndex::V3Empty(v3_empty) => v3_empty.get_and_locate_with_ord(ord),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_block_for_automaton<'a>(
|
||||
&'a self,
|
||||
automaton: &'a impl Automaton,
|
||||
) -> impl Iterator<Item = (u64, BlockAddr)> + 'a {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => {
|
||||
BlockIter::V2(v2_index.get_block_for_automaton(automaton))
|
||||
}
|
||||
SSTableIndex::V3(v3_index) => {
|
||||
BlockIter::V3(v3_index.get_block_for_automaton(automaton))
|
||||
}
|
||||
SSTableIndex::V3Empty(v3_empty) => {
|
||||
BlockIter::V3Empty(std::iter::once((0, v3_empty.block_addr.clone())))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum BlockIter<V2, V3, T> {
|
||||
V2(V2),
|
||||
V3(V3),
|
||||
V3Empty(std::iter::Once<T>),
|
||||
}
|
||||
|
||||
impl<V2: Iterator<Item = T>, V3: Iterator<Item = T>, T> Iterator for BlockIter<V2, V3, T> {
|
||||
type Item = T;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self {
|
||||
BlockIter::V2(v2) => v2.next(),
|
||||
BlockIter::V3(v3) => v3.next(),
|
||||
BlockIter::V3Empty(once) => once.next(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Eq, PartialEq, Debug)]
|
||||
pub struct BlockAddr {
|
||||
pub first_ordinal: u64,
|
||||
pub byte_range: Range<usize>,
|
||||
}
|
||||
|
||||
impl BlockAddr {
|
||||
fn to_block_start(&self) -> BlockStartAddr {
|
||||
BlockStartAddr {
|
||||
first_ordinal: self.first_ordinal,
|
||||
byte_range_start: self.byte_range.start,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
struct BlockStartAddr {
|
||||
first_ordinal: u64,
|
||||
byte_range_start: usize,
|
||||
}
|
||||
|
||||
impl BlockStartAddr {
|
||||
fn to_block_addr(&self, byte_range_end: usize) -> BlockAddr {
|
||||
BlockAddr {
|
||||
first_ordinal: self.first_ordinal,
|
||||
byte_range: self.byte_range_start..byte_range_end,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct BlockMeta {
|
||||
/// Any byte string that is lexicographically greater or equal to
|
||||
/// the last key in the block,
|
||||
/// and yet strictly smaller than the first key in the next block.
|
||||
pub last_key_or_greater: Vec<u8>,
|
||||
pub block_addr: BlockAddr,
|
||||
}
|
||||
|
||||
impl BinarySerializable for BlockStartAddr {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let start = self.byte_range_start as u64;
|
||||
start.serialize(writer)?;
|
||||
self.first_ordinal.serialize(writer)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let byte_range_start = u64::deserialize(reader)? as usize;
|
||||
let first_ordinal = u64::deserialize(reader)?;
|
||||
Ok(BlockStartAddr {
|
||||
first_ordinal,
|
||||
byte_range_start,
|
||||
})
|
||||
}
|
||||
|
||||
// Provided method
|
||||
fn num_bytes(&self) -> u64 {
|
||||
BlockStartAddr::SIZE_IN_BYTES as u64
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for BlockStartAddr {
|
||||
const SIZE_IN_BYTES: usize = 2 * u64::SIZE_IN_BYTES;
|
||||
}
|
||||
|
||||
/// Given that left < right,
|
||||
/// mutates `left into a shorter byte string left'` that
|
||||
/// matches `left <= left' < right`.
|
||||
fn find_shorter_str_in_between(left: &mut Vec<u8>, right: &[u8]) {
|
||||
assert!(&left[..] < right);
|
||||
let common_len = common_prefix_len(left, right);
|
||||
if left.len() == common_len {
|
||||
return;
|
||||
}
|
||||
// It is possible to do one character shorter in some case,
|
||||
// but it is not worth the extra complexity
|
||||
for pos in (common_len + 1)..left.len() {
|
||||
if left[pos] != u8::MAX {
|
||||
left[pos] += 1;
|
||||
left.truncate(pos + 1);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct SSTableIndexBuilder {
|
||||
blocks: Vec<BlockMeta>,
|
||||
}
|
||||
|
||||
impl SSTableIndexBuilder {
|
||||
/// In order to make the index as light as possible, we
|
||||
/// try to find a shorter alternative to the last key of the last block
|
||||
/// that is still smaller than the next key.
|
||||
pub(crate) fn shorten_last_block_key_given_next_key(&mut self, next_key: &[u8]) {
|
||||
if let Some(last_block) = self.blocks.last_mut() {
|
||||
find_shorter_str_in_between(&mut last_block.last_key_or_greater, next_key);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_block(&mut self, last_key: &[u8], byte_range: Range<usize>, first_ordinal: u64) {
|
||||
self.blocks.push(BlockMeta {
|
||||
last_key_or_greater: last_key.to_vec(),
|
||||
block_addr: BlockAddr {
|
||||
byte_range,
|
||||
first_ordinal,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
pub fn serialize<W: std::io::Write>(&self, wrt: W) -> io::Result<u64> {
|
||||
if self.blocks.len() <= 1 {
|
||||
return Ok(0);
|
||||
}
|
||||
let counting_writer = common::CountingWriter::wrap(wrt);
|
||||
let mut map_builder = MapBuilder::new(counting_writer).map_err(fst_error_to_io_error)?;
|
||||
for (i, block) in self.blocks.iter().enumerate() {
|
||||
map_builder
|
||||
.insert(&block.last_key_or_greater, i as u64)
|
||||
.map_err(fst_error_to_io_error)?;
|
||||
}
|
||||
let counting_writer = map_builder.into_inner().map_err(fst_error_to_io_error)?;
|
||||
let written_bytes = counting_writer.written_bytes();
|
||||
let mut wrt = counting_writer.finish();
|
||||
|
||||
let mut block_store_writer = v3::BlockAddrStoreWriter::new();
|
||||
for block in &self.blocks {
|
||||
block_store_writer.write_block_meta(block.block_addr.clone())?;
|
||||
}
|
||||
block_store_writer.serialize(&mut wrt)?;
|
||||
|
||||
Ok(written_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
fn fst_error_to_io_error(error: tantivy_fst::Error) -> io::Error {
|
||||
match error {
|
||||
tantivy_fst::Error::Fst(fst_error) => io::Error::other(fst_error),
|
||||
tantivy_fst::Error::Io(ioerror) => ioerror,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[track_caller]
|
||||
fn test_find_shorter_str_in_between_aux(left: &[u8], right: &[u8]) {
|
||||
let mut left_buf = left.to_vec();
|
||||
super::find_shorter_str_in_between(&mut left_buf, right);
|
||||
assert!(left_buf.len() <= left.len());
|
||||
assert!(left <= &left_buf);
|
||||
assert!(&left_buf[..] < right);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_shorter_str_in_between() {
|
||||
test_find_shorter_str_in_between_aux(b"", b"hello");
|
||||
test_find_shorter_str_in_between_aux(b"abc", b"abcd");
|
||||
test_find_shorter_str_in_between_aux(b"abcd", b"abd");
|
||||
test_find_shorter_str_in_between_aux(&[0, 0, 0], &[1]);
|
||||
test_find_shorter_str_in_between_aux(&[0, 0, 0], &[0, 0, 1]);
|
||||
test_find_shorter_str_in_between_aux(&[0, 0, 255, 255, 255, 0u8], &[0, 1]);
|
||||
}
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(100))]
|
||||
#[test]
|
||||
fn test_proptest_find_shorter_str(left in any::<Vec<u8>>(), right in any::<Vec<u8>>()) {
|
||||
if left < right {
|
||||
test_find_shorter_str_in_between_aux(&left, &right);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -47,8 +47,9 @@ pub mod merge;
|
||||
mod streamer;
|
||||
pub mod value;
|
||||
|
||||
mod index;
|
||||
pub use index::{BlockAddr, SSTableIndex, SSTableIndexBuilder};
|
||||
mod sstable_index_v3;
|
||||
pub use sstable_index_v3::{BlockAddr, SSTableIndex, SSTableIndexBuilder, SSTableIndexV3};
|
||||
mod sstable_index_v2;
|
||||
pub(crate) mod vint;
|
||||
pub use dictionary::{Dictionary, TermOrdHit};
|
||||
pub use streamer::{Streamer, StreamerBuilder};
|
||||
|
||||
@@ -77,13 +77,6 @@ impl SSTableIndex {
|
||||
self.get_block(self.locate_with_ord(ord)).unwrap()
|
||||
}
|
||||
|
||||
pub(crate) fn get_and_locate_with_ord(&self, ord: TermOrdinal) -> (BlockAddr, u64) {
|
||||
let location = self.locate_with_ord(ord);
|
||||
// locate_with_ord always returns an index within range
|
||||
let block_addr = self.get_block(location).unwrap();
|
||||
(block_addr, location as u64)
|
||||
}
|
||||
|
||||
pub(crate) fn get_block_for_automaton<'a>(
|
||||
&'a self,
|
||||
automaton: &'a impl Automaton,
|
||||
@@ -1,14 +1,106 @@
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::{BinarySerializable, FixedSize, OwnedBytes};
|
||||
use tantivy_bitpacker::{BitPacker, compute_num_bits};
|
||||
use tantivy_fst::raw::Fst;
|
||||
use tantivy_fst::{Automaton, IntoStreamer, Map, Streamer};
|
||||
use tantivy_fst::{Automaton, IntoStreamer, Map, MapBuilder, Streamer};
|
||||
|
||||
use super::{BlockAddr, BlockStartAddr};
|
||||
use crate::block_match_automaton::can_block_match_automaton;
|
||||
use crate::{SSTableDataCorruption, TermOrdinal};
|
||||
use crate::{SSTableDataCorruption, TermOrdinal, common_prefix_len};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum SSTableIndex {
|
||||
V2(crate::sstable_index_v2::SSTableIndex),
|
||||
V3(SSTableIndexV3),
|
||||
V3Empty(SSTableIndexV3Empty),
|
||||
}
|
||||
|
||||
impl SSTableIndex {
|
||||
/// Get the [`BlockAddr`] of the requested block.
|
||||
pub(crate) fn get_block(&self, block_id: u64) -> Option<BlockAddr> {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => v2_index.get_block(block_id as usize),
|
||||
SSTableIndex::V3(v3_index) => v3_index.get_block(block_id),
|
||||
SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block(block_id),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the block id of the block that would contain `key`.
|
||||
///
|
||||
/// Returns None if `key` is lexicographically after the last key recorded.
|
||||
pub(crate) fn locate_with_key(&self, key: &[u8]) -> Option<u64> {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => v2_index.locate_with_key(key).map(|i| i as u64),
|
||||
SSTableIndex::V3(v3_index) => v3_index.locate_with_key(key),
|
||||
SSTableIndex::V3Empty(v3_empty) => v3_empty.locate_with_key(key),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the [`BlockAddr`] of the block that would contain `key`.
|
||||
///
|
||||
/// Returns None if `key` is lexicographically after the last key recorded.
|
||||
pub fn get_block_with_key(&self, key: &[u8]) -> Option<BlockAddr> {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => v2_index.get_block_with_key(key),
|
||||
SSTableIndex::V3(v3_index) => v3_index.get_block_with_key(key),
|
||||
SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block_with_key(key),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn locate_with_ord(&self, ord: TermOrdinal) -> u64 {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => v2_index.locate_with_ord(ord) as u64,
|
||||
SSTableIndex::V3(v3_index) => v3_index.locate_with_ord(ord),
|
||||
SSTableIndex::V3Empty(v3_empty) => v3_empty.locate_with_ord(ord),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
||||
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => v2_index.get_block_with_ord(ord),
|
||||
SSTableIndex::V3(v3_index) => v3_index.get_block_with_ord(ord),
|
||||
SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block_with_ord(ord),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_block_for_automaton<'a>(
|
||||
&'a self,
|
||||
automaton: &'a impl Automaton,
|
||||
) -> impl Iterator<Item = (u64, BlockAddr)> + 'a {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => {
|
||||
BlockIter::V2(v2_index.get_block_for_automaton(automaton))
|
||||
}
|
||||
SSTableIndex::V3(v3_index) => {
|
||||
BlockIter::V3(v3_index.get_block_for_automaton(automaton))
|
||||
}
|
||||
SSTableIndex::V3Empty(v3_empty) => {
|
||||
BlockIter::V3Empty(std::iter::once((0, v3_empty.block_addr.clone())))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum BlockIter<V2, V3, T> {
|
||||
V2(V2),
|
||||
V3(V3),
|
||||
V3Empty(std::iter::Once<T>),
|
||||
}
|
||||
|
||||
impl<V2: Iterator<Item = T>, V3: Iterator<Item = T>, T> Iterator for BlockIter<V2, V3, T> {
|
||||
type Item = T;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self {
|
||||
BlockIter::V2(v2) => v2.next(),
|
||||
BlockIter::V3(v3) => v3.next(),
|
||||
BlockIter::V3Empty(once) => once.next(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SSTableIndexV3 {
|
||||
@@ -68,11 +160,6 @@ impl SSTableIndexV3 {
|
||||
self.block_addr_store.binary_search_ord(ord).1
|
||||
}
|
||||
|
||||
pub(crate) fn get_and_locate_with_ord(&self, ord: TermOrdinal) -> (BlockAddr, u64) {
|
||||
let (location, block_addr) = self.block_addr_store.binary_search_ord(ord);
|
||||
(block_addr, location)
|
||||
}
|
||||
|
||||
pub(crate) fn get_block_for_automaton<'a>(
|
||||
&'a self,
|
||||
automaton: &'a impl Automaton,
|
||||
@@ -129,7 +216,7 @@ impl<A: Automaton> Iterator for GetBlockForAutomaton<'_, A> {
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SSTableIndexV3Empty {
|
||||
pub block_addr: BlockAddr,
|
||||
block_addr: BlockAddr,
|
||||
}
|
||||
|
||||
impl SSTableIndexV3Empty {
|
||||
@@ -143,8 +230,8 @@ impl SSTableIndexV3Empty {
|
||||
}
|
||||
|
||||
/// Get the [`BlockAddr`] of the requested block.
|
||||
pub(crate) fn get_block(&self, block_id: u64) -> Option<BlockAddr> {
|
||||
(block_id == 0).then(|| self.block_addr.clone())
|
||||
pub(crate) fn get_block(&self, _block_id: u64) -> Option<BlockAddr> {
|
||||
Some(self.block_addr.clone())
|
||||
}
|
||||
|
||||
/// Get the block id of the block that would contain `key`.
|
||||
@@ -169,9 +256,146 @@ impl SSTableIndexV3Empty {
|
||||
pub(crate) fn get_block_with_ord(&self, _ord: TermOrdinal) -> BlockAddr {
|
||||
self.block_addr.clone()
|
||||
}
|
||||
}
|
||||
#[derive(Clone, Eq, PartialEq, Debug)]
|
||||
pub struct BlockAddr {
|
||||
pub first_ordinal: u64,
|
||||
pub byte_range: Range<usize>,
|
||||
}
|
||||
|
||||
pub(crate) fn get_and_locate_with_ord(&self, _ord: TermOrdinal) -> (BlockAddr, u64) {
|
||||
(self.block_addr.clone(), 0)
|
||||
impl BlockAddr {
|
||||
fn to_block_start(&self) -> BlockStartAddr {
|
||||
BlockStartAddr {
|
||||
first_ordinal: self.first_ordinal,
|
||||
byte_range_start: self.byte_range.start,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
struct BlockStartAddr {
|
||||
first_ordinal: u64,
|
||||
byte_range_start: usize,
|
||||
}
|
||||
|
||||
impl BlockStartAddr {
|
||||
fn to_block_addr(&self, byte_range_end: usize) -> BlockAddr {
|
||||
BlockAddr {
|
||||
first_ordinal: self.first_ordinal,
|
||||
byte_range: self.byte_range_start..byte_range_end,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct BlockMeta {
|
||||
/// Any byte string that is lexicographically greater or equal to
|
||||
/// the last key in the block,
|
||||
/// and yet strictly smaller than the first key in the next block.
|
||||
pub last_key_or_greater: Vec<u8>,
|
||||
pub block_addr: BlockAddr,
|
||||
}
|
||||
|
||||
impl BinarySerializable for BlockStartAddr {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let start = self.byte_range_start as u64;
|
||||
start.serialize(writer)?;
|
||||
self.first_ordinal.serialize(writer)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let byte_range_start = u64::deserialize(reader)? as usize;
|
||||
let first_ordinal = u64::deserialize(reader)?;
|
||||
Ok(BlockStartAddr {
|
||||
first_ordinal,
|
||||
byte_range_start,
|
||||
})
|
||||
}
|
||||
|
||||
// Provided method
|
||||
fn num_bytes(&self) -> u64 {
|
||||
BlockStartAddr::SIZE_IN_BYTES as u64
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for BlockStartAddr {
|
||||
const SIZE_IN_BYTES: usize = 2 * u64::SIZE_IN_BYTES;
|
||||
}
|
||||
|
||||
/// Given that left < right,
|
||||
/// mutates `left into a shorter byte string left'` that
|
||||
/// matches `left <= left' < right`.
|
||||
fn find_shorter_str_in_between(left: &mut Vec<u8>, right: &[u8]) {
|
||||
assert!(&left[..] < right);
|
||||
let common_len = common_prefix_len(left, right);
|
||||
if left.len() == common_len {
|
||||
return;
|
||||
}
|
||||
// It is possible to do one character shorter in some case,
|
||||
// but it is not worth the extra complexity
|
||||
for pos in (common_len + 1)..left.len() {
|
||||
if left[pos] != u8::MAX {
|
||||
left[pos] += 1;
|
||||
left.truncate(pos + 1);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct SSTableIndexBuilder {
|
||||
blocks: Vec<BlockMeta>,
|
||||
}
|
||||
|
||||
impl SSTableIndexBuilder {
|
||||
/// In order to make the index as light as possible, we
|
||||
/// try to find a shorter alternative to the last key of the last block
|
||||
/// that is still smaller than the next key.
|
||||
pub(crate) fn shorten_last_block_key_given_next_key(&mut self, next_key: &[u8]) {
|
||||
if let Some(last_block) = self.blocks.last_mut() {
|
||||
find_shorter_str_in_between(&mut last_block.last_key_or_greater, next_key);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_block(&mut self, last_key: &[u8], byte_range: Range<usize>, first_ordinal: u64) {
|
||||
self.blocks.push(BlockMeta {
|
||||
last_key_or_greater: last_key.to_vec(),
|
||||
block_addr: BlockAddr {
|
||||
byte_range,
|
||||
first_ordinal,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
pub fn serialize<W: std::io::Write>(&self, wrt: W) -> io::Result<u64> {
|
||||
if self.blocks.len() <= 1 {
|
||||
return Ok(0);
|
||||
}
|
||||
let counting_writer = common::CountingWriter::wrap(wrt);
|
||||
let mut map_builder = MapBuilder::new(counting_writer).map_err(fst_error_to_io_error)?;
|
||||
for (i, block) in self.blocks.iter().enumerate() {
|
||||
map_builder
|
||||
.insert(&block.last_key_or_greater, i as u64)
|
||||
.map_err(fst_error_to_io_error)?;
|
||||
}
|
||||
let counting_writer = map_builder.into_inner().map_err(fst_error_to_io_error)?;
|
||||
let written_bytes = counting_writer.written_bytes();
|
||||
let mut wrt = counting_writer.finish();
|
||||
|
||||
let mut block_store_writer = BlockAddrStoreWriter::new();
|
||||
for block in &self.blocks {
|
||||
block_store_writer.write_block_meta(block.block_addr.clone())?;
|
||||
}
|
||||
block_store_writer.serialize(&mut wrt)?;
|
||||
|
||||
Ok(written_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
fn fst_error_to_io_error(error: tantivy_fst::Error) -> io::Error {
|
||||
match error {
|
||||
tantivy_fst::Error::Fst(fst_error) => io::Error::other(fst_error),
|
||||
tantivy_fst::Error::Io(ioerror) => ioerror,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -423,14 +647,14 @@ fn binary_search(max: u64, cmp_fn: impl Fn(u64) -> std::cmp::Ordering) -> Result
|
||||
Err(left)
|
||||
}
|
||||
|
||||
pub(crate) struct BlockAddrStoreWriter {
|
||||
struct BlockAddrStoreWriter {
|
||||
buffer_block_metas: Vec<u8>,
|
||||
buffer_addrs: Vec<u8>,
|
||||
block_addrs: Vec<BlockAddr>,
|
||||
}
|
||||
|
||||
impl BlockAddrStoreWriter {
|
||||
pub(crate) fn new() -> Self {
|
||||
fn new() -> Self {
|
||||
BlockAddrStoreWriter {
|
||||
buffer_block_metas: Vec::new(),
|
||||
buffer_addrs: Vec::new(),
|
||||
@@ -438,7 +662,7 @@ impl BlockAddrStoreWriter {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn flush_block(&mut self) -> io::Result<()> {
|
||||
fn flush_block(&mut self) -> io::Result<()> {
|
||||
if self.block_addrs.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -517,7 +741,7 @@ impl BlockAddrStoreWriter {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn write_block_meta(&mut self, block_addr: BlockAddr) -> io::Result<()> {
|
||||
fn write_block_meta(&mut self, block_addr: BlockAddr) -> io::Result<()> {
|
||||
self.block_addrs.push(block_addr);
|
||||
if self.block_addrs.len() >= STORE_BLOCK_LEN {
|
||||
self.flush_block()?;
|
||||
@@ -525,7 +749,7 @@ impl BlockAddrStoreWriter {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn serialize<W: std::io::Write>(&mut self, wrt: &mut W) -> io::Result<()> {
|
||||
fn serialize<W: std::io::Write>(&mut self, wrt: &mut W) -> io::Result<()> {
|
||||
self.flush_block()?;
|
||||
let len = self.buffer_block_metas.len() as u64;
|
||||
len.serialize(wrt)?;
|
||||
@@ -600,9 +824,8 @@ mod tests {
|
||||
use common::OwnedBytes;
|
||||
|
||||
use super::*;
|
||||
use crate::SSTableDataCorruption;
|
||||
use crate::block_match_automaton::tests::EqBuffer;
|
||||
use crate::index::BlockMeta;
|
||||
use crate::{SSTableDataCorruption, SSTableIndexBuilder};
|
||||
|
||||
#[test]
|
||||
fn test_sstable_index() {
|
||||
@@ -651,7 +874,36 @@ mod tests {
|
||||
assert!(matches!(data_corruption_err, SSTableDataCorruption));
|
||||
}
|
||||
|
||||
// use proptest::prelude::*;
|
||||
#[track_caller]
|
||||
fn test_find_shorter_str_in_between_aux(left: &[u8], right: &[u8]) {
|
||||
let mut left_buf = left.to_vec();
|
||||
super::find_shorter_str_in_between(&mut left_buf, right);
|
||||
assert!(left_buf.len() <= left.len());
|
||||
assert!(left <= &left_buf);
|
||||
assert!(&left_buf[..] < right);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_shorter_str_in_between() {
|
||||
test_find_shorter_str_in_between_aux(b"", b"hello");
|
||||
test_find_shorter_str_in_between_aux(b"abc", b"abcd");
|
||||
test_find_shorter_str_in_between_aux(b"abcd", b"abd");
|
||||
test_find_shorter_str_in_between_aux(&[0, 0, 0], &[1]);
|
||||
test_find_shorter_str_in_between_aux(&[0, 0, 0], &[0, 0, 1]);
|
||||
test_find_shorter_str_in_between_aux(&[0, 0, 255, 255, 255, 0u8], &[0, 1]);
|
||||
}
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(100))]
|
||||
#[test]
|
||||
fn test_proptest_find_shorter_str(left in any::<Vec<u8>>(), right in any::<Vec<u8>>()) {
|
||||
if left < right {
|
||||
test_find_shorter_str_in_between_aux(&left, &right);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_best_slop() {
|
||||
@@ -8,7 +8,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
||||
description = "term hashmap used for indexing"
|
||||
|
||||
[dependencies]
|
||||
murmurhash32 = "0.4"
|
||||
murmurhash32 = "0.3"
|
||||
common = { version = "0.11", path = "../common/", package = "tantivy-common" }
|
||||
ahash = { version = "0.8.11", default-features = false, optional = true }
|
||||
|
||||
@@ -27,7 +27,7 @@ rand = "0.9"
|
||||
zipf = "7.0.0"
|
||||
rustc-hash = "2.1.0"
|
||||
proptest = "1.2.0"
|
||||
binggan = { version = "0.17.0" }
|
||||
binggan = { version = "0.16.1" }
|
||||
rand_distr = "0.5"
|
||||
|
||||
[features]
|
||||
|
||||
Reference in New Issue
Block a user