mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-10 11:02:55 +00:00
Compare commits
1 Commits
main
...
low_card_o
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b2573a3b16 |
4
.github/workflows/coverage.yml
vendored
4
.github/workflows/coverage.yml
vendored
@@ -15,11 +15,11 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- name: Install Rust
|
- name: Install Rust
|
||||||
run: rustup toolchain install nightly-2025-12-01 --profile minimal --component llvm-tools-preview
|
run: rustup toolchain install nightly-2024-07-01 --profile minimal --component llvm-tools-preview
|
||||||
- uses: Swatinem/rust-cache@v2
|
- uses: Swatinem/rust-cache@v2
|
||||||
- uses: taiki-e/install-action@cargo-llvm-cov
|
- uses: taiki-e/install-action@cargo-llvm-cov
|
||||||
- name: Generate code coverage
|
- name: Generate code coverage
|
||||||
run: cargo +nightly-2025-12-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
run: cargo +nightly-2024-07-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
||||||
- name: Upload coverage to Codecov
|
- name: Upload coverage to Codecov
|
||||||
uses: codecov/codecov-action@v3
|
uses: codecov/codecov-action@v3
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
|
|||||||
30
.github/workflows/test.yml
vendored
30
.github/workflows/test.yml
vendored
@@ -39,11 +39,11 @@ jobs:
|
|||||||
|
|
||||||
- name: Check Formatting
|
- name: Check Formatting
|
||||||
run: cargo +nightly fmt --all -- --check
|
run: cargo +nightly fmt --all -- --check
|
||||||
|
|
||||||
- name: Check Stable Compilation
|
- name: Check Stable Compilation
|
||||||
run: cargo build --all-features
|
run: cargo build --all-features
|
||||||
|
|
||||||
|
|
||||||
- name: Check Bench Compilation
|
- name: Check Bench Compilation
|
||||||
run: cargo +nightly bench --no-run --profile=dev --all-features
|
run: cargo +nightly bench --no-run --profile=dev --all-features
|
||||||
|
|
||||||
@@ -59,10 +59,10 @@ jobs:
|
|||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
features:
|
features: [
|
||||||
- { label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints,stemmer" }
|
{ label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints" },
|
||||||
- { label: "quickwit", flags: "mmap,quickwit,failpoints" }
|
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
|
||||||
- { label: "none", flags: "" }
|
]
|
||||||
|
|
||||||
name: test-${{ matrix.features.label}}
|
name: test-${{ matrix.features.label}}
|
||||||
|
|
||||||
@@ -80,21 +80,7 @@ jobs:
|
|||||||
- uses: Swatinem/rust-cache@v2
|
- uses: Swatinem/rust-cache@v2
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: cargo +stable nextest run --features ${{ matrix.features.flags }} --verbose --workspace
|
||||||
# if matrix.feature.flags is empty then run on --lib to avoid compiling examples
|
|
||||||
# (as most of them rely on mmap) otherwise run all
|
|
||||||
if [ -z "${{ matrix.features.flags }}" ]; then
|
|
||||||
cargo +stable nextest run --lib --no-default-features --verbose --workspace
|
|
||||||
else
|
|
||||||
cargo +stable nextest run --features ${{ matrix.features.flags }} --no-default-features --verbose --workspace
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Run doctests
|
- name: Run doctests
|
||||||
run: |
|
run: cargo +stable test --doc --features ${{ matrix.features.flags }} --verbose --workspace
|
||||||
# if matrix.feature.flags is empty then run on --lib to avoid compiling examples
|
|
||||||
# (as most of them rely on mmap) otherwise run all
|
|
||||||
if [ -z "${{ matrix.features.flags }}" ]; then
|
|
||||||
echo "no doctest for no feature flag"
|
|
||||||
else
|
|
||||||
cargo +stable test --doc --features ${{ matrix.features.flags }} --verbose --workspace
|
|
||||||
fi
|
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ This will slightly increase space and access time. [#2439](https://github.com/qu
|
|||||||
|
|
||||||
- **Store DateTime as nanoseconds in doc store** DateTime in the doc store was truncated to microseconds previously. This removes this truncation, while still keeping backwards compatibility. [#2486](https://github.com/quickwit-oss/tantivy/pull/2486)(@PSeitz)
|
- **Store DateTime as nanoseconds in doc store** DateTime in the doc store was truncated to microseconds previously. This removes this truncation, while still keeping backwards compatibility. [#2486](https://github.com/quickwit-oss/tantivy/pull/2486)(@PSeitz)
|
||||||
|
|
||||||
- **Performance/Memory**
|
- **Performace/Memory**
|
||||||
- lift clauses in LogicalAst for optimized ast during execution [#2449](https://github.com/quickwit-oss/tantivy/pull/2449)(@PSeitz)
|
- lift clauses in LogicalAst for optimized ast during execution [#2449](https://github.com/quickwit-oss/tantivy/pull/2449)(@PSeitz)
|
||||||
- Use Vec instead of BTreeMap to back OwnedValue object [#2364](https://github.com/quickwit-oss/tantivy/pull/2364)(@fulmicoton)
|
- Use Vec instead of BTreeMap to back OwnedValue object [#2364](https://github.com/quickwit-oss/tantivy/pull/2364)(@fulmicoton)
|
||||||
- Replace TantivyDocument with CompactDoc. CompactDoc is much smaller and provides similar performance. [#2402](https://github.com/quickwit-oss/tantivy/pull/2402)(@PSeitz)
|
- Replace TantivyDocument with CompactDoc. CompactDoc is much smaller and provides similar performance. [#2402](https://github.com/quickwit-oss/tantivy/pull/2402)(@PSeitz)
|
||||||
|
|||||||
25
Cargo.toml
25
Cargo.toml
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.26.0"
|
version = "0.25.0"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = ["database-implementations", "data-structures"]
|
categories = ["database-implementations", "data-structures"]
|
||||||
@@ -37,9 +37,9 @@ fs4 = { version = "0.13.1", optional = true }
|
|||||||
levenshtein_automata = "0.2.1"
|
levenshtein_automata = "0.2.1"
|
||||||
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
||||||
crossbeam-channel = "0.5.4"
|
crossbeam-channel = "0.5.4"
|
||||||
rust-stemmers = { version = "1.2.0", optional = true }
|
rust-stemmers = "1.2.0"
|
||||||
downcast-rs = "2.0.1"
|
downcast-rs = "2.0.1"
|
||||||
bitpacking = { version = "0.9.3", default-features = false, features = [
|
bitpacking = { version = "0.9.2", default-features = false, features = [
|
||||||
"bitpacker4x",
|
"bitpacker4x",
|
||||||
] }
|
] }
|
||||||
census = "0.4.2"
|
census = "0.4.2"
|
||||||
@@ -75,12 +75,12 @@ typetag = "0.2.21"
|
|||||||
winapi = "0.3.9"
|
winapi = "0.3.9"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
binggan = "0.14.2"
|
binggan = "0.14.0"
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
maplit = "1.0.2"
|
maplit = "1.0.2"
|
||||||
matches = "0.1.9"
|
matches = "0.1.9"
|
||||||
pretty_assertions = "1.2.1"
|
pretty_assertions = "1.2.1"
|
||||||
proptest = "1.7.0"
|
proptest = "1.0.0"
|
||||||
test-log = "0.2.10"
|
test-log = "0.2.10"
|
||||||
futures = "0.3.21"
|
futures = "0.3.21"
|
||||||
paste = "1.0.11"
|
paste = "1.0.11"
|
||||||
@@ -113,8 +113,7 @@ debug-assertions = true
|
|||||||
overflow-checks = true
|
overflow-checks = true
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["mmap", "stopwords", "lz4-compression", "columnar-zstd-compression", "stemmer"]
|
default = ["mmap", "stopwords", "lz4-compression", "columnar-zstd-compression"]
|
||||||
stemmer = ["rust-stemmers"]
|
|
||||||
mmap = ["fs4", "tempfile", "memmap2"]
|
mmap = ["fs4", "tempfile", "memmap2"]
|
||||||
stopwords = []
|
stopwords = []
|
||||||
|
|
||||||
@@ -174,18 +173,6 @@ harness = false
|
|||||||
name = "exists_json"
|
name = "exists_json"
|
||||||
harness = false
|
harness = false
|
||||||
|
|
||||||
[[bench]]
|
|
||||||
name = "range_query"
|
|
||||||
harness = false
|
|
||||||
|
|
||||||
[[bench]]
|
[[bench]]
|
||||||
name = "and_or_queries"
|
name = "and_or_queries"
|
||||||
harness = false
|
harness = false
|
||||||
|
|
||||||
[[bench]]
|
|
||||||
name = "range_queries"
|
|
||||||
harness = false
|
|
||||||
|
|
||||||
[[bench]]
|
|
||||||
name = "bool_queries_with_range"
|
|
||||||
harness = false
|
|
||||||
|
|||||||
@@ -123,7 +123,6 @@ You can also find other bindings on [GitHub](https://github.com/search?q=tantivy
|
|||||||
- [seshat](https://github.com/matrix-org/seshat/): A matrix message database/indexer
|
- [seshat](https://github.com/matrix-org/seshat/): A matrix message database/indexer
|
||||||
- [tantiny](https://github.com/baygeldin/tantiny): Tiny full-text search for Ruby
|
- [tantiny](https://github.com/baygeldin/tantiny): Tiny full-text search for Ruby
|
||||||
- [lnx](https://github.com/lnx-search/lnx): adaptable, typo tolerant search engine with a REST API
|
- [lnx](https://github.com/lnx-search/lnx): adaptable, typo tolerant search engine with a REST API
|
||||||
- [Bichon](https://github.com/rustmailer/bichon): A lightweight, high-performance Rust email archiver with WebUI
|
|
||||||
- and [more](https://github.com/search?q=tantivy)!
|
- and [more](https://github.com/search?q=tantivy)!
|
||||||
|
|
||||||
### On average, how much faster is Tantivy compared to Lucene?
|
### On average, how much faster is Tantivy compared to Lucene?
|
||||||
|
|||||||
2
TODO.txt
2
TODO.txt
@@ -10,7 +10,7 @@ rename FastFieldReaders::open to load
|
|||||||
remove fast field reader
|
remove fast field reader
|
||||||
|
|
||||||
find a way to unify the two DateTime.
|
find a way to unify the two DateTime.
|
||||||
re-add type check in the filter wrapper
|
readd type check in the filter wrapper
|
||||||
|
|
||||||
add unit test on columnar list columns.
|
add unit test on columnar list columns.
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
use binggan::plugins::PeakMemAllocPlugin;
|
use binggan::plugins::PeakMemAllocPlugin;
|
||||||
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
||||||
use rand::distributions::WeightedIndex;
|
|
||||||
use rand::prelude::SliceRandom;
|
use rand::prelude::SliceRandom;
|
||||||
use rand::rngs::StdRng;
|
use rand::rngs::StdRng;
|
||||||
use rand::{Rng, SeedableRng};
|
use rand::{Rng, SeedableRng};
|
||||||
@@ -54,33 +53,25 @@ fn bench_agg(mut group: InputGroup<Index>) {
|
|||||||
register!(group, stats_f64);
|
register!(group, stats_f64);
|
||||||
register!(group, extendedstats_f64);
|
register!(group, extendedstats_f64);
|
||||||
register!(group, percentiles_f64);
|
register!(group, percentiles_f64);
|
||||||
register!(group, terms_7);
|
register!(group, terms_few);
|
||||||
register!(group, terms_all_unique);
|
register!(group, terms_many);
|
||||||
register!(group, terms_150_000);
|
|
||||||
register!(group, terms_many_top_1000);
|
register!(group, terms_many_top_1000);
|
||||||
register!(group, terms_many_order_by_term);
|
register!(group, terms_many_order_by_term);
|
||||||
register!(group, terms_many_with_top_hits);
|
register!(group, terms_many_with_top_hits);
|
||||||
register!(group, terms_all_unique_with_avg_sub_agg);
|
|
||||||
register!(group, terms_many_with_avg_sub_agg);
|
register!(group, terms_many_with_avg_sub_agg);
|
||||||
register!(group, terms_status_with_avg_sub_agg);
|
|
||||||
register!(group, terms_status_with_histogram);
|
|
||||||
register!(group, terms_zipf_1000);
|
|
||||||
register!(group, terms_zipf_1000_with_histogram);
|
|
||||||
register!(group, terms_zipf_1000_with_avg_sub_agg);
|
|
||||||
|
|
||||||
register!(group, terms_many_json_mixed_type_with_avg_sub_agg);
|
register!(group, terms_many_json_mixed_type_with_avg_sub_agg);
|
||||||
|
|
||||||
register!(group, cardinality_agg);
|
register!(group, cardinality_agg);
|
||||||
register!(group, terms_status_with_cardinality_agg);
|
register!(group, terms_few_with_cardinality_agg);
|
||||||
|
|
||||||
register!(group, range_agg);
|
register!(group, range_agg);
|
||||||
register!(group, range_agg_with_avg_sub_agg);
|
register!(group, range_agg_with_avg_sub_agg);
|
||||||
register!(group, range_agg_with_term_agg_status);
|
register!(group, range_agg_with_term_agg_few);
|
||||||
register!(group, range_agg_with_term_agg_many);
|
register!(group, range_agg_with_term_agg_many);
|
||||||
register!(group, histogram);
|
register!(group, histogram);
|
||||||
register!(group, histogram_hard_bounds);
|
register!(group, histogram_hard_bounds);
|
||||||
register!(group, histogram_with_avg_sub_agg);
|
register!(group, histogram_with_avg_sub_agg);
|
||||||
register!(group, histogram_with_term_agg_status);
|
register!(group, histogram_with_term_agg_few);
|
||||||
register!(group, avg_and_range_with_avg_sub_agg);
|
register!(group, avg_and_range_with_avg_sub_agg);
|
||||||
|
|
||||||
// Filter aggregation benchmarks
|
// Filter aggregation benchmarks
|
||||||
@@ -139,12 +130,12 @@ fn extendedstats_f64(index: &Index) {
|
|||||||
}
|
}
|
||||||
fn percentiles_f64(index: &Index) {
|
fn percentiles_f64(index: &Index) {
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"mypercentiles": {
|
"mypercentiles": {
|
||||||
"percentiles": {
|
"percentiles": {
|
||||||
"field": "score_f64",
|
"field": "score_f64",
|
||||||
"percents": [ 95, 99, 99.9 ]
|
"percents": [ 95, 99, 99.9 ]
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
});
|
});
|
||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
@@ -159,10 +150,10 @@ fn cardinality_agg(index: &Index) {
|
|||||||
});
|
});
|
||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
fn terms_status_with_cardinality_agg(index: &Index) {
|
fn terms_few_with_cardinality_agg(index: &Index) {
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"my_texts": {
|
"my_texts": {
|
||||||
"terms": { "field": "text_few_terms_status" },
|
"terms": { "field": "text_few_terms" },
|
||||||
"aggs": {
|
"aggs": {
|
||||||
"cardinality": {
|
"cardinality": {
|
||||||
"cardinality": {
|
"cardinality": {
|
||||||
@@ -175,20 +166,13 @@ fn terms_status_with_cardinality_agg(index: &Index) {
|
|||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn terms_7(index: &Index) {
|
fn terms_few(index: &Index) {
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"my_texts": { "terms": { "field": "text_few_terms_status" } },
|
"my_texts": { "terms": { "field": "text_few_terms" } },
|
||||||
});
|
});
|
||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
fn terms_all_unique(index: &Index) {
|
fn terms_many(index: &Index) {
|
||||||
let agg_req = json!({
|
|
||||||
"my_texts": { "terms": { "field": "text_all_unique_terms" } },
|
|
||||||
});
|
|
||||||
execute_agg(index, agg_req);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn terms_150_000(index: &Index) {
|
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"my_texts": { "terms": { "field": "text_many_terms" } },
|
"my_texts": { "terms": { "field": "text_many_terms" } },
|
||||||
});
|
});
|
||||||
@@ -236,72 +220,6 @@ fn terms_many_with_avg_sub_agg(index: &Index) {
|
|||||||
});
|
});
|
||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
fn terms_all_unique_with_avg_sub_agg(index: &Index) {
|
|
||||||
let agg_req = json!({
|
|
||||||
"my_texts": {
|
|
||||||
"terms": { "field": "text_all_unique_terms" },
|
|
||||||
"aggs": {
|
|
||||||
"average_f64": { "avg": { "field": "score_f64" } }
|
|
||||||
}
|
|
||||||
},
|
|
||||||
});
|
|
||||||
execute_agg(index, agg_req);
|
|
||||||
}
|
|
||||||
fn terms_status_with_histogram(index: &Index) {
|
|
||||||
let agg_req = json!({
|
|
||||||
"my_texts": {
|
|
||||||
"terms": { "field": "text_few_terms_status" },
|
|
||||||
"aggs": {
|
|
||||||
"histo": {"histogram": { "field": "score_f64", "interval": 10 }}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
execute_agg(index, agg_req);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn terms_zipf_1000_with_histogram(index: &Index) {
|
|
||||||
let agg_req = json!({
|
|
||||||
"my_texts": {
|
|
||||||
"terms": { "field": "text_1000_terms_zipf" },
|
|
||||||
"aggs": {
|
|
||||||
"histo": {"histogram": { "field": "score_f64", "interval": 10 }}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
execute_agg(index, agg_req);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn terms_status_with_avg_sub_agg(index: &Index) {
|
|
||||||
let agg_req = json!({
|
|
||||||
"my_texts": {
|
|
||||||
"terms": { "field": "text_few_terms_status" },
|
|
||||||
"aggs": {
|
|
||||||
"average_f64": { "avg": { "field": "score_f64" } }
|
|
||||||
}
|
|
||||||
},
|
|
||||||
});
|
|
||||||
execute_agg(index, agg_req);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn terms_zipf_1000_with_avg_sub_agg(index: &Index) {
|
|
||||||
let agg_req = json!({
|
|
||||||
"my_texts": {
|
|
||||||
"terms": { "field": "text_1000_terms_zipf" },
|
|
||||||
"aggs": {
|
|
||||||
"average_f64": { "avg": { "field": "score_f64" } }
|
|
||||||
}
|
|
||||||
},
|
|
||||||
});
|
|
||||||
execute_agg(index, agg_req);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn terms_zipf_1000(index: &Index) {
|
|
||||||
let agg_req = json!({
|
|
||||||
"my_texts": { "terms": { "field": "text_1000_terms_zipf" } },
|
|
||||||
});
|
|
||||||
execute_agg(index, agg_req);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) {
|
fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) {
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"my_texts": {
|
"my_texts": {
|
||||||
@@ -357,7 +275,7 @@ fn range_agg_with_avg_sub_agg(index: &Index) {
|
|||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn range_agg_with_term_agg_status(index: &Index) {
|
fn range_agg_with_term_agg_few(index: &Index) {
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"rangef64": {
|
"rangef64": {
|
||||||
"range": {
|
"range": {
|
||||||
@@ -372,7 +290,7 @@ fn range_agg_with_term_agg_status(index: &Index) {
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"aggs": {
|
"aggs": {
|
||||||
"my_texts": { "terms": { "field": "text_few_terms_status" } },
|
"my_texts": { "terms": { "field": "text_few_terms" } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
@@ -428,12 +346,12 @@ fn histogram_with_avg_sub_agg(index: &Index) {
|
|||||||
});
|
});
|
||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
fn histogram_with_term_agg_status(index: &Index) {
|
fn histogram_with_term_agg_few(index: &Index) {
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"rangef64": {
|
"rangef64": {
|
||||||
"histogram": { "field": "score_f64", "interval": 10 },
|
"histogram": { "field": "score_f64", "interval": 10 },
|
||||||
"aggs": {
|
"aggs": {
|
||||||
"my_texts": { "terms": { "field": "text_few_terms_status" } }
|
"my_texts": { "terms": { "field": "text_few_terms" } }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -478,13 +396,6 @@ fn get_collector(agg_req: Aggregations) -> AggregationCollector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
||||||
// Flag to use existing index
|
|
||||||
let reuse_index = std::env::var("REUSE_AGG_BENCH_INDEX").is_ok();
|
|
||||||
if reuse_index && std::path::Path::new("agg_bench").exists() {
|
|
||||||
return Index::open_in_dir("agg_bench");
|
|
||||||
}
|
|
||||||
// crreate dir
|
|
||||||
std::fs::create_dir_all("agg_bench")?;
|
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_fieldtype = tantivy::schema::TextOptions::default()
|
let text_fieldtype = tantivy::schema::TextOptions::default()
|
||||||
.set_indexing_options(
|
.set_indexing_options(
|
||||||
@@ -493,47 +404,20 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
|||||||
.set_stored();
|
.set_stored();
|
||||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||||
let json_field = schema_builder.add_json_field("json", FAST);
|
let json_field = schema_builder.add_json_field("json", FAST);
|
||||||
let text_field_all_unique_terms =
|
|
||||||
schema_builder.add_text_field("text_all_unique_terms", STRING | FAST);
|
|
||||||
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
|
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
|
||||||
let text_field_few_terms_status =
|
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
|
||||||
schema_builder.add_text_field("text_few_terms_status", STRING | FAST);
|
|
||||||
let text_field_1000_terms_zipf =
|
|
||||||
schema_builder.add_text_field("text_1000_terms_zipf", STRING | FAST);
|
|
||||||
let score_fieldtype = tantivy::schema::NumericOptions::default().set_fast();
|
let score_fieldtype = tantivy::schema::NumericOptions::default().set_fast();
|
||||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||||
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
|
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
|
||||||
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
|
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
|
||||||
// use tmp dir
|
let index = Index::create_from_tempdir(schema_builder.build())?;
|
||||||
let index = if reuse_index {
|
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
|
||||||
Index::create_in_dir("agg_bench", schema_builder.build())?
|
|
||||||
} else {
|
|
||||||
Index::create_from_tempdir(schema_builder.build())?
|
|
||||||
};
|
|
||||||
// Approximate log proportions
|
|
||||||
let status_field_data = [
|
|
||||||
("INFO", 8000),
|
|
||||||
("ERROR", 300),
|
|
||||||
("WARN", 1200),
|
|
||||||
("DEBUG", 500),
|
|
||||||
("OK", 500),
|
|
||||||
("CRITICAL", 20),
|
|
||||||
("EMERGENCY", 1),
|
|
||||||
];
|
|
||||||
let log_level_distribution =
|
|
||||||
WeightedIndex::new(status_field_data.iter().map(|item| item.1)).unwrap();
|
|
||||||
|
|
||||||
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
|
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
|
||||||
|
|
||||||
let many_terms_data = (0..150_000)
|
let many_terms_data = (0..150_000)
|
||||||
.map(|num| format!("author{num}"))
|
.map(|num| format!("author{num}"))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
// Prepare 1000 unique terms sampled using a Zipf distribution.
|
|
||||||
// Exponent ~1.1 approximates top-20 terms covering around ~20%.
|
|
||||||
let terms_1000: Vec<String> = (1..=1000).map(|i| format!("term_{i}")).collect();
|
|
||||||
let zipf_1000 = rand_distr::Zipf::new(1000, 1.1f64).unwrap();
|
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut rng = StdRng::from_seed([1u8; 32]);
|
let mut rng = StdRng::from_seed([1u8; 32]);
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
|
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
|
||||||
@@ -543,25 +427,15 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
|||||||
index_writer.add_document(doc!())?;
|
index_writer.add_document(doc!())?;
|
||||||
}
|
}
|
||||||
if cardinality == Cardinality::Multivalued {
|
if cardinality == Cardinality::Multivalued {
|
||||||
let log_level_sample_a = status_field_data[log_level_distribution.sample(&mut rng)].0;
|
|
||||||
let log_level_sample_b = status_field_data[log_level_distribution.sample(&mut rng)].0;
|
|
||||||
let idx_a = zipf_1000.sample(&mut rng) as usize - 1;
|
|
||||||
let idx_b = zipf_1000.sample(&mut rng) as usize - 1;
|
|
||||||
let term_1000_a = &terms_1000[idx_a];
|
|
||||||
let term_1000_b = &terms_1000[idx_b];
|
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
json_field => json!({"mixed_type": 10.0}),
|
json_field => json!({"mixed_type": 10.0}),
|
||||||
json_field => json!({"mixed_type": 10.0}),
|
json_field => json!({"mixed_type": 10.0}),
|
||||||
text_field => "cool",
|
text_field => "cool",
|
||||||
text_field => "cool",
|
text_field => "cool",
|
||||||
text_field_all_unique_terms => "cool",
|
|
||||||
text_field_all_unique_terms => "coolo",
|
|
||||||
text_field_many_terms => "cool",
|
text_field_many_terms => "cool",
|
||||||
text_field_many_terms => "cool",
|
text_field_many_terms => "cool",
|
||||||
text_field_few_terms_status => log_level_sample_a,
|
text_field_few_terms => "cool",
|
||||||
text_field_few_terms_status => log_level_sample_b,
|
text_field_few_terms => "cool",
|
||||||
text_field_1000_terms_zipf => term_1000_a.as_str(),
|
|
||||||
text_field_1000_terms_zipf => term_1000_b.as_str(),
|
|
||||||
score_field => 1u64,
|
score_field => 1u64,
|
||||||
score_field => 1u64,
|
score_field => 1u64,
|
||||||
score_field_f64 => lg_norm.sample(&mut rng),
|
score_field_f64 => lg_norm.sample(&mut rng),
|
||||||
@@ -586,10 +460,8 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
|
|||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "cool",
|
text_field => "cool",
|
||||||
json_field => json,
|
json_field => json,
|
||||||
text_field_all_unique_terms => format!("unique_term_{}", rng.gen::<u64>()),
|
|
||||||
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
|
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
|
||||||
text_field_few_terms_status => status_field_data[log_level_distribution.sample(&mut rng)].0,
|
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
|
||||||
text_field_1000_terms_zipf => terms_1000[zipf_1000.sample(&mut rng) as usize - 1].as_str(),
|
|
||||||
score_field => val as u64,
|
score_field => val as u64,
|
||||||
score_field_f64 => lg_norm.sample(&mut rng),
|
score_field_f64 => lg_norm.sample(&mut rng),
|
||||||
score_field_i64 => val as i64,
|
score_field_i64 => val as i64,
|
||||||
@@ -641,7 +513,7 @@ fn filter_agg_all_query_with_sub_aggs(index: &Index) {
|
|||||||
"avg_score": { "avg": { "field": "score" } },
|
"avg_score": { "avg": { "field": "score" } },
|
||||||
"stats_score": { "stats": { "field": "score_f64" } },
|
"stats_score": { "stats": { "field": "score_f64" } },
|
||||||
"terms_text": {
|
"terms_text": {
|
||||||
"terms": { "field": "text_few_terms_status" }
|
"terms": { "field": "text_few_terms" }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -657,7 +529,7 @@ fn filter_agg_term_query_with_sub_aggs(index: &Index) {
|
|||||||
"avg_score": { "avg": { "field": "score" } },
|
"avg_score": { "avg": { "field": "score" } },
|
||||||
"stats_score": { "stats": { "field": "score_f64" } },
|
"stats_score": { "stats": { "field": "score_f64" } },
|
||||||
"terms_text": {
|
"terms_text": {
|
||||||
"terms": { "field": "text_few_terms_status" }
|
"terms": { "field": "text_few_terms" }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,15 +16,14 @@
|
|||||||
// - This bench isolates boolean iteration speed and intersection/union cost.
|
// - This bench isolates boolean iteration speed and intersection/union cost.
|
||||||
// - Use `cargo bench --bench boolean_conjunction` to run.
|
// - Use `cargo bench --bench boolean_conjunction` to run.
|
||||||
|
|
||||||
use binggan::{black_box, BenchGroup, BenchRunner};
|
use binggan::{black_box, BenchRunner};
|
||||||
use rand::prelude::*;
|
use rand::prelude::*;
|
||||||
use rand::rngs::StdRng;
|
use rand::rngs::StdRng;
|
||||||
use rand::SeedableRng;
|
use rand::SeedableRng;
|
||||||
use tantivy::collector::sort_key::SortByStaticFastValue;
|
use tantivy::collector::{Count, TopDocs};
|
||||||
use tantivy::collector::{Collector, Count, TopDocs};
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::query::{Query, QueryParser};
|
use tantivy::schema::{Schema, TEXT};
|
||||||
use tantivy::schema::{Schema, FAST, TEXT};
|
use tantivy::{doc, Index, ReloadPolicy, Searcher};
|
||||||
use tantivy::{doc, Index, Order, ReloadPolicy, Searcher};
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
struct BenchIndex {
|
struct BenchIndex {
|
||||||
@@ -34,6 +33,23 @@ struct BenchIndex {
|
|||||||
query_parser: QueryParser,
|
query_parser: QueryParser,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl BenchIndex {
|
||||||
|
#[inline(always)]
|
||||||
|
fn count_query(&self, query_str: &str) -> usize {
|
||||||
|
let query = self.query_parser.parse_query(query_str).unwrap();
|
||||||
|
self.searcher.search(&query, &Count).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn topk_len(&self, query_str: &str, k: usize) -> usize {
|
||||||
|
let query = self.query_parser.parse_query(query_str).unwrap();
|
||||||
|
self.searcher
|
||||||
|
.search(&query, &TopDocs::with_limit(k))
|
||||||
|
.unwrap()
|
||||||
|
.len()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Build a single index containing both fields (title, body) and
|
/// Build a single index containing both fields (title, body) and
|
||||||
/// return two BenchIndex views:
|
/// return two BenchIndex views:
|
||||||
/// - single_field: QueryParser defaults to only "body"
|
/// - single_field: QueryParser defaults to only "body"
|
||||||
@@ -43,8 +59,6 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench
|
|||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let f_title = schema_builder.add_text_field("title", TEXT);
|
let f_title = schema_builder.add_text_field("title", TEXT);
|
||||||
let f_body = schema_builder.add_text_field("body", TEXT);
|
let f_body = schema_builder.add_text_field("body", TEXT);
|
||||||
let f_score = schema_builder.add_u64_field("score", FAST);
|
|
||||||
let f_score2 = schema_builder.add_u64_field("score2", FAST);
|
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema.clone());
|
let index = Index::create_in_ram(schema.clone());
|
||||||
|
|
||||||
@@ -53,13 +67,11 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench
|
|||||||
|
|
||||||
// Populate: spread each present token 90/10 to body/title
|
// Populate: spread each present token 90/10 to body/title
|
||||||
{
|
{
|
||||||
let mut writer = index.writer_with_num_threads(1, 500_000_000).unwrap();
|
let mut writer = index.writer(500_000_000).unwrap();
|
||||||
for _ in 0..num_docs {
|
for _ in 0..num_docs {
|
||||||
let has_a = rng.gen_bool(p_a as f64);
|
let has_a = rng.gen_bool(p_a as f64);
|
||||||
let has_b = rng.gen_bool(p_b as f64);
|
let has_b = rng.gen_bool(p_b as f64);
|
||||||
let has_c = rng.gen_bool(p_c as f64);
|
let has_c = rng.gen_bool(p_c as f64);
|
||||||
let score = rng.gen_range(0u64..100u64);
|
|
||||||
let score2 = rng.gen_range(0u64..100_000u64);
|
|
||||||
let mut title_tokens: Vec<&str> = Vec::new();
|
let mut title_tokens: Vec<&str> = Vec::new();
|
||||||
let mut body_tokens: Vec<&str> = Vec::new();
|
let mut body_tokens: Vec<&str> = Vec::new();
|
||||||
if has_a {
|
if has_a {
|
||||||
@@ -89,9 +101,7 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench
|
|||||||
writer
|
writer
|
||||||
.add_document(doc!(
|
.add_document(doc!(
|
||||||
f_title=>title_tokens.join(" "),
|
f_title=>title_tokens.join(" "),
|
||||||
f_body=>body_tokens.join(" "),
|
f_body=>body_tokens.join(" ")
|
||||||
f_score=>score,
|
|
||||||
f_score2=>score2,
|
|
||||||
))
|
))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
@@ -143,76 +153,72 @@ fn main() {
|
|||||||
),
|
),
|
||||||
];
|
];
|
||||||
|
|
||||||
let queries = &["a", "+a +b", "+a +b +c", "a OR b", "a OR b OR c"];
|
|
||||||
|
|
||||||
let mut runner = BenchRunner::new();
|
let mut runner = BenchRunner::new();
|
||||||
for (label, n, pa, pb, pc) in scenarios {
|
for (label, n, pa, pb, pc) in scenarios {
|
||||||
let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc);
|
let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc);
|
||||||
|
|
||||||
for (view_name, bench_index) in [("single_field", single_view), ("multi_field", multi_view)]
|
// Single-field group: default field is body only
|
||||||
{
|
{
|
||||||
// Single-field group: default field is body only
|
|
||||||
let mut group = runner.new_group();
|
let mut group = runner.new_group();
|
||||||
group.set_name(format!("{} — {}", view_name, label));
|
group.set_name(format!("single_field — {}", label));
|
||||||
for query_str in queries {
|
group.register_with_input("+a_+b_count", &single_view, |benv: &BenchIndex| {
|
||||||
add_bench_task(&mut group, &bench_index, query_str, Count, "count");
|
black_box(benv.count_query("+a +b"))
|
||||||
add_bench_task(
|
});
|
||||||
&mut group,
|
group.register_with_input("+a_+b_+c_count", &single_view, |benv: &BenchIndex| {
|
||||||
&bench_index,
|
black_box(benv.count_query("+a +b +c"))
|
||||||
query_str,
|
});
|
||||||
TopDocs::with_limit(10).order_by_score(),
|
group.register_with_input("+a_+b_top10", &single_view, |benv: &BenchIndex| {
|
||||||
"top10",
|
black_box(benv.topk_len("+a +b", 10))
|
||||||
);
|
});
|
||||||
add_bench_task(
|
group.register_with_input("+a_+b_+c_top10", &single_view, |benv: &BenchIndex| {
|
||||||
&mut group,
|
black_box(benv.topk_len("+a +b +c", 10))
|
||||||
&bench_index,
|
});
|
||||||
query_str,
|
// OR queries
|
||||||
TopDocs::with_limit(10).order_by_fast_field::<u64>("score", Order::Asc),
|
group.register_with_input("a_OR_b_count", &single_view, |benv: &BenchIndex| {
|
||||||
"top10_by_ff",
|
black_box(benv.count_query("a OR b"))
|
||||||
);
|
});
|
||||||
add_bench_task(
|
group.register_with_input("a_OR_b_OR_c_count", &single_view, |benv: &BenchIndex| {
|
||||||
&mut group,
|
black_box(benv.count_query("a OR b OR c"))
|
||||||
&bench_index,
|
});
|
||||||
query_str,
|
group.register_with_input("a_OR_b_top10", &single_view, |benv: &BenchIndex| {
|
||||||
TopDocs::with_limit(10).order_by((
|
black_box(benv.topk_len("a OR b", 10))
|
||||||
SortByStaticFastValue::<u64>::for_field("score"),
|
});
|
||||||
SortByStaticFastValue::<u64>::for_field("score2"),
|
group.register_with_input("a_OR_b_OR_c_top10", &single_view, |benv: &BenchIndex| {
|
||||||
)),
|
black_box(benv.topk_len("a OR b OR c", 10))
|
||||||
"top10_by_2ff",
|
});
|
||||||
);
|
group.run();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Multi-field group: default fields are [title, body]
|
||||||
|
{
|
||||||
|
let mut group = runner.new_group();
|
||||||
|
group.set_name(format!("multi_field — {}", label));
|
||||||
|
group.register_with_input("+a_+b_count", &multi_view, |benv: &BenchIndex| {
|
||||||
|
black_box(benv.count_query("+a +b"))
|
||||||
|
});
|
||||||
|
group.register_with_input("+a_+b_+c_count", &multi_view, |benv: &BenchIndex| {
|
||||||
|
black_box(benv.count_query("+a +b +c"))
|
||||||
|
});
|
||||||
|
group.register_with_input("+a_+b_top10", &multi_view, |benv: &BenchIndex| {
|
||||||
|
black_box(benv.topk_len("+a +b", 10))
|
||||||
|
});
|
||||||
|
group.register_with_input("+a_+b_+c_top10", &multi_view, |benv: &BenchIndex| {
|
||||||
|
black_box(benv.topk_len("+a +b +c", 10))
|
||||||
|
});
|
||||||
|
// OR queries
|
||||||
|
group.register_with_input("a_OR_b_count", &multi_view, |benv: &BenchIndex| {
|
||||||
|
black_box(benv.count_query("a OR b"))
|
||||||
|
});
|
||||||
|
group.register_with_input("a_OR_b_OR_c_count", &multi_view, |benv: &BenchIndex| {
|
||||||
|
black_box(benv.count_query("a OR b OR c"))
|
||||||
|
});
|
||||||
|
group.register_with_input("a_OR_b_top10", &multi_view, |benv: &BenchIndex| {
|
||||||
|
black_box(benv.topk_len("a OR b", 10))
|
||||||
|
});
|
||||||
|
group.register_with_input("a_OR_b_OR_c_top10", &multi_view, |benv: &BenchIndex| {
|
||||||
|
black_box(benv.topk_len("a OR b OR c", 10))
|
||||||
|
});
|
||||||
group.run();
|
group.run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn add_bench_task<C: Collector + 'static>(
|
|
||||||
bench_group: &mut BenchGroup,
|
|
||||||
bench_index: &BenchIndex,
|
|
||||||
query_str: &str,
|
|
||||||
collector: C,
|
|
||||||
collector_name: &str,
|
|
||||||
) {
|
|
||||||
let task_name = format!("{}_{}", query_str.replace(" ", "_"), collector_name);
|
|
||||||
let query = bench_index.query_parser.parse_query(query_str).unwrap();
|
|
||||||
let search_task = SearchTask {
|
|
||||||
searcher: bench_index.searcher.clone(),
|
|
||||||
collector,
|
|
||||||
query,
|
|
||||||
};
|
|
||||||
bench_group.register(task_name, move |_| black_box(search_task.run()));
|
|
||||||
}
|
|
||||||
|
|
||||||
struct SearchTask<C: Collector> {
|
|
||||||
searcher: Searcher,
|
|
||||||
collector: C,
|
|
||||||
query: Box<dyn Query>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<C: Collector> SearchTask<C> {
|
|
||||||
#[inline(never)]
|
|
||||||
pub fn run(&self) -> usize {
|
|
||||||
self.searcher.search(&self.query, &self.collector).unwrap();
|
|
||||||
1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,288 +0,0 @@
|
|||||||
use binggan::{black_box, BenchGroup, BenchRunner};
|
|
||||||
use rand::prelude::*;
|
|
||||||
use rand::rngs::StdRng;
|
|
||||||
use rand::SeedableRng;
|
|
||||||
use tantivy::collector::{Collector, Count, DocSetCollector, TopDocs};
|
|
||||||
use tantivy::query::{Query, QueryParser};
|
|
||||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
|
||||||
use tantivy::{doc, Index, Order, ReloadPolicy, Searcher};
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
struct BenchIndex {
|
|
||||||
#[allow(dead_code)]
|
|
||||||
index: Index,
|
|
||||||
searcher: Searcher,
|
|
||||||
query_parser: QueryParser,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn build_shared_indices(num_docs: usize, p_title_a: f32, distribution: &str) -> BenchIndex {
|
|
||||||
// Unified schema
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let f_title = schema_builder.add_text_field("title", TEXT);
|
|
||||||
let f_num_rand = schema_builder.add_u64_field("num_rand", INDEXED);
|
|
||||||
let f_num_asc = schema_builder.add_u64_field("num_asc", INDEXED);
|
|
||||||
let f_num_rand_fast = schema_builder.add_u64_field("num_rand_fast", INDEXED | FAST);
|
|
||||||
let f_num_asc_fast = schema_builder.add_u64_field("num_asc_fast", INDEXED | FAST);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema.clone());
|
|
||||||
|
|
||||||
// Populate index with stable RNG for reproducibility.
|
|
||||||
let mut rng = StdRng::from_seed([7u8; 32]);
|
|
||||||
|
|
||||||
{
|
|
||||||
let mut writer = index.writer_with_num_threads(1, 4_000_000_000).unwrap();
|
|
||||||
|
|
||||||
match distribution {
|
|
||||||
"dense" => {
|
|
||||||
for doc_id in 0..num_docs {
|
|
||||||
// Always add title to avoid empty documents
|
|
||||||
let title_token = if rng.gen_bool(p_title_a as f64) {
|
|
||||||
"a"
|
|
||||||
} else {
|
|
||||||
"b"
|
|
||||||
};
|
|
||||||
|
|
||||||
let num_rand = rng.gen_range(0u64..1000u64);
|
|
||||||
|
|
||||||
let num_asc = (doc_id / 10000) as u64;
|
|
||||||
|
|
||||||
writer
|
|
||||||
.add_document(doc!(
|
|
||||||
f_title=>title_token,
|
|
||||||
f_num_rand=>num_rand,
|
|
||||||
f_num_asc=>num_asc,
|
|
||||||
f_num_rand_fast=>num_rand,
|
|
||||||
f_num_asc_fast=>num_asc,
|
|
||||||
))
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"sparse" => {
|
|
||||||
for doc_id in 0..num_docs {
|
|
||||||
// Always add title to avoid empty documents
|
|
||||||
let title_token = if rng.gen_bool(p_title_a as f64) {
|
|
||||||
"a"
|
|
||||||
} else {
|
|
||||||
"b"
|
|
||||||
};
|
|
||||||
|
|
||||||
let num_rand = rng.gen_range(0u64..10000000u64);
|
|
||||||
|
|
||||||
let num_asc = doc_id as u64;
|
|
||||||
|
|
||||||
writer
|
|
||||||
.add_document(doc!(
|
|
||||||
f_title=>title_token,
|
|
||||||
f_num_rand=>num_rand,
|
|
||||||
f_num_asc=>num_asc,
|
|
||||||
f_num_rand_fast=>num_rand,
|
|
||||||
f_num_asc_fast=>num_asc,
|
|
||||||
))
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
panic!("Unsupported distribution type");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
writer.commit().unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Prepare reader/searcher once.
|
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
|
||||||
// Build query parser for title field
|
|
||||||
let qp_title = QueryParser::for_index(&index, vec![f_title]);
|
|
||||||
|
|
||||||
BenchIndex {
|
|
||||||
index,
|
|
||||||
searcher,
|
|
||||||
query_parser: qp_title,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() {
|
|
||||||
// Prepare corpora with varying scenarios
|
|
||||||
let scenarios = vec![
|
|
||||||
(
|
|
||||||
"dense and 99% a".to_string(),
|
|
||||||
10_000_000,
|
|
||||||
0.99,
|
|
||||||
"dense",
|
|
||||||
0,
|
|
||||||
9,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"dense and 99% a".to_string(),
|
|
||||||
10_000_000,
|
|
||||||
0.99,
|
|
||||||
"dense",
|
|
||||||
990,
|
|
||||||
999,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"sparse and 99% a".to_string(),
|
|
||||||
10_000_000,
|
|
||||||
0.99,
|
|
||||||
"sparse",
|
|
||||||
0,
|
|
||||||
9,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"sparse and 99% a".to_string(),
|
|
||||||
10_000_000,
|
|
||||||
0.99,
|
|
||||||
"sparse",
|
|
||||||
9_999_990,
|
|
||||||
9_999_999,
|
|
||||||
),
|
|
||||||
];
|
|
||||||
|
|
||||||
let mut runner = BenchRunner::new();
|
|
||||||
for (scenario_id, n, p_title_a, num_rand_distribution, range_low, range_high) in scenarios {
|
|
||||||
// Build index for this scenario
|
|
||||||
let bench_index = build_shared_indices(n, p_title_a, num_rand_distribution);
|
|
||||||
|
|
||||||
// Create benchmark group
|
|
||||||
let mut group = runner.new_group();
|
|
||||||
|
|
||||||
// Now set the name (this moves scenario_id)
|
|
||||||
group.set_name(scenario_id);
|
|
||||||
|
|
||||||
// Define all four field types
|
|
||||||
let field_names = ["num_rand", "num_asc", "num_rand_fast", "num_asc_fast"];
|
|
||||||
|
|
||||||
// Define the three terms we want to test with
|
|
||||||
let terms = ["a", "b", "z"];
|
|
||||||
|
|
||||||
// Generate all combinations of terms and field names
|
|
||||||
let mut queries = Vec::new();
|
|
||||||
for &term in &terms {
|
|
||||||
for &field_name in &field_names {
|
|
||||||
let query_str = format!(
|
|
||||||
"{} AND {}:[{} TO {}]",
|
|
||||||
term, field_name, range_low, range_high
|
|
||||||
);
|
|
||||||
queries.push((query_str, field_name.to_string()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let query_str = format!(
|
|
||||||
"{}:[{} TO {}] AND {}:[{} TO {}]",
|
|
||||||
"num_rand_fast", range_low, range_high, "num_asc_fast", range_low, range_high
|
|
||||||
);
|
|
||||||
queries.push((query_str, "num_asc_fast".to_string()));
|
|
||||||
|
|
||||||
// Run all benchmark tasks for each query and its corresponding field name
|
|
||||||
for (query_str, field_name) in queries {
|
|
||||||
run_benchmark_tasks(&mut group, &bench_index, &query_str, &field_name);
|
|
||||||
}
|
|
||||||
|
|
||||||
group.run();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Run all benchmark tasks for a given query string and field name
|
|
||||||
fn run_benchmark_tasks(
|
|
||||||
bench_group: &mut BenchGroup,
|
|
||||||
bench_index: &BenchIndex,
|
|
||||||
query_str: &str,
|
|
||||||
field_name: &str,
|
|
||||||
) {
|
|
||||||
// Test count
|
|
||||||
add_bench_task(bench_group, bench_index, query_str, Count, "count");
|
|
||||||
|
|
||||||
// Test all results
|
|
||||||
add_bench_task(
|
|
||||||
bench_group,
|
|
||||||
bench_index,
|
|
||||||
query_str,
|
|
||||||
DocSetCollector,
|
|
||||||
"all results",
|
|
||||||
);
|
|
||||||
|
|
||||||
// Test top 100 by the field (if it's a FAST field)
|
|
||||||
if field_name.ends_with("_fast") {
|
|
||||||
// Ascending order
|
|
||||||
{
|
|
||||||
let collector_name = format!("top100_by_{}_asc", field_name);
|
|
||||||
let field_name_owned = field_name.to_string();
|
|
||||||
add_bench_task(
|
|
||||||
bench_group,
|
|
||||||
bench_index,
|
|
||||||
query_str,
|
|
||||||
TopDocs::with_limit(100).order_by_fast_field::<u64>(field_name_owned, Order::Asc),
|
|
||||||
&collector_name,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Descending order
|
|
||||||
{
|
|
||||||
let collector_name = format!("top100_by_{}_desc", field_name);
|
|
||||||
let field_name_owned = field_name.to_string();
|
|
||||||
add_bench_task(
|
|
||||||
bench_group,
|
|
||||||
bench_index,
|
|
||||||
query_str,
|
|
||||||
TopDocs::with_limit(100).order_by_fast_field::<u64>(field_name_owned, Order::Desc),
|
|
||||||
&collector_name,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn add_bench_task<C: Collector + 'static>(
|
|
||||||
bench_group: &mut BenchGroup,
|
|
||||||
bench_index: &BenchIndex,
|
|
||||||
query_str: &str,
|
|
||||||
collector: C,
|
|
||||||
collector_name: &str,
|
|
||||||
) {
|
|
||||||
let task_name = format!("{}_{}", query_str.replace(" ", "_"), collector_name);
|
|
||||||
let query = bench_index.query_parser.parse_query(query_str).unwrap();
|
|
||||||
let search_task = SearchTask {
|
|
||||||
searcher: bench_index.searcher.clone(),
|
|
||||||
collector,
|
|
||||||
query,
|
|
||||||
};
|
|
||||||
bench_group.register(task_name, move |_| black_box(search_task.run()));
|
|
||||||
}
|
|
||||||
|
|
||||||
struct SearchTask<C: Collector> {
|
|
||||||
searcher: Searcher,
|
|
||||||
collector: C,
|
|
||||||
query: Box<dyn Query>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<C: Collector> SearchTask<C> {
|
|
||||||
#[inline(never)]
|
|
||||||
pub fn run(&self) -> usize {
|
|
||||||
let result = self.searcher.search(&self.query, &self.collector).unwrap();
|
|
||||||
if let Some(count) = (&result as &dyn std::any::Any).downcast_ref::<usize>() {
|
|
||||||
*count
|
|
||||||
} else if let Some(top_docs) = (&result as &dyn std::any::Any)
|
|
||||||
.downcast_ref::<Vec<(Option<u64>, tantivy::DocAddress)>>()
|
|
||||||
{
|
|
||||||
top_docs.len()
|
|
||||||
} else if let Some(top_docs) =
|
|
||||||
(&result as &dyn std::any::Any).downcast_ref::<Vec<(u64, tantivy::DocAddress)>>()
|
|
||||||
{
|
|
||||||
top_docs.len()
|
|
||||||
} else if let Some(doc_set) = (&result as &dyn std::any::Any)
|
|
||||||
.downcast_ref::<std::collections::HashSet<tantivy::DocAddress>>()
|
|
||||||
{
|
|
||||||
doc_set.len()
|
|
||||||
} else {
|
|
||||||
eprintln!(
|
|
||||||
"Unknown collector result type: {:?}",
|
|
||||||
std::any::type_name::<C::Fruit>()
|
|
||||||
);
|
|
||||||
0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,365 +0,0 @@
|
|||||||
use std::ops::Bound;
|
|
||||||
|
|
||||||
use binggan::{black_box, BenchGroup, BenchRunner};
|
|
||||||
use rand::prelude::*;
|
|
||||||
use rand::rngs::StdRng;
|
|
||||||
use rand::SeedableRng;
|
|
||||||
use tantivy::collector::{Count, DocSetCollector, TopDocs};
|
|
||||||
use tantivy::query::RangeQuery;
|
|
||||||
use tantivy::schema::{Schema, FAST, INDEXED};
|
|
||||||
use tantivy::{doc, Index, Order, ReloadPolicy, Searcher, Term};
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
struct BenchIndex {
|
|
||||||
#[allow(dead_code)]
|
|
||||||
index: Index,
|
|
||||||
searcher: Searcher,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn build_shared_indices(num_docs: usize, distribution: &str) -> BenchIndex {
|
|
||||||
// Schema with fast fields only
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let f_num_rand_fast = schema_builder.add_u64_field("num_rand_fast", INDEXED | FAST);
|
|
||||||
let f_num_asc_fast = schema_builder.add_u64_field("num_asc_fast", INDEXED | FAST);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema.clone());
|
|
||||||
|
|
||||||
// Populate index with stable RNG for reproducibility.
|
|
||||||
let mut rng = StdRng::from_seed([7u8; 32]);
|
|
||||||
|
|
||||||
{
|
|
||||||
let mut writer = index.writer_with_num_threads(1, 4_000_000_000).unwrap();
|
|
||||||
|
|
||||||
match distribution {
|
|
||||||
"dense" => {
|
|
||||||
for doc_id in 0..num_docs {
|
|
||||||
let num_rand = rng.gen_range(0u64..1000u64);
|
|
||||||
let num_asc = (doc_id / 10000) as u64;
|
|
||||||
|
|
||||||
writer
|
|
||||||
.add_document(doc!(
|
|
||||||
f_num_rand_fast=>num_rand,
|
|
||||||
f_num_asc_fast=>num_asc,
|
|
||||||
))
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"sparse" => {
|
|
||||||
for doc_id in 0..num_docs {
|
|
||||||
let num_rand = rng.gen_range(0u64..10000000u64);
|
|
||||||
let num_asc = doc_id as u64;
|
|
||||||
|
|
||||||
writer
|
|
||||||
.add_document(doc!(
|
|
||||||
f_num_rand_fast=>num_rand,
|
|
||||||
f_num_asc_fast=>num_asc,
|
|
||||||
))
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
panic!("Unsupported distribution type");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
writer.commit().unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Prepare reader/searcher once.
|
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
|
||||||
BenchIndex { index, searcher }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() {
|
|
||||||
// Prepare corpora with varying scenarios
|
|
||||||
let scenarios = vec![
|
|
||||||
// Dense distribution - random values in small range (0-999)
|
|
||||||
(
|
|
||||||
"dense_values_search_low_value_range".to_string(),
|
|
||||||
10_000_000,
|
|
||||||
"dense",
|
|
||||||
0,
|
|
||||||
9,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"dense_values_search_high_value_range".to_string(),
|
|
||||||
10_000_000,
|
|
||||||
"dense",
|
|
||||||
990,
|
|
||||||
999,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"dense_values_search_out_of_range".to_string(),
|
|
||||||
10_000_000,
|
|
||||||
"dense",
|
|
||||||
1000,
|
|
||||||
1002,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"sparse_values_search_low_value_range".to_string(),
|
|
||||||
10_000_000,
|
|
||||||
"sparse",
|
|
||||||
0,
|
|
||||||
9,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"sparse_values_search_high_value_range".to_string(),
|
|
||||||
10_000_000,
|
|
||||||
"sparse",
|
|
||||||
9_999_990,
|
|
||||||
9_999_999,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"sparse_values_search_out_of_range".to_string(),
|
|
||||||
10_000_000,
|
|
||||||
"sparse",
|
|
||||||
10_000_000,
|
|
||||||
10_000_002,
|
|
||||||
),
|
|
||||||
];
|
|
||||||
|
|
||||||
let mut runner = BenchRunner::new();
|
|
||||||
for (scenario_id, n, num_rand_distribution, range_low, range_high) in scenarios {
|
|
||||||
// Build index for this scenario
|
|
||||||
let bench_index = build_shared_indices(n, num_rand_distribution);
|
|
||||||
|
|
||||||
// Create benchmark group
|
|
||||||
let mut group = runner.new_group();
|
|
||||||
|
|
||||||
// Now set the name (this moves scenario_id)
|
|
||||||
group.set_name(scenario_id);
|
|
||||||
|
|
||||||
// Define fast field types
|
|
||||||
let field_names = ["num_rand_fast", "num_asc_fast"];
|
|
||||||
|
|
||||||
// Generate range queries for fast fields
|
|
||||||
for &field_name in &field_names {
|
|
||||||
// Create the range query
|
|
||||||
let field = bench_index.searcher.schema().get_field(field_name).unwrap();
|
|
||||||
let lower_term = Term::from_field_u64(field, range_low);
|
|
||||||
let upper_term = Term::from_field_u64(field, range_high);
|
|
||||||
|
|
||||||
let query = RangeQuery::new(Bound::Included(lower_term), Bound::Included(upper_term));
|
|
||||||
|
|
||||||
run_benchmark_tasks(
|
|
||||||
&mut group,
|
|
||||||
&bench_index,
|
|
||||||
query,
|
|
||||||
field_name,
|
|
||||||
range_low,
|
|
||||||
range_high,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
group.run();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Run all benchmark tasks for a given range query and field name
|
|
||||||
fn run_benchmark_tasks(
|
|
||||||
bench_group: &mut BenchGroup,
|
|
||||||
bench_index: &BenchIndex,
|
|
||||||
query: RangeQuery,
|
|
||||||
field_name: &str,
|
|
||||||
range_low: u64,
|
|
||||||
range_high: u64,
|
|
||||||
) {
|
|
||||||
// Test count
|
|
||||||
add_bench_task_count(
|
|
||||||
bench_group,
|
|
||||||
bench_index,
|
|
||||||
query.clone(),
|
|
||||||
"count",
|
|
||||||
field_name,
|
|
||||||
range_low,
|
|
||||||
range_high,
|
|
||||||
);
|
|
||||||
|
|
||||||
// Test top 100 by the field (ascending order)
|
|
||||||
{
|
|
||||||
let collector_name = format!("top100_by_{}_asc", field_name);
|
|
||||||
let field_name_owned = field_name.to_string();
|
|
||||||
add_bench_task_top100_asc(
|
|
||||||
bench_group,
|
|
||||||
bench_index,
|
|
||||||
query.clone(),
|
|
||||||
&collector_name,
|
|
||||||
field_name,
|
|
||||||
range_low,
|
|
||||||
range_high,
|
|
||||||
field_name_owned,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test top 100 by the field (descending order)
|
|
||||||
{
|
|
||||||
let collector_name = format!("top100_by_{}_desc", field_name);
|
|
||||||
let field_name_owned = field_name.to_string();
|
|
||||||
add_bench_task_top100_desc(
|
|
||||||
bench_group,
|
|
||||||
bench_index,
|
|
||||||
query,
|
|
||||||
&collector_name,
|
|
||||||
field_name,
|
|
||||||
range_low,
|
|
||||||
range_high,
|
|
||||||
field_name_owned,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn add_bench_task_count(
|
|
||||||
bench_group: &mut BenchGroup,
|
|
||||||
bench_index: &BenchIndex,
|
|
||||||
query: RangeQuery,
|
|
||||||
collector_name: &str,
|
|
||||||
field_name: &str,
|
|
||||||
range_low: u64,
|
|
||||||
range_high: u64,
|
|
||||||
) {
|
|
||||||
let task_name = format!(
|
|
||||||
"range_{}_[{} TO {}]_{}",
|
|
||||||
field_name, range_low, range_high, collector_name
|
|
||||||
);
|
|
||||||
|
|
||||||
let search_task = CountSearchTask {
|
|
||||||
searcher: bench_index.searcher.clone(),
|
|
||||||
query,
|
|
||||||
};
|
|
||||||
bench_group.register(task_name, move |_| black_box(search_task.run()));
|
|
||||||
}
|
|
||||||
|
|
||||||
fn add_bench_task_docset(
|
|
||||||
bench_group: &mut BenchGroup,
|
|
||||||
bench_index: &BenchIndex,
|
|
||||||
query: RangeQuery,
|
|
||||||
collector_name: &str,
|
|
||||||
field_name: &str,
|
|
||||||
range_low: u64,
|
|
||||||
range_high: u64,
|
|
||||||
) {
|
|
||||||
let task_name = format!(
|
|
||||||
"range_{}_[{} TO {}]_{}",
|
|
||||||
field_name, range_low, range_high, collector_name
|
|
||||||
);
|
|
||||||
|
|
||||||
let search_task = DocSetSearchTask {
|
|
||||||
searcher: bench_index.searcher.clone(),
|
|
||||||
query,
|
|
||||||
};
|
|
||||||
bench_group.register(task_name, move |_| black_box(search_task.run()));
|
|
||||||
}
|
|
||||||
|
|
||||||
fn add_bench_task_top100_asc(
|
|
||||||
bench_group: &mut BenchGroup,
|
|
||||||
bench_index: &BenchIndex,
|
|
||||||
query: RangeQuery,
|
|
||||||
collector_name: &str,
|
|
||||||
field_name: &str,
|
|
||||||
range_low: u64,
|
|
||||||
range_high: u64,
|
|
||||||
field_name_owned: String,
|
|
||||||
) {
|
|
||||||
let task_name = format!(
|
|
||||||
"range_{}_[{} TO {}]_{}",
|
|
||||||
field_name, range_low, range_high, collector_name
|
|
||||||
);
|
|
||||||
|
|
||||||
let search_task = Top100AscSearchTask {
|
|
||||||
searcher: bench_index.searcher.clone(),
|
|
||||||
query,
|
|
||||||
field_name: field_name_owned,
|
|
||||||
};
|
|
||||||
bench_group.register(task_name, move |_| black_box(search_task.run()));
|
|
||||||
}
|
|
||||||
|
|
||||||
fn add_bench_task_top100_desc(
|
|
||||||
bench_group: &mut BenchGroup,
|
|
||||||
bench_index: &BenchIndex,
|
|
||||||
query: RangeQuery,
|
|
||||||
collector_name: &str,
|
|
||||||
field_name: &str,
|
|
||||||
range_low: u64,
|
|
||||||
range_high: u64,
|
|
||||||
field_name_owned: String,
|
|
||||||
) {
|
|
||||||
let task_name = format!(
|
|
||||||
"range_{}_[{} TO {}]_{}",
|
|
||||||
field_name, range_low, range_high, collector_name
|
|
||||||
);
|
|
||||||
|
|
||||||
let search_task = Top100DescSearchTask {
|
|
||||||
searcher: bench_index.searcher.clone(),
|
|
||||||
query,
|
|
||||||
field_name: field_name_owned,
|
|
||||||
};
|
|
||||||
bench_group.register(task_name, move |_| black_box(search_task.run()));
|
|
||||||
}
|
|
||||||
|
|
||||||
struct CountSearchTask {
|
|
||||||
searcher: Searcher,
|
|
||||||
query: RangeQuery,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl CountSearchTask {
|
|
||||||
#[inline(never)]
|
|
||||||
pub fn run(&self) -> usize {
|
|
||||||
self.searcher.search(&self.query, &Count).unwrap()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct DocSetSearchTask {
|
|
||||||
searcher: Searcher,
|
|
||||||
query: RangeQuery,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl DocSetSearchTask {
|
|
||||||
#[inline(never)]
|
|
||||||
pub fn run(&self) -> usize {
|
|
||||||
let result = self.searcher.search(&self.query, &DocSetCollector).unwrap();
|
|
||||||
result.len()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Top100AscSearchTask {
|
|
||||||
searcher: Searcher,
|
|
||||||
query: RangeQuery,
|
|
||||||
field_name: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Top100AscSearchTask {
|
|
||||||
#[inline(never)]
|
|
||||||
pub fn run(&self) -> usize {
|
|
||||||
let collector =
|
|
||||||
TopDocs::with_limit(100).order_by_fast_field::<u64>(&self.field_name, Order::Asc);
|
|
||||||
let result = self.searcher.search(&self.query, &collector).unwrap();
|
|
||||||
for (_score, doc_address) in &result {
|
|
||||||
let _doc: tantivy::TantivyDocument = self.searcher.doc(*doc_address).unwrap();
|
|
||||||
}
|
|
||||||
result.len()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Top100DescSearchTask {
|
|
||||||
searcher: Searcher,
|
|
||||||
query: RangeQuery,
|
|
||||||
field_name: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Top100DescSearchTask {
|
|
||||||
#[inline(never)]
|
|
||||||
pub fn run(&self) -> usize {
|
|
||||||
let collector =
|
|
||||||
TopDocs::with_limit(100).order_by_fast_field::<u64>(&self.field_name, Order::Desc);
|
|
||||||
let result = self.searcher.search(&self.query, &collector).unwrap();
|
|
||||||
for (_score, doc_address) in &result {
|
|
||||||
let _doc: tantivy::TantivyDocument = self.searcher.doc(*doc_address).unwrap();
|
|
||||||
}
|
|
||||||
result.len()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,260 +0,0 @@
|
|||||||
use std::fmt::Display;
|
|
||||||
use std::net::Ipv6Addr;
|
|
||||||
use std::ops::RangeInclusive;
|
|
||||||
|
|
||||||
use binggan::plugins::PeakMemAllocPlugin;
|
|
||||||
use binggan::{black_box, BenchRunner, OutputValue, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
|
||||||
use columnar::MonotonicallyMappableToU128;
|
|
||||||
use rand::rngs::StdRng;
|
|
||||||
use rand::{Rng, SeedableRng};
|
|
||||||
use tantivy::collector::{Count, TopDocs};
|
|
||||||
use tantivy::query::QueryParser;
|
|
||||||
use tantivy::schema::*;
|
|
||||||
use tantivy::{doc, Index};
|
|
||||||
|
|
||||||
#[global_allocator]
|
|
||||||
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
|
|
||||||
|
|
||||||
fn main() {
|
|
||||||
bench_range_query();
|
|
||||||
}
|
|
||||||
|
|
||||||
fn bench_range_query() {
|
|
||||||
let index = get_index_0_to_100();
|
|
||||||
let mut runner = BenchRunner::new();
|
|
||||||
runner.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
|
|
||||||
|
|
||||||
runner.set_name("range_query on u64");
|
|
||||||
let field_name_and_descr: Vec<_> = vec![
|
|
||||||
("id", "Single Valued Range Field"),
|
|
||||||
("ids", "Multi Valued Range Field"),
|
|
||||||
];
|
|
||||||
let range_num_hits = vec![
|
|
||||||
("90_percent", get_90_percent()),
|
|
||||||
("10_percent", get_10_percent()),
|
|
||||||
("1_percent", get_1_percent()),
|
|
||||||
];
|
|
||||||
|
|
||||||
test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);
|
|
||||||
|
|
||||||
runner.set_name("range_query on ip");
|
|
||||||
let field_name_and_descr: Vec<_> = vec![
|
|
||||||
("ip", "Single Valued Range Field"),
|
|
||||||
("ips", "Multi Valued Range Field"),
|
|
||||||
];
|
|
||||||
let range_num_hits = vec![
|
|
||||||
("90_percent", get_90_percent_ip()),
|
|
||||||
("10_percent", get_10_percent_ip()),
|
|
||||||
("1_percent", get_1_percent_ip()),
|
|
||||||
];
|
|
||||||
|
|
||||||
test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_range<T: Display>(
|
|
||||||
runner: &mut BenchRunner,
|
|
||||||
index: &Index,
|
|
||||||
field_name_and_descr: &[(&str, &str)],
|
|
||||||
range_num_hits: Vec<(&str, RangeInclusive<T>)>,
|
|
||||||
) {
|
|
||||||
for (field, suffix) in field_name_and_descr {
|
|
||||||
let term_num_hits = vec![
|
|
||||||
("", ""),
|
|
||||||
("1_percent", "veryfew"),
|
|
||||||
("10_percent", "few"),
|
|
||||||
("90_percent", "most"),
|
|
||||||
];
|
|
||||||
let mut group = runner.new_group();
|
|
||||||
group.set_name(suffix);
|
|
||||||
// all intersect combinations
|
|
||||||
for (range_name, range) in &range_num_hits {
|
|
||||||
for (term_name, term) in &term_num_hits {
|
|
||||||
let index = &index;
|
|
||||||
let test_name = if term_name.is_empty() {
|
|
||||||
format!("id_range_hit_{}", range_name)
|
|
||||||
} else {
|
|
||||||
format!(
|
|
||||||
"id_range_hit_{}_intersect_with_term_{}",
|
|
||||||
range_name, term_name
|
|
||||||
)
|
|
||||||
};
|
|
||||||
group.register(test_name, move |_| {
|
|
||||||
let query = if term_name.is_empty() {
|
|
||||||
"".to_string()
|
|
||||||
} else {
|
|
||||||
format!("AND id_name:{}", term)
|
|
||||||
};
|
|
||||||
black_box(execute_query(field, range, &query, index));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
group.run();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_index_0_to_100() -> Index {
|
|
||||||
let mut rng = StdRng::from_seed([1u8; 32]);
|
|
||||||
let num_vals = 100_000;
|
|
||||||
let docs: Vec<_> = (0..num_vals)
|
|
||||||
.map(|_i| {
|
|
||||||
let id_name = if rng.gen_bool(0.01) {
|
|
||||||
"veryfew".to_string() // 1%
|
|
||||||
} else if rng.gen_bool(0.1) {
|
|
||||||
"few".to_string() // 9%
|
|
||||||
} else {
|
|
||||||
"most".to_string() // 90%
|
|
||||||
};
|
|
||||||
Doc {
|
|
||||||
id_name,
|
|
||||||
id: rng.gen_range(0..100),
|
|
||||||
// Multiply by 1000, so that we create most buckets in the compact space
|
|
||||||
// The benches depend on this range to select n-percent of elements with the
|
|
||||||
// methods below.
|
|
||||||
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
create_index_from_docs(&docs)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub struct Doc {
|
|
||||||
pub id_name: String,
|
|
||||||
pub id: u64,
|
|
||||||
pub ip: Ipv6Addr,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST);
|
|
||||||
let ids_u64_field =
|
|
||||||
schema_builder.add_u64_field("ids", NumericOptions::default().set_fast().set_indexed());
|
|
||||||
|
|
||||||
let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST);
|
|
||||||
let ids_f64_field = schema_builder.add_f64_field(
|
|
||||||
"ids_f64",
|
|
||||||
NumericOptions::default().set_fast().set_indexed(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST);
|
|
||||||
let ids_i64_field = schema_builder.add_i64_field(
|
|
||||||
"ids_i64",
|
|
||||||
NumericOptions::default().set_fast().set_indexed(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let text_field = schema_builder.add_text_field("id_name", STRING | STORED);
|
|
||||||
let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST);
|
|
||||||
|
|
||||||
let ip_field = schema_builder.add_ip_addr_field("ip", FAST);
|
|
||||||
let ips_field = schema_builder.add_ip_addr_field("ips", FAST);
|
|
||||||
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
|
|
||||||
{
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
|
|
||||||
for doc in docs.iter() {
|
|
||||||
index_writer
|
|
||||||
.add_document(doc!(
|
|
||||||
ids_i64_field => doc.id as i64,
|
|
||||||
ids_i64_field => doc.id as i64,
|
|
||||||
ids_f64_field => doc.id as f64,
|
|
||||||
ids_f64_field => doc.id as f64,
|
|
||||||
ids_u64_field => doc.id,
|
|
||||||
ids_u64_field => doc.id,
|
|
||||||
id_u64_field => doc.id,
|
|
||||||
id_f64_field => doc.id as f64,
|
|
||||||
id_i64_field => doc.id as i64,
|
|
||||||
text_field => doc.id_name.to_string(),
|
|
||||||
text_field2 => doc.id_name.to_string(),
|
|
||||||
ips_field => doc.ip,
|
|
||||||
ips_field => doc.ip,
|
|
||||||
ip_field => doc.ip,
|
|
||||||
))
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
}
|
|
||||||
index
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_90_percent() -> RangeInclusive<u64> {
|
|
||||||
0..=90
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_10_percent() -> RangeInclusive<u64> {
|
|
||||||
0..=10
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_1_percent() -> RangeInclusive<u64> {
|
|
||||||
10..=10
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_90_percent_ip() -> RangeInclusive<Ipv6Addr> {
|
|
||||||
let start = Ipv6Addr::from_u128(0);
|
|
||||||
let end = Ipv6Addr::from_u128(90 * 1000);
|
|
||||||
start..=end
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_10_percent_ip() -> RangeInclusive<Ipv6Addr> {
|
|
||||||
let start = Ipv6Addr::from_u128(0);
|
|
||||||
let end = Ipv6Addr::from_u128(10 * 1000);
|
|
||||||
start..=end
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_1_percent_ip() -> RangeInclusive<Ipv6Addr> {
|
|
||||||
let start = Ipv6Addr::from_u128(10 * 1000);
|
|
||||||
let end = Ipv6Addr::from_u128(10 * 1000);
|
|
||||||
start..=end
|
|
||||||
}
|
|
||||||
|
|
||||||
struct NumHits {
|
|
||||||
count: usize,
|
|
||||||
}
|
|
||||||
impl OutputValue for NumHits {
|
|
||||||
fn column_title() -> &'static str {
|
|
||||||
"NumHits"
|
|
||||||
}
|
|
||||||
fn format(&self) -> Option<String> {
|
|
||||||
Some(self.count.to_string())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn execute_query<T: Display>(
|
|
||||||
field: &str,
|
|
||||||
id_range: &RangeInclusive<T>,
|
|
||||||
suffix: &str,
|
|
||||||
index: &Index,
|
|
||||||
) -> NumHits {
|
|
||||||
let gen_query_inclusive = |from: &T, to: &T| {
|
|
||||||
format!(
|
|
||||||
"{}:[{} TO {}] {}",
|
|
||||||
field,
|
|
||||||
&from.to_string(),
|
|
||||||
&to.to_string(),
|
|
||||||
suffix
|
|
||||||
)
|
|
||||||
};
|
|
||||||
|
|
||||||
let query = gen_query_inclusive(id_range.start(), id_range.end());
|
|
||||||
execute_query_(&query, index)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn execute_query_(query: &str, index: &Index) -> NumHits {
|
|
||||||
let query_from_text = |text: &str| {
|
|
||||||
QueryParser::for_index(index, vec![])
|
|
||||||
.parse_query(text)
|
|
||||||
.unwrap()
|
|
||||||
};
|
|
||||||
let query = query_from_text(query);
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
let num_hits = searcher
|
|
||||||
.search(&query, &(TopDocs::with_limit(10).order_by_score(), Count))
|
|
||||||
.unwrap()
|
|
||||||
.1;
|
|
||||||
NumHits { count: num_hits }
|
|
||||||
}
|
|
||||||
@@ -258,7 +258,7 @@ mod test {
|
|||||||
bitpacker.write(val, num_bits, &mut data).unwrap();
|
bitpacker.write(val, num_bits, &mut data).unwrap();
|
||||||
}
|
}
|
||||||
bitpacker.close(&mut data).unwrap();
|
bitpacker.close(&mut data).unwrap();
|
||||||
assert_eq!(data.len(), ((num_bits as usize) * len).div_ceil(8));
|
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8);
|
||||||
let bitunpacker = BitUnpacker::new(num_bits);
|
let bitunpacker = BitUnpacker::new(num_bits);
|
||||||
(bitunpacker, vals, data)
|
(bitunpacker, vals, data)
|
||||||
}
|
}
|
||||||
@@ -304,7 +304,7 @@ mod test {
|
|||||||
bitpacker.write(val, num_bits, &mut buffer).unwrap();
|
bitpacker.write(val, num_bits, &mut buffer).unwrap();
|
||||||
}
|
}
|
||||||
bitpacker.flush(&mut buffer).unwrap();
|
bitpacker.flush(&mut buffer).unwrap();
|
||||||
assert_eq!(buffer.len(), (vals.len() * num_bits as usize).div_ceil(8));
|
assert_eq!(buffer.len(), (vals.len() * num_bits as usize + 7) / 8);
|
||||||
let bitunpacker = BitUnpacker::new(num_bits);
|
let bitunpacker = BitUnpacker::new(num_bits);
|
||||||
let max_val = if num_bits == 64 {
|
let max_val = if num_bits == 64 {
|
||||||
u64::MAX
|
u64::MAX
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ fn u32_to_i32(val: u32) -> i32 {
|
|||||||
#[inline]
|
#[inline]
|
||||||
unsafe fn u32_to_i32_avx2(vals_u32x8s: DataType) -> DataType {
|
unsafe fn u32_to_i32_avx2(vals_u32x8s: DataType) -> DataType {
|
||||||
const HIGHEST_BIT_MASK: DataType = from_u32x8([HIGHEST_BIT; NUM_LANES]);
|
const HIGHEST_BIT_MASK: DataType = from_u32x8([HIGHEST_BIT; NUM_LANES]);
|
||||||
unsafe { op_xor(vals_u32x8s, HIGHEST_BIT_MASK) }
|
op_xor(vals_u32x8s, HIGHEST_BIT_MASK)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
|
pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
|
||||||
@@ -66,19 +66,17 @@ unsafe fn filter_vec_avx2_aux(
|
|||||||
]);
|
]);
|
||||||
const SHIFT: __m256i = from_u32x8([NUM_LANES as u32; NUM_LANES]);
|
const SHIFT: __m256i = from_u32x8([NUM_LANES as u32; NUM_LANES]);
|
||||||
for _ in 0..num_words {
|
for _ in 0..num_words {
|
||||||
unsafe {
|
let word = load_unaligned(input);
|
||||||
let word = load_unaligned(input);
|
let word = u32_to_i32_avx2(word);
|
||||||
let word = u32_to_i32_avx2(word);
|
let keeper_bitset = compute_filter_bitset(word, range_simd.clone());
|
||||||
let keeper_bitset = compute_filter_bitset(word, range_simd.clone());
|
let added_len = keeper_bitset.count_ones();
|
||||||
let added_len = keeper_bitset.count_ones();
|
let filtered_doc_ids = compact(ids, keeper_bitset);
|
||||||
let filtered_doc_ids = compact(ids, keeper_bitset);
|
store_unaligned(output_tail as *mut __m256i, filtered_doc_ids);
|
||||||
store_unaligned(output_tail as *mut __m256i, filtered_doc_ids);
|
output_tail = output_tail.offset(added_len as isize);
|
||||||
output_tail = output_tail.offset(added_len as isize);
|
ids = op_add(ids, SHIFT);
|
||||||
ids = op_add(ids, SHIFT);
|
input = input.offset(1);
|
||||||
input = input.offset(1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
unsafe { output_tail.offset_from(output) as usize }
|
output_tail.offset_from(output) as usize
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
@@ -94,7 +92,8 @@ unsafe fn compute_filter_bitset(val: __m256i, range: std::ops::RangeInclusive<__
|
|||||||
let too_low = op_greater(*range.start(), val);
|
let too_low = op_greater(*range.start(), val);
|
||||||
let too_high = op_greater(val, *range.end());
|
let too_high = op_greater(val, *range.end());
|
||||||
let inside = op_or(too_low, too_high);
|
let inside = op_or(too_low, too_high);
|
||||||
255 - std::arch::x86_64::_mm256_movemask_ps(_mm256_castsi256_ps(inside)) as u8
|
255 - std::arch::x86_64::_mm256_movemask_ps(std::mem::transmute::<DataType, __m256>(inside))
|
||||||
|
as u8
|
||||||
}
|
}
|
||||||
|
|
||||||
union U8x32 {
|
union U8x32 {
|
||||||
|
|||||||
@@ -73,7 +73,7 @@ The crate introduces the following concepts.
|
|||||||
`Columnar` is an equivalent of a dataframe.
|
`Columnar` is an equivalent of a dataframe.
|
||||||
It maps `column_key` to `Column`.
|
It maps `column_key` to `Column`.
|
||||||
|
|
||||||
A `Column<T>` associates a `RowId` (u32) to any
|
A `Column<T>` asssociates a `RowId` (u32) to any
|
||||||
number of values.
|
number of values.
|
||||||
|
|
||||||
This is made possible by wrapping a `ColumnIndex` and a `ColumnValue` object.
|
This is made possible by wrapping a `ColumnIndex` and a `ColumnValue` object.
|
||||||
|
|||||||
@@ -89,6 +89,13 @@ fn main() {
|
|||||||
black_box(sum);
|
black_box(sum);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
group.register("first_block_fetch", |column| {
|
||||||
|
let mut block: Vec<Option<u64>> = vec![None; 64];
|
||||||
|
let fetch_docids = (0..64).collect::<Vec<_>>();
|
||||||
|
column.first_vals(&fetch_docids, &mut block);
|
||||||
|
black_box(block[0]);
|
||||||
|
});
|
||||||
|
|
||||||
group.register("first_block_single_calls", |column| {
|
group.register("first_block_single_calls", |column| {
|
||||||
let mut block: Vec<Option<u64>> = vec![None; 64];
|
let mut block: Vec<Option<u64>> = vec![None; 64];
|
||||||
let fetch_docids = (0..64).collect::<Vec<_>>();
|
let fetch_docids = (0..64).collect::<Vec<_>>();
|
||||||
|
|||||||
@@ -29,20 +29,12 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn fetch_block_with_missing(
|
pub fn fetch_block_with_missing(&mut self, docs: &[u32], accessor: &Column<T>, missing: T) {
|
||||||
&mut self,
|
|
||||||
docs: &[u32],
|
|
||||||
accessor: &Column<T>,
|
|
||||||
missing: Option<T>,
|
|
||||||
) {
|
|
||||||
self.fetch_block(docs, accessor);
|
self.fetch_block(docs, accessor);
|
||||||
// no missing values
|
// no missing values
|
||||||
if accessor.index.get_cardinality().is_full() {
|
if accessor.index.get_cardinality().is_full() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
let Some(missing) = missing else {
|
|
||||||
return;
|
|
||||||
};
|
|
||||||
|
|
||||||
// We can compare docid_cache length with docs to find missing docs
|
// We can compare docid_cache length with docs to find missing docs
|
||||||
// For multi value columns we can't rely on the length and always need to scan
|
// For multi value columns we can't rely on the length and always need to scan
|
||||||
|
|||||||
@@ -85,8 +85,8 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn first(&self, doc_id: DocId) -> Option<T> {
|
pub fn first(&self, row_id: RowId) -> Option<T> {
|
||||||
self.values_for_doc(doc_id).next()
|
self.values_for_doc(row_id).next()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Load the first value for each docid in the provided slice.
|
/// Load the first value for each docid in the provided slice.
|
||||||
@@ -131,8 +131,6 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
|
|||||||
self.index.docids_to_rowids(doc_ids, doc_ids_out, row_ids)
|
self.index.docids_to_rowids(doc_ids, doc_ids_out, row_ids)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get an iterator over the values for the provided docid.
|
|
||||||
#[inline]
|
|
||||||
pub fn values_for_doc(&self, doc_id: DocId) -> impl Iterator<Item = T> + '_ {
|
pub fn values_for_doc(&self, doc_id: DocId) -> impl Iterator<Item = T> + '_ {
|
||||||
self.index
|
self.index
|
||||||
.value_row_ids(doc_id)
|
.value_row_ids(doc_id)
|
||||||
@@ -160,6 +158,15 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
|
|||||||
.select_batch_in_place(selected_docid_range.start, doc_ids);
|
.select_batch_in_place(selected_docid_range.start, doc_ids);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Fills the output vector with the (possibly multiple values that are associated_with
|
||||||
|
/// `row_id`.
|
||||||
|
///
|
||||||
|
/// This method clears the `output` vector.
|
||||||
|
pub fn fill_vals(&self, row_id: RowId, output: &mut Vec<T>) {
|
||||||
|
output.clear();
|
||||||
|
output.extend(self.values_for_doc(row_id));
|
||||||
|
}
|
||||||
|
|
||||||
pub fn first_or_default_col(self, default_value: T) -> Arc<dyn ColumnValues<T>> {
|
pub fn first_or_default_col(self, default_value: T) -> Arc<dyn ColumnValues<T>> {
|
||||||
Arc::new(FirstValueWithDefault {
|
Arc::new(FirstValueWithDefault {
|
||||||
column: self,
|
column: self,
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
use std::net::Ipv6Addr;
|
use std::net::Ipv6Addr;
|
||||||
|
|
||||||
/// Monotonic maps a value to u128 value space
|
/// Montonic maps a value to u128 value space
|
||||||
/// Monotonic mapping enables `PartialOrd` on u128 space without conversion to original space.
|
/// Monotonic mapping enables `PartialOrd` on u128 space without conversion to original space.
|
||||||
pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Debug + Send + Sync {
|
pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Debug + Send + Sync {
|
||||||
/// Converts a value to u128.
|
/// Converts a value to u128.
|
||||||
|
|||||||
@@ -41,6 +41,12 @@ fn transform_range_before_linear_transformation(
|
|||||||
if range.is_empty() {
|
if range.is_empty() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
if stats.min_value > *range.end() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
if stats.max_value < *range.start() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
let shifted_range =
|
let shifted_range =
|
||||||
range.start().saturating_sub(stats.min_value)..=range.end().saturating_sub(stats.min_value);
|
range.start().saturating_sub(stats.min_value)..=range.end().saturating_sub(stats.min_value);
|
||||||
let start_before_gcd_multiplication: u64 = div_ceil(*shifted_range.start(), stats.gcd);
|
let start_before_gcd_multiplication: u64 = div_ceil(*shifted_range.start(), stats.gcd);
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use crate::column_values::ColumnValues;
|
|||||||
const MID_POINT: u64 = (1u64 << 32) - 1u64;
|
const MID_POINT: u64 = (1u64 << 32) - 1u64;
|
||||||
|
|
||||||
/// `Line` describes a line function `y: ax + b` using integer
|
/// `Line` describes a line function `y: ax + b` using integer
|
||||||
/// arithmetic.
|
/// arithmetics.
|
||||||
///
|
///
|
||||||
/// The slope is in fact a decimal split into a 32 bit integer value,
|
/// The slope is in fact a decimal split into a 32 bit integer value,
|
||||||
/// and a 32-bit decimal value.
|
/// and a 32-bit decimal value.
|
||||||
@@ -94,7 +94,7 @@ impl Line {
|
|||||||
// `(i, ys[])`.
|
// `(i, ys[])`.
|
||||||
//
|
//
|
||||||
// The best intercept therefore has the form
|
// The best intercept therefore has the form
|
||||||
// `y[i] - line.eval(i)` (using wrapping arithmetic).
|
// `y[i] - line.eval(i)` (using wrapping arithmetics).
|
||||||
// In other words, the best intercept is one of the `y - Line::eval(ys[i])`
|
// In other words, the best intercept is one of the `y - Line::eval(ys[i])`
|
||||||
// and our task is just to pick the one that minimizes our error.
|
// and our task is just to pick the one that minimizes our error.
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ pub trait ColumnCodecEstimator<T = u64>: 'static {
|
|||||||
) -> io::Result<()>;
|
) -> io::Result<()>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A column codec describes a column serialization format.
|
/// A column codec describes a colunm serialization format.
|
||||||
pub trait ColumnCodec<T: PartialOrd = u64> {
|
pub trait ColumnCodec<T: PartialOrd = u64> {
|
||||||
/// Specialized `ColumnValues` type.
|
/// Specialized `ColumnValues` type.
|
||||||
type ColumnValues: ColumnValues<T> + 'static;
|
type ColumnValues: ColumnValues<T> + 'static;
|
||||||
|
|||||||
@@ -3,8 +3,7 @@ use std::sync::Arc;
|
|||||||
use std::{fmt, io};
|
use std::{fmt, io};
|
||||||
|
|
||||||
use common::file_slice::FileSlice;
|
use common::file_slice::FileSlice;
|
||||||
use common::{ByteCount, DateTime, OwnedBytes};
|
use common::{ByteCount, DateTime, HasLen, OwnedBytes};
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
|
|
||||||
use crate::column::{BytesColumn, Column, StrColumn};
|
use crate::column::{BytesColumn, Column, StrColumn};
|
||||||
use crate::column_values::{StrictlyMonotonicFn, monotonic_map_column};
|
use crate::column_values::{StrictlyMonotonicFn, monotonic_map_column};
|
||||||
@@ -318,89 +317,10 @@ impl DynamicColumnHandle {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn num_bytes(&self) -> ByteCount {
|
pub fn num_bytes(&self) -> ByteCount {
|
||||||
self.file_slice.num_bytes()
|
self.file_slice.len().into()
|
||||||
}
|
|
||||||
|
|
||||||
/// Legacy helper returning the column space usage.
|
|
||||||
pub fn column_and_dictionary_num_bytes(&self) -> io::Result<ColumnSpaceUsage> {
|
|
||||||
self.space_usage()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return the space usage of the column, optionally broken down by dictionary and column
|
|
||||||
/// values.
|
|
||||||
///
|
|
||||||
/// For dictionary encoded columns (strings and bytes), this splits the total footprint into
|
|
||||||
/// the dictionary and the remaining column data (including index and values).
|
|
||||||
/// For all other column types, the dictionary size is `None` and the column size
|
|
||||||
/// equals the total bytes.
|
|
||||||
pub fn space_usage(&self) -> io::Result<ColumnSpaceUsage> {
|
|
||||||
let total_num_bytes = self.num_bytes();
|
|
||||||
let dynamic_column = self.open()?;
|
|
||||||
let dictionary_num_bytes = match &dynamic_column {
|
|
||||||
DynamicColumn::Bytes(bytes_column) => bytes_column.dictionary().num_bytes(),
|
|
||||||
DynamicColumn::Str(str_column) => str_column.dictionary().num_bytes(),
|
|
||||||
_ => {
|
|
||||||
return Ok(ColumnSpaceUsage::new(self.num_bytes(), None));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
assert!(dictionary_num_bytes <= total_num_bytes);
|
|
||||||
let column_num_bytes =
|
|
||||||
ByteCount::from(total_num_bytes.get_bytes() - dictionary_num_bytes.get_bytes());
|
|
||||||
Ok(ColumnSpaceUsage::new(
|
|
||||||
column_num_bytes,
|
|
||||||
Some(dictionary_num_bytes),
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn column_type(&self) -> ColumnType {
|
pub fn column_type(&self) -> ColumnType {
|
||||||
self.column_type
|
self.column_type
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Represents space usage of a column.
|
|
||||||
///
|
|
||||||
/// `column_num_bytes` tracks the column payload (index, values and footer).
|
|
||||||
/// For dictionary encoded columns, `dictionary_num_bytes` captures the dictionary footprint.
|
|
||||||
/// [`ColumnSpaceUsage::total_num_bytes`] returns the sum of both parts.
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
||||||
pub struct ColumnSpaceUsage {
|
|
||||||
column_num_bytes: ByteCount,
|
|
||||||
dictionary_num_bytes: Option<ByteCount>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ColumnSpaceUsage {
|
|
||||||
pub(crate) fn new(
|
|
||||||
column_num_bytes: ByteCount,
|
|
||||||
dictionary_num_bytes: Option<ByteCount>,
|
|
||||||
) -> Self {
|
|
||||||
ColumnSpaceUsage {
|
|
||||||
column_num_bytes,
|
|
||||||
dictionary_num_bytes,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn column_num_bytes(&self) -> ByteCount {
|
|
||||||
self.column_num_bytes
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn dictionary_num_bytes(&self) -> Option<ByteCount> {
|
|
||||||
self.dictionary_num_bytes
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn total_num_bytes(&self) -> ByteCount {
|
|
||||||
self.column_num_bytes + self.dictionary_num_bytes.unwrap_or_default()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Merge two space usage values by summing their components.
|
|
||||||
pub fn merge(&self, other: &ColumnSpaceUsage) -> ColumnSpaceUsage {
|
|
||||||
let dictionary_num_bytes = match (self.dictionary_num_bytes, other.dictionary_num_bytes) {
|
|
||||||
(Some(lhs), Some(rhs)) => Some(lhs + rhs),
|
|
||||||
(Some(val), None) | (None, Some(val)) => Some(val),
|
|
||||||
(None, None) => None,
|
|
||||||
};
|
|
||||||
ColumnSpaceUsage {
|
|
||||||
column_num_bytes: self.column_num_bytes + other.column_num_bytes,
|
|
||||||
dictionary_num_bytes,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ pub use columnar::{
|
|||||||
use sstable::VoidSSTable;
|
use sstable::VoidSSTable;
|
||||||
pub use value::{NumericalType, NumericalValue};
|
pub use value::{NumericalType, NumericalValue};
|
||||||
|
|
||||||
pub use self::dynamic_column::{ColumnSpaceUsage, DynamicColumn, DynamicColumnHandle};
|
pub use self::dynamic_column::{DynamicColumn, DynamicColumnHandle};
|
||||||
|
|
||||||
pub type RowId = u32;
|
pub type RowId = u32;
|
||||||
pub type DocId = u32;
|
pub type DocId = u32;
|
||||||
|
|||||||
@@ -60,7 +60,7 @@ fn test_dataframe_writer_bool() {
|
|||||||
let DynamicColumn::Bool(bool_col) = dyn_bool_col else {
|
let DynamicColumn::Bool(bool_col) = dyn_bool_col else {
|
||||||
panic!();
|
panic!();
|
||||||
};
|
};
|
||||||
let vals: Vec<Option<bool>> = (0..5).map(|doc_id| bool_col.first(doc_id)).collect();
|
let vals: Vec<Option<bool>> = (0..5).map(|row_id| bool_col.first(row_id)).collect();
|
||||||
assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
|
assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -108,7 +108,7 @@ fn test_dataframe_writer_ip_addr() {
|
|||||||
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else {
|
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else {
|
||||||
panic!();
|
panic!();
|
||||||
};
|
};
|
||||||
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|doc_id| ip_col.first(doc_id)).collect();
|
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|row_id| ip_col.first(row_id)).collect();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&vals,
|
&vals,
|
||||||
&[
|
&[
|
||||||
@@ -169,7 +169,7 @@ fn test_dictionary_encoded_str() {
|
|||||||
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else {
|
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else {
|
||||||
panic!();
|
panic!();
|
||||||
};
|
};
|
||||||
let index: Vec<Option<u64>> = (0..5).map(|doc_id| str_col.ords().first(doc_id)).collect();
|
let index: Vec<Option<u64>> = (0..5).map(|row_id| str_col.ords().first(row_id)).collect();
|
||||||
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
|
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
|
||||||
assert_eq!(str_col.num_rows(), 5);
|
assert_eq!(str_col.num_rows(), 5);
|
||||||
let mut term_buffer = String::new();
|
let mut term_buffer = String::new();
|
||||||
@@ -204,7 +204,7 @@ fn test_dictionary_encoded_bytes() {
|
|||||||
panic!();
|
panic!();
|
||||||
};
|
};
|
||||||
let index: Vec<Option<u64>> = (0..5)
|
let index: Vec<Option<u64>> = (0..5)
|
||||||
.map(|doc_id| bytes_col.ords().first(doc_id))
|
.map(|row_id| bytes_col.ords().first(row_id))
|
||||||
.collect();
|
.collect();
|
||||||
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
|
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
|
||||||
assert_eq!(bytes_col.num_rows(), 5);
|
assert_eq!(bytes_col.num_rows(), 5);
|
||||||
|
|||||||
@@ -181,14 +181,6 @@ pub struct BitSet {
|
|||||||
len: u64,
|
len: u64,
|
||||||
max_value: u32,
|
max_value: u32,
|
||||||
}
|
}
|
||||||
impl std::fmt::Debug for BitSet {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
||||||
f.debug_struct("BitSet")
|
|
||||||
.field("len", &self.len)
|
|
||||||
.field("max_value", &self.max_value)
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn num_buckets(max_val: u32) -> u32 {
|
fn num_buckets(max_val: u32) -> u32 {
|
||||||
max_val.div_ceil(64u32)
|
max_val.div_ceil(64u32)
|
||||||
|
|||||||
@@ -28,9 +28,7 @@ impl BinarySerializable for VIntU128 {
|
|||||||
writer.write_all(&buffer)
|
writer.write_all(&buffer)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::unbuffered_bytes)]
|
|
||||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||||
#[allow(clippy::unbuffered_bytes)]
|
|
||||||
let mut bytes = reader.bytes();
|
let mut bytes = reader.bytes();
|
||||||
let mut result = 0u128;
|
let mut result = 0u128;
|
||||||
let mut shift = 0u64;
|
let mut shift = 0u64;
|
||||||
@@ -197,9 +195,7 @@ impl BinarySerializable for VInt {
|
|||||||
writer.write_all(&buffer[0..num_bytes])
|
writer.write_all(&buffer[0..num_bytes])
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::unbuffered_bytes)]
|
|
||||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||||
#[allow(clippy::unbuffered_bytes)]
|
|
||||||
let mut bytes = reader.bytes();
|
let mut bytes = reader.bytes();
|
||||||
let mut result = 0u64;
|
let mut result = 0u64;
|
||||||
let mut shift = 0u64;
|
let mut shift = 0u64;
|
||||||
|
|||||||
@@ -208,7 +208,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// is the role of the `TopDocs` collector.
|
// is the role of the `TopDocs` collector.
|
||||||
|
|
||||||
// We can now perform our query.
|
// We can now perform our query.
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?;
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||||
|
|
||||||
// The actual documents still need to be
|
// The actual documents still need to be
|
||||||
// retrieved from Tantivy's store.
|
// retrieved from Tantivy's store.
|
||||||
@@ -226,7 +226,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let query = query_parser.parse_query("title:sea^20 body:whale^70")?;
|
let query = query_parser.parse_query("title:sea^20 body:whale^70")?;
|
||||||
|
|
||||||
let (_score, doc_address) = searcher
|
let (_score, doc_address) = searcher
|
||||||
.search(&query, &TopDocs::with_limit(1).order_by_score())?
|
.search(&query, &TopDocs::with_limit(1))?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.next()
|
.next()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|||||||
@@ -100,7 +100,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// here we want to get a hit on the 'ken' in Frankenstein
|
// here we want to get a hit on the 'ken' in Frankenstein
|
||||||
let query = query_parser.parse_query("ken")?;
|
let query = query_parser.parse_query("ken")?;
|
||||||
|
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?;
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||||
|
|
||||||
for (_, doc_address) in top_docs {
|
for (_, doc_address) in top_docs {
|
||||||
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
|
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
|
||||||
|
|||||||
@@ -50,14 +50,14 @@ fn main() -> tantivy::Result<()> {
|
|||||||
{
|
{
|
||||||
// Simple exact search on the date
|
// Simple exact search on the date
|
||||||
let query = query_parser.parse_query("occurred_at:\"2022-06-22T12:53:50.53Z\"")?;
|
let query = query_parser.parse_query("occurred_at:\"2022-06-22T12:53:50.53Z\"")?;
|
||||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5).order_by_score())?;
|
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
|
||||||
assert_eq!(count_docs.len(), 1);
|
assert_eq!(count_docs.len(), 1);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// Range query on the date field
|
// Range query on the date field
|
||||||
let query = query_parser
|
let query = query_parser
|
||||||
.parse_query(r#"occurred_at:[2022-06-22T12:58:00Z TO 2022-06-23T00:00:00Z}"#)?;
|
.parse_query(r#"occurred_at:[2022-06-22T12:58:00Z TO 2022-06-23T00:00:00Z}"#)?;
|
||||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4).order_by_score())?;
|
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
|
||||||
assert_eq!(count_docs.len(), 1);
|
assert_eq!(count_docs.len(), 1);
|
||||||
for (_score, doc_address) in count_docs {
|
for (_score, doc_address) in count_docs {
|
||||||
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
|
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ fn extract_doc_given_isbn(
|
|||||||
// The second argument is here to tell we don't care about decoding positions,
|
// The second argument is here to tell we don't care about decoding positions,
|
||||||
// or term frequencies.
|
// or term frequencies.
|
||||||
let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic);
|
let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic);
|
||||||
let top_docs = searcher.search(&term_query, &TopDocs::with_limit(1).order_by_score())?;
|
let top_docs = searcher.search(&term_query, &TopDocs::with_limit(1))?;
|
||||||
|
|
||||||
if let Some((_score, doc_address)) = top_docs.first() {
|
if let Some((_score, doc_address)) = top_docs.first() {
|
||||||
let doc = searcher.doc(*doc_address)?;
|
let doc = searcher.doc(*doc_address)?;
|
||||||
|
|||||||
@@ -145,7 +145,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let query = FuzzyTermQuery::new(term, 2, true);
|
let query = FuzzyTermQuery::new(term, 2, true);
|
||||||
|
|
||||||
let (top_docs, count) = searcher
|
let (top_docs, count) = searcher
|
||||||
.search(&query, &(TopDocs::with_limit(5).order_by_score(), Count))
|
.search(&query, &(TopDocs::with_limit(5), Count))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(count, 3);
|
assert_eq!(count, 3);
|
||||||
assert_eq!(top_docs.len(), 3);
|
assert_eq!(top_docs.len(), 3);
|
||||||
|
|||||||
@@ -69,25 +69,25 @@ fn main() -> tantivy::Result<()> {
|
|||||||
{
|
{
|
||||||
// Inclusive range queries
|
// Inclusive range queries
|
||||||
let query = query_parser.parse_query("ip:[192.168.0.80 TO 192.168.0.100]")?;
|
let query = query_parser.parse_query("ip:[192.168.0.80 TO 192.168.0.100]")?;
|
||||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5).order_by_score())?;
|
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
|
||||||
assert_eq!(count_docs.len(), 1);
|
assert_eq!(count_docs.len(), 1);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// Exclusive range queries
|
// Exclusive range queries
|
||||||
let query = query_parser.parse_query("ip:{192.168.0.80 TO 192.168.1.100]")?;
|
let query = query_parser.parse_query("ip:{192.168.0.80 TO 192.168.1.100]")?;
|
||||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?;
|
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||||
assert_eq!(count_docs.len(), 0);
|
assert_eq!(count_docs.len(), 0);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// Find docs with IP addresses smaller equal 192.168.1.100
|
// Find docs with IP addresses smaller equal 192.168.1.100
|
||||||
let query = query_parser.parse_query("ip:[* TO 192.168.1.100]")?;
|
let query = query_parser.parse_query("ip:[* TO 192.168.1.100]")?;
|
||||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?;
|
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||||
assert_eq!(count_docs.len(), 2);
|
assert_eq!(count_docs.len(), 2);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// Find docs with IP addresses smaller than 192.168.1.100
|
// Find docs with IP addresses smaller than 192.168.1.100
|
||||||
let query = query_parser.parse_query("ip:[* TO 192.168.1.100}")?;
|
let query = query_parser.parse_query("ip:[* TO 192.168.1.100}")?;
|
||||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?;
|
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||||
assert_eq!(count_docs.len(), 2);
|
assert_eq!(count_docs.len(), 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -59,12 +59,12 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]);
|
let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]);
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("target:submit-button")?;
|
let query = query_parser.parse_query("target:submit-button")?;
|
||||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?;
|
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||||
assert_eq!(count_docs.len(), 2);
|
assert_eq!(count_docs.len(), 2);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("target:submit")?;
|
let query = query_parser.parse_query("target:submit")?;
|
||||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?;
|
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||||
assert_eq!(count_docs.len(), 2);
|
assert_eq!(count_docs.len(), 2);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -74,33 +74,33 @@ fn main() -> tantivy::Result<()> {
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("click AND cart.product_id:133")?;
|
let query = query_parser.parse_query("click AND cart.product_id:133")?;
|
||||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?;
|
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||||
assert_eq!(hits.len(), 1);
|
assert_eq!(hits.len(), 1);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// The sub-fields in the json field marked as default field still need to be explicitly
|
// The sub-fields in the json field marked as default field still need to be explicitly
|
||||||
// addressed
|
// addressed
|
||||||
let query = query_parser.parse_query("click AND 133")?;
|
let query = query_parser.parse_query("click AND 133")?;
|
||||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?;
|
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||||
assert_eq!(hits.len(), 0);
|
assert_eq!(hits.len(), 0);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// Default json fields are ignored if they collide with the schema
|
// Default json fields are ignored if they collide with the schema
|
||||||
let query = query_parser.parse_query("event_type:holiday-sale")?;
|
let query = query_parser.parse_query("event_type:holiday-sale")?;
|
||||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?;
|
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||||
assert_eq!(hits.len(), 0);
|
assert_eq!(hits.len(), 0);
|
||||||
}
|
}
|
||||||
// # Query via full attribute path
|
// # Query via full attribute path
|
||||||
{
|
{
|
||||||
// This only searches in our schema's `event_type` field
|
// This only searches in our schema's `event_type` field
|
||||||
let query = query_parser.parse_query("event_type:click")?;
|
let query = query_parser.parse_query("event_type:click")?;
|
||||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?;
|
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||||
assert_eq!(hits.len(), 2);
|
assert_eq!(hits.len(), 2);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// Default json fields can still be accessed by full path
|
// Default json fields can still be accessed by full path
|
||||||
let query = query_parser.parse_query("attributes.event_type:holiday-sale")?;
|
let query = query_parser.parse_query("attributes.event_type:holiday-sale")?;
|
||||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?;
|
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||||
assert_eq!(hits.len(), 1);
|
assert_eq!(hits.len(), 1);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ fn main() -> Result<()> {
|
|||||||
// but not "in the Gulf Stream".
|
// but not "in the Gulf Stream".
|
||||||
let query = query_parser.parse_query("\"in the su\"*")?;
|
let query = query_parser.parse_query("\"in the su\"*")?;
|
||||||
|
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?;
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||||
let mut titles = top_docs
|
let mut titles = top_docs
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(_score, doc_address)| {
|
.map(|(_score, doc_address)| {
|
||||||
|
|||||||
@@ -107,8 +107,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
IndexRecordOption::Basic,
|
IndexRecordOption::Basic,
|
||||||
);
|
);
|
||||||
|
|
||||||
let (top_docs, count) =
|
let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
|
||||||
searcher.search(&query, &(TopDocs::with_limit(2).order_by_score(), Count))?;
|
|
||||||
|
|
||||||
assert_eq!(count, 2);
|
assert_eq!(count, 2);
|
||||||
|
|
||||||
@@ -129,8 +128,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
IndexRecordOption::Basic,
|
IndexRecordOption::Basic,
|
||||||
);
|
);
|
||||||
|
|
||||||
let (_top_docs, count) =
|
let (_top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
|
||||||
searcher.search(&query, &(TopDocs::with_limit(2).order_by_score(), Count))?;
|
|
||||||
|
|
||||||
assert_eq!(count, 0);
|
assert_eq!(count, 0);
|
||||||
|
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||||
let query = query_parser.parse_query("sycamore spring")?;
|
let query = query_parser.parse_query("sycamore spring")?;
|
||||||
|
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?;
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||||
|
|
||||||
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
|
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
|
||||||
|
|
||||||
|
|||||||
@@ -102,7 +102,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// stop words are applied on the query as well.
|
// stop words are applied on the query as well.
|
||||||
// The following will be equivalent to `title:frankenstein`
|
// The following will be equivalent to `title:frankenstein`
|
||||||
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
|
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?;
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||||
|
|
||||||
for (score, doc_address) in top_docs {
|
for (score, doc_address) in top_docs {
|
||||||
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
|
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
|
||||||
|
|||||||
@@ -164,7 +164,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
move |doc_id: DocId| Reverse(price[doc_id as usize])
|
move |doc_id: DocId| Reverse(price[doc_id as usize])
|
||||||
};
|
};
|
||||||
|
|
||||||
let most_expensive_first = TopDocs::with_limit(10).order_by(score_by_price);
|
let most_expensive_first = TopDocs::with_limit(10).custom_score(score_by_price);
|
||||||
|
|
||||||
let hits = searcher.search(&query, &most_expensive_first)?;
|
let hits = searcher.search(&query, &most_expensive_first)?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|||||||
@@ -758,17 +758,7 @@ fn negate(expr: UserInputAst) -> UserInputAst {
|
|||||||
fn leaf(inp: &str) -> IResult<&str, UserInputAst> {
|
fn leaf(inp: &str) -> IResult<&str, UserInputAst> {
|
||||||
alt((
|
alt((
|
||||||
delimited(char('('), ast, char(')')),
|
delimited(char('('), ast, char(')')),
|
||||||
map(
|
map(char('*'), |_| UserInputAst::from(UserInputLeaf::All)),
|
||||||
terminated(
|
|
||||||
char('*'),
|
|
||||||
peek(alt((
|
|
||||||
value((), multispace1),
|
|
||||||
value((), char(')')),
|
|
||||||
value((), eof),
|
|
||||||
))),
|
|
||||||
),
|
|
||||||
|_| UserInputAst::from(UserInputLeaf::All),
|
|
||||||
),
|
|
||||||
map(preceded(tuple((tag("NOT"), multispace1)), leaf), negate),
|
map(preceded(tuple((tag("NOT"), multispace1)), leaf), negate),
|
||||||
literal,
|
literal,
|
||||||
))(inp)
|
))(inp)
|
||||||
@@ -789,17 +779,7 @@ fn leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
|
|||||||
),
|
),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
value(
|
value((), char('*')),
|
||||||
(),
|
|
||||||
terminated(
|
|
||||||
char('*'),
|
|
||||||
peek(alt((
|
|
||||||
value((), multispace1),
|
|
||||||
value((), char(')')),
|
|
||||||
value((), eof),
|
|
||||||
))),
|
|
||||||
),
|
|
||||||
),
|
|
||||||
map(nothing, |_| {
|
map(nothing, |_| {
|
||||||
(Some(UserInputAst::from(UserInputLeaf::All)), Vec::new())
|
(Some(UserInputAst::from(UserInputLeaf::All)), Vec::new())
|
||||||
}),
|
}),
|
||||||
@@ -1691,21 +1671,6 @@ mod test {
|
|||||||
test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":a *b)");
|
test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":a *b)");
|
||||||
test_parse_query_to_ast_helper("abc:\"a b\"", "\"abc\":\"a b\"");
|
test_parse_query_to_ast_helper("abc:\"a b\"", "\"abc\":\"a b\"");
|
||||||
test_parse_query_to_ast_helper("foo:[1 TO 5]", "\"foo\":[\"1\" TO \"5\"]");
|
test_parse_query_to_ast_helper("foo:[1 TO 5]", "\"foo\":[\"1\" TO \"5\"]");
|
||||||
|
|
||||||
// Phrase prefixed with *
|
|
||||||
test_parse_query_to_ast_helper("foo:(*A)", "\"foo\":*A");
|
|
||||||
test_parse_query_to_ast_helper("*A", "*A");
|
|
||||||
test_parse_query_to_ast_helper("(*A)", "*A");
|
|
||||||
test_parse_query_to_ast_helper("foo:(A OR B)", "(?\"foo\":A ?\"foo\":B)");
|
|
||||||
test_parse_query_to_ast_helper("foo:(A* OR B*)", "(?\"foo\":A* ?\"foo\":B*)");
|
|
||||||
test_parse_query_to_ast_helper("foo:(*A OR *B)", "(?\"foo\":*A ?\"foo\":*B)");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_parse_query_all() {
|
|
||||||
test_parse_query_to_ast_helper("*", "*");
|
|
||||||
test_parse_query_to_ast_helper("(*)", "*");
|
|
||||||
test_parse_query_to_ast_helper("(* )", "*");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -16,16 +16,15 @@ use crate::index::SegmentReader;
|
|||||||
/// That way we can use it the same way as if it would come from the fastfield.
|
/// That way we can use it the same way as if it would come from the fastfield.
|
||||||
pub(crate) fn get_missing_val_as_u64_lenient(
|
pub(crate) fn get_missing_val_as_u64_lenient(
|
||||||
column_type: ColumnType,
|
column_type: ColumnType,
|
||||||
column_max_value: u64,
|
|
||||||
missing: &Key,
|
missing: &Key,
|
||||||
field_name: &str,
|
field_name: &str,
|
||||||
) -> crate::Result<Option<u64>> {
|
) -> crate::Result<Option<u64>> {
|
||||||
let missing_val = match missing {
|
let missing_val = match missing {
|
||||||
Key::Str(_) if column_type == ColumnType::Str => Some(column_max_value + 1),
|
Key::Str(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||||
// Allow fallback to number on text fields
|
// Allow fallback to number on text fields
|
||||||
Key::F64(_) if column_type == ColumnType::Str => Some(column_max_value + 1),
|
Key::F64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||||
Key::U64(_) if column_type == ColumnType::Str => Some(column_max_value + 1),
|
Key::U64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||||
Key::I64(_) if column_type == ColumnType::Str => Some(column_max_value + 1),
|
Key::I64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||||
Key::F64(val) if column_type.numerical_type().is_some() => {
|
Key::F64(val) if column_type.numerical_type().is_some() => {
|
||||||
f64_to_fastfield_u64(*val, &column_type)
|
f64_to_fastfield_u64(*val, &column_type)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use columnar::{Column, ColumnBlockAccessor, ColumnType, StrColumn};
|
use columnar::{Column, ColumnType, StrColumn};
|
||||||
use common::BitSet;
|
use common::BitSet;
|
||||||
use rustc_hash::FxHashSet;
|
use rustc_hash::FxHashSet;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
@@ -10,16 +10,16 @@ use crate::aggregation::accessor_helpers::{
|
|||||||
};
|
};
|
||||||
use crate::aggregation::agg_req::{Aggregation, AggregationVariants, Aggregations};
|
use crate::aggregation::agg_req::{Aggregation, AggregationVariants, Aggregations};
|
||||||
use crate::aggregation::bucket::{
|
use crate::aggregation::bucket::{
|
||||||
build_segment_filter_collector, build_segment_range_collector, FilterAggReqData,
|
build_segment_aggregation_collector, FilterAggReqData, HistogramAggReqData, HistogramBounds,
|
||||||
HistogramAggReqData, HistogramBounds, IncludeExcludeParam, MissingTermAggReqData,
|
IncludeExcludeParam, MissingTermAggReqData, RangeAggReqData, SegmentFilterCollector,
|
||||||
RangeAggReqData, SegmentHistogramCollector, TermMissingAgg, TermsAggReqData, TermsAggregation,
|
SegmentHistogramCollector, SegmentRangeCollector, TermMissingAgg, TermsAggReqData,
|
||||||
TermsAggregationInternal,
|
TermsAggregation, TermsAggregationInternal,
|
||||||
};
|
};
|
||||||
use crate::aggregation::metric::{
|
use crate::aggregation::metric::{
|
||||||
build_segment_stats_collector, AverageAggregation, CardinalityAggReqData,
|
AverageAggregation, CardinalityAggReqData, CardinalityAggregationReq, CountAggregation,
|
||||||
CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation, MaxAggregation,
|
ExtendedStatsAggregation, MaxAggregation, MetricAggReqData, MinAggregation,
|
||||||
MetricAggReqData, MinAggregation, SegmentCardinalityCollector, SegmentExtendedStatsCollector,
|
SegmentCardinalityCollector, SegmentExtendedStatsCollector, SegmentPercentilesCollector,
|
||||||
SegmentPercentilesCollector, StatsAggregation, StatsType, SumAggregation, TopHitsAggReqData,
|
SegmentStatsCollector, StatsAggregation, StatsType, SumAggregation, TopHitsAggReqData,
|
||||||
TopHitsSegmentCollector,
|
TopHitsSegmentCollector,
|
||||||
};
|
};
|
||||||
use crate::aggregation::segment_agg_result::{
|
use crate::aggregation::segment_agg_result::{
|
||||||
@@ -35,7 +35,6 @@ pub struct AggregationsSegmentCtx {
|
|||||||
/// Request data for each aggregation type.
|
/// Request data for each aggregation type.
|
||||||
pub per_request: PerRequestAggSegCtx,
|
pub per_request: PerRequestAggSegCtx,
|
||||||
pub context: AggContextParams,
|
pub context: AggContextParams,
|
||||||
pub column_block_accessor: ColumnBlockAccessor<u64>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AggregationsSegmentCtx {
|
impl AggregationsSegmentCtx {
|
||||||
@@ -108,14 +107,21 @@ impl AggregationsSegmentCtx {
|
|||||||
.as_deref()
|
.as_deref()
|
||||||
.expect("range_req_data slot is empty (taken)")
|
.expect("range_req_data slot is empty (taken)")
|
||||||
}
|
}
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn get_filter_req_data(&self, idx: usize) -> &FilterAggReqData {
|
||||||
|
self.per_request.filter_req_data[idx]
|
||||||
|
.as_deref()
|
||||||
|
.expect("filter_req_data slot is empty (taken)")
|
||||||
|
}
|
||||||
|
|
||||||
// ---------- mutable getters ----------
|
// ---------- mutable getters ----------
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) fn get_metric_req_data_mut(&mut self, idx: usize) -> &mut MetricAggReqData {
|
pub(crate) fn get_term_req_data_mut(&mut self, idx: usize) -> &mut TermsAggReqData {
|
||||||
&mut self.per_request.stats_metric_req_data[idx]
|
self.per_request.term_req_data[idx]
|
||||||
|
.as_deref_mut()
|
||||||
|
.expect("term_req_data slot is empty (taken)")
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) fn get_cardinality_req_data_mut(
|
pub(crate) fn get_cardinality_req_data_mut(
|
||||||
&mut self,
|
&mut self,
|
||||||
@@ -123,7 +129,10 @@ impl AggregationsSegmentCtx {
|
|||||||
) -> &mut CardinalityAggReqData {
|
) -> &mut CardinalityAggReqData {
|
||||||
&mut self.per_request.cardinality_req_data[idx]
|
&mut self.per_request.cardinality_req_data[idx]
|
||||||
}
|
}
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn get_metric_req_data_mut(&mut self, idx: usize) -> &mut MetricAggReqData {
|
||||||
|
&mut self.per_request.stats_metric_req_data[idx]
|
||||||
|
}
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) fn get_histogram_req_data_mut(&mut self, idx: usize) -> &mut HistogramAggReqData {
|
pub(crate) fn get_histogram_req_data_mut(&mut self, idx: usize) -> &mut HistogramAggReqData {
|
||||||
self.per_request.histogram_req_data[idx]
|
self.per_request.histogram_req_data[idx]
|
||||||
@@ -133,6 +142,21 @@ impl AggregationsSegmentCtx {
|
|||||||
|
|
||||||
// ---------- take / put (terms, histogram, range) ----------
|
// ---------- take / put (terms, histogram, range) ----------
|
||||||
|
|
||||||
|
/// Move out the boxed Terms request at `idx`, leaving `None`.
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn take_term_req_data(&mut self, idx: usize) -> Box<TermsAggReqData> {
|
||||||
|
self.per_request.term_req_data[idx]
|
||||||
|
.take()
|
||||||
|
.expect("term_req_data slot is empty (taken)")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Put back a Terms request into an empty slot at `idx`.
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn put_back_term_req_data(&mut self, idx: usize, value: Box<TermsAggReqData>) {
|
||||||
|
debug_assert!(self.per_request.term_req_data[idx].is_none());
|
||||||
|
self.per_request.term_req_data[idx] = Some(value);
|
||||||
|
}
|
||||||
|
|
||||||
/// Move out the boxed Histogram request at `idx`, leaving `None`.
|
/// Move out the boxed Histogram request at `idx`, leaving `None`.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) fn take_histogram_req_data(&mut self, idx: usize) -> Box<HistogramAggReqData> {
|
pub(crate) fn take_histogram_req_data(&mut self, idx: usize) -> Box<HistogramAggReqData> {
|
||||||
@@ -296,7 +320,6 @@ impl PerRequestAggSegCtx {
|
|||||||
|
|
||||||
/// Convert the aggregation tree into a serializable struct representation.
|
/// Convert the aggregation tree into a serializable struct representation.
|
||||||
/// Each node contains: { name, kind, children }.
|
/// Each node contains: { name, kind, children }.
|
||||||
#[allow(dead_code)]
|
|
||||||
pub fn get_view_tree(&self) -> Vec<AggTreeViewNode> {
|
pub fn get_view_tree(&self) -> Vec<AggTreeViewNode> {
|
||||||
fn node_to_view(node: &AggRefNode, pr: &PerRequestAggSegCtx) -> AggTreeViewNode {
|
fn node_to_view(node: &AggRefNode, pr: &PerRequestAggSegCtx) -> AggTreeViewNode {
|
||||||
let mut children: Vec<AggTreeViewNode> =
|
let mut children: Vec<AggTreeViewNode> =
|
||||||
@@ -322,19 +345,12 @@ impl PerRequestAggSegCtx {
|
|||||||
pub(crate) fn build_segment_agg_collectors_root(
|
pub(crate) fn build_segment_agg_collectors_root(
|
||||||
req: &mut AggregationsSegmentCtx,
|
req: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
||||||
build_segment_agg_collectors_generic(req, &req.per_request.agg_tree.clone())
|
build_segment_agg_collectors(req, &req.per_request.agg_tree.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn build_segment_agg_collectors(
|
pub(crate) fn build_segment_agg_collectors(
|
||||||
req: &mut AggregationsSegmentCtx,
|
req: &mut AggregationsSegmentCtx,
|
||||||
nodes: &[AggRefNode],
|
nodes: &[AggRefNode],
|
||||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
|
||||||
build_segment_agg_collectors_generic(req, nodes)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn build_segment_agg_collectors_generic(
|
|
||||||
req: &mut AggregationsSegmentCtx,
|
|
||||||
nodes: &[AggRefNode],
|
|
||||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
||||||
let mut collectors = Vec::new();
|
let mut collectors = Vec::new();
|
||||||
for node in nodes.iter() {
|
for node in nodes.iter() {
|
||||||
@@ -357,7 +373,7 @@ pub(crate) fn build_segment_agg_collector(
|
|||||||
node: &AggRefNode,
|
node: &AggRefNode,
|
||||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
||||||
match node.kind {
|
match node.kind {
|
||||||
AggKind::Terms => crate::aggregation::bucket::build_segment_term_collector(req, node),
|
AggKind::Terms => build_segment_aggregation_collector(req, node),
|
||||||
AggKind::MissingTerm => {
|
AggKind::MissingTerm => {
|
||||||
let req_data = &mut req.per_request.missing_term_req_data[node.idx_in_req_data];
|
let req_data = &mut req.per_request.missing_term_req_data[node.idx_in_req_data];
|
||||||
if req_data.accessors.is_empty() {
|
if req_data.accessors.is_empty() {
|
||||||
@@ -372,8 +388,6 @@ pub(crate) fn build_segment_agg_collector(
|
|||||||
Ok(Box::new(SegmentCardinalityCollector::from_req(
|
Ok(Box::new(SegmentCardinalityCollector::from_req(
|
||||||
req_data.column_type,
|
req_data.column_type,
|
||||||
node.idx_in_req_data,
|
node.idx_in_req_data,
|
||||||
req_data.accessor.clone(),
|
|
||||||
req_data.missing_value_for_accessor,
|
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
AggKind::StatsKind(stats_type) => {
|
AggKind::StatsKind(stats_type) => {
|
||||||
@@ -384,21 +398,20 @@ pub(crate) fn build_segment_agg_collector(
|
|||||||
| StatsType::Count
|
| StatsType::Count
|
||||||
| StatsType::Max
|
| StatsType::Max
|
||||||
| StatsType::Min
|
| StatsType::Min
|
||||||
| StatsType::Stats => build_segment_stats_collector(req_data),
|
| StatsType::Stats => Ok(Box::new(SegmentStatsCollector::from_req(
|
||||||
StatsType::ExtendedStats(sigma) => Ok(Box::new(
|
node.idx_in_req_data,
|
||||||
SegmentExtendedStatsCollector::from_req(req_data, sigma),
|
))),
|
||||||
)),
|
StatsType::ExtendedStats(sigma) => {
|
||||||
StatsType::Percentiles => {
|
Ok(Box::new(SegmentExtendedStatsCollector::from_req(
|
||||||
let req_data = req.get_metric_req_data_mut(node.idx_in_req_data);
|
req_data.field_type,
|
||||||
Ok(Box::new(
|
sigma,
|
||||||
SegmentPercentilesCollector::from_req_and_validate(
|
node.idx_in_req_data,
|
||||||
req_data.field_type,
|
req_data.missing,
|
||||||
req_data.missing_u64,
|
)))
|
||||||
req_data.accessor.clone(),
|
|
||||||
node.idx_in_req_data,
|
|
||||||
),
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
|
StatsType::Percentiles => Ok(Box::new(
|
||||||
|
SegmentPercentilesCollector::from_req_and_validate(node.idx_in_req_data)?,
|
||||||
|
)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
AggKind::TopHits => {
|
AggKind::TopHits => {
|
||||||
@@ -415,8 +428,12 @@ pub(crate) fn build_segment_agg_collector(
|
|||||||
AggKind::DateHistogram => Ok(Box::new(SegmentHistogramCollector::from_req_and_validate(
|
AggKind::DateHistogram => Ok(Box::new(SegmentHistogramCollector::from_req_and_validate(
|
||||||
req, node,
|
req, node,
|
||||||
)?)),
|
)?)),
|
||||||
AggKind::Range => Ok(build_segment_range_collector(req, node)?),
|
AggKind::Range => Ok(Box::new(SegmentRangeCollector::from_req_and_validate(
|
||||||
AggKind::Filter => build_segment_filter_collector(req, node),
|
req, node,
|
||||||
|
)?)),
|
||||||
|
AggKind::Filter => Ok(Box::new(SegmentFilterCollector::from_req_and_validate(
|
||||||
|
req, node,
|
||||||
|
)?)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -476,11 +493,10 @@ pub(crate) fn build_aggregations_data_from_req(
|
|||||||
let mut data = AggregationsSegmentCtx {
|
let mut data = AggregationsSegmentCtx {
|
||||||
per_request: Default::default(),
|
per_request: Default::default(),
|
||||||
context,
|
context,
|
||||||
column_block_accessor: ColumnBlockAccessor::default(),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
for (name, agg) in aggs.iter() {
|
for (name, agg) in aggs.iter() {
|
||||||
let nodes = build_nodes(name, agg, reader, segment_ordinal, &mut data, true)?;
|
let nodes = build_nodes(name, agg, reader, segment_ordinal, &mut data)?;
|
||||||
data.per_request.agg_tree.extend(nodes);
|
data.per_request.agg_tree.extend(nodes);
|
||||||
}
|
}
|
||||||
Ok(data)
|
Ok(data)
|
||||||
@@ -492,7 +508,6 @@ fn build_nodes(
|
|||||||
reader: &SegmentReader,
|
reader: &SegmentReader,
|
||||||
segment_ordinal: SegmentOrdinal,
|
segment_ordinal: SegmentOrdinal,
|
||||||
data: &mut AggregationsSegmentCtx,
|
data: &mut AggregationsSegmentCtx,
|
||||||
is_top_level: bool,
|
|
||||||
) -> crate::Result<Vec<AggRefNode>> {
|
) -> crate::Result<Vec<AggRefNode>> {
|
||||||
use AggregationVariants::*;
|
use AggregationVariants::*;
|
||||||
match &req.agg {
|
match &req.agg {
|
||||||
@@ -505,9 +520,9 @@ fn build_nodes(
|
|||||||
let idx_in_req_data = data.push_range_req_data(RangeAggReqData {
|
let idx_in_req_data = data.push_range_req_data(RangeAggReqData {
|
||||||
accessor,
|
accessor,
|
||||||
field_type,
|
field_type,
|
||||||
|
column_block_accessor: Default::default(),
|
||||||
name: agg_name.to_string(),
|
name: agg_name.to_string(),
|
||||||
req: range_req.clone(),
|
req: range_req.clone(),
|
||||||
is_top_level,
|
|
||||||
});
|
});
|
||||||
let children = build_children(&req.sub_aggregation, reader, segment_ordinal, data)?;
|
let children = build_children(&req.sub_aggregation, reader, segment_ordinal, data)?;
|
||||||
Ok(vec![AggRefNode {
|
Ok(vec![AggRefNode {
|
||||||
@@ -525,7 +540,9 @@ fn build_nodes(
|
|||||||
let idx_in_req_data = data.push_histogram_req_data(HistogramAggReqData {
|
let idx_in_req_data = data.push_histogram_req_data(HistogramAggReqData {
|
||||||
accessor,
|
accessor,
|
||||||
field_type,
|
field_type,
|
||||||
|
column_block_accessor: Default::default(),
|
||||||
name: agg_name.to_string(),
|
name: agg_name.to_string(),
|
||||||
|
sub_aggregation_blueprint: None,
|
||||||
req: histo_req.clone(),
|
req: histo_req.clone(),
|
||||||
is_date_histogram: false,
|
is_date_histogram: false,
|
||||||
bounds: HistogramBounds {
|
bounds: HistogramBounds {
|
||||||
@@ -550,7 +567,9 @@ fn build_nodes(
|
|||||||
let idx_in_req_data = data.push_histogram_req_data(HistogramAggReqData {
|
let idx_in_req_data = data.push_histogram_req_data(HistogramAggReqData {
|
||||||
accessor,
|
accessor,
|
||||||
field_type,
|
field_type,
|
||||||
|
column_block_accessor: Default::default(),
|
||||||
name: agg_name.to_string(),
|
name: agg_name.to_string(),
|
||||||
|
sub_aggregation_blueprint: None,
|
||||||
req: histo_req,
|
req: histo_req,
|
||||||
is_date_histogram: true,
|
is_date_histogram: true,
|
||||||
bounds: HistogramBounds {
|
bounds: HistogramBounds {
|
||||||
@@ -575,7 +594,6 @@ fn build_nodes(
|
|||||||
data,
|
data,
|
||||||
&req.sub_aggregation,
|
&req.sub_aggregation,
|
||||||
TermsOrCardinalityRequest::Terms(terms_req.clone()),
|
TermsOrCardinalityRequest::Terms(terms_req.clone()),
|
||||||
is_top_level,
|
|
||||||
),
|
),
|
||||||
Cardinality(card_req) => build_terms_or_cardinality_nodes(
|
Cardinality(card_req) => build_terms_or_cardinality_nodes(
|
||||||
agg_name,
|
agg_name,
|
||||||
@@ -586,7 +604,6 @@ fn build_nodes(
|
|||||||
data,
|
data,
|
||||||
&req.sub_aggregation,
|
&req.sub_aggregation,
|
||||||
TermsOrCardinalityRequest::Cardinality(card_req.clone()),
|
TermsOrCardinalityRequest::Cardinality(card_req.clone()),
|
||||||
is_top_level,
|
|
||||||
),
|
),
|
||||||
Average(AverageAggregation { field, missing, .. })
|
Average(AverageAggregation { field, missing, .. })
|
||||||
| Max(MaxAggregation { field, missing, .. })
|
| Max(MaxAggregation { field, missing, .. })
|
||||||
@@ -630,6 +647,7 @@ fn build_nodes(
|
|||||||
let idx_in_req_data = data.push_metric_req_data(MetricAggReqData {
|
let idx_in_req_data = data.push_metric_req_data(MetricAggReqData {
|
||||||
accessor,
|
accessor,
|
||||||
field_type,
|
field_type,
|
||||||
|
column_block_accessor: Default::default(),
|
||||||
name: agg_name.to_string(),
|
name: agg_name.to_string(),
|
||||||
collecting_for,
|
collecting_for,
|
||||||
missing: *missing,
|
missing: *missing,
|
||||||
@@ -657,6 +675,7 @@ fn build_nodes(
|
|||||||
let idx_in_req_data = data.push_metric_req_data(MetricAggReqData {
|
let idx_in_req_data = data.push_metric_req_data(MetricAggReqData {
|
||||||
accessor,
|
accessor,
|
||||||
field_type,
|
field_type,
|
||||||
|
column_block_accessor: Default::default(),
|
||||||
name: agg_name.to_string(),
|
name: agg_name.to_string(),
|
||||||
collecting_for: StatsType::Percentiles,
|
collecting_for: StatsType::Percentiles,
|
||||||
missing: percentiles_req.missing,
|
missing: percentiles_req.missing,
|
||||||
@@ -713,7 +732,7 @@ fn build_nodes(
|
|||||||
// Build the query and evaluator upfront
|
// Build the query and evaluator upfront
|
||||||
let schema = reader.schema();
|
let schema = reader.schema();
|
||||||
let tokenizers = &data.context.tokenizers;
|
let tokenizers = &data.context.tokenizers;
|
||||||
let query = filter_req.parse_query(schema, tokenizers)?;
|
let query = filter_req.parse_query(&schema, tokenizers)?;
|
||||||
let evaluator = crate::aggregation::bucket::DocumentQueryEvaluator::new(
|
let evaluator = crate::aggregation::bucket::DocumentQueryEvaluator::new(
|
||||||
query,
|
query,
|
||||||
schema.clone(),
|
schema.clone(),
|
||||||
@@ -731,7 +750,6 @@ fn build_nodes(
|
|||||||
segment_reader: reader.clone(),
|
segment_reader: reader.clone(),
|
||||||
evaluator,
|
evaluator,
|
||||||
matching_docs_buffer,
|
matching_docs_buffer,
|
||||||
is_top_level,
|
|
||||||
});
|
});
|
||||||
let children = build_children(&req.sub_aggregation, reader, segment_ordinal, data)?;
|
let children = build_children(&req.sub_aggregation, reader, segment_ordinal, data)?;
|
||||||
Ok(vec![AggRefNode {
|
Ok(vec![AggRefNode {
|
||||||
@@ -751,14 +769,7 @@ fn build_children(
|
|||||||
) -> crate::Result<Vec<AggRefNode>> {
|
) -> crate::Result<Vec<AggRefNode>> {
|
||||||
let mut children = Vec::new();
|
let mut children = Vec::new();
|
||||||
for (name, agg) in aggs.iter() {
|
for (name, agg) in aggs.iter() {
|
||||||
children.extend(build_nodes(
|
children.extend(build_nodes(name, agg, reader, segment_ordinal, data)?);
|
||||||
name,
|
|
||||||
agg,
|
|
||||||
reader,
|
|
||||||
segment_ordinal,
|
|
||||||
data,
|
|
||||||
false,
|
|
||||||
)?);
|
|
||||||
}
|
}
|
||||||
Ok(children)
|
Ok(children)
|
||||||
}
|
}
|
||||||
@@ -822,7 +833,6 @@ fn build_terms_or_cardinality_nodes(
|
|||||||
data: &mut AggregationsSegmentCtx,
|
data: &mut AggregationsSegmentCtx,
|
||||||
sub_aggs: &Aggregations,
|
sub_aggs: &Aggregations,
|
||||||
req: TermsOrCardinalityRequest,
|
req: TermsOrCardinalityRequest,
|
||||||
is_top_level: bool,
|
|
||||||
) -> crate::Result<Vec<AggRefNode>> {
|
) -> crate::Result<Vec<AggRefNode>> {
|
||||||
let mut nodes = Vec::new();
|
let mut nodes = Vec::new();
|
||||||
|
|
||||||
@@ -874,12 +884,12 @@ fn build_terms_or_cardinality_nodes(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add one node per accessor
|
// Add one node per accessor to mirror previous behavior and allow per-type missing handling.
|
||||||
for (accessor, column_type) in column_and_types {
|
for (accessor, column_type) in column_and_types {
|
||||||
let missing_value_for_accessor = if use_special_missing_agg {
|
let missing_value_for_accessor = if use_special_missing_agg {
|
||||||
None
|
None
|
||||||
} else if let Some(m) = missing.as_ref() {
|
} else if let Some(m) = missing.as_ref() {
|
||||||
get_missing_val_as_u64_lenient(column_type, accessor.max_value(), m, field_name)?
|
get_missing_val_as_u64_lenient(column_type, m, field_name)?
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
@@ -905,11 +915,13 @@ fn build_terms_or_cardinality_nodes(
|
|||||||
column_type,
|
column_type,
|
||||||
str_dict_column: str_dict_column.clone(),
|
str_dict_column: str_dict_column.clone(),
|
||||||
missing_value_for_accessor,
|
missing_value_for_accessor,
|
||||||
|
column_block_accessor: Default::default(),
|
||||||
name: agg_name.to_string(),
|
name: agg_name.to_string(),
|
||||||
req: TermsAggregationInternal::from_req(req),
|
req: TermsAggregationInternal::from_req(req),
|
||||||
|
// Will be filled later when building collectors
|
||||||
|
sub_aggregation_blueprint: None,
|
||||||
sug_aggregations: sub_aggs.clone(),
|
sug_aggregations: sub_aggs.clone(),
|
||||||
allowed_term_ids,
|
allowed_term_ids,
|
||||||
is_top_level,
|
|
||||||
});
|
});
|
||||||
(idx_in_req_data, AggKind::Terms)
|
(idx_in_req_data, AggKind::Terms)
|
||||||
}
|
}
|
||||||
@@ -919,6 +931,7 @@ fn build_terms_or_cardinality_nodes(
|
|||||||
column_type,
|
column_type,
|
||||||
str_dict_column: str_dict_column.clone(),
|
str_dict_column: str_dict_column.clone(),
|
||||||
missing_value_for_accessor,
|
missing_value_for_accessor,
|
||||||
|
column_block_accessor: Default::default(),
|
||||||
name: agg_name.to_string(),
|
name: agg_name.to_string(),
|
||||||
req: req.clone(),
|
req: req.clone(),
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -35,7 +35,6 @@ pub struct AggregationLimitsGuard {
|
|||||||
/// Allocated memory with this guard.
|
/// Allocated memory with this guard.
|
||||||
allocated_with_the_guard: u64,
|
allocated_with_the_guard: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for AggregationLimitsGuard {
|
impl Clone for AggregationLimitsGuard {
|
||||||
fn clone(&self) -> Self {
|
fn clone(&self) -> Self {
|
||||||
Self {
|
Self {
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ use super::{AggregationError, Key};
|
|||||||
use crate::TantivyError;
|
use crate::TantivyError;
|
||||||
|
|
||||||
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
|
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
/// The final aggregation result.
|
/// The final aggegation result.
|
||||||
pub struct AggregationResults(pub FxHashMap<String, AggregationResult>);
|
pub struct AggregationResults(pub FxHashMap<String, AggregationResult>);
|
||||||
|
|
||||||
impl AggregationResults {
|
impl AggregationResults {
|
||||||
|
|||||||
@@ -2,441 +2,15 @@ use serde_json::Value;
|
|||||||
|
|
||||||
use crate::aggregation::agg_req::{Aggregation, Aggregations};
|
use crate::aggregation::agg_req::{Aggregation, Aggregations};
|
||||||
use crate::aggregation::agg_result::AggregationResults;
|
use crate::aggregation::agg_result::AggregationResults;
|
||||||
|
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
|
||||||
use crate::aggregation::collector::AggregationCollector;
|
use crate::aggregation::collector::AggregationCollector;
|
||||||
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
|
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
|
||||||
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
|
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
|
||||||
use crate::aggregation::DistributedAggregationCollector;
|
use crate::aggregation::DistributedAggregationCollector;
|
||||||
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
|
|
||||||
use crate::query::{AllQuery, TermQuery};
|
use crate::query::{AllQuery, TermQuery};
|
||||||
use crate::schema::{IndexRecordOption, Schema, FAST};
|
use crate::schema::{IndexRecordOption, Schema, FAST};
|
||||||
use crate::{Index, IndexWriter, Term};
|
use crate::{Index, IndexWriter, Term};
|
||||||
|
|
||||||
// The following tests ensure that each bucket aggregation type correctly functions as a
|
|
||||||
// sub-aggregation of another bucket aggregation in two scenarios:
|
|
||||||
// 1) The parent has more buckets than the child sub-aggregation
|
|
||||||
// 2) The child sub-aggregation has more buckets than the parent
|
|
||||||
//
|
|
||||||
// These scenarios exercise the bucket id mapping and sub-aggregation routing logic.
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_terms_as_subagg_parent_more_vs_child_more() -> crate::Result<()> {
|
|
||||||
let index = get_test_index_2_segments(false)?;
|
|
||||||
|
|
||||||
// Case A: parent has more buckets than child
|
|
||||||
// Parent: range with 4 buckets
|
|
||||||
// Child: terms on text -> 2 buckets
|
|
||||||
let agg_parent_more: Aggregations = serde_json::from_value(json!({
|
|
||||||
"parent_range": {
|
|
||||||
"range": {
|
|
||||||
"field": "score",
|
|
||||||
"ranges": [
|
|
||||||
{"to": 3.0},
|
|
||||||
{"from": 3.0, "to": 7.0},
|
|
||||||
{"from": 7.0, "to": 20.0},
|
|
||||||
{"from": 20.0}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"aggs": {
|
|
||||||
"child_terms": {"terms": {"field": "text", "order": {"_key": "asc"}}}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?;
|
|
||||||
// Exact expected structure and counts
|
|
||||||
assert_eq!(
|
|
||||||
res["parent_range"]["buckets"],
|
|
||||||
json!([
|
|
||||||
{
|
|
||||||
"key": "*-3",
|
|
||||||
"doc_count": 1,
|
|
||||||
"to": 3.0,
|
|
||||||
"child_terms": {
|
|
||||||
"buckets": [
|
|
||||||
{"doc_count": 1, "key": "cool"}
|
|
||||||
],
|
|
||||||
"sum_other_doc_count": 0
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"key": "3-7",
|
|
||||||
"doc_count": 3,
|
|
||||||
"from": 3.0,
|
|
||||||
"to": 7.0,
|
|
||||||
"child_terms": {
|
|
||||||
"buckets": [
|
|
||||||
{"doc_count": 2, "key": "cool"},
|
|
||||||
{"doc_count": 1, "key": "nohit"}
|
|
||||||
],
|
|
||||||
"sum_other_doc_count": 0
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"key": "7-20",
|
|
||||||
"doc_count": 3,
|
|
||||||
"from": 7.0,
|
|
||||||
"to": 20.0,
|
|
||||||
"child_terms": {
|
|
||||||
"buckets": [
|
|
||||||
{"doc_count": 3, "key": "cool"}
|
|
||||||
],
|
|
||||||
"sum_other_doc_count": 0
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"key": "20-*",
|
|
||||||
"doc_count": 2,
|
|
||||||
"from": 20.0,
|
|
||||||
"child_terms": {
|
|
||||||
"buckets": [
|
|
||||||
{"doc_count": 1, "key": "cool"},
|
|
||||||
{"doc_count": 1, "key": "nohit"}
|
|
||||||
],
|
|
||||||
"sum_other_doc_count": 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
])
|
|
||||||
);
|
|
||||||
|
|
||||||
// Case B: child has more buckets than parent
|
|
||||||
// Parent: histogram on score with large interval -> 1 bucket
|
|
||||||
// Child: terms on text -> 2 buckets (cool/nohit)
|
|
||||||
let agg_child_more: Aggregations = serde_json::from_value(json!({
|
|
||||||
"parent_hist": {
|
|
||||||
"histogram": {"field": "score", "interval": 100.0},
|
|
||||||
"aggs": {
|
|
||||||
"child_terms": {"terms": {"field": "text", "order": {"_key": "asc"}}}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?;
|
|
||||||
assert_eq!(
|
|
||||||
res["parent_hist"],
|
|
||||||
json!({
|
|
||||||
"buckets": [
|
|
||||||
{
|
|
||||||
"key": 0.0,
|
|
||||||
"doc_count": 9,
|
|
||||||
"child_terms": {
|
|
||||||
"buckets": [
|
|
||||||
{"doc_count": 7, "key": "cool"},
|
|
||||||
{"doc_count": 2, "key": "nohit"}
|
|
||||||
],
|
|
||||||
"sum_other_doc_count": 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
})
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_range_as_subagg_parent_more_vs_child_more() -> crate::Result<()> {
|
|
||||||
let index = get_test_index_2_segments(false)?;
|
|
||||||
|
|
||||||
// Case A: parent has more buckets than child
|
|
||||||
// Parent: range with 5 buckets
|
|
||||||
// Child: coarse range with 3 buckets
|
|
||||||
let agg_parent_more: Aggregations = serde_json::from_value(json!({
|
|
||||||
"parent_range": {
|
|
||||||
"range": {
|
|
||||||
"field": "score",
|
|
||||||
"ranges": [
|
|
||||||
{"to": 3.0},
|
|
||||||
{"from": 3.0, "to": 7.0},
|
|
||||||
{"from": 7.0, "to": 11.0},
|
|
||||||
{"from": 11.0, "to": 20.0},
|
|
||||||
{"from": 20.0}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"aggs": {
|
|
||||||
"child_range": {
|
|
||||||
"range": {
|
|
||||||
"field": "score",
|
|
||||||
"ranges": [
|
|
||||||
{"to": 3.0},
|
|
||||||
{"from": 3.0, "to": 20.0}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
.unwrap();
|
|
||||||
let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?;
|
|
||||||
assert_eq!(
|
|
||||||
res["parent_range"]["buckets"],
|
|
||||||
json!([
|
|
||||||
{"key": "*-3", "doc_count": 1, "to": 3.0,
|
|
||||||
"child_range": {"buckets": [
|
|
||||||
{"key": "*-3", "doc_count": 1, "to": 3.0},
|
|
||||||
{"key": "3-20", "doc_count": 0, "from": 3.0, "to": 20.0},
|
|
||||||
{"key": "20-*", "doc_count": 0, "from": 20.0}
|
|
||||||
]}
|
|
||||||
},
|
|
||||||
{"key": "3-7", "doc_count": 3, "from": 3.0, "to": 7.0,
|
|
||||||
"child_range": {"buckets": [
|
|
||||||
{"key": "*-3", "doc_count": 0, "to": 3.0},
|
|
||||||
{"key": "3-20", "doc_count": 3, "from": 3.0, "to": 20.0},
|
|
||||||
{"key": "20-*", "doc_count": 0, "from": 20.0}
|
|
||||||
]}
|
|
||||||
},
|
|
||||||
{"key": "7-11", "doc_count": 1, "from": 7.0, "to": 11.0,
|
|
||||||
"child_range": {"buckets": [
|
|
||||||
{"key": "*-3", "doc_count": 0, "to": 3.0},
|
|
||||||
{"key": "3-20", "doc_count": 1, "from": 3.0, "to": 20.0},
|
|
||||||
{"key": "20-*", "doc_count": 0, "from": 20.0}
|
|
||||||
]}
|
|
||||||
},
|
|
||||||
{"key": "11-20", "doc_count": 2, "from": 11.0, "to": 20.0,
|
|
||||||
"child_range": {"buckets": [
|
|
||||||
{"key": "*-3", "doc_count": 0, "to": 3.0},
|
|
||||||
{"key": "3-20", "doc_count": 2, "from": 3.0, "to": 20.0},
|
|
||||||
{"key": "20-*", "doc_count": 0, "from": 20.0}
|
|
||||||
]}
|
|
||||||
},
|
|
||||||
{"key": "20-*", "doc_count": 2, "from": 20.0,
|
|
||||||
"child_range": {"buckets": [
|
|
||||||
{"key": "*-3", "doc_count": 0, "to": 3.0},
|
|
||||||
{"key": "3-20", "doc_count": 0, "from": 3.0, "to": 20.0},
|
|
||||||
{"key": "20-*", "doc_count": 2, "from": 20.0}
|
|
||||||
]}
|
|
||||||
}
|
|
||||||
])
|
|
||||||
);
|
|
||||||
|
|
||||||
// Case B: child has more buckets than parent
|
|
||||||
// Parent: terms on text (2 buckets)
|
|
||||||
// Child: range with 4 buckets
|
|
||||||
let agg_child_more: Aggregations = serde_json::from_value(json!({
|
|
||||||
"parent_terms": {
|
|
||||||
"terms": {"field": "text"},
|
|
||||||
"aggs": {
|
|
||||||
"child_range": {
|
|
||||||
"range": {
|
|
||||||
"field": "score",
|
|
||||||
"ranges": [
|
|
||||||
{"to": 3.0},
|
|
||||||
{"from": 3.0, "to": 7.0},
|
|
||||||
{"from": 7.0, "to": 20.0}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
.unwrap();
|
|
||||||
let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?;
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
res["parent_terms"],
|
|
||||||
json!({
|
|
||||||
"buckets": [
|
|
||||||
{
|
|
||||||
"key": "cool",
|
|
||||||
"doc_count": 7,
|
|
||||||
"child_range": {
|
|
||||||
"buckets": [
|
|
||||||
{"key": "*-3", "doc_count": 1, "to": 3.0},
|
|
||||||
{"key": "3-7", "doc_count": 2, "from": 3.0, "to": 7.0},
|
|
||||||
{"key": "7-20", "doc_count": 3, "from": 7.0, "to": 20.0},
|
|
||||||
{"key": "20-*", "doc_count": 1, "from": 20.0}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"key": "nohit",
|
|
||||||
"doc_count": 2,
|
|
||||||
"child_range": {
|
|
||||||
"buckets": [
|
|
||||||
{"key": "*-3", "doc_count": 0, "to": 3.0},
|
|
||||||
{"key": "3-7", "doc_count": 1, "from": 3.0, "to": 7.0},
|
|
||||||
{"key": "7-20", "doc_count": 0, "from": 7.0, "to": 20.0},
|
|
||||||
{"key": "20-*", "doc_count": 1, "from": 20.0}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"doc_count_error_upper_bound": 0,
|
|
||||||
"sum_other_doc_count": 0
|
|
||||||
})
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_histogram_as_subagg_parent_more_vs_child_more() -> crate::Result<()> {
|
|
||||||
let index = get_test_index_2_segments(false)?;
|
|
||||||
|
|
||||||
// Case A: parent has more buckets than child
|
|
||||||
// Parent: range with several ranges
|
|
||||||
// Child: histogram with large interval (single bucket per parent)
|
|
||||||
let agg_parent_more: Aggregations = serde_json::from_value(json!({
|
|
||||||
"parent_range": {
|
|
||||||
"range": {
|
|
||||||
"field": "score",
|
|
||||||
"ranges": [
|
|
||||||
{"to": 3.0},
|
|
||||||
{"from": 3.0, "to": 7.0},
|
|
||||||
{"from": 7.0, "to": 11.0},
|
|
||||||
{"from": 11.0, "to": 20.0},
|
|
||||||
{"from": 20.0}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"aggs": {
|
|
||||||
"child_hist": {"histogram": {"field": "score", "interval": 100.0}}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
.unwrap();
|
|
||||||
let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?;
|
|
||||||
assert_eq!(
|
|
||||||
res["parent_range"]["buckets"],
|
|
||||||
json!([
|
|
||||||
{"key": "*-3", "doc_count": 1, "to": 3.0,
|
|
||||||
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 1} ]}
|
|
||||||
},
|
|
||||||
{"key": "3-7", "doc_count": 3, "from": 3.0, "to": 7.0,
|
|
||||||
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 3} ]}
|
|
||||||
},
|
|
||||||
{"key": "7-11", "doc_count": 1, "from": 7.0, "to": 11.0,
|
|
||||||
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 1} ]}
|
|
||||||
},
|
|
||||||
{"key": "11-20", "doc_count": 2, "from": 11.0, "to": 20.0,
|
|
||||||
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 2} ]}
|
|
||||||
},
|
|
||||||
{"key": "20-*", "doc_count": 2, "from": 20.0,
|
|
||||||
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 2} ]}
|
|
||||||
}
|
|
||||||
])
|
|
||||||
);
|
|
||||||
|
|
||||||
// Case B: child has more buckets than parent
|
|
||||||
// Parent: terms on text -> 2 buckets
|
|
||||||
// Child: histogram with small interval -> multiple buckets including empties
|
|
||||||
let agg_child_more: Aggregations = serde_json::from_value(json!({
|
|
||||||
"parent_terms": {
|
|
||||||
"terms": {"field": "text"},
|
|
||||||
"aggs": {
|
|
||||||
"child_hist": {"histogram": {"field": "score", "interval": 10.0}}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
.unwrap();
|
|
||||||
let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?;
|
|
||||||
assert_eq!(
|
|
||||||
res["parent_terms"],
|
|
||||||
json!({
|
|
||||||
"buckets": [
|
|
||||||
{
|
|
||||||
"key": "cool",
|
|
||||||
"doc_count": 7,
|
|
||||||
"child_hist": {
|
|
||||||
"buckets": [
|
|
||||||
{"key": 0.0, "doc_count": 4},
|
|
||||||
{"key": 10.0, "doc_count": 2},
|
|
||||||
{"key": 20.0, "doc_count": 0},
|
|
||||||
{"key": 30.0, "doc_count": 0},
|
|
||||||
{"key": 40.0, "doc_count": 1}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"key": "nohit",
|
|
||||||
"doc_count": 2,
|
|
||||||
"child_hist": {
|
|
||||||
"buckets": [
|
|
||||||
{"key": 0.0, "doc_count": 1},
|
|
||||||
{"key": 10.0, "doc_count": 0},
|
|
||||||
{"key": 20.0, "doc_count": 0},
|
|
||||||
{"key": 30.0, "doc_count": 0},
|
|
||||||
{"key": 40.0, "doc_count": 1}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"doc_count_error_upper_bound": 0,
|
|
||||||
"sum_other_doc_count": 0
|
|
||||||
})
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_date_histogram_as_subagg_parent_more_vs_child_more() -> crate::Result<()> {
|
|
||||||
let index = get_test_index_2_segments(false)?;
|
|
||||||
|
|
||||||
// Case A: parent has more buckets than child
|
|
||||||
// Parent: range with several buckets
|
|
||||||
// Child: date_histogram with 30d -> single bucket per parent
|
|
||||||
let agg_parent_more: Aggregations = serde_json::from_value(json!({
|
|
||||||
"parent_range": {
|
|
||||||
"range": {
|
|
||||||
"field": "score",
|
|
||||||
"ranges": [
|
|
||||||
{"to": 3.0},
|
|
||||||
{"from": 3.0, "to": 7.0},
|
|
||||||
{"from": 7.0, "to": 11.0},
|
|
||||||
{"from": 11.0, "to": 20.0},
|
|
||||||
{"from": 20.0}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"aggs": {
|
|
||||||
"child_date_hist": {"date_histogram": {"field": "date", "fixed_interval": "30d"}}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
.unwrap();
|
|
||||||
let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?;
|
|
||||||
let buckets = res["parent_range"]["buckets"].as_array().unwrap();
|
|
||||||
// Verify each parent bucket has exactly one child date bucket with matching doc_count
|
|
||||||
for bucket in buckets {
|
|
||||||
let parent_count = bucket["doc_count"].as_u64().unwrap();
|
|
||||||
let child_buckets = bucket["child_date_hist"]["buckets"].as_array().unwrap();
|
|
||||||
assert_eq!(child_buckets.len(), 1);
|
|
||||||
assert_eq!(child_buckets[0]["doc_count"], parent_count);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Case B: child has more buckets than parent
|
|
||||||
// Parent: terms on text (2 buckets)
|
|
||||||
// Child: date_histogram with 1d -> multiple buckets
|
|
||||||
let agg_child_more: Aggregations = serde_json::from_value(json!({
|
|
||||||
"parent_terms": {
|
|
||||||
"terms": {"field": "text"},
|
|
||||||
"aggs": {
|
|
||||||
"child_date_hist": {"date_histogram": {"field": "date", "fixed_interval": "1d"}}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
.unwrap();
|
|
||||||
let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?;
|
|
||||||
let buckets = res["parent_terms"]["buckets"].as_array().unwrap();
|
|
||||||
|
|
||||||
// cool bucket
|
|
||||||
assert_eq!(buckets[0]["key"], "cool");
|
|
||||||
let cool_buckets = buckets[0]["child_date_hist"]["buckets"].as_array().unwrap();
|
|
||||||
assert_eq!(cool_buckets.len(), 3);
|
|
||||||
assert_eq!(cool_buckets[0]["doc_count"], 1); // day 0
|
|
||||||
assert_eq!(cool_buckets[1]["doc_count"], 4); // day 1
|
|
||||||
assert_eq!(cool_buckets[2]["doc_count"], 2); // day 2
|
|
||||||
|
|
||||||
// nohit bucket
|
|
||||||
assert_eq!(buckets[1]["key"], "nohit");
|
|
||||||
let nohit_buckets = buckets[1]["child_date_hist"]["buckets"].as_array().unwrap();
|
|
||||||
assert_eq!(nohit_buckets.len(), 2);
|
|
||||||
assert_eq!(nohit_buckets[0]["doc_count"], 1); // day 1
|
|
||||||
assert_eq!(nohit_buckets[1]["doc_count"], 1); // day 2
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_avg_req(field_name: &str) -> Aggregation {
|
fn get_avg_req(field_name: &str) -> Aggregation {
|
||||||
serde_json::from_value(json!({
|
serde_json::from_value(json!({
|
||||||
"avg": {
|
"avg": {
|
||||||
@@ -451,10 +25,6 @@ fn get_collector(agg_req: Aggregations) -> AggregationCollector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// *** EVERY BUCKET-TYPE SHOULD BE TESTED HERE ***
|
// *** EVERY BUCKET-TYPE SHOULD BE TESTED HERE ***
|
||||||
// Note: The flushng part of these tests are outdated, since the buffering change after converting
|
|
||||||
// the collection into one collector per request instead of per bucket.
|
|
||||||
//
|
|
||||||
// However they are useful as they test a complex aggregation requests.
|
|
||||||
fn test_aggregation_flushing(
|
fn test_aggregation_flushing(
|
||||||
merge_segments: bool,
|
merge_segments: bool,
|
||||||
use_distributed_collector: bool,
|
use_distributed_collector: bool,
|
||||||
@@ -467,9 +37,8 @@ fn test_aggregation_flushing(
|
|||||||
|
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
|
|
||||||
assert_eq!(COLLECT_BLOCK_BUFFER_LEN, 64);
|
assert_eq!(DOC_BLOCK_SIZE, 64);
|
||||||
// In the tree we cache documents of COLLECT_BLOCK_BUFFER_LEN before passing them down as one
|
// In the tree we cache Documents of DOC_BLOCK_SIZE, before passing them down as one block.
|
||||||
// block.
|
|
||||||
//
|
//
|
||||||
// Build a request so that on the first level we have one full cache, which is then flushed.
|
// Build a request so that on the first level we have one full cache, which is then flushed.
|
||||||
// The same cache should have some residue docs at the end, which are flushed (Range 0-70)
|
// The same cache should have some residue docs at the end, which are flushed (Range 0-70)
|
||||||
|
|||||||
@@ -6,14 +6,10 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
|||||||
use crate::aggregation::agg_data::{
|
use crate::aggregation::agg_data::{
|
||||||
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
||||||
};
|
};
|
||||||
use crate::aggregation::cached_sub_aggs::{
|
|
||||||
CachedSubAggs, HighCardSubAggCache, LowCardSubAggCache, SubAggCache,
|
|
||||||
};
|
|
||||||
use crate::aggregation::intermediate_agg_result::{
|
use crate::aggregation::intermediate_agg_result::{
|
||||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||||
};
|
};
|
||||||
use crate::aggregation::segment_agg_result::{BucketIdProvider, SegmentAggregationCollector};
|
use crate::aggregation::segment_agg_result::{CollectorClone, SegmentAggregationCollector};
|
||||||
use crate::aggregation::BucketId;
|
|
||||||
use crate::docset::DocSet;
|
use crate::docset::DocSet;
|
||||||
use crate::query::{AllQuery, EnableScoring, Query, QueryParser};
|
use crate::query::{AllQuery, EnableScoring, Query, QueryParser};
|
||||||
use crate::schema::Schema;
|
use crate::schema::Schema;
|
||||||
@@ -36,7 +32,7 @@ use crate::{DocId, SegmentReader, TantivyError};
|
|||||||
///
|
///
|
||||||
/// # Implementation Requirements
|
/// # Implementation Requirements
|
||||||
///
|
///
|
||||||
/// Implementers must:
|
/// Implementors must:
|
||||||
/// 1. Derive `Debug`, `Clone`, `Serialize`, and `Deserialize`
|
/// 1. Derive `Debug`, `Clone`, `Serialize`, and `Deserialize`
|
||||||
/// 2. Use `#[typetag::serde]` attribute on the impl block
|
/// 2. Use `#[typetag::serde]` attribute on the impl block
|
||||||
/// 3. Implement `build_query()` to construct the query from schema/tokenizers
|
/// 3. Implement `build_query()` to construct the query from schema/tokenizers
|
||||||
@@ -408,18 +404,15 @@ pub struct FilterAggReqData {
|
|||||||
pub evaluator: DocumentQueryEvaluator,
|
pub evaluator: DocumentQueryEvaluator,
|
||||||
/// Reusable buffer for matching documents to minimize allocations during collection
|
/// Reusable buffer for matching documents to minimize allocations during collection
|
||||||
pub matching_docs_buffer: Vec<DocId>,
|
pub matching_docs_buffer: Vec<DocId>,
|
||||||
/// True if this filter aggregation is at the top level of the aggregation tree (not nested).
|
|
||||||
pub is_top_level: bool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FilterAggReqData {
|
impl FilterAggReqData {
|
||||||
pub(crate) fn get_memory_consumption(&self) -> usize {
|
pub(crate) fn get_memory_consumption(&self) -> usize {
|
||||||
// Estimate: name + segment reader reference + bitset + buffer capacity
|
// Estimate: name + segment reader reference + bitset + buffer capacity
|
||||||
self.name.len()
|
self.name.len()
|
||||||
+ std::mem::size_of::<SegmentReader>()
|
+ std::mem::size_of::<SegmentReader>()
|
||||||
+ self.evaluator.bitset.len() / 8 // BitSet memory (bits to bytes)
|
+ self.evaluator.bitset.len() / 8 // BitSet memory (bits to bytes)
|
||||||
+ self.matching_docs_buffer.capacity() * std::mem::size_of::<DocId>()
|
+ self.matching_docs_buffer.capacity() * std::mem::size_of::<DocId>()
|
||||||
+ std::mem::size_of::<bool>()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -496,24 +489,17 @@ impl Debug for DocumentQueryEvaluator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Copy)]
|
|
||||||
struct DocCount {
|
|
||||||
doc_count: u64,
|
|
||||||
bucket_id: BucketId,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Segment collector for filter aggregation
|
/// Segment collector for filter aggregation
|
||||||
pub struct SegmentFilterCollector<C: SubAggCache> {
|
pub struct SegmentFilterCollector {
|
||||||
/// Document counts per parent bucket
|
/// Document count in this bucket
|
||||||
parent_buckets: Vec<DocCount>,
|
doc_count: u64,
|
||||||
/// Sub-aggregation collectors
|
/// Sub-aggregation collectors
|
||||||
sub_aggregations: Option<CachedSubAggs<C>>,
|
sub_aggregations: Option<Box<dyn SegmentAggregationCollector>>,
|
||||||
bucket_id_provider: BucketIdProvider,
|
|
||||||
/// Accessor index for this filter aggregation (to access FilterAggReqData)
|
/// Accessor index for this filter aggregation (to access FilterAggReqData)
|
||||||
accessor_idx: usize,
|
accessor_idx: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<C: SubAggCache> SegmentFilterCollector<C> {
|
impl SegmentFilterCollector {
|
||||||
/// Create a new filter segment collector following the new agg_data pattern
|
/// Create a new filter segment collector following the new agg_data pattern
|
||||||
pub(crate) fn from_req_and_validate(
|
pub(crate) fn from_req_and_validate(
|
||||||
req: &mut AggregationsSegmentCtx,
|
req: &mut AggregationsSegmentCtx,
|
||||||
@@ -525,75 +511,47 @@ impl<C: SubAggCache> SegmentFilterCollector<C> {
|
|||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
let sub_agg_collector = sub_agg_collector.map(CachedSubAggs::new);
|
|
||||||
|
|
||||||
Ok(SegmentFilterCollector {
|
Ok(SegmentFilterCollector {
|
||||||
parent_buckets: Vec::new(),
|
doc_count: 0,
|
||||||
sub_aggregations: sub_agg_collector,
|
sub_aggregations: sub_agg_collector,
|
||||||
accessor_idx: node.idx_in_req_data,
|
accessor_idx: node.idx_in_req_data,
|
||||||
bucket_id_provider: BucketIdProvider::default(),
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn build_segment_filter_collector(
|
impl Debug for SegmentFilterCollector {
|
||||||
req: &mut AggregationsSegmentCtx,
|
|
||||||
node: &AggRefNode,
|
|
||||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
|
||||||
let is_top_level = req.per_request.filter_req_data[node.idx_in_req_data]
|
|
||||||
.as_ref()
|
|
||||||
.expect("filter_req_data slot is empty")
|
|
||||||
.is_top_level;
|
|
||||||
|
|
||||||
if is_top_level {
|
|
||||||
Ok(Box::new(
|
|
||||||
SegmentFilterCollector::<LowCardSubAggCache>::from_req_and_validate(req, node)?,
|
|
||||||
))
|
|
||||||
} else {
|
|
||||||
Ok(Box::new(
|
|
||||||
SegmentFilterCollector::<HighCardSubAggCache>::from_req_and_validate(req, node)?,
|
|
||||||
))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<C: SubAggCache> Debug for SegmentFilterCollector<C> {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
f.debug_struct("SegmentFilterCollector")
|
f.debug_struct("SegmentFilterCollector")
|
||||||
.field("buckets", &self.parent_buckets)
|
.field("doc_count", &self.doc_count)
|
||||||
.field("has_sub_aggs", &self.sub_aggregations.is_some())
|
.field("has_sub_aggs", &self.sub_aggregations.is_some())
|
||||||
.field("accessor_idx", &self.accessor_idx)
|
.field("accessor_idx", &self.accessor_idx)
|
||||||
.finish()
|
.finish()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<C: SubAggCache> SegmentAggregationCollector for SegmentFilterCollector<C> {
|
impl CollectorClone for SegmentFilterCollector {
|
||||||
|
fn clone_box(&self) -> Box<dyn SegmentAggregationCollector> {
|
||||||
|
// For now, panic - this needs proper implementation with weight recreation
|
||||||
|
panic!("SegmentFilterCollector cloning not yet implemented - requires weight recreation")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentAggregationCollector for SegmentFilterCollector {
|
||||||
fn add_intermediate_aggregation_result(
|
fn add_intermediate_aggregation_result(
|
||||||
&mut self,
|
self: Box<Self>,
|
||||||
agg_data: &AggregationsSegmentCtx,
|
agg_data: &AggregationsSegmentCtx,
|
||||||
results: &mut IntermediateAggregationResults,
|
results: &mut IntermediateAggregationResults,
|
||||||
parent_bucket_id: BucketId,
|
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let mut sub_results = IntermediateAggregationResults::default();
|
let mut sub_results = IntermediateAggregationResults::default();
|
||||||
let bucket_opt = self.parent_buckets.get(parent_bucket_id as usize);
|
|
||||||
|
|
||||||
if let Some(sub_aggs) = &mut self.sub_aggregations {
|
if let Some(sub_aggs) = self.sub_aggregations {
|
||||||
sub_aggs
|
sub_aggs.add_intermediate_aggregation_result(agg_data, &mut sub_results)?;
|
||||||
.get_sub_agg_collector()
|
|
||||||
.add_intermediate_aggregation_result(
|
|
||||||
agg_data,
|
|
||||||
&mut sub_results,
|
|
||||||
// Here we create a new bucket ID for sub-aggregations if the bucket doesn't
|
|
||||||
// exist, so that sub-aggregations can still produce results (e.g., zero doc
|
|
||||||
// count)
|
|
||||||
bucket_opt
|
|
||||||
.map(|bucket| bucket.bucket_id)
|
|
||||||
.unwrap_or(self.bucket_id_provider.next_bucket_id()),
|
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create the filter bucket result
|
// Create the filter bucket result
|
||||||
let filter_bucket_result = IntermediateBucketResult::Filter {
|
let filter_bucket_result = IntermediateBucketResult::Filter {
|
||||||
doc_count: bucket_opt.map(|b| b.doc_count).unwrap_or(0),
|
doc_count: self.doc_count,
|
||||||
sub_aggregations: sub_results,
|
sub_aggregations: sub_results,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -612,17 +570,32 @@ impl<C: SubAggCache> SegmentAggregationCollector for SegmentFilterCollector<C> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect(
|
fn collect(&mut self, doc: DocId, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
||||||
|
// Access the evaluator from FilterAggReqData
|
||||||
|
let req_data = agg_data.get_filter_req_data(self.accessor_idx);
|
||||||
|
|
||||||
|
// O(1) BitSet lookup to check if document matches filter
|
||||||
|
if req_data.evaluator.matches_document(doc) {
|
||||||
|
self.doc_count += 1;
|
||||||
|
|
||||||
|
// If we have sub-aggregations, collect on them for this filtered document
|
||||||
|
if let Some(sub_aggs) = &mut self.sub_aggregations {
|
||||||
|
sub_aggs.collect(doc, agg_data)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn collect_block(
|
||||||
&mut self,
|
&mut self,
|
||||||
parent_bucket_id: BucketId,
|
docs: &[DocId],
|
||||||
docs: &[crate::DocId],
|
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
if docs.is_empty() {
|
if docs.is_empty() {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut bucket = self.parent_buckets[parent_bucket_id as usize];
|
|
||||||
// Take the request data to avoid borrow checker issues with sub-aggregations
|
// Take the request data to avoid borrow checker issues with sub-aggregations
|
||||||
let mut req = agg_data.take_filter_req_data(self.accessor_idx);
|
let mut req = agg_data.take_filter_req_data(self.accessor_idx);
|
||||||
|
|
||||||
@@ -631,24 +604,18 @@ impl<C: SubAggCache> SegmentAggregationCollector for SegmentFilterCollector<C> {
|
|||||||
req.evaluator
|
req.evaluator
|
||||||
.filter_batch(docs, &mut req.matching_docs_buffer);
|
.filter_batch(docs, &mut req.matching_docs_buffer);
|
||||||
|
|
||||||
bucket.doc_count += req.matching_docs_buffer.len() as u64;
|
self.doc_count += req.matching_docs_buffer.len() as u64;
|
||||||
|
|
||||||
// Batch process sub-aggregations if we have matches
|
// Batch process sub-aggregations if we have matches
|
||||||
if !req.matching_docs_buffer.is_empty() {
|
if !req.matching_docs_buffer.is_empty() {
|
||||||
if let Some(sub_aggs) = &mut self.sub_aggregations {
|
if let Some(sub_aggs) = &mut self.sub_aggregations {
|
||||||
for &doc_id in &req.matching_docs_buffer {
|
// Use collect_block for better sub-aggregation performance
|
||||||
sub_aggs.push(bucket.bucket_id, doc_id);
|
sub_aggs.collect_block(&req.matching_docs_buffer, agg_data)?;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Put the request data back
|
// Put the request data back
|
||||||
agg_data.put_back_filter_req_data(self.accessor_idx, req);
|
agg_data.put_back_filter_req_data(self.accessor_idx, req);
|
||||||
if let Some(sub_aggs) = &mut self.sub_aggregations {
|
|
||||||
sub_aggs.check_flush_local(agg_data)?;
|
|
||||||
}
|
|
||||||
// put back bucket
|
|
||||||
self.parent_buckets[parent_bucket_id as usize] = bucket;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -659,21 +626,6 @@ impl<C: SubAggCache> SegmentAggregationCollector for SegmentFilterCollector<C> {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn prepare_max_bucket(
|
|
||||||
&mut self,
|
|
||||||
max_bucket: BucketId,
|
|
||||||
_agg_data: &AggregationsSegmentCtx,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
while self.parent_buckets.len() <= max_bucket as usize {
|
|
||||||
let bucket_id = self.bucket_id_provider.next_bucket_id();
|
|
||||||
self.parent_buckets.push(DocCount {
|
|
||||||
doc_count: 0,
|
|
||||||
bucket_id,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Intermediate result for filter aggregation
|
/// Intermediate result for filter aggregation
|
||||||
@@ -687,14 +639,16 @@ pub struct IntermediateFilterBucketResult {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
use serde_json::{json, Value};
|
use serde_json::{json, Value};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::aggregation::agg_req::Aggregations;
|
use crate::aggregation::agg_req::Aggregations;
|
||||||
use crate::aggregation::agg_result::AggregationResults;
|
use crate::aggregation::agg_result::AggregationResults;
|
||||||
use crate::aggregation::{AggContextParams, AggregationCollector};
|
use crate::aggregation::{AggContextParams, AggregationCollector};
|
||||||
use crate::query::{AllQuery, TermQuery};
|
use crate::query::{AllQuery, QueryParser, TermQuery};
|
||||||
use crate::schema::{IndexRecordOption, Schema, Term, FAST, INDEXED, TEXT};
|
use crate::schema::{IndexRecordOption, Schema, Term, FAST, INDEXED, STORED, TEXT};
|
||||||
use crate::{doc, Index, IndexWriter};
|
use crate::{doc, Index, IndexWriter};
|
||||||
|
|
||||||
// Test helper functions
|
// Test helper functions
|
||||||
@@ -775,13 +729,12 @@ mod tests {
|
|||||||
|
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut writer: IndexWriter = index.writer_for_tests()?;
|
let mut writer: IndexWriter = index.writer(50_000_000)?;
|
||||||
|
|
||||||
writer.add_document(doc!(
|
writer.add_document(doc!(
|
||||||
category => "electronics", brand => "apple",
|
category => "electronics", brand => "apple",
|
||||||
price => 999u64, rating => 4.5f64, in_stock => true
|
price => 999u64, rating => 4.5f64, in_stock => true
|
||||||
))?;
|
))?;
|
||||||
writer.commit()?;
|
|
||||||
writer.add_document(doc!(
|
writer.add_document(doc!(
|
||||||
category => "electronics", brand => "samsung",
|
category => "electronics", brand => "samsung",
|
||||||
price => 799u64, rating => 4.2f64, in_stock => true
|
price => 799u64, rating => 4.2f64, in_stock => true
|
||||||
@@ -985,7 +938,7 @@ mod tests {
|
|||||||
let index = create_standard_test_index()?;
|
let index = create_standard_test_index()?;
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
assert_eq!(searcher.segment_readers().len(), 2);
|
|
||||||
let agg = json!({
|
let agg = json!({
|
||||||
"premium_electronics": {
|
"premium_electronics": {
|
||||||
"filter": "category:electronics AND price:[800 TO *]",
|
"filter": "category:electronics AND price:[800 TO *]",
|
||||||
@@ -1567,9 +1520,9 @@ mod tests {
|
|||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
let agg = json!({
|
let agg = json!({
|
||||||
"test": {
|
"test": {
|
||||||
"filter": deserialized,
|
"filter": deserialized,
|
||||||
"aggs": { "count": { "value_count": { "field": "brand" } } }
|
"aggs": { "count": { "value_count": { "field": "brand" } } }
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
use columnar::{Column, ColumnType};
|
use columnar::{Column, ColumnBlockAccessor, ColumnType};
|
||||||
use rustc_hash::FxHashMap;
|
use rustc_hash::FxHashMap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tantivy_bitpacker::minmax;
|
use tantivy_bitpacker::minmax;
|
||||||
@@ -8,14 +8,14 @@ use tantivy_bitpacker::minmax;
|
|||||||
use crate::aggregation::agg_data::{
|
use crate::aggregation::agg_data::{
|
||||||
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
||||||
};
|
};
|
||||||
|
use crate::aggregation::agg_limits::MemoryConsumption;
|
||||||
use crate::aggregation::agg_req::Aggregations;
|
use crate::aggregation::agg_req::Aggregations;
|
||||||
use crate::aggregation::agg_result::BucketEntry;
|
use crate::aggregation::agg_result::BucketEntry;
|
||||||
use crate::aggregation::cached_sub_aggs::{CachedSubAggs, HighCardCachedSubAggs};
|
|
||||||
use crate::aggregation::intermediate_agg_result::{
|
use crate::aggregation::intermediate_agg_result::{
|
||||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||||
IntermediateHistogramBucketEntry,
|
IntermediateHistogramBucketEntry,
|
||||||
};
|
};
|
||||||
use crate::aggregation::segment_agg_result::{BucketIdProvider, SegmentAggregationCollector};
|
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
||||||
use crate::aggregation::*;
|
use crate::aggregation::*;
|
||||||
use crate::TantivyError;
|
use crate::TantivyError;
|
||||||
|
|
||||||
@@ -26,8 +26,13 @@ pub struct HistogramAggReqData {
|
|||||||
pub accessor: Column<u64>,
|
pub accessor: Column<u64>,
|
||||||
/// The field type of the fast field.
|
/// The field type of the fast field.
|
||||||
pub field_type: ColumnType,
|
pub field_type: ColumnType,
|
||||||
|
/// The column block accessor to access the fast field values.
|
||||||
|
pub column_block_accessor: ColumnBlockAccessor<u64>,
|
||||||
/// The name of the aggregation.
|
/// The name of the aggregation.
|
||||||
pub name: String,
|
pub name: String,
|
||||||
|
/// The sub aggregation blueprint, used to create sub aggregations for each bucket.
|
||||||
|
/// Will be filled during initialization of the collector.
|
||||||
|
pub sub_aggregation_blueprint: Option<Box<dyn SegmentAggregationCollector>>,
|
||||||
/// The histogram aggregation request.
|
/// The histogram aggregation request.
|
||||||
pub req: HistogramAggregation,
|
pub req: HistogramAggregation,
|
||||||
/// True if this is a date_histogram aggregation.
|
/// True if this is a date_histogram aggregation.
|
||||||
@@ -252,24 +257,18 @@ impl HistogramBounds {
|
|||||||
pub(crate) struct SegmentHistogramBucketEntry {
|
pub(crate) struct SegmentHistogramBucketEntry {
|
||||||
pub key: f64,
|
pub key: f64,
|
||||||
pub doc_count: u64,
|
pub doc_count: u64,
|
||||||
pub bucket_id: BucketId,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentHistogramBucketEntry {
|
impl SegmentHistogramBucketEntry {
|
||||||
pub(crate) fn into_intermediate_bucket_entry(
|
pub(crate) fn into_intermediate_bucket_entry(
|
||||||
self,
|
self,
|
||||||
sub_aggregation: &mut Option<HighCardCachedSubAggs>,
|
sub_aggregation: Option<Box<dyn SegmentAggregationCollector>>,
|
||||||
agg_data: &AggregationsSegmentCtx,
|
agg_data: &AggregationsSegmentCtx,
|
||||||
) -> crate::Result<IntermediateHistogramBucketEntry> {
|
) -> crate::Result<IntermediateHistogramBucketEntry> {
|
||||||
let mut sub_aggregation_res = IntermediateAggregationResults::default();
|
let mut sub_aggregation_res = IntermediateAggregationResults::default();
|
||||||
if let Some(sub_aggregation) = sub_aggregation {
|
if let Some(sub_aggregation) = sub_aggregation {
|
||||||
sub_aggregation
|
sub_aggregation
|
||||||
.get_sub_agg_collector()
|
.add_intermediate_aggregation_result(agg_data, &mut sub_aggregation_res)?;
|
||||||
.add_intermediate_aggregation_result(
|
|
||||||
agg_data,
|
|
||||||
&mut sub_aggregation_res,
|
|
||||||
self.bucket_id,
|
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
Ok(IntermediateHistogramBucketEntry {
|
Ok(IntermediateHistogramBucketEntry {
|
||||||
key: self.key,
|
key: self.key,
|
||||||
@@ -279,38 +278,27 @@ impl SegmentHistogramBucketEntry {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Default)]
|
|
||||||
struct HistogramBuckets {
|
|
||||||
pub buckets: FxHashMap<i64, SegmentHistogramBucketEntry>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
||||||
/// the correct datatype.
|
/// the correct datatype.
|
||||||
#[derive(Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct SegmentHistogramCollector {
|
pub struct SegmentHistogramCollector {
|
||||||
/// The buckets containing the aggregation data.
|
/// The buckets containing the aggregation data.
|
||||||
/// One Histogram bucket per parent bucket id.
|
buckets: FxHashMap<i64, SegmentHistogramBucketEntry>,
|
||||||
parent_buckets: Vec<HistogramBuckets>,
|
sub_aggregations: FxHashMap<i64, Box<dyn SegmentAggregationCollector>>,
|
||||||
sub_agg: Option<HighCardCachedSubAggs>,
|
|
||||||
accessor_idx: usize,
|
accessor_idx: usize,
|
||||||
bucket_id_provider: BucketIdProvider,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentAggregationCollector for SegmentHistogramCollector {
|
impl SegmentAggregationCollector for SegmentHistogramCollector {
|
||||||
fn add_intermediate_aggregation_result(
|
fn add_intermediate_aggregation_result(
|
||||||
&mut self,
|
self: Box<Self>,
|
||||||
agg_data: &AggregationsSegmentCtx,
|
agg_data: &AggregationsSegmentCtx,
|
||||||
results: &mut IntermediateAggregationResults,
|
results: &mut IntermediateAggregationResults,
|
||||||
parent_bucket_id: BucketId,
|
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let name = agg_data
|
let name = agg_data
|
||||||
.get_histogram_req_data(self.accessor_idx)
|
.get_histogram_req_data(self.accessor_idx)
|
||||||
.name
|
.name
|
||||||
.clone();
|
.clone();
|
||||||
// TODO: avoid prepare_max_bucket here and handle empty buckets.
|
let bucket = self.into_intermediate_bucket_result(agg_data)?;
|
||||||
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
|
|
||||||
let histogram = std::mem::take(&mut self.parent_buckets[parent_bucket_id as usize]);
|
|
||||||
let bucket = self.add_intermediate_bucket_result(agg_data, histogram)?;
|
|
||||||
results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
|
results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -319,40 +307,44 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
|
|||||||
#[inline]
|
#[inline]
|
||||||
fn collect(
|
fn collect(
|
||||||
&mut self,
|
&mut self,
|
||||||
parent_bucket_id: BucketId,
|
doc: crate::DocId,
|
||||||
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
self.collect_block(&[doc], agg_data)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn collect_block(
|
||||||
|
&mut self,
|
||||||
docs: &[crate::DocId],
|
docs: &[crate::DocId],
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let req = agg_data.take_histogram_req_data(self.accessor_idx);
|
let mut req = agg_data.take_histogram_req_data(self.accessor_idx);
|
||||||
let mem_pre = self.get_memory_consumption();
|
let mem_pre = self.get_memory_consumption();
|
||||||
let buckets = &mut self.parent_buckets[parent_bucket_id as usize].buckets;
|
|
||||||
|
|
||||||
let bounds = req.bounds;
|
let bounds = req.bounds;
|
||||||
let interval = req.req.interval;
|
let interval = req.req.interval;
|
||||||
let offset = req.offset;
|
let offset = req.offset;
|
||||||
let get_bucket_pos = |val| get_bucket_pos_f64(val, interval, offset) as i64;
|
let get_bucket_pos = |val| get_bucket_pos_f64(val, interval, offset) as i64;
|
||||||
|
|
||||||
agg_data
|
req.column_block_accessor.fetch_block(docs, &req.accessor);
|
||||||
.column_block_accessor
|
for (doc, val) in req
|
||||||
.fetch_block(docs, &req.accessor);
|
|
||||||
for (doc, val) in agg_data
|
|
||||||
.column_block_accessor
|
.column_block_accessor
|
||||||
.iter_docid_vals(docs, &req.accessor)
|
.iter_docid_vals(docs, &req.accessor)
|
||||||
{
|
{
|
||||||
let val = f64_from_fastfield_u64(val, req.field_type);
|
let val = f64_from_fastfield_u64(val, &req.field_type);
|
||||||
let bucket_pos = get_bucket_pos(val);
|
let bucket_pos = get_bucket_pos(val);
|
||||||
if bounds.contains(val) {
|
if bounds.contains(val) {
|
||||||
let bucket = buckets.entry(bucket_pos).or_insert_with(|| {
|
let bucket = self.buckets.entry(bucket_pos).or_insert_with(|| {
|
||||||
let key = get_bucket_key_from_pos(bucket_pos as f64, interval, offset);
|
let key = get_bucket_key_from_pos(bucket_pos as f64, interval, offset);
|
||||||
SegmentHistogramBucketEntry {
|
SegmentHistogramBucketEntry { key, doc_count: 0 }
|
||||||
key,
|
|
||||||
doc_count: 0,
|
|
||||||
bucket_id: self.bucket_id_provider.next_bucket_id(),
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
bucket.doc_count += 1;
|
bucket.doc_count += 1;
|
||||||
if let Some(sub_agg) = &mut self.sub_agg {
|
if let Some(sub_aggregation_blueprint) = req.sub_aggregation_blueprint.as_ref() {
|
||||||
sub_agg.push(bucket.bucket_id, doc);
|
self.sub_aggregations
|
||||||
|
.entry(bucket_pos)
|
||||||
|
.or_insert_with(|| sub_aggregation_blueprint.clone())
|
||||||
|
.collect(doc, agg_data)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -366,30 +358,14 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
|
|||||||
.add_memory_consumed(mem_delta as u64)?;
|
.add_memory_consumed(mem_delta as u64)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(sub_agg) = &mut self.sub_agg {
|
|
||||||
sub_agg.check_flush_local(agg_data)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
||||||
if let Some(sub_aggregation) = &mut self.sub_agg {
|
for sub_aggregation in self.sub_aggregations.values_mut() {
|
||||||
sub_aggregation.flush(agg_data)?;
|
sub_aggregation.flush(agg_data)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn prepare_max_bucket(
|
|
||||||
&mut self,
|
|
||||||
max_bucket: BucketId,
|
|
||||||
_agg_data: &AggregationsSegmentCtx,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
while self.parent_buckets.len() <= max_bucket as usize {
|
|
||||||
self.parent_buckets.push(HistogramBuckets {
|
|
||||||
buckets: FxHashMap::default(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -397,19 +373,22 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
|
|||||||
impl SegmentHistogramCollector {
|
impl SegmentHistogramCollector {
|
||||||
fn get_memory_consumption(&self) -> usize {
|
fn get_memory_consumption(&self) -> usize {
|
||||||
let self_mem = std::mem::size_of::<Self>();
|
let self_mem = std::mem::size_of::<Self>();
|
||||||
let buckets_mem = self.parent_buckets.len() * std::mem::size_of::<HistogramBuckets>();
|
let sub_aggs_mem = self.sub_aggregations.memory_consumption();
|
||||||
self_mem + buckets_mem
|
let buckets_mem = self.buckets.memory_consumption();
|
||||||
|
self_mem + sub_aggs_mem + buckets_mem
|
||||||
}
|
}
|
||||||
/// Converts the collector result into a intermediate bucket result.
|
/// Converts the collector result into a intermediate bucket result.
|
||||||
fn add_intermediate_bucket_result(
|
pub fn into_intermediate_bucket_result(
|
||||||
&mut self,
|
self,
|
||||||
agg_data: &AggregationsSegmentCtx,
|
agg_data: &AggregationsSegmentCtx,
|
||||||
histogram: HistogramBuckets,
|
|
||||||
) -> crate::Result<IntermediateBucketResult> {
|
) -> crate::Result<IntermediateBucketResult> {
|
||||||
let mut buckets = Vec::with_capacity(histogram.buckets.len());
|
let mut buckets = Vec::with_capacity(self.buckets.len());
|
||||||
|
|
||||||
for bucket in histogram.buckets.into_values() {
|
for (bucket_pos, bucket) in self.buckets {
|
||||||
let bucket_res = bucket.into_intermediate_bucket_entry(&mut self.sub_agg, agg_data);
|
let bucket_res = bucket.into_intermediate_bucket_entry(
|
||||||
|
self.sub_aggregations.get(&bucket_pos).cloned(),
|
||||||
|
agg_data,
|
||||||
|
);
|
||||||
|
|
||||||
buckets.push(bucket_res?);
|
buckets.push(bucket_res?);
|
||||||
}
|
}
|
||||||
@@ -429,7 +408,7 @@ impl SegmentHistogramCollector {
|
|||||||
agg_data: &mut AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
node: &AggRefNode,
|
node: &AggRefNode,
|
||||||
) -> crate::Result<Self> {
|
) -> crate::Result<Self> {
|
||||||
let sub_agg = if !node.children.is_empty() {
|
let blueprint = if !node.children.is_empty() {
|
||||||
Some(build_segment_agg_collectors(agg_data, &node.children)?)
|
Some(build_segment_agg_collectors(agg_data, &node.children)?)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
@@ -444,13 +423,13 @@ impl SegmentHistogramCollector {
|
|||||||
max: f64::MAX,
|
max: f64::MAX,
|
||||||
});
|
});
|
||||||
req_data.offset = req_data.req.offset.unwrap_or(0.0);
|
req_data.offset = req_data.req.offset.unwrap_or(0.0);
|
||||||
let sub_agg = sub_agg.map(CachedSubAggs::new);
|
|
||||||
|
req_data.sub_aggregation_blueprint = blueprint;
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
parent_buckets: Default::default(),
|
buckets: Default::default(),
|
||||||
sub_agg,
|
sub_aggregations: Default::default(),
|
||||||
accessor_idx: node.idx_in_req_data,
|
accessor_idx: node.idx_in_req_data,
|
||||||
bucket_id_provider: BucketIdProvider::default(),
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,22 +1,18 @@
|
|||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
|
|
||||||
use columnar::{Column, ColumnType};
|
use columnar::{Column, ColumnBlockAccessor, ColumnType};
|
||||||
use rustc_hash::FxHashMap;
|
use rustc_hash::FxHashMap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::aggregation::agg_data::{
|
use crate::aggregation::agg_data::{
|
||||||
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
||||||
};
|
};
|
||||||
use crate::aggregation::agg_limits::AggregationLimitsGuard;
|
|
||||||
use crate::aggregation::cached_sub_aggs::{
|
|
||||||
CachedSubAggs, HighCardSubAggCache, LowCardCachedSubAggs, LowCardSubAggCache, SubAggCache,
|
|
||||||
};
|
|
||||||
use crate::aggregation::intermediate_agg_result::{
|
use crate::aggregation::intermediate_agg_result::{
|
||||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||||
IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
|
IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
|
||||||
};
|
};
|
||||||
use crate::aggregation::segment_agg_result::{BucketIdProvider, SegmentAggregationCollector};
|
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
||||||
use crate::aggregation::*;
|
use crate::aggregation::*;
|
||||||
use crate::TantivyError;
|
use crate::TantivyError;
|
||||||
|
|
||||||
@@ -27,12 +23,12 @@ pub struct RangeAggReqData {
|
|||||||
pub accessor: Column<u64>,
|
pub accessor: Column<u64>,
|
||||||
/// The type of the fast field.
|
/// The type of the fast field.
|
||||||
pub field_type: ColumnType,
|
pub field_type: ColumnType,
|
||||||
|
/// The column block accessor to access the fast field values.
|
||||||
|
pub column_block_accessor: ColumnBlockAccessor<u64>,
|
||||||
/// The range aggregation request.
|
/// The range aggregation request.
|
||||||
pub req: RangeAggregation,
|
pub req: RangeAggregation,
|
||||||
/// The name of the aggregation.
|
/// The name of the aggregation.
|
||||||
pub name: String,
|
pub name: String,
|
||||||
/// Whether this is a top-level aggregation.
|
|
||||||
pub is_top_level: bool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RangeAggReqData {
|
impl RangeAggReqData {
|
||||||
@@ -155,47 +151,19 @@ pub(crate) struct SegmentRangeAndBucketEntry {
|
|||||||
|
|
||||||
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
||||||
/// the correct datatype.
|
/// the correct datatype.
|
||||||
pub struct SegmentRangeCollector<C: SubAggCache> {
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct SegmentRangeCollector {
|
||||||
/// The buckets containing the aggregation data.
|
/// The buckets containing the aggregation data.
|
||||||
/// One for each ParentBucketId
|
buckets: Vec<SegmentRangeAndBucketEntry>,
|
||||||
parent_buckets: Vec<Vec<SegmentRangeAndBucketEntry>>,
|
|
||||||
column_type: ColumnType,
|
column_type: ColumnType,
|
||||||
pub(crate) accessor_idx: usize,
|
pub(crate) accessor_idx: usize,
|
||||||
sub_agg: Option<CachedSubAggs<C>>,
|
|
||||||
/// Here things get a bit weird. We need to assign unique bucket ids across all
|
|
||||||
/// parent buckets. So we keep track of the next available bucket id here.
|
|
||||||
/// This allows a kind of flattening of the bucket ids across all parent buckets.
|
|
||||||
/// E.g. in nested aggregations:
|
|
||||||
/// Term Agg -> Range aggregation -> Stats aggregation
|
|
||||||
/// E.g. the Term Agg creates 3 buckets ["INFO", "ERROR", "WARN"], each of these has a Range
|
|
||||||
/// aggregation with 4 buckets. The Range aggregation will create buckets with ids:
|
|
||||||
/// - INFO: 0,1,2,3
|
|
||||||
/// - ERROR: 4,5,6,7
|
|
||||||
/// - WARN: 8,9,10,11
|
|
||||||
///
|
|
||||||
/// This allows the Stats aggregation to have unique bucket ids to refer to.
|
|
||||||
bucket_id_provider: BucketIdProvider,
|
|
||||||
limits: AggregationLimitsGuard,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<C: SubAggCache> Debug for SegmentRangeCollector<C> {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
f.debug_struct("SegmentRangeCollector")
|
|
||||||
.field("parent_buckets_len", &self.parent_buckets.len())
|
|
||||||
.field("column_type", &self.column_type)
|
|
||||||
.field("accessor_idx", &self.accessor_idx)
|
|
||||||
.field("has_sub_agg", &self.sub_agg.is_some())
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// TODO: Bad naming, there's also SegmentRangeAndBucketEntry
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub(crate) struct SegmentRangeBucketEntry {
|
pub(crate) struct SegmentRangeBucketEntry {
|
||||||
pub key: Key,
|
pub key: Key,
|
||||||
pub doc_count: u64,
|
pub doc_count: u64,
|
||||||
// pub sub_aggregation: Option<Box<dyn SegmentAggregationCollector>>,
|
pub sub_aggregation: Option<Box<dyn SegmentAggregationCollector>>,
|
||||||
pub bucket_id: BucketId,
|
|
||||||
/// The from range of the bucket. Equals `f64::MIN` when `None`.
|
/// The from range of the bucket. Equals `f64::MIN` when `None`.
|
||||||
pub from: Option<f64>,
|
pub from: Option<f64>,
|
||||||
/// The to range of the bucket. Equals `f64::MAX` when `None`. Open interval, `to` is not
|
/// The to range of the bucket. Equals `f64::MAX` when `None`. Open interval, `to` is not
|
||||||
@@ -216,50 +184,48 @@ impl Debug for SegmentRangeBucketEntry {
|
|||||||
impl SegmentRangeBucketEntry {
|
impl SegmentRangeBucketEntry {
|
||||||
pub(crate) fn into_intermediate_bucket_entry(
|
pub(crate) fn into_intermediate_bucket_entry(
|
||||||
self,
|
self,
|
||||||
|
agg_data: &AggregationsSegmentCtx,
|
||||||
) -> crate::Result<IntermediateRangeBucketEntry> {
|
) -> crate::Result<IntermediateRangeBucketEntry> {
|
||||||
let sub_aggregation = IntermediateAggregationResults::default();
|
let mut sub_aggregation_res = IntermediateAggregationResults::default();
|
||||||
|
if let Some(sub_aggregation) = self.sub_aggregation {
|
||||||
|
sub_aggregation
|
||||||
|
.add_intermediate_aggregation_result(agg_data, &mut sub_aggregation_res)?
|
||||||
|
} else {
|
||||||
|
Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
Ok(IntermediateRangeBucketEntry {
|
Ok(IntermediateRangeBucketEntry {
|
||||||
key: self.key.into(),
|
key: self.key.into(),
|
||||||
doc_count: self.doc_count,
|
doc_count: self.doc_count,
|
||||||
sub_aggregation_res: sub_aggregation,
|
sub_aggregation: sub_aggregation_res,
|
||||||
from: self.from,
|
from: self.from,
|
||||||
to: self.to,
|
to: self.to,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<C: SubAggCache> SegmentAggregationCollector for SegmentRangeCollector<C> {
|
impl SegmentAggregationCollector for SegmentRangeCollector {
|
||||||
fn add_intermediate_aggregation_result(
|
fn add_intermediate_aggregation_result(
|
||||||
&mut self,
|
self: Box<Self>,
|
||||||
agg_data: &AggregationsSegmentCtx,
|
agg_data: &AggregationsSegmentCtx,
|
||||||
results: &mut IntermediateAggregationResults,
|
results: &mut IntermediateAggregationResults,
|
||||||
parent_bucket_id: BucketId,
|
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
|
|
||||||
let field_type = self.column_type;
|
let field_type = self.column_type;
|
||||||
let name = agg_data
|
let name = agg_data
|
||||||
.get_range_req_data(self.accessor_idx)
|
.get_range_req_data(self.accessor_idx)
|
||||||
.name
|
.name
|
||||||
.to_string();
|
.to_string();
|
||||||
|
|
||||||
let buckets = std::mem::take(&mut self.parent_buckets[parent_bucket_id as usize]);
|
let buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry> = self
|
||||||
|
.buckets
|
||||||
let buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry> = buckets
|
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|range_bucket| {
|
.map(move |range_bucket| {
|
||||||
let bucket_id = range_bucket.bucket.bucket_id;
|
Ok((
|
||||||
let mut agg = range_bucket.bucket.into_intermediate_bucket_entry()?;
|
range_to_string(&range_bucket.range, &field_type)?,
|
||||||
if let Some(sub_aggregation) = &mut self.sub_agg {
|
range_bucket
|
||||||
sub_aggregation
|
.bucket
|
||||||
.get_sub_agg_collector()
|
.into_intermediate_bucket_entry(agg_data)?,
|
||||||
.add_intermediate_aggregation_result(
|
))
|
||||||
agg_data,
|
|
||||||
&mut agg.sub_aggregation_res,
|
|
||||||
bucket_id,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
Ok((range_to_string(&range_bucket.range, &field_type)?, agg))
|
|
||||||
})
|
})
|
||||||
.collect::<crate::Result<_>>()?;
|
.collect::<crate::Result<_>>()?;
|
||||||
|
|
||||||
@@ -276,114 +242,73 @@ impl<C: SubAggCache> SegmentAggregationCollector for SegmentRangeCollector<C> {
|
|||||||
#[inline]
|
#[inline]
|
||||||
fn collect(
|
fn collect(
|
||||||
&mut self,
|
&mut self,
|
||||||
parent_bucket_id: BucketId,
|
doc: crate::DocId,
|
||||||
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
self.collect_block(&[doc], agg_data)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn collect_block(
|
||||||
|
&mut self,
|
||||||
docs: &[crate::DocId],
|
docs: &[crate::DocId],
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let req = agg_data.take_range_req_data(self.accessor_idx);
|
// Take request data to avoid borrow conflicts during sub-aggregation
|
||||||
|
let mut req = agg_data.take_range_req_data(self.accessor_idx);
|
||||||
|
|
||||||
agg_data
|
req.column_block_accessor.fetch_block(docs, &req.accessor);
|
||||||
.column_block_accessor
|
|
||||||
.fetch_block(docs, &req.accessor);
|
|
||||||
|
|
||||||
let buckets = &mut self.parent_buckets[parent_bucket_id as usize];
|
for (doc, val) in req
|
||||||
|
|
||||||
for (doc, val) in agg_data
|
|
||||||
.column_block_accessor
|
.column_block_accessor
|
||||||
.iter_docid_vals(docs, &req.accessor)
|
.iter_docid_vals(docs, &req.accessor)
|
||||||
{
|
{
|
||||||
let bucket_pos = get_bucket_pos(val, buckets);
|
let bucket_pos = self.get_bucket_pos(val);
|
||||||
let bucket = &mut buckets[bucket_pos];
|
let bucket = &mut self.buckets[bucket_pos];
|
||||||
bucket.bucket.doc_count += 1;
|
bucket.bucket.doc_count += 1;
|
||||||
if let Some(sub_agg) = self.sub_agg.as_mut() {
|
if let Some(sub_agg) = bucket.bucket.sub_aggregation.as_mut() {
|
||||||
sub_agg.push(bucket.bucket.bucket_id, doc);
|
sub_agg.collect(doc, agg_data)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
agg_data.put_back_range_req_data(self.accessor_idx, req);
|
agg_data.put_back_range_req_data(self.accessor_idx, req);
|
||||||
if let Some(sub_agg) = self.sub_agg.as_mut() {
|
|
||||||
sub_agg.check_flush_local(agg_data)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
||||||
if let Some(sub_agg) = self.sub_agg.as_mut() {
|
for bucket in self.buckets.iter_mut() {
|
||||||
sub_agg.flush(agg_data)?;
|
if let Some(sub_agg) = bucket.bucket.sub_aggregation.as_mut() {
|
||||||
|
sub_agg.flush(agg_data)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn prepare_max_bucket(
|
|
||||||
&mut self,
|
|
||||||
max_bucket: BucketId,
|
|
||||||
agg_data: &AggregationsSegmentCtx,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
while self.parent_buckets.len() <= max_bucket as usize {
|
|
||||||
let new_buckets = self.create_new_buckets(agg_data)?;
|
|
||||||
self.parent_buckets.push(new_buckets);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/// Build a concrete `SegmentRangeCollector` with either a Vec- or HashMap-backed
|
|
||||||
/// bucket storage, depending on the column type and aggregation level.
|
|
||||||
pub(crate) fn build_segment_range_collector(
|
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
|
||||||
node: &AggRefNode,
|
|
||||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
|
||||||
let accessor_idx = node.idx_in_req_data;
|
|
||||||
let req_data = agg_data.get_range_req_data(node.idx_in_req_data);
|
|
||||||
let field_type = req_data.field_type;
|
|
||||||
|
|
||||||
// TODO: A better metric instead of is_top_level would be the number of buckets expected.
|
|
||||||
// E.g. If range agg is not top level, but the parent is a bucket agg with less than 10 buckets,
|
|
||||||
// we can are still in low cardinality territory.
|
|
||||||
let is_low_card = req_data.is_top_level && req_data.req.ranges.len() <= 64;
|
|
||||||
|
|
||||||
let sub_agg = if !node.children.is_empty() {
|
|
||||||
Some(build_segment_agg_collectors(agg_data, &node.children)?)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
if is_low_card {
|
|
||||||
Ok(Box::new(SegmentRangeCollector::<LowCardSubAggCache> {
|
|
||||||
sub_agg: sub_agg.map(LowCardCachedSubAggs::new),
|
|
||||||
column_type: field_type,
|
|
||||||
accessor_idx,
|
|
||||||
parent_buckets: Vec::new(),
|
|
||||||
bucket_id_provider: BucketIdProvider::default(),
|
|
||||||
limits: agg_data.context.limits.clone(),
|
|
||||||
}))
|
|
||||||
} else {
|
|
||||||
Ok(Box::new(SegmentRangeCollector::<HighCardSubAggCache> {
|
|
||||||
sub_agg: sub_agg.map(CachedSubAggs::new),
|
|
||||||
column_type: field_type,
|
|
||||||
accessor_idx,
|
|
||||||
parent_buckets: Vec::new(),
|
|
||||||
bucket_id_provider: BucketIdProvider::default(),
|
|
||||||
limits: agg_data.context.limits.clone(),
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<C: SubAggCache> SegmentRangeCollector<C> {
|
impl SegmentRangeCollector {
|
||||||
pub(crate) fn create_new_buckets(
|
pub(crate) fn from_req_and_validate(
|
||||||
&mut self,
|
req_data: &mut AggregationsSegmentCtx,
|
||||||
agg_data: &AggregationsSegmentCtx,
|
node: &AggRefNode,
|
||||||
) -> crate::Result<Vec<SegmentRangeAndBucketEntry>> {
|
) -> crate::Result<Self> {
|
||||||
let field_type = self.column_type;
|
let accessor_idx = node.idx_in_req_data;
|
||||||
let req_data = agg_data.get_range_req_data(self.accessor_idx);
|
let (field_type, ranges) = {
|
||||||
|
let req_view = req_data.get_range_req_data(node.idx_in_req_data);
|
||||||
|
(req_view.field_type, req_view.req.ranges.clone())
|
||||||
|
};
|
||||||
|
|
||||||
// The range input on the request is f64.
|
// The range input on the request is f64.
|
||||||
// We need to convert to u64 ranges, because we read the values as u64.
|
// We need to convert to u64 ranges, because we read the values as u64.
|
||||||
// The mapping from the conversion is monotonic so ordering is preserved.
|
// The mapping from the conversion is monotonic so ordering is preserved.
|
||||||
let buckets: Vec<_> = extend_validate_ranges(&req_data.req.ranges, &field_type)?
|
let sub_agg_prototype = if !node.children.is_empty() {
|
||||||
|
Some(build_segment_agg_collectors(req_data, &node.children)?)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let buckets: Vec<_> = extend_validate_ranges(&ranges, &field_type)?
|
||||||
.iter()
|
.iter()
|
||||||
.map(|range| {
|
.map(|range| {
|
||||||
let bucket_id = self.bucket_id_provider.next_bucket_id();
|
|
||||||
let key = range
|
let key = range
|
||||||
.key
|
.key
|
||||||
.clone()
|
.clone()
|
||||||
@@ -392,20 +317,20 @@ impl<C: SubAggCache> SegmentRangeCollector<C> {
|
|||||||
let to = if range.range.end == u64::MAX {
|
let to = if range.range.end == u64::MAX {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
Some(f64_from_fastfield_u64(range.range.end, field_type))
|
Some(f64_from_fastfield_u64(range.range.end, &field_type))
|
||||||
};
|
};
|
||||||
let from = if range.range.start == u64::MIN {
|
let from = if range.range.start == u64::MIN {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
Some(f64_from_fastfield_u64(range.range.start, field_type))
|
Some(f64_from_fastfield_u64(range.range.start, &field_type))
|
||||||
};
|
};
|
||||||
// let sub_aggregation = sub_agg_prototype.clone();
|
let sub_aggregation = sub_agg_prototype.clone();
|
||||||
|
|
||||||
Ok(SegmentRangeAndBucketEntry {
|
Ok(SegmentRangeAndBucketEntry {
|
||||||
range: range.range.clone(),
|
range: range.range.clone(),
|
||||||
bucket: SegmentRangeBucketEntry {
|
bucket: SegmentRangeBucketEntry {
|
||||||
doc_count: 0,
|
doc_count: 0,
|
||||||
bucket_id,
|
sub_aggregation,
|
||||||
key,
|
key,
|
||||||
from,
|
from,
|
||||||
to,
|
to,
|
||||||
@@ -414,19 +339,26 @@ impl<C: SubAggCache> SegmentRangeCollector<C> {
|
|||||||
})
|
})
|
||||||
.collect::<crate::Result<_>>()?;
|
.collect::<crate::Result<_>>()?;
|
||||||
|
|
||||||
self.limits.add_memory_consumed(
|
req_data.context.limits.add_memory_consumed(
|
||||||
buckets.len() as u64 * std::mem::size_of::<SegmentRangeAndBucketEntry>() as u64,
|
buckets.len() as u64 * std::mem::size_of::<SegmentRangeAndBucketEntry>() as u64,
|
||||||
)?;
|
)?;
|
||||||
Ok(buckets)
|
|
||||||
|
Ok(SegmentRangeCollector {
|
||||||
|
buckets,
|
||||||
|
column_type: field_type,
|
||||||
|
accessor_idx,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn get_bucket_pos(&self, val: u64) -> usize {
|
||||||
|
let pos = self
|
||||||
|
.buckets
|
||||||
|
.binary_search_by_key(&val, |probe| probe.range.start)
|
||||||
|
.unwrap_or_else(|pos| pos - 1);
|
||||||
|
debug_assert!(self.buckets[pos].range.contains(&val));
|
||||||
|
pos
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
fn get_bucket_pos(val: u64, buckets: &[SegmentRangeAndBucketEntry]) -> usize {
|
|
||||||
let pos = buckets
|
|
||||||
.binary_search_by_key(&val, |probe| probe.range.start)
|
|
||||||
.unwrap_or_else(|pos| pos - 1);
|
|
||||||
debug_assert!(buckets[pos].range.contains(&val));
|
|
||||||
pos
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Converts the user provided f64 range value to fast field value space.
|
/// Converts the user provided f64 range value to fast field value space.
|
||||||
@@ -524,7 +456,7 @@ pub(crate) fn range_to_string(
|
|||||||
let val = i64::from_u64(val);
|
let val = i64::from_u64(val);
|
||||||
format_date(val)
|
format_date(val)
|
||||||
} else {
|
} else {
|
||||||
Ok(f64_from_fastfield_u64(val, *field_type).to_string())
|
Ok(f64_from_fastfield_u64(val, field_type).to_string())
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -554,7 +486,7 @@ mod tests {
|
|||||||
pub fn get_collector_from_ranges(
|
pub fn get_collector_from_ranges(
|
||||||
ranges: Vec<RangeAggregationRange>,
|
ranges: Vec<RangeAggregationRange>,
|
||||||
field_type: ColumnType,
|
field_type: ColumnType,
|
||||||
) -> SegmentRangeCollector<HighCardSubAggCache> {
|
) -> SegmentRangeCollector {
|
||||||
let req = RangeAggregation {
|
let req = RangeAggregation {
|
||||||
field: "dummy".to_string(),
|
field: "dummy".to_string(),
|
||||||
ranges,
|
ranges,
|
||||||
@@ -574,33 +506,30 @@ mod tests {
|
|||||||
let to = if range.range.end == u64::MAX {
|
let to = if range.range.end == u64::MAX {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
Some(f64_from_fastfield_u64(range.range.end, field_type))
|
Some(f64_from_fastfield_u64(range.range.end, &field_type))
|
||||||
};
|
};
|
||||||
let from = if range.range.start == u64::MIN {
|
let from = if range.range.start == u64::MIN {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
Some(f64_from_fastfield_u64(range.range.start, field_type))
|
Some(f64_from_fastfield_u64(range.range.start, &field_type))
|
||||||
};
|
};
|
||||||
SegmentRangeAndBucketEntry {
|
SegmentRangeAndBucketEntry {
|
||||||
range: range.range.clone(),
|
range: range.range.clone(),
|
||||||
bucket: SegmentRangeBucketEntry {
|
bucket: SegmentRangeBucketEntry {
|
||||||
doc_count: 0,
|
doc_count: 0,
|
||||||
|
sub_aggregation: None,
|
||||||
key,
|
key,
|
||||||
from,
|
from,
|
||||||
to,
|
to,
|
||||||
bucket_id: 0,
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
SegmentRangeCollector {
|
SegmentRangeCollector {
|
||||||
parent_buckets: vec![buckets],
|
buckets,
|
||||||
column_type: field_type,
|
column_type: field_type,
|
||||||
accessor_idx: 0,
|
accessor_idx: 0,
|
||||||
sub_agg: None,
|
|
||||||
bucket_id_provider: Default::default(),
|
|
||||||
limits: AggregationLimitsGuard::default(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -847,7 +776,7 @@ mod tests {
|
|||||||
let buckets = vec![(10f64..20f64).into(), (30f64..40f64).into()];
|
let buckets = vec![(10f64..20f64).into(), (30f64..40f64).into()];
|
||||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||||
|
|
||||||
let buckets = collector.parent_buckets[0].clone();
|
let buckets = collector.buckets;
|
||||||
assert_eq!(buckets[0].range.start, u64::MIN);
|
assert_eq!(buckets[0].range.start, u64::MIN);
|
||||||
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
||||||
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
||||||
@@ -870,7 +799,7 @@ mod tests {
|
|||||||
];
|
];
|
||||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||||
|
|
||||||
let buckets = collector.parent_buckets[0].clone();
|
let buckets = collector.buckets;
|
||||||
assert_eq!(buckets[0].range.start, u64::MIN);
|
assert_eq!(buckets[0].range.start, u64::MIN);
|
||||||
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
||||||
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
||||||
@@ -885,7 +814,7 @@ mod tests {
|
|||||||
let buckets = vec![(-10f64..-1f64).into()];
|
let buckets = vec![(-10f64..-1f64).into()];
|
||||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||||
|
|
||||||
let buckets = collector.parent_buckets[0].clone();
|
let buckets = collector.buckets;
|
||||||
assert_eq!(&buckets[0].bucket.key.to_string(), "*--10");
|
assert_eq!(&buckets[0].bucket.key.to_string(), "*--10");
|
||||||
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "-1-*");
|
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "-1-*");
|
||||||
}
|
}
|
||||||
@@ -894,7 +823,7 @@ mod tests {
|
|||||||
let buckets = vec![(0f64..10f64).into()];
|
let buckets = vec![(0f64..10f64).into()];
|
||||||
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
|
||||||
|
|
||||||
let buckets = collector.parent_buckets[0].clone();
|
let buckets = collector.buckets;
|
||||||
assert_eq!(&buckets[0].bucket.key.to_string(), "*-0");
|
assert_eq!(&buckets[0].bucket.key.to_string(), "*-0");
|
||||||
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "10-*");
|
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "10-*");
|
||||||
}
|
}
|
||||||
@@ -903,7 +832,7 @@ mod tests {
|
|||||||
fn range_binary_search_test_u64() {
|
fn range_binary_search_test_u64() {
|
||||||
let check_ranges = |ranges: Vec<RangeAggregationRange>| {
|
let check_ranges = |ranges: Vec<RangeAggregationRange>| {
|
||||||
let collector = get_collector_from_ranges(ranges, ColumnType::U64);
|
let collector = get_collector_from_ranges(ranges, ColumnType::U64);
|
||||||
let search = |val: u64| get_bucket_pos(val, &collector.parent_buckets[0]);
|
let search = |val: u64| collector.get_bucket_pos(val);
|
||||||
|
|
||||||
assert_eq!(search(u64::MIN), 0);
|
assert_eq!(search(u64::MIN), 0);
|
||||||
assert_eq!(search(9), 0);
|
assert_eq!(search(9), 0);
|
||||||
@@ -949,7 +878,7 @@ mod tests {
|
|||||||
let ranges = vec![(10.0..100.0).into()];
|
let ranges = vec![(10.0..100.0).into()];
|
||||||
|
|
||||||
let collector = get_collector_from_ranges(ranges, ColumnType::F64);
|
let collector = get_collector_from_ranges(ranges, ColumnType::F64);
|
||||||
let search = |val: u64| get_bucket_pos(val, &collector.parent_buckets[0]);
|
let search = |val: u64| collector.get_bucket_pos(val);
|
||||||
|
|
||||||
assert_eq!(search(u64::MIN), 0);
|
assert_eq!(search(u64::MIN), 0);
|
||||||
assert_eq!(search(9f64.to_u64()), 0);
|
assert_eq!(search(9f64.to_u64()), 0);
|
||||||
@@ -961,3 +890,63 @@ mod tests {
|
|||||||
// the max value
|
// the max value
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
|
mod bench {
|
||||||
|
|
||||||
|
use itertools::Itertools;
|
||||||
|
use rand::seq::SliceRandom;
|
||||||
|
use rand::thread_rng;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use crate::aggregation::bucket::range::tests::get_collector_from_ranges;
|
||||||
|
|
||||||
|
const TOTAL_DOCS: u64 = 1_000_000u64;
|
||||||
|
const NUM_DOCS: u64 = 50_000u64;
|
||||||
|
|
||||||
|
fn get_collector_with_buckets(num_buckets: u64, num_docs: u64) -> SegmentRangeCollector {
|
||||||
|
let bucket_size = num_docs / num_buckets;
|
||||||
|
let mut buckets: Vec<RangeAggregationRange> = vec![];
|
||||||
|
for i in 0..num_buckets {
|
||||||
|
let bucket_start = (i * bucket_size) as f64;
|
||||||
|
buckets.push((bucket_start..bucket_start + bucket_size as f64).into())
|
||||||
|
}
|
||||||
|
|
||||||
|
get_collector_from_ranges(buckets, ColumnType::U64)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_rand_docs(total_docs: u64, num_docs_returned: u64) -> Vec<u64> {
|
||||||
|
let mut rng = thread_rng();
|
||||||
|
|
||||||
|
let all_docs = (0..total_docs - 1).collect_vec();
|
||||||
|
let mut vals = all_docs
|
||||||
|
.as_slice()
|
||||||
|
.choose_multiple(&mut rng, num_docs_returned as usize)
|
||||||
|
.cloned()
|
||||||
|
.collect_vec();
|
||||||
|
vals.sort();
|
||||||
|
vals
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bench_range_binary_search(b: &mut test::Bencher, num_buckets: u64) {
|
||||||
|
let collector = get_collector_with_buckets(num_buckets, TOTAL_DOCS);
|
||||||
|
let vals = get_rand_docs(TOTAL_DOCS, NUM_DOCS);
|
||||||
|
b.iter(|| {
|
||||||
|
let mut bucket_pos = 0;
|
||||||
|
for val in &vals {
|
||||||
|
bucket_pos = collector.get_bucket_pos(*val);
|
||||||
|
}
|
||||||
|
bucket_pos
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_range_100_buckets(b: &mut test::Bencher) {
|
||||||
|
bench_range_binary_search(b, 100)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_range_10_buckets(b: &mut test::Bencher) {
|
||||||
|
bench_range_binary_search(b, 10)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
196
src/aggregation/bucket/term_agg/default_impl.rs
Normal file
196
src/aggregation/bucket/term_agg/default_impl.rs
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
use std::fmt::Debug;
|
||||||
|
|
||||||
|
use columnar::ColumnType;
|
||||||
|
use rustc_hash::FxHashMap;
|
||||||
|
|
||||||
|
use super::OrderTarget;
|
||||||
|
use crate::aggregation::agg_data::{
|
||||||
|
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
||||||
|
};
|
||||||
|
use crate::aggregation::agg_limits::MemoryConsumption;
|
||||||
|
use crate::aggregation::bucket::get_agg_name_and_property;
|
||||||
|
use crate::aggregation::intermediate_agg_result::{
|
||||||
|
IntermediateAggregationResult, IntermediateAggregationResults,
|
||||||
|
};
|
||||||
|
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
||||||
|
use crate::TantivyError;
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
/// Container to store term_ids/or u64 values and their buckets.
|
||||||
|
struct TermBuckets {
|
||||||
|
pub(crate) entries: FxHashMap<u64, u32>,
|
||||||
|
pub(crate) sub_aggs: FxHashMap<u64, Box<dyn SegmentAggregationCollector>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TermBuckets {
|
||||||
|
fn get_memory_consumption(&self) -> usize {
|
||||||
|
let sub_aggs_mem = self.sub_aggs.memory_consumption();
|
||||||
|
let buckets_mem = self.entries.memory_consumption();
|
||||||
|
sub_aggs_mem + buckets_mem
|
||||||
|
}
|
||||||
|
|
||||||
|
fn force_flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
||||||
|
for sub_aggregations in &mut self.sub_aggs.values_mut() {
|
||||||
|
sub_aggregations.as_mut().flush(agg_data)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
||||||
|
/// the correct datatype.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct SegmentTermCollector {
|
||||||
|
/// The buckets containing the aggregation data.
|
||||||
|
term_buckets: TermBuckets,
|
||||||
|
accessor_idx: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentAggregationCollector for SegmentTermCollector {
|
||||||
|
fn add_intermediate_aggregation_result(
|
||||||
|
self: Box<Self>,
|
||||||
|
agg_data: &AggregationsSegmentCtx,
|
||||||
|
results: &mut IntermediateAggregationResults,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
let name = agg_data.get_term_req_data(self.accessor_idx).name.clone();
|
||||||
|
|
||||||
|
let entries: Vec<(u64, u32)> = self.term_buckets.entries.into_iter().collect();
|
||||||
|
let bucket = super::into_intermediate_bucket_result(
|
||||||
|
self.accessor_idx,
|
||||||
|
entries,
|
||||||
|
self.term_buckets.sub_aggs,
|
||||||
|
agg_data,
|
||||||
|
)?;
|
||||||
|
results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn collect(
|
||||||
|
&mut self,
|
||||||
|
doc: crate::DocId,
|
||||||
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
self.collect_block(&[doc], agg_data)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn collect_block(
|
||||||
|
&mut self,
|
||||||
|
docs: &[crate::DocId],
|
||||||
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
let mut req_data = agg_data.take_term_req_data(self.accessor_idx);
|
||||||
|
|
||||||
|
let mem_pre = self.get_memory_consumption();
|
||||||
|
|
||||||
|
if let Some(missing) = req_data.missing_value_for_accessor {
|
||||||
|
req_data.column_block_accessor.fetch_block_with_missing(
|
||||||
|
docs,
|
||||||
|
&req_data.accessor,
|
||||||
|
missing,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
req_data
|
||||||
|
.column_block_accessor
|
||||||
|
.fetch_block(docs, &req_data.accessor);
|
||||||
|
}
|
||||||
|
|
||||||
|
for term_id in req_data.column_block_accessor.iter_vals() {
|
||||||
|
if let Some(allowed_bs) = req_data.allowed_term_ids.as_ref() {
|
||||||
|
if !allowed_bs.contains(term_id as u32) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let entry = self.term_buckets.entries.entry(term_id).or_default();
|
||||||
|
*entry += 1;
|
||||||
|
}
|
||||||
|
// has subagg
|
||||||
|
if let Some(blueprint) = req_data.sub_aggregation_blueprint.as_ref() {
|
||||||
|
for (doc, term_id) in req_data
|
||||||
|
.column_block_accessor
|
||||||
|
.iter_docid_vals(docs, &req_data.accessor)
|
||||||
|
{
|
||||||
|
if let Some(allowed_bs) = req_data.allowed_term_ids.as_ref() {
|
||||||
|
if !allowed_bs.contains(term_id as u32) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let sub_aggregations = self
|
||||||
|
.term_buckets
|
||||||
|
.sub_aggs
|
||||||
|
.entry(term_id)
|
||||||
|
.or_insert_with(|| blueprint.clone());
|
||||||
|
sub_aggregations.collect(doc, agg_data)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mem_delta = self.get_memory_consumption() - mem_pre;
|
||||||
|
if mem_delta > 0 {
|
||||||
|
agg_data
|
||||||
|
.context
|
||||||
|
.limits
|
||||||
|
.add_memory_consumed(mem_delta as u64)?;
|
||||||
|
}
|
||||||
|
agg_data.put_back_term_req_data(self.accessor_idx, req_data);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
||||||
|
self.term_buckets.force_flush(agg_data)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentTermCollector {
|
||||||
|
pub fn from_req_and_validate(
|
||||||
|
req_data: &mut AggregationsSegmentCtx,
|
||||||
|
node: &AggRefNode,
|
||||||
|
) -> crate::Result<Self> {
|
||||||
|
let terms_req_data = req_data.get_term_req_data(node.idx_in_req_data);
|
||||||
|
let column_type = terms_req_data.column_type;
|
||||||
|
let accessor_idx = node.idx_in_req_data;
|
||||||
|
if column_type == ColumnType::Bytes {
|
||||||
|
return Err(TantivyError::InvalidArgument(format!(
|
||||||
|
"terms aggregation is not supported for column type {column_type:?}"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
let term_buckets = TermBuckets::default();
|
||||||
|
|
||||||
|
// Validate sub aggregation exists
|
||||||
|
if let OrderTarget::SubAggregation(sub_agg_name) = &terms_req_data.req.order.target {
|
||||||
|
let (agg_name, _agg_property) = get_agg_name_and_property(sub_agg_name);
|
||||||
|
|
||||||
|
node.get_sub_agg(agg_name, &req_data.per_request)
|
||||||
|
.ok_or_else(|| {
|
||||||
|
TantivyError::InvalidArgument(format!(
|
||||||
|
"could not find aggregation with name {agg_name} in metric \
|
||||||
|
sub_aggregations"
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let has_sub_aggregations = !node.children.is_empty();
|
||||||
|
let blueprint = if has_sub_aggregations {
|
||||||
|
let sub_aggregation = build_segment_agg_collectors(req_data, &node.children)?;
|
||||||
|
Some(sub_aggregation)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
let terms_req_data = req_data.get_term_req_data_mut(node.idx_in_req_data);
|
||||||
|
terms_req_data.sub_aggregation_blueprint = blueprint;
|
||||||
|
|
||||||
|
Ok(SegmentTermCollector {
|
||||||
|
term_buckets,
|
||||||
|
accessor_idx,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_memory_consumption(&self) -> usize {
|
||||||
|
let self_mem = std::mem::size_of::<Self>();
|
||||||
|
let term_buckets_mem = self.term_buckets.get_memory_consumption();
|
||||||
|
self_mem + term_buckets_mem
|
||||||
|
}
|
||||||
|
}
|
||||||
228
src/aggregation/bucket/term_agg/low_cardinality_impl.rs
Normal file
228
src/aggregation/bucket/term_agg/low_cardinality_impl.rs
Normal file
@@ -0,0 +1,228 @@
|
|||||||
|
use std::vec;
|
||||||
|
|
||||||
|
use rustc_hash::FxHashMap;
|
||||||
|
|
||||||
|
use crate::aggregation::agg_data::{
|
||||||
|
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
||||||
|
};
|
||||||
|
use crate::aggregation::bucket::{get_agg_name_and_property, OrderTarget};
|
||||||
|
use crate::aggregation::intermediate_agg_result::{
|
||||||
|
IntermediateAggregationResult, IntermediateAggregationResults,
|
||||||
|
};
|
||||||
|
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
||||||
|
use crate::{DocId, TantivyError};
|
||||||
|
|
||||||
|
const MAX_BATCH_SIZE: usize = 1_024;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct LowCardTermBuckets {
|
||||||
|
entries: Box<[u32]>,
|
||||||
|
sub_aggs: Vec<Box<dyn SegmentAggregationCollector>>,
|
||||||
|
doc_buffers: Box<[Vec<DocId>]>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LowCardTermBuckets {
|
||||||
|
pub fn with_num_buckets(
|
||||||
|
num_buckets: usize,
|
||||||
|
sub_aggs_blueprint_opt: Option<&Box<dyn SegmentAggregationCollector>>,
|
||||||
|
) -> Self {
|
||||||
|
let sub_aggs = sub_aggs_blueprint_opt
|
||||||
|
.as_ref()
|
||||||
|
.map(|blueprint| {
|
||||||
|
std::iter::repeat_with(|| blueprint.clone_box())
|
||||||
|
.take(num_buckets)
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
})
|
||||||
|
.unwrap_or_default();
|
||||||
|
Self {
|
||||||
|
entries: vec![0; num_buckets].into_boxed_slice(),
|
||||||
|
sub_aggs,
|
||||||
|
doc_buffers: std::iter::repeat_with(|| Vec::with_capacity(MAX_BATCH_SIZE))
|
||||||
|
.take(num_buckets)
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.into_boxed_slice(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_memory_consumption(&self) -> usize {
|
||||||
|
std::mem::size_of::<Self>()
|
||||||
|
+ self.entries.len() * std::mem::size_of::<u32>()
|
||||||
|
+ self.doc_buffers.len()
|
||||||
|
* (std::mem::size_of::<Vec<DocId>>()
|
||||||
|
+ std::mem::size_of::<DocId>() * MAX_BATCH_SIZE)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct LowCardSegmentTermCollector {
|
||||||
|
term_buckets: LowCardTermBuckets,
|
||||||
|
accessor_idx: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LowCardSegmentTermCollector {
|
||||||
|
pub fn from_req_and_validate(
|
||||||
|
req_data: &mut AggregationsSegmentCtx,
|
||||||
|
node: &AggRefNode,
|
||||||
|
) -> crate::Result<Self> {
|
||||||
|
let terms_req_data = req_data.get_term_req_data(node.idx_in_req_data);
|
||||||
|
let accessor_idx = node.idx_in_req_data;
|
||||||
|
let cardinality = terms_req_data
|
||||||
|
.accessor
|
||||||
|
.max_value()
|
||||||
|
.max(terms_req_data.missing_value_for_accessor.unwrap_or(0))
|
||||||
|
+ 1;
|
||||||
|
assert!(cardinality <= super::LOW_CARDINALITY_THRESHOLD);
|
||||||
|
|
||||||
|
// Validate sub aggregation exists
|
||||||
|
if let OrderTarget::SubAggregation(sub_agg_name) = &terms_req_data.req.order.target {
|
||||||
|
let (agg_name, _agg_property) = get_agg_name_and_property(sub_agg_name);
|
||||||
|
|
||||||
|
node.get_sub_agg(agg_name, &req_data.per_request)
|
||||||
|
.ok_or_else(|| {
|
||||||
|
TantivyError::InvalidArgument(format!(
|
||||||
|
"could not find aggregation with name {agg_name} in metric \
|
||||||
|
sub_aggregations"
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let has_sub_aggregations = !node.children.is_empty();
|
||||||
|
let blueprint = if has_sub_aggregations {
|
||||||
|
let sub_aggregation = build_segment_agg_collectors(req_data, &node.children)?;
|
||||||
|
Some(sub_aggregation)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
let terms_req_data = req_data.get_term_req_data_mut(node.idx_in_req_data);
|
||||||
|
|
||||||
|
let term_buckets =
|
||||||
|
LowCardTermBuckets::with_num_buckets(cardinality as usize, blueprint.as_ref());
|
||||||
|
|
||||||
|
terms_req_data.sub_aggregation_blueprint = blueprint;
|
||||||
|
|
||||||
|
Ok(LowCardSegmentTermCollector {
|
||||||
|
term_buckets,
|
||||||
|
accessor_idx,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_memory_consumption(&self) -> usize {
|
||||||
|
let self_mem = std::mem::size_of::<Self>();
|
||||||
|
let term_buckets_mem = self.term_buckets.get_memory_consumption();
|
||||||
|
self_mem + term_buckets_mem
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentAggregationCollector for LowCardSegmentTermCollector {
|
||||||
|
fn add_intermediate_aggregation_result(
|
||||||
|
self: Box<Self>,
|
||||||
|
agg_data: &AggregationsSegmentCtx,
|
||||||
|
results: &mut IntermediateAggregationResults,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
let name = agg_data.get_term_req_data(self.accessor_idx).name.clone();
|
||||||
|
let sub_aggs: FxHashMap<u64, Box<dyn SegmentAggregationCollector>> = self
|
||||||
|
.term_buckets
|
||||||
|
.sub_aggs
|
||||||
|
.into_iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|(bucket_id, _sub_agg)| self.term_buckets.entries[*bucket_id] > 0)
|
||||||
|
.map(|(bucket_id, sub_agg)| (bucket_id as u64, sub_agg))
|
||||||
|
.collect();
|
||||||
|
let entries: Vec<(u64, u32)> = self
|
||||||
|
.term_buckets
|
||||||
|
.entries
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|(_, count)| **count > 0)
|
||||||
|
.map(|(bucket_id, count)| (bucket_id as u64, *count))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let bucket =
|
||||||
|
super::into_intermediate_bucket_result(self.accessor_idx, entries, sub_aggs, agg_data)?;
|
||||||
|
results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_block(
|
||||||
|
&mut self,
|
||||||
|
docs: &[crate::DocId],
|
||||||
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
if docs.len() > MAX_BATCH_SIZE {
|
||||||
|
for batch in docs.chunks(MAX_BATCH_SIZE) {
|
||||||
|
self.collect_block(batch, agg_data)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut req_data = agg_data.take_term_req_data(self.accessor_idx);
|
||||||
|
|
||||||
|
let mem_pre = self.get_memory_consumption();
|
||||||
|
|
||||||
|
if let Some(missing) = req_data.missing_value_for_accessor {
|
||||||
|
req_data.column_block_accessor.fetch_block_with_missing(
|
||||||
|
docs,
|
||||||
|
&req_data.accessor,
|
||||||
|
missing,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
req_data
|
||||||
|
.column_block_accessor
|
||||||
|
.fetch_block(docs, &req_data.accessor);
|
||||||
|
}
|
||||||
|
|
||||||
|
// has subagg
|
||||||
|
if req_data.sub_aggregation_blueprint.is_some() {
|
||||||
|
for (doc, term_id) in req_data
|
||||||
|
.column_block_accessor
|
||||||
|
.iter_docid_vals(docs, &req_data.accessor)
|
||||||
|
{
|
||||||
|
if let Some(allowed_bs) = req_data.allowed_term_ids.as_ref() {
|
||||||
|
if !allowed_bs.contains(term_id as u32) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.term_buckets.doc_buffers[term_id as usize].push(doc);
|
||||||
|
}
|
||||||
|
for (bucket_id, docs) in self.term_buckets.doc_buffers.iter_mut().enumerate() {
|
||||||
|
self.term_buckets.entries[bucket_id] += docs.len() as u32;
|
||||||
|
self.term_buckets.sub_aggs[bucket_id].collect_block(&docs[..], agg_data)?;
|
||||||
|
docs.clear();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for term_id in req_data.column_block_accessor.iter_vals() {
|
||||||
|
if let Some(allowed_bs) = req_data.allowed_term_ids.as_ref() {
|
||||||
|
if !allowed_bs.contains(term_id as u32) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.term_buckets.entries[term_id as usize] += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mem_delta = self.get_memory_consumption() - mem_pre;
|
||||||
|
if mem_delta > 0 {
|
||||||
|
agg_data
|
||||||
|
.context
|
||||||
|
.limits
|
||||||
|
.add_memory_consumed(mem_delta as u64)?;
|
||||||
|
}
|
||||||
|
agg_data.put_back_term_req_data(self.accessor_idx, req_data);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect(
|
||||||
|
&mut self,
|
||||||
|
doc: crate::DocId,
|
||||||
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
self.collect_block(&[doc], agg_data)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
||||||
|
for sub_aggregations in &mut self.term_buckets.sub_aggs.iter_mut() {
|
||||||
|
sub_aggregations.as_mut().flush(agg_data)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -5,13 +5,11 @@ use crate::aggregation::agg_data::{
|
|||||||
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
|
||||||
};
|
};
|
||||||
use crate::aggregation::bucket::term_agg::TermsAggregation;
|
use crate::aggregation::bucket::term_agg::TermsAggregation;
|
||||||
use crate::aggregation::cached_sub_aggs::{CachedSubAggs, HighCardCachedSubAggs};
|
|
||||||
use crate::aggregation::intermediate_agg_result::{
|
use crate::aggregation::intermediate_agg_result::{
|
||||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||||
IntermediateKey, IntermediateTermBucketEntry, IntermediateTermBucketResult,
|
IntermediateKey, IntermediateTermBucketEntry, IntermediateTermBucketResult,
|
||||||
};
|
};
|
||||||
use crate::aggregation::segment_agg_result::{BucketIdProvider, SegmentAggregationCollector};
|
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
||||||
use crate::aggregation::BucketId;
|
|
||||||
|
|
||||||
/// Special aggregation to handle missing values for term aggregations.
|
/// Special aggregation to handle missing values for term aggregations.
|
||||||
/// This missing aggregation will check multiple columns for existence.
|
/// This missing aggregation will check multiple columns for existence.
|
||||||
@@ -37,55 +35,41 @@ impl MissingTermAggReqData {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default, Debug, Clone)]
|
|
||||||
struct MissingCount {
|
|
||||||
missing_count: u32,
|
|
||||||
bucket_id: BucketId,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The specialized missing term aggregation.
|
/// The specialized missing term aggregation.
|
||||||
#[derive(Default, Debug)]
|
#[derive(Default, Debug, Clone)]
|
||||||
pub struct TermMissingAgg {
|
pub struct TermMissingAgg {
|
||||||
|
missing_count: u32,
|
||||||
accessor_idx: usize,
|
accessor_idx: usize,
|
||||||
sub_agg: Option<HighCardCachedSubAggs>,
|
sub_agg: Option<Box<dyn SegmentAggregationCollector>>,
|
||||||
/// Idx = parent bucket id, Value = missing count for that bucket
|
|
||||||
missing_count_per_bucket: Vec<MissingCount>,
|
|
||||||
bucket_id_provider: BucketIdProvider,
|
|
||||||
}
|
}
|
||||||
impl TermMissingAgg {
|
impl TermMissingAgg {
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
req_data: &mut AggregationsSegmentCtx,
|
||||||
node: &AggRefNode,
|
node: &AggRefNode,
|
||||||
) -> crate::Result<Self> {
|
) -> crate::Result<Self> {
|
||||||
let has_sub_aggregations = !node.children.is_empty();
|
let has_sub_aggregations = !node.children.is_empty();
|
||||||
let accessor_idx = node.idx_in_req_data;
|
let accessor_idx = node.idx_in_req_data;
|
||||||
let sub_agg = if has_sub_aggregations {
|
let sub_agg = if has_sub_aggregations {
|
||||||
let sub_aggregation = build_segment_agg_collectors(agg_data, &node.children)?;
|
let sub_aggregation = build_segment_agg_collectors(req_data, &node.children)?;
|
||||||
Some(sub_aggregation)
|
Some(sub_aggregation)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
let sub_agg = sub_agg.map(CachedSubAggs::new);
|
|
||||||
let bucket_id_provider = BucketIdProvider::default();
|
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
accessor_idx,
|
accessor_idx,
|
||||||
sub_agg,
|
sub_agg,
|
||||||
missing_count_per_bucket: Vec::new(),
|
..Default::default()
|
||||||
bucket_id_provider,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentAggregationCollector for TermMissingAgg {
|
impl SegmentAggregationCollector for TermMissingAgg {
|
||||||
fn add_intermediate_aggregation_result(
|
fn add_intermediate_aggregation_result(
|
||||||
&mut self,
|
self: Box<Self>,
|
||||||
agg_data: &AggregationsSegmentCtx,
|
agg_data: &AggregationsSegmentCtx,
|
||||||
results: &mut IntermediateAggregationResults,
|
results: &mut IntermediateAggregationResults,
|
||||||
parent_bucket_id: BucketId,
|
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
|
|
||||||
let req_data = agg_data.get_missing_term_req_data(self.accessor_idx);
|
let req_data = agg_data.get_missing_term_req_data(self.accessor_idx);
|
||||||
let term_agg = &req_data.req;
|
let term_agg = &req_data.req;
|
||||||
let missing = term_agg
|
let missing = term_agg
|
||||||
@@ -96,16 +80,13 @@ impl SegmentAggregationCollector for TermMissingAgg {
|
|||||||
let mut entries: FxHashMap<IntermediateKey, IntermediateTermBucketEntry> =
|
let mut entries: FxHashMap<IntermediateKey, IntermediateTermBucketEntry> =
|
||||||
Default::default();
|
Default::default();
|
||||||
|
|
||||||
let missing_count = &self.missing_count_per_bucket[parent_bucket_id as usize];
|
|
||||||
let mut missing_entry = IntermediateTermBucketEntry {
|
let mut missing_entry = IntermediateTermBucketEntry {
|
||||||
doc_count: missing_count.missing_count,
|
doc_count: self.missing_count,
|
||||||
sub_aggregation: Default::default(),
|
sub_aggregation: Default::default(),
|
||||||
};
|
};
|
||||||
if let Some(sub_agg) = &mut self.sub_agg {
|
if let Some(sub_agg) = self.sub_agg {
|
||||||
let mut res = IntermediateAggregationResults::default();
|
let mut res = IntermediateAggregationResults::default();
|
||||||
sub_agg
|
sub_agg.add_intermediate_aggregation_result(agg_data, &mut res)?;
|
||||||
.get_sub_agg_collector()
|
|
||||||
.add_intermediate_aggregation_result(agg_data, &mut res, missing_count.bucket_id)?;
|
|
||||||
missing_entry.sub_aggregation = res;
|
missing_entry.sub_aggregation = res;
|
||||||
}
|
}
|
||||||
entries.insert(missing.into(), missing_entry);
|
entries.insert(missing.into(), missing_entry);
|
||||||
@@ -128,52 +109,30 @@ impl SegmentAggregationCollector for TermMissingAgg {
|
|||||||
|
|
||||||
fn collect(
|
fn collect(
|
||||||
&mut self,
|
&mut self,
|
||||||
parent_bucket_id: BucketId,
|
doc: crate::DocId,
|
||||||
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
let req_data = agg_data.get_missing_term_req_data(self.accessor_idx);
|
||||||
|
let has_value = req_data
|
||||||
|
.accessors
|
||||||
|
.iter()
|
||||||
|
.any(|(acc, _)| acc.index.has_value(doc));
|
||||||
|
if !has_value {
|
||||||
|
self.missing_count += 1;
|
||||||
|
if let Some(sub_agg) = self.sub_agg.as_mut() {
|
||||||
|
sub_agg.collect(doc, agg_data)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_block(
|
||||||
|
&mut self,
|
||||||
docs: &[crate::DocId],
|
docs: &[crate::DocId],
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let bucket = &mut self.missing_count_per_bucket[parent_bucket_id as usize];
|
|
||||||
let req_data = agg_data.get_missing_term_req_data(self.accessor_idx);
|
|
||||||
|
|
||||||
for doc in docs {
|
for doc in docs {
|
||||||
let doc = *doc;
|
self.collect(*doc, agg_data)?;
|
||||||
let has_value = req_data
|
|
||||||
.accessors
|
|
||||||
.iter()
|
|
||||||
.any(|(acc, _)| acc.index.has_value(doc));
|
|
||||||
if !has_value {
|
|
||||||
bucket.missing_count += 1;
|
|
||||||
|
|
||||||
if let Some(sub_agg) = self.sub_agg.as_mut() {
|
|
||||||
sub_agg.push(bucket.bucket_id, doc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(sub_agg) = self.sub_agg.as_mut() {
|
|
||||||
sub_agg.check_flush_local(agg_data)?;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn prepare_max_bucket(
|
|
||||||
&mut self,
|
|
||||||
max_bucket: BucketId,
|
|
||||||
_agg_data: &AggregationsSegmentCtx,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
while self.missing_count_per_bucket.len() <= max_bucket as usize {
|
|
||||||
let bucket_id = self.bucket_id_provider.next_bucket_id();
|
|
||||||
self.missing_count_per_bucket.push(MissingCount {
|
|
||||||
missing_count: 0,
|
|
||||||
bucket_id,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
|
||||||
if let Some(sub_agg) = self.sub_agg.as_mut() {
|
|
||||||
sub_agg.flush(agg_data)?;
|
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
83
src/aggregation/buf_collector.rs
Normal file
83
src/aggregation/buf_collector.rs
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||||
|
use super::segment_agg_result::SegmentAggregationCollector;
|
||||||
|
use crate::aggregation::agg_data::AggregationsSegmentCtx;
|
||||||
|
use crate::DocId;
|
||||||
|
|
||||||
|
pub(crate) const DOC_BLOCK_SIZE: usize = 64;
|
||||||
|
pub(crate) type DocBlock = [DocId; DOC_BLOCK_SIZE];
|
||||||
|
|
||||||
|
/// BufAggregationCollector buffers documents before calling collect_block().
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub(crate) struct BufAggregationCollector {
|
||||||
|
pub(crate) collector: Box<dyn SegmentAggregationCollector>,
|
||||||
|
staged_docs: DocBlock,
|
||||||
|
num_staged_docs: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for BufAggregationCollector {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
f.debug_struct("SegmentAggregationResultsCollector")
|
||||||
|
.field("staged_docs", &&self.staged_docs[..self.num_staged_docs])
|
||||||
|
.field("num_staged_docs", &self.num_staged_docs)
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BufAggregationCollector {
|
||||||
|
pub fn new(collector: Box<dyn SegmentAggregationCollector>) -> Self {
|
||||||
|
Self {
|
||||||
|
collector,
|
||||||
|
num_staged_docs: 0,
|
||||||
|
staged_docs: [0; DOC_BLOCK_SIZE],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentAggregationCollector for BufAggregationCollector {
|
||||||
|
#[inline]
|
||||||
|
fn add_intermediate_aggregation_result(
|
||||||
|
self: Box<Self>,
|
||||||
|
agg_data: &AggregationsSegmentCtx,
|
||||||
|
results: &mut IntermediateAggregationResults,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
Box::new(self.collector).add_intermediate_aggregation_result(agg_data, results)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn collect(
|
||||||
|
&mut self,
|
||||||
|
doc: crate::DocId,
|
||||||
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
self.staged_docs[self.num_staged_docs] = doc;
|
||||||
|
self.num_staged_docs += 1;
|
||||||
|
if self.num_staged_docs == self.staged_docs.len() {
|
||||||
|
self.collector
|
||||||
|
.collect_block(&self.staged_docs[..self.num_staged_docs], agg_data)?;
|
||||||
|
self.num_staged_docs = 0;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn collect_block(
|
||||||
|
&mut self,
|
||||||
|
docs: &[crate::DocId],
|
||||||
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
self.collector.collect_block(docs, agg_data)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
||||||
|
self.collector
|
||||||
|
.collect_block(&self.staged_docs[..self.num_staged_docs], agg_data)?;
|
||||||
|
self.num_staged_docs = 0;
|
||||||
|
|
||||||
|
self.collector.flush(agg_data)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,245 +0,0 @@
|
|||||||
use std::fmt::Debug;
|
|
||||||
|
|
||||||
use super::segment_agg_result::SegmentAggregationCollector;
|
|
||||||
use crate::aggregation::agg_data::AggregationsSegmentCtx;
|
|
||||||
use crate::aggregation::bucket::MAX_NUM_TERMS_FOR_VEC;
|
|
||||||
use crate::aggregation::BucketId;
|
|
||||||
use crate::DocId;
|
|
||||||
|
|
||||||
/// A cache for sub-aggregations, storing doc ids per bucket id.
|
|
||||||
/// Depending on the cardinality of the parent aggregation, we use different
|
|
||||||
/// storage strategies.
|
|
||||||
///
|
|
||||||
/// ## Low Cardinality
|
|
||||||
/// Cardinality here refers to the number of unique flattened buckets that can be created
|
|
||||||
/// by the parent aggregation.
|
|
||||||
/// Flattened buckets are the result of combining all buckets per collector
|
|
||||||
/// into a single list of buckets, where each bucket is identified by its BucketId.
|
|
||||||
///
|
|
||||||
/// ## Usage
|
|
||||||
/// Since this is caching for sub-aggregations, it is only used by bucket
|
|
||||||
/// aggregations.
|
|
||||||
///
|
|
||||||
/// TODO: consider using a more advanced data structure for high cardinality
|
|
||||||
/// aggregations.
|
|
||||||
/// What this datastructure does in general is to group docs by bucket id.
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub(crate) struct CachedSubAggs<C: SubAggCache> {
|
|
||||||
cache: C,
|
|
||||||
sub_agg_collector: Box<dyn SegmentAggregationCollector>,
|
|
||||||
num_docs: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub type LowCardCachedSubAggs = CachedSubAggs<LowCardSubAggCache>;
|
|
||||||
pub type HighCardCachedSubAggs = CachedSubAggs<HighCardSubAggCache>;
|
|
||||||
|
|
||||||
const FLUSH_THRESHOLD: usize = 2048;
|
|
||||||
|
|
||||||
/// A trait for caching sub-aggregation doc ids per bucket id.
|
|
||||||
/// Different implementations can be used depending on the cardinality
|
|
||||||
/// of the parent aggregation.
|
|
||||||
pub trait SubAggCache: Debug {
|
|
||||||
fn new() -> Self;
|
|
||||||
fn push(&mut self, bucket_id: BucketId, doc_id: DocId);
|
|
||||||
fn flush_local(
|
|
||||||
&mut self,
|
|
||||||
sub_agg: &mut Box<dyn SegmentAggregationCollector>,
|
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
|
||||||
force: bool,
|
|
||||||
) -> crate::Result<()>;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<Backend: SubAggCache + Debug> CachedSubAggs<Backend> {
|
|
||||||
pub fn new(sub_agg: Box<dyn SegmentAggregationCollector>) -> Self {
|
|
||||||
Self {
|
|
||||||
cache: Backend::new(),
|
|
||||||
sub_agg_collector: sub_agg,
|
|
||||||
num_docs: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_sub_agg_collector(&mut self) -> &mut Box<dyn SegmentAggregationCollector> {
|
|
||||||
&mut self.sub_agg_collector
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
pub fn push(&mut self, bucket_id: BucketId, doc_id: DocId) {
|
|
||||||
self.cache.push(bucket_id, doc_id);
|
|
||||||
self.num_docs += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Check if we need to flush based on the number of documents cached.
|
|
||||||
/// If so, flushes the cache to the provided aggregation collector.
|
|
||||||
pub fn check_flush_local(
|
|
||||||
&mut self,
|
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
if self.num_docs >= FLUSH_THRESHOLD {
|
|
||||||
self.cache
|
|
||||||
.flush_local(&mut self.sub_agg_collector, agg_data, false)?;
|
|
||||||
self.num_docs = 0;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Note: this _does_ flush the sub aggregations.
|
|
||||||
pub fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
|
|
||||||
if self.num_docs != 0 {
|
|
||||||
self.cache
|
|
||||||
.flush_local(&mut self.sub_agg_collector, agg_data, true)?;
|
|
||||||
self.num_docs = 0;
|
|
||||||
}
|
|
||||||
self.sub_agg_collector.flush(agg_data)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Number of partitions for high cardinality sub-aggregation cache.
|
|
||||||
const NUM_PARTITIONS: usize = 16;
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub(crate) struct HighCardSubAggCache {
|
|
||||||
/// This weird partitioning is used to do some cheap grouping on the bucket ids.
|
|
||||||
/// bucket ids are dense, e.g. when we don't detect the cardinality as low cardinality,
|
|
||||||
/// but there are just 16 bucket ids, each bucket id will go to its own partition.
|
|
||||||
///
|
|
||||||
/// We want to keep this cheap, because high cardinality aggregations can have a lot of
|
|
||||||
/// buckets, and there may be nothing to group.
|
|
||||||
partitions: Box<[PartitionEntry; NUM_PARTITIONS]>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl HighCardSubAggCache {
|
|
||||||
#[inline]
|
|
||||||
fn clear(&mut self) {
|
|
||||||
for partition in self.partitions.iter_mut() {
|
|
||||||
partition.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Default)]
|
|
||||||
struct PartitionEntry {
|
|
||||||
bucket_ids: Vec<BucketId>,
|
|
||||||
docs: Vec<DocId>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PartitionEntry {
|
|
||||||
#[inline]
|
|
||||||
fn clear(&mut self) {
|
|
||||||
self.bucket_ids.clear();
|
|
||||||
self.docs.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SubAggCache for HighCardSubAggCache {
|
|
||||||
fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
partitions: Box::new(core::array::from_fn(|_| PartitionEntry::default())),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn push(&mut self, bucket_id: BucketId, doc_id: DocId) {
|
|
||||||
let idx = bucket_id % NUM_PARTITIONS as u32;
|
|
||||||
let slot = &mut self.partitions[idx as usize];
|
|
||||||
slot.bucket_ids.push(bucket_id);
|
|
||||||
slot.docs.push(doc_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn flush_local(
|
|
||||||
&mut self,
|
|
||||||
sub_agg: &mut Box<dyn SegmentAggregationCollector>,
|
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
|
||||||
_force: bool,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
let mut max_bucket = 0u32;
|
|
||||||
for partition in self.partitions.iter() {
|
|
||||||
if let Some(&local_max) = partition.bucket_ids.iter().max() {
|
|
||||||
max_bucket = max_bucket.max(local_max);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
sub_agg.prepare_max_bucket(max_bucket, agg_data)?;
|
|
||||||
|
|
||||||
for slot in self.partitions.iter() {
|
|
||||||
if !slot.bucket_ids.is_empty() {
|
|
||||||
// Reduce dynamic dispatch overhead by collecting a full partition in one call.
|
|
||||||
sub_agg.collect_multiple(&slot.bucket_ids, &slot.docs, agg_data)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
self.clear();
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub(crate) struct LowCardSubAggCache {
|
|
||||||
/// Cache doc ids per bucket for sub-aggregations.
|
|
||||||
///
|
|
||||||
/// The outer Vec is indexed by BucketId.
|
|
||||||
per_bucket_docs: Vec<Vec<DocId>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LowCardSubAggCache {
|
|
||||||
#[inline]
|
|
||||||
fn clear(&mut self) {
|
|
||||||
for v in &mut self.per_bucket_docs {
|
|
||||||
v.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SubAggCache for LowCardSubAggCache {
|
|
||||||
fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
per_bucket_docs: Vec::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn push(&mut self, bucket_id: BucketId, doc_id: DocId) {
|
|
||||||
let idx = bucket_id as usize;
|
|
||||||
if self.per_bucket_docs.len() <= idx {
|
|
||||||
self.per_bucket_docs.resize_with(idx + 1, Vec::new);
|
|
||||||
}
|
|
||||||
self.per_bucket_docs[idx].push(doc_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn flush_local(
|
|
||||||
&mut self,
|
|
||||||
sub_agg: &mut Box<dyn SegmentAggregationCollector>,
|
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
|
||||||
force: bool,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
// Pre-aggregated: call collect per bucket.
|
|
||||||
let max_bucket = (self.per_bucket_docs.len() as BucketId).saturating_sub(1);
|
|
||||||
sub_agg.prepare_max_bucket(max_bucket, agg_data)?;
|
|
||||||
// The threshold above which we flush buckets individually.
|
|
||||||
// Note: We need to make sure that we don't lock ourselves into a situation where we hit
|
|
||||||
// the FLUSH_THRESHOLD, but never flush any buckets. (except the final flush)
|
|
||||||
let mut bucket_treshold = FLUSH_THRESHOLD / (self.per_bucket_docs.len().max(1) * 2);
|
|
||||||
const _: () = {
|
|
||||||
// MAX_NUM_TERMS_FOR_VEC threshold is used for term aggregations
|
|
||||||
// Note: There may be other flexible values, for other aggregations, but we can use the
|
|
||||||
// const value here as a upper bound. (better than nothing)
|
|
||||||
let bucket_treshold_limit = FLUSH_THRESHOLD / (MAX_NUM_TERMS_FOR_VEC as usize * 2);
|
|
||||||
assert!(
|
|
||||||
bucket_treshold_limit > 0,
|
|
||||||
"Bucket threshold must be greater than 0"
|
|
||||||
);
|
|
||||||
};
|
|
||||||
if force {
|
|
||||||
bucket_treshold = 0;
|
|
||||||
}
|
|
||||||
for (bucket_id, docs) in self
|
|
||||||
.per_bucket_docs
|
|
||||||
.iter()
|
|
||||||
.enumerate()
|
|
||||||
.filter(|(_, docs)| docs.len() > bucket_treshold)
|
|
||||||
{
|
|
||||||
sub_agg.collect(bucket_id as BucketId, docs, agg_data)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
self.clear();
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,9 +1,9 @@
|
|||||||
use super::agg_req::Aggregations;
|
use super::agg_req::Aggregations;
|
||||||
use super::agg_result::AggregationResults;
|
use super::agg_result::AggregationResults;
|
||||||
use super::cached_sub_aggs::LowCardCachedSubAggs;
|
use super::buf_collector::BufAggregationCollector;
|
||||||
use super::intermediate_agg_result::IntermediateAggregationResults;
|
use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||||
|
use super::segment_agg_result::SegmentAggregationCollector;
|
||||||
use super::AggContextParams;
|
use super::AggContextParams;
|
||||||
// group buffering strategy is chosen explicitly by callers; no need to hash-group on the fly.
|
|
||||||
use crate::aggregation::agg_data::{
|
use crate::aggregation::agg_data::{
|
||||||
build_aggregations_data_from_req, build_segment_agg_collectors_root, AggregationsSegmentCtx,
|
build_aggregations_data_from_req, build_segment_agg_collectors_root, AggregationsSegmentCtx,
|
||||||
};
|
};
|
||||||
@@ -136,7 +136,7 @@ fn merge_fruits(
|
|||||||
/// `AggregationSegmentCollector` does the aggregation collection on a segment.
|
/// `AggregationSegmentCollector` does the aggregation collection on a segment.
|
||||||
pub struct AggregationSegmentCollector {
|
pub struct AggregationSegmentCollector {
|
||||||
aggs_with_accessor: AggregationsSegmentCtx,
|
aggs_with_accessor: AggregationsSegmentCtx,
|
||||||
agg_collector: LowCardCachedSubAggs,
|
agg_collector: BufAggregationCollector,
|
||||||
error: Option<TantivyError>,
|
error: Option<TantivyError>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -151,11 +151,8 @@ impl AggregationSegmentCollector {
|
|||||||
) -> crate::Result<Self> {
|
) -> crate::Result<Self> {
|
||||||
let mut agg_data =
|
let mut agg_data =
|
||||||
build_aggregations_data_from_req(agg, reader, segment_ordinal, context.clone())?;
|
build_aggregations_data_from_req(agg, reader, segment_ordinal, context.clone())?;
|
||||||
let mut result =
|
let result =
|
||||||
LowCardCachedSubAggs::new(build_segment_agg_collectors_root(&mut agg_data)?);
|
BufAggregationCollector::new(build_segment_agg_collectors_root(&mut agg_data)?);
|
||||||
result
|
|
||||||
.get_sub_agg_collector()
|
|
||||||
.prepare_max_bucket(0, &agg_data)?; // prepare for bucket zero
|
|
||||||
|
|
||||||
Ok(AggregationSegmentCollector {
|
Ok(AggregationSegmentCollector {
|
||||||
aggs_with_accessor: agg_data,
|
aggs_with_accessor: agg_data,
|
||||||
@@ -173,31 +170,26 @@ impl SegmentCollector for AggregationSegmentCollector {
|
|||||||
if self.error.is_some() {
|
if self.error.is_some() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
self.agg_collector.push(0, doc);
|
if let Err(err) = self
|
||||||
match self
|
|
||||||
.agg_collector
|
.agg_collector
|
||||||
.check_flush_local(&mut self.aggs_with_accessor)
|
.collect(doc, &mut self.aggs_with_accessor)
|
||||||
{
|
{
|
||||||
Ok(_) => {}
|
self.error = Some(err);
|
||||||
Err(e) => {
|
|
||||||
self.error = Some(e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The query pushes the documents to the collector via this method.
|
||||||
|
///
|
||||||
|
/// Only valid for Collectors that ignore docs
|
||||||
fn collect_block(&mut self, docs: &[DocId]) {
|
fn collect_block(&mut self, docs: &[DocId]) {
|
||||||
if self.error.is_some() {
|
if self.error.is_some() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if let Err(err) = self
|
||||||
match self.agg_collector.get_sub_agg_collector().collect(
|
.agg_collector
|
||||||
0,
|
.collect_block(docs, &mut self.aggs_with_accessor)
|
||||||
docs,
|
{
|
||||||
&mut self.aggs_with_accessor,
|
self.error = Some(err);
|
||||||
) {
|
|
||||||
Ok(_) => {}
|
|
||||||
Err(e) => {
|
|
||||||
self.error = Some(e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -208,13 +200,10 @@ impl SegmentCollector for AggregationSegmentCollector {
|
|||||||
self.agg_collector.flush(&mut self.aggs_with_accessor)?;
|
self.agg_collector.flush(&mut self.aggs_with_accessor)?;
|
||||||
|
|
||||||
let mut sub_aggregation_res = IntermediateAggregationResults::default();
|
let mut sub_aggregation_res = IntermediateAggregationResults::default();
|
||||||
self.agg_collector
|
Box::new(self.agg_collector).add_intermediate_aggregation_result(
|
||||||
.get_sub_agg_collector()
|
&self.aggs_with_accessor,
|
||||||
.add_intermediate_aggregation_result(
|
&mut sub_aggregation_res,
|
||||||
&self.aggs_with_accessor,
|
)?;
|
||||||
&mut sub_aggregation_res,
|
|
||||||
0,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
Ok(sub_aggregation_res)
|
Ok(sub_aggregation_res)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -792,7 +792,7 @@ pub struct IntermediateRangeBucketEntry {
|
|||||||
/// The number of documents in the bucket.
|
/// The number of documents in the bucket.
|
||||||
pub doc_count: u64,
|
pub doc_count: u64,
|
||||||
/// The sub_aggregation in this bucket.
|
/// The sub_aggregation in this bucket.
|
||||||
pub sub_aggregation_res: IntermediateAggregationResults,
|
pub sub_aggregation: IntermediateAggregationResults,
|
||||||
/// The from range of the bucket. Equals `f64::MIN` when `None`.
|
/// The from range of the bucket. Equals `f64::MIN` when `None`.
|
||||||
pub from: Option<f64>,
|
pub from: Option<f64>,
|
||||||
/// The to range of the bucket. Equals `f64::MAX` when `None`.
|
/// The to range of the bucket. Equals `f64::MAX` when `None`.
|
||||||
@@ -811,7 +811,7 @@ impl IntermediateRangeBucketEntry {
|
|||||||
key: self.key.into(),
|
key: self.key.into(),
|
||||||
doc_count: self.doc_count,
|
doc_count: self.doc_count,
|
||||||
sub_aggregation: self
|
sub_aggregation: self
|
||||||
.sub_aggregation_res
|
.sub_aggregation
|
||||||
.into_final_result_internal(req, limits)?,
|
.into_final_result_internal(req, limits)?,
|
||||||
to: self.to,
|
to: self.to,
|
||||||
from: self.from,
|
from: self.from,
|
||||||
@@ -857,8 +857,7 @@ impl MergeFruits for IntermediateTermBucketEntry {
|
|||||||
impl MergeFruits for IntermediateRangeBucketEntry {
|
impl MergeFruits for IntermediateRangeBucketEntry {
|
||||||
fn merge_fruits(&mut self, other: IntermediateRangeBucketEntry) -> crate::Result<()> {
|
fn merge_fruits(&mut self, other: IntermediateRangeBucketEntry) -> crate::Result<()> {
|
||||||
self.doc_count += other.doc_count;
|
self.doc_count += other.doc_count;
|
||||||
self.sub_aggregation_res
|
self.sub_aggregation.merge_fruits(other.sub_aggregation)?;
|
||||||
.merge_fruits(other.sub_aggregation_res)?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -888,7 +887,7 @@ mod tests {
|
|||||||
IntermediateRangeBucketEntry {
|
IntermediateRangeBucketEntry {
|
||||||
key: IntermediateKey::Str(key.to_string()),
|
key: IntermediateKey::Str(key.to_string()),
|
||||||
doc_count: *doc_count,
|
doc_count: *doc_count,
|
||||||
sub_aggregation_res: Default::default(),
|
sub_aggregation: Default::default(),
|
||||||
from: None,
|
from: None,
|
||||||
to: None,
|
to: None,
|
||||||
},
|
},
|
||||||
@@ -921,7 +920,7 @@ mod tests {
|
|||||||
doc_count: *doc_count,
|
doc_count: *doc_count,
|
||||||
from: None,
|
from: None,
|
||||||
to: None,
|
to: None,
|
||||||
sub_aggregation_res: get_sub_test_tree(&[(
|
sub_aggregation: get_sub_test_tree(&[(
|
||||||
sub_aggregation_key.to_string(),
|
sub_aggregation_key.to_string(),
|
||||||
*sub_aggregation_count,
|
*sub_aggregation_count,
|
||||||
)]),
|
)]),
|
||||||
|
|||||||
@@ -52,8 +52,10 @@ pub struct IntermediateAverage {
|
|||||||
|
|
||||||
impl IntermediateAverage {
|
impl IntermediateAverage {
|
||||||
/// Creates a new [`IntermediateAverage`] instance from a [`SegmentStatsCollector`].
|
/// Creates a new [`IntermediateAverage`] instance from a [`SegmentStatsCollector`].
|
||||||
pub(crate) fn from_stats(stats: IntermediateStats) -> Self {
|
pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
|
||||||
Self { stats }
|
Self {
|
||||||
|
stats: collector.stats,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/// Merges the other intermediate result into self.
|
/// Merges the other intermediate result into self.
|
||||||
pub fn merge_fruits(&mut self, other: IntermediateAverage) {
|
pub fn merge_fruits(&mut self, other: IntermediateAverage) {
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use std::collections::hash_map::DefaultHasher;
|
|||||||
use std::hash::{BuildHasher, Hasher};
|
use std::hash::{BuildHasher, Hasher};
|
||||||
|
|
||||||
use columnar::column_values::CompactSpaceU64Accessor;
|
use columnar::column_values::CompactSpaceU64Accessor;
|
||||||
use columnar::{Column, ColumnType, Dictionary, StrColumn};
|
use columnar::{Column, ColumnBlockAccessor, ColumnType, Dictionary, StrColumn};
|
||||||
use common::f64_to_u64;
|
use common::f64_to_u64;
|
||||||
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
|
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
|
||||||
use rustc_hash::FxHashSet;
|
use rustc_hash::FxHashSet;
|
||||||
@@ -106,6 +106,8 @@ pub struct CardinalityAggReqData {
|
|||||||
pub str_dict_column: Option<StrColumn>,
|
pub str_dict_column: Option<StrColumn>,
|
||||||
/// The missing value normalized to the internal u64 representation of the field type.
|
/// The missing value normalized to the internal u64 representation of the field type.
|
||||||
pub missing_value_for_accessor: Option<u64>,
|
pub missing_value_for_accessor: Option<u64>,
|
||||||
|
/// The column block accessor to access the fast field values.
|
||||||
|
pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
|
||||||
/// The name of the aggregation.
|
/// The name of the aggregation.
|
||||||
pub name: String,
|
pub name: String,
|
||||||
/// The aggregation request.
|
/// The aggregation request.
|
||||||
@@ -133,34 +135,45 @@ impl CardinalityAggregationReq {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug, PartialEq)]
|
||||||
pub(crate) struct SegmentCardinalityCollector {
|
pub(crate) struct SegmentCardinalityCollector {
|
||||||
buckets: Vec<SegmentCardinalityCollectorBucket>,
|
|
||||||
accessor_idx: usize,
|
|
||||||
/// The column accessor to access the fast field values.
|
|
||||||
accessor: Column<u64>,
|
|
||||||
/// The column_type of the field.
|
|
||||||
column_type: ColumnType,
|
|
||||||
/// The missing value normalized to the internal u64 representation of the field type.
|
|
||||||
missing_value_for_accessor: Option<u64>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, Debug, PartialEq, Default)]
|
|
||||||
pub(crate) struct SegmentCardinalityCollectorBucket {
|
|
||||||
cardinality: CardinalityCollector,
|
cardinality: CardinalityCollector,
|
||||||
entries: FxHashSet<u64>,
|
entries: FxHashSet<u64>,
|
||||||
|
accessor_idx: usize,
|
||||||
}
|
}
|
||||||
impl SegmentCardinalityCollectorBucket {
|
|
||||||
pub fn new(column_type: ColumnType) -> Self {
|
impl SegmentCardinalityCollector {
|
||||||
|
pub fn from_req(column_type: ColumnType, accessor_idx: usize) -> Self {
|
||||||
Self {
|
Self {
|
||||||
cardinality: CardinalityCollector::new(column_type as u8),
|
cardinality: CardinalityCollector::new(column_type as u8),
|
||||||
entries: FxHashSet::default(),
|
entries: Default::default(),
|
||||||
|
accessor_idx,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn fetch_block_with_field(
|
||||||
|
&mut self,
|
||||||
|
docs: &[crate::DocId],
|
||||||
|
agg_data: &mut CardinalityAggReqData,
|
||||||
|
) {
|
||||||
|
if let Some(missing) = agg_data.missing_value_for_accessor {
|
||||||
|
agg_data.column_block_accessor.fetch_block_with_missing(
|
||||||
|
docs,
|
||||||
|
&agg_data.accessor,
|
||||||
|
missing,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
agg_data
|
||||||
|
.column_block_accessor
|
||||||
|
.fetch_block(docs, &agg_data.accessor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn into_intermediate_metric_result(
|
fn into_intermediate_metric_result(
|
||||||
mut self,
|
mut self,
|
||||||
req_data: &CardinalityAggReqData,
|
agg_data: &AggregationsSegmentCtx,
|
||||||
) -> crate::Result<IntermediateMetricResult> {
|
) -> crate::Result<IntermediateMetricResult> {
|
||||||
|
let req_data = &agg_data.get_cardinality_req_data(self.accessor_idx);
|
||||||
if req_data.column_type == ColumnType::Str {
|
if req_data.column_type == ColumnType::Str {
|
||||||
let fallback_dict = Dictionary::empty();
|
let fallback_dict = Dictionary::empty();
|
||||||
let dict = req_data
|
let dict = req_data
|
||||||
@@ -181,7 +194,6 @@ impl SegmentCardinalityCollectorBucket {
|
|||||||
term_ids.push(term_ord as u32);
|
term_ids.push(term_ord as u32);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
term_ids.sort_unstable();
|
term_ids.sort_unstable();
|
||||||
dict.sorted_ords_to_term_cb(term_ids.iter().map(|term| *term as u64), |term| {
|
dict.sorted_ords_to_term_cb(term_ids.iter().map(|term| *term as u64), |term| {
|
||||||
self.cardinality.sketch.insert_any(&term);
|
self.cardinality.sketch.insert_any(&term);
|
||||||
@@ -215,49 +227,16 @@ impl SegmentCardinalityCollectorBucket {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentCardinalityCollector {
|
|
||||||
pub fn from_req(
|
|
||||||
column_type: ColumnType,
|
|
||||||
accessor_idx: usize,
|
|
||||||
accessor: Column<u64>,
|
|
||||||
missing_value_for_accessor: Option<u64>,
|
|
||||||
) -> Self {
|
|
||||||
Self {
|
|
||||||
buckets: vec![SegmentCardinalityCollectorBucket::new(column_type); 1],
|
|
||||||
column_type,
|
|
||||||
accessor_idx,
|
|
||||||
accessor,
|
|
||||||
missing_value_for_accessor,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn fetch_block_with_field(
|
|
||||||
&mut self,
|
|
||||||
docs: &[crate::DocId],
|
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
|
||||||
) {
|
|
||||||
agg_data.column_block_accessor.fetch_block_with_missing(
|
|
||||||
docs,
|
|
||||||
&self.accessor,
|
|
||||||
self.missing_value_for_accessor,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentAggregationCollector for SegmentCardinalityCollector {
|
impl SegmentAggregationCollector for SegmentCardinalityCollector {
|
||||||
fn add_intermediate_aggregation_result(
|
fn add_intermediate_aggregation_result(
|
||||||
&mut self,
|
self: Box<Self>,
|
||||||
agg_data: &AggregationsSegmentCtx,
|
agg_data: &AggregationsSegmentCtx,
|
||||||
results: &mut IntermediateAggregationResults,
|
results: &mut IntermediateAggregationResults,
|
||||||
parent_bucket_id: BucketId,
|
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
|
|
||||||
let req_data = &agg_data.get_cardinality_req_data(self.accessor_idx);
|
let req_data = &agg_data.get_cardinality_req_data(self.accessor_idx);
|
||||||
let name = req_data.name.to_string();
|
let name = req_data.name.to_string();
|
||||||
// take the bucket in buckets and replace it with a new empty one
|
|
||||||
let bucket = std::mem::take(&mut self.buckets[parent_bucket_id as usize]);
|
|
||||||
|
|
||||||
let intermediate_result = bucket.into_intermediate_metric_result(req_data)?;
|
let intermediate_result = self.into_intermediate_metric_result(agg_data)?;
|
||||||
results.push(
|
results.push(
|
||||||
name,
|
name,
|
||||||
IntermediateAggregationResult::Metric(intermediate_result),
|
IntermediateAggregationResult::Metric(intermediate_result),
|
||||||
@@ -268,20 +247,27 @@ impl SegmentAggregationCollector for SegmentCardinalityCollector {
|
|||||||
|
|
||||||
fn collect(
|
fn collect(
|
||||||
&mut self,
|
&mut self,
|
||||||
parent_bucket_id: BucketId,
|
doc: crate::DocId,
|
||||||
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
self.collect_block(&[doc], agg_data)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_block(
|
||||||
|
&mut self,
|
||||||
docs: &[crate::DocId],
|
docs: &[crate::DocId],
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
self.fetch_block_with_field(docs, agg_data);
|
let req_data = agg_data.get_cardinality_req_data_mut(self.accessor_idx);
|
||||||
let bucket = &mut self.buckets[parent_bucket_id as usize];
|
self.fetch_block_with_field(docs, req_data);
|
||||||
|
|
||||||
let col_block_accessor = &agg_data.column_block_accessor;
|
let col_block_accessor = &req_data.column_block_accessor;
|
||||||
if self.column_type == ColumnType::Str {
|
if req_data.column_type == ColumnType::Str {
|
||||||
for term_ord in col_block_accessor.iter_vals() {
|
for term_ord in col_block_accessor.iter_vals() {
|
||||||
bucket.entries.insert(term_ord);
|
self.entries.insert(term_ord);
|
||||||
}
|
}
|
||||||
} else if self.column_type == ColumnType::IpAddr {
|
} else if req_data.column_type == ColumnType::IpAddr {
|
||||||
let compact_space_accessor = self
|
let compact_space_accessor = req_data
|
||||||
.accessor
|
.accessor
|
||||||
.values
|
.values
|
||||||
.clone()
|
.clone()
|
||||||
@@ -296,29 +282,16 @@ impl SegmentAggregationCollector for SegmentCardinalityCollector {
|
|||||||
})?;
|
})?;
|
||||||
for val in col_block_accessor.iter_vals() {
|
for val in col_block_accessor.iter_vals() {
|
||||||
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
|
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
|
||||||
bucket.cardinality.sketch.insert_any(&val);
|
self.cardinality.sketch.insert_any(&val);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for val in col_block_accessor.iter_vals() {
|
for val in col_block_accessor.iter_vals() {
|
||||||
bucket.cardinality.sketch.insert_any(&val);
|
self.cardinality.sketch.insert_any(&val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn prepare_max_bucket(
|
|
||||||
&mut self,
|
|
||||||
max_bucket: BucketId,
|
|
||||||
_agg_data: &AggregationsSegmentCtx,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
if max_bucket as usize >= self.buckets.len() {
|
|
||||||
self.buckets.resize_with(max_bucket as usize + 1, || {
|
|
||||||
SegmentCardinalityCollectorBucket::new(self.column_type)
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
|||||||
@@ -52,8 +52,10 @@ pub struct IntermediateCount {
|
|||||||
|
|
||||||
impl IntermediateCount {
|
impl IntermediateCount {
|
||||||
/// Creates a new [`IntermediateCount`] instance from a [`SegmentStatsCollector`].
|
/// Creates a new [`IntermediateCount`] instance from a [`SegmentStatsCollector`].
|
||||||
pub(crate) fn from_stats(stats: IntermediateStats) -> Self {
|
pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
|
||||||
Self { stats }
|
Self {
|
||||||
|
stats: collector.stats,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/// Merges the other intermediate result into self.
|
/// Merges the other intermediate result into self.
|
||||||
pub fn merge_fruits(&mut self, other: IntermediateCount) {
|
pub fn merge_fruits(&mut self, other: IntermediateCount) {
|
||||||
|
|||||||
@@ -8,9 +8,10 @@ use crate::aggregation::agg_data::AggregationsSegmentCtx;
|
|||||||
use crate::aggregation::intermediate_agg_result::{
|
use crate::aggregation::intermediate_agg_result::{
|
||||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
|
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
|
||||||
};
|
};
|
||||||
|
use crate::aggregation::metric::MetricAggReqData;
|
||||||
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
||||||
use crate::aggregation::*;
|
use crate::aggregation::*;
|
||||||
use crate::TantivyError;
|
use crate::{DocId, TantivyError};
|
||||||
|
|
||||||
/// A multi-value metric aggregation that computes a collection of extended statistics
|
/// A multi-value metric aggregation that computes a collection of extended statistics
|
||||||
/// on numeric values that are extracted
|
/// on numeric values that are extracted
|
||||||
@@ -61,7 +62,7 @@ impl ExtendedStatsAggregation {
|
|||||||
|
|
||||||
/// Extended stats contains a collection of statistics
|
/// Extended stats contains a collection of statistics
|
||||||
/// they extends stats adding variance, standard deviation
|
/// they extends stats adding variance, standard deviation
|
||||||
/// and bound information
|
/// and bound informations
|
||||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
pub struct ExtendedStats {
|
pub struct ExtendedStats {
|
||||||
/// The number of documents.
|
/// The number of documents.
|
||||||
@@ -317,28 +318,51 @@ impl IntermediateExtendedStats {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug, PartialEq)]
|
||||||
pub(crate) struct SegmentExtendedStatsCollector {
|
pub(crate) struct SegmentExtendedStatsCollector {
|
||||||
name: String,
|
|
||||||
missing: Option<u64>,
|
missing: Option<u64>,
|
||||||
field_type: ColumnType,
|
field_type: ColumnType,
|
||||||
accessor: columnar::Column<u64>,
|
pub(crate) extended_stats: IntermediateExtendedStats,
|
||||||
buckets: Vec<IntermediateExtendedStats>,
|
pub(crate) accessor_idx: usize,
|
||||||
sigma: Option<f64>,
|
val_cache: Vec<u64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentExtendedStatsCollector {
|
impl SegmentExtendedStatsCollector {
|
||||||
pub fn from_req(req: &MetricAggReqData, sigma: Option<f64>) -> Self {
|
pub fn from_req(
|
||||||
let missing = req
|
field_type: ColumnType,
|
||||||
.missing
|
sigma: Option<f64>,
|
||||||
.and_then(|val| f64_to_fastfield_u64(val, &req.field_type));
|
accessor_idx: usize,
|
||||||
|
missing: Option<f64>,
|
||||||
|
) -> Self {
|
||||||
|
let missing = missing.and_then(|val| f64_to_fastfield_u64(val, &field_type));
|
||||||
Self {
|
Self {
|
||||||
name: req.name.clone(),
|
field_type,
|
||||||
field_type: req.field_type,
|
extended_stats: IntermediateExtendedStats::with_sigma(sigma),
|
||||||
accessor: req.accessor.clone(),
|
accessor_idx,
|
||||||
missing,
|
missing,
|
||||||
buckets: vec![IntermediateExtendedStats::with_sigma(sigma); 16],
|
val_cache: Default::default(),
|
||||||
sigma,
|
}
|
||||||
|
}
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn collect_block_with_field(
|
||||||
|
&mut self,
|
||||||
|
docs: &[DocId],
|
||||||
|
req_data: &mut MetricAggReqData,
|
||||||
|
) {
|
||||||
|
if let Some(missing) = self.missing.as_ref() {
|
||||||
|
req_data.column_block_accessor.fetch_block_with_missing(
|
||||||
|
docs,
|
||||||
|
&req_data.accessor,
|
||||||
|
*missing,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
req_data
|
||||||
|
.column_block_accessor
|
||||||
|
.fetch_block(docs, &req_data.accessor);
|
||||||
|
}
|
||||||
|
for val in req_data.column_block_accessor.iter_vals() {
|
||||||
|
let val1 = f64_from_fastfield_u64(val, &self.field_type);
|
||||||
|
self.extended_stats.collect(val1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -346,18 +370,15 @@ impl SegmentExtendedStatsCollector {
|
|||||||
impl SegmentAggregationCollector for SegmentExtendedStatsCollector {
|
impl SegmentAggregationCollector for SegmentExtendedStatsCollector {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn add_intermediate_aggregation_result(
|
fn add_intermediate_aggregation_result(
|
||||||
&mut self,
|
self: Box<Self>,
|
||||||
agg_data: &AggregationsSegmentCtx,
|
agg_data: &AggregationsSegmentCtx,
|
||||||
results: &mut IntermediateAggregationResults,
|
results: &mut IntermediateAggregationResults,
|
||||||
parent_bucket_id: BucketId,
|
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let name = self.name.clone();
|
let name = agg_data.get_metric_req_data(self.accessor_idx).name.clone();
|
||||||
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
|
|
||||||
let extended_stats = std::mem::take(&mut self.buckets[parent_bucket_id as usize]);
|
|
||||||
results.push(
|
results.push(
|
||||||
name,
|
name,
|
||||||
IntermediateAggregationResult::Metric(IntermediateMetricResult::ExtendedStats(
|
IntermediateAggregationResult::Metric(IntermediateMetricResult::ExtendedStats(
|
||||||
extended_stats,
|
self.extended_stats,
|
||||||
)),
|
)),
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
@@ -367,36 +388,39 @@ impl SegmentAggregationCollector for SegmentExtendedStatsCollector {
|
|||||||
#[inline]
|
#[inline]
|
||||||
fn collect(
|
fn collect(
|
||||||
&mut self,
|
&mut self,
|
||||||
parent_bucket_id: BucketId,
|
doc: crate::DocId,
|
||||||
docs: &[crate::DocId],
|
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let mut extended_stats = self.buckets[parent_bucket_id as usize].clone();
|
let req_data = agg_data.get_metric_req_data(self.accessor_idx);
|
||||||
|
if let Some(missing) = self.missing {
|
||||||
agg_data
|
let mut has_val = false;
|
||||||
.column_block_accessor
|
for val in req_data.accessor.values_for_doc(doc) {
|
||||||
.fetch_block_with_missing(docs, &self.accessor, self.missing);
|
let val1 = f64_from_fastfield_u64(val, &self.field_type);
|
||||||
for val in agg_data.column_block_accessor.iter_vals() {
|
self.extended_stats.collect(val1);
|
||||||
let val1 = f64_from_fastfield_u64(val, self.field_type);
|
has_val = true;
|
||||||
extended_stats.collect(val1);
|
}
|
||||||
|
if !has_val {
|
||||||
|
self.extended_stats
|
||||||
|
.collect(f64_from_fastfield_u64(missing, &self.field_type));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for val in req_data.accessor.values_for_doc(doc) {
|
||||||
|
let val1 = f64_from_fastfield_u64(val, &self.field_type);
|
||||||
|
self.extended_stats.collect(val1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// store back
|
|
||||||
self.buckets[parent_bucket_id as usize] = extended_stats;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn prepare_max_bucket(
|
#[inline]
|
||||||
|
fn collect_block(
|
||||||
&mut self,
|
&mut self,
|
||||||
max_bucket: BucketId,
|
docs: &[crate::DocId],
|
||||||
_agg_data: &AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
if self.buckets.len() <= max_bucket as usize {
|
let req_data = agg_data.get_metric_req_data_mut(self.accessor_idx);
|
||||||
self.buckets.resize_with(max_bucket as usize + 1, || {
|
self.collect_block_with_field(docs, req_data);
|
||||||
IntermediateExtendedStats::with_sigma(self.sigma)
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -52,8 +52,10 @@ pub struct IntermediateMax {
|
|||||||
|
|
||||||
impl IntermediateMax {
|
impl IntermediateMax {
|
||||||
/// Creates a new [`IntermediateMax`] instance from a [`SegmentStatsCollector`].
|
/// Creates a new [`IntermediateMax`] instance from a [`SegmentStatsCollector`].
|
||||||
pub(crate) fn from_stats(stats: IntermediateStats) -> Self {
|
pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
|
||||||
Self { stats }
|
Self {
|
||||||
|
stats: collector.stats,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/// Merges the other intermediate result into self.
|
/// Merges the other intermediate result into self.
|
||||||
pub fn merge_fruits(&mut self, other: IntermediateMax) {
|
pub fn merge_fruits(&mut self, other: IntermediateMax) {
|
||||||
|
|||||||
@@ -52,8 +52,10 @@ pub struct IntermediateMin {
|
|||||||
|
|
||||||
impl IntermediateMin {
|
impl IntermediateMin {
|
||||||
/// Creates a new [`IntermediateMin`] instance from a [`SegmentStatsCollector`].
|
/// Creates a new [`IntermediateMin`] instance from a [`SegmentStatsCollector`].
|
||||||
pub(crate) fn from_stats(stats: IntermediateStats) -> Self {
|
pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
|
||||||
Self { stats }
|
Self {
|
||||||
|
stats: collector.stats,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/// Merges the other intermediate result into self.
|
/// Merges the other intermediate result into self.
|
||||||
pub fn merge_fruits(&mut self, other: IntermediateMin) {
|
pub fn merge_fruits(&mut self, other: IntermediateMin) {
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ use std::collections::HashMap;
|
|||||||
|
|
||||||
pub use average::*;
|
pub use average::*;
|
||||||
pub use cardinality::*;
|
pub use cardinality::*;
|
||||||
use columnar::{Column, ColumnType};
|
use columnar::{Column, ColumnBlockAccessor, ColumnType};
|
||||||
pub use count::*;
|
pub use count::*;
|
||||||
pub use extended_stats::*;
|
pub use extended_stats::*;
|
||||||
pub use max::*;
|
pub use max::*;
|
||||||
@@ -55,6 +55,8 @@ pub struct MetricAggReqData {
|
|||||||
pub field_type: ColumnType,
|
pub field_type: ColumnType,
|
||||||
/// The missing value normalized to the internal u64 representation of the field type.
|
/// The missing value normalized to the internal u64 representation of the field type.
|
||||||
pub missing_u64: Option<u64>,
|
pub missing_u64: Option<u64>,
|
||||||
|
/// The column block accessor to access the fast field values.
|
||||||
|
pub column_block_accessor: ColumnBlockAccessor<u64>,
|
||||||
/// The column accessor to access the fast field values.
|
/// The column accessor to access the fast field values.
|
||||||
pub accessor: Column<u64>,
|
pub accessor: Column<u64>,
|
||||||
/// Used when converting to intermediate result
|
/// Used when converting to intermediate result
|
||||||
|
|||||||
@@ -7,9 +7,10 @@ use crate::aggregation::agg_data::AggregationsSegmentCtx;
|
|||||||
use crate::aggregation::intermediate_agg_result::{
|
use crate::aggregation::intermediate_agg_result::{
|
||||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
|
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
|
||||||
};
|
};
|
||||||
|
use crate::aggregation::metric::MetricAggReqData;
|
||||||
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
||||||
use crate::aggregation::*;
|
use crate::aggregation::*;
|
||||||
use crate::TantivyError;
|
use crate::{DocId, TantivyError};
|
||||||
|
|
||||||
/// # Percentiles
|
/// # Percentiles
|
||||||
///
|
///
|
||||||
@@ -130,16 +131,10 @@ impl PercentilesAggregationReq {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug, PartialEq)]
|
||||||
pub(crate) struct SegmentPercentilesCollector {
|
pub(crate) struct SegmentPercentilesCollector {
|
||||||
pub(crate) buckets: Vec<PercentilesCollector>,
|
pub(crate) percentiles: PercentilesCollector,
|
||||||
pub(crate) accessor_idx: usize,
|
pub(crate) accessor_idx: usize,
|
||||||
/// The type of the field.
|
|
||||||
pub field_type: ColumnType,
|
|
||||||
/// The missing value normalized to the internal u64 representation of the field type.
|
|
||||||
pub missing_u64: Option<u64>,
|
|
||||||
/// The column accessor to access the fast field values.
|
|
||||||
pub accessor: Column<u64>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Serialize, Deserialize)]
|
#[derive(Clone, Serialize, Deserialize)]
|
||||||
@@ -234,18 +229,33 @@ impl PercentilesCollector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentPercentilesCollector {
|
impl SegmentPercentilesCollector {
|
||||||
pub fn from_req_and_validate(
|
pub fn from_req_and_validate(accessor_idx: usize) -> crate::Result<Self> {
|
||||||
field_type: ColumnType,
|
Ok(Self {
|
||||||
missing_u64: Option<u64>,
|
percentiles: PercentilesCollector::new(),
|
||||||
accessor: Column<u64>,
|
|
||||||
accessor_idx: usize,
|
|
||||||
) -> Self {
|
|
||||||
Self {
|
|
||||||
buckets: Vec::with_capacity(64),
|
|
||||||
field_type,
|
|
||||||
missing_u64,
|
|
||||||
accessor,
|
|
||||||
accessor_idx,
|
accessor_idx,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn collect_block_with_field(
|
||||||
|
&mut self,
|
||||||
|
docs: &[DocId],
|
||||||
|
req_data: &mut MetricAggReqData,
|
||||||
|
) {
|
||||||
|
if let Some(missing) = req_data.missing_u64.as_ref() {
|
||||||
|
req_data.column_block_accessor.fetch_block_with_missing(
|
||||||
|
docs,
|
||||||
|
&req_data.accessor,
|
||||||
|
*missing,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
req_data
|
||||||
|
.column_block_accessor
|
||||||
|
.fetch_block(docs, &req_data.accessor);
|
||||||
|
}
|
||||||
|
|
||||||
|
for val in req_data.column_block_accessor.iter_vals() {
|
||||||
|
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
|
||||||
|
self.percentiles.collect(val1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -253,18 +263,12 @@ impl SegmentPercentilesCollector {
|
|||||||
impl SegmentAggregationCollector for SegmentPercentilesCollector {
|
impl SegmentAggregationCollector for SegmentPercentilesCollector {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn add_intermediate_aggregation_result(
|
fn add_intermediate_aggregation_result(
|
||||||
&mut self,
|
self: Box<Self>,
|
||||||
agg_data: &AggregationsSegmentCtx,
|
agg_data: &AggregationsSegmentCtx,
|
||||||
results: &mut IntermediateAggregationResults,
|
results: &mut IntermediateAggregationResults,
|
||||||
parent_bucket_id: BucketId,
|
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let name = agg_data.get_metric_req_data(self.accessor_idx).name.clone();
|
let name = agg_data.get_metric_req_data(self.accessor_idx).name.clone();
|
||||||
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
|
let intermediate_metric_result = IntermediateMetricResult::Percentiles(self.percentiles);
|
||||||
// Swap collector with an empty one to avoid cloning
|
|
||||||
let percentiles_collector = std::mem::take(&mut self.buckets[parent_bucket_id as usize]);
|
|
||||||
|
|
||||||
let intermediate_metric_result =
|
|
||||||
IntermediateMetricResult::Percentiles(percentiles_collector);
|
|
||||||
|
|
||||||
results.push(
|
results.push(
|
||||||
name,
|
name,
|
||||||
@@ -277,33 +281,40 @@ impl SegmentAggregationCollector for SegmentPercentilesCollector {
|
|||||||
#[inline]
|
#[inline]
|
||||||
fn collect(
|
fn collect(
|
||||||
&mut self,
|
&mut self,
|
||||||
parent_bucket_id: BucketId,
|
doc: crate::DocId,
|
||||||
docs: &[crate::DocId],
|
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let percentiles = &mut self.buckets[parent_bucket_id as usize];
|
let req_data = agg_data.get_metric_req_data(self.accessor_idx);
|
||||||
agg_data.column_block_accessor.fetch_block_with_missing(
|
|
||||||
docs,
|
|
||||||
&self.accessor,
|
|
||||||
self.missing_u64,
|
|
||||||
);
|
|
||||||
|
|
||||||
for val in agg_data.column_block_accessor.iter_vals() {
|
if let Some(missing) = req_data.missing_u64 {
|
||||||
let val1 = f64_from_fastfield_u64(val, self.field_type);
|
let mut has_val = false;
|
||||||
percentiles.collect(val1);
|
for val in req_data.accessor.values_for_doc(doc) {
|
||||||
|
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
|
||||||
|
self.percentiles.collect(val1);
|
||||||
|
has_val = true;
|
||||||
|
}
|
||||||
|
if !has_val {
|
||||||
|
self.percentiles
|
||||||
|
.collect(f64_from_fastfield_u64(missing, &req_data.field_type));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for val in req_data.accessor.values_for_doc(doc) {
|
||||||
|
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
|
||||||
|
self.percentiles.collect(val1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn prepare_max_bucket(
|
#[inline]
|
||||||
|
fn collect_block(
|
||||||
&mut self,
|
&mut self,
|
||||||
max_bucket: BucketId,
|
docs: &[crate::DocId],
|
||||||
_agg_data: &AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
while self.buckets.len() <= max_bucket as usize {
|
let req_data = agg_data.get_metric_req_data_mut(self.accessor_idx);
|
||||||
self.buckets.push(PercentilesCollector::new());
|
self.collect_block_with_field(docs, req_data);
|
||||||
}
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
|
|
||||||
use columnar::{Column, ColumnType};
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
@@ -8,9 +7,10 @@ use crate::aggregation::agg_data::AggregationsSegmentCtx;
|
|||||||
use crate::aggregation::intermediate_agg_result::{
|
use crate::aggregation::intermediate_agg_result::{
|
||||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
|
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
|
||||||
};
|
};
|
||||||
|
use crate::aggregation::metric::MetricAggReqData;
|
||||||
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
||||||
use crate::aggregation::*;
|
use crate::aggregation::*;
|
||||||
use crate::TantivyError;
|
use crate::{DocId, TantivyError};
|
||||||
|
|
||||||
/// A multi-value metric aggregation that computes a collection of statistics on numeric values that
|
/// A multi-value metric aggregation that computes a collection of statistics on numeric values that
|
||||||
/// are extracted from the aggregated documents.
|
/// are extracted from the aggregated documents.
|
||||||
@@ -83,7 +83,7 @@ impl Stats {
|
|||||||
|
|
||||||
/// Intermediate result of the stats aggregation that can be combined with other intermediate
|
/// Intermediate result of the stats aggregation that can be combined with other intermediate
|
||||||
/// results.
|
/// results.
|
||||||
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
pub struct IntermediateStats {
|
pub struct IntermediateStats {
|
||||||
/// The number of extracted values.
|
/// The number of extracted values.
|
||||||
pub(crate) count: u64,
|
pub(crate) count: u64,
|
||||||
@@ -187,75 +187,75 @@ pub enum StatsType {
|
|||||||
Percentiles,
|
Percentiles,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn create_collector<const TYPE_ID: u8>(
|
#[derive(Clone, Debug)]
|
||||||
req: &MetricAggReqData,
|
pub(crate) struct SegmentStatsCollector {
|
||||||
) -> Box<dyn SegmentAggregationCollector> {
|
pub(crate) stats: IntermediateStats,
|
||||||
Box::new(SegmentStatsCollector::<TYPE_ID> {
|
pub(crate) accessor_idx: usize,
|
||||||
name: req.name.clone(),
|
|
||||||
collecting_for: req.collecting_for,
|
|
||||||
is_number_or_date_type: req.is_number_or_date_type,
|
|
||||||
missing_u64: req.missing_u64,
|
|
||||||
accessor: req.accessor.clone(),
|
|
||||||
buckets: vec![IntermediateStats::default()],
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build a concrete `SegmentStatsCollector` depending on the column type.
|
impl SegmentStatsCollector {
|
||||||
pub(crate) fn build_segment_stats_collector(
|
pub fn from_req(accessor_idx: usize) -> Self {
|
||||||
req: &MetricAggReqData,
|
Self {
|
||||||
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
|
stats: IntermediateStats::default(),
|
||||||
match req.field_type {
|
accessor_idx,
|
||||||
ColumnType::I64 => Ok(create_collector::<{ ColumnType::I64 as u8 }>(req)),
|
}
|
||||||
ColumnType::U64 => Ok(create_collector::<{ ColumnType::U64 as u8 }>(req)),
|
}
|
||||||
ColumnType::F64 => Ok(create_collector::<{ ColumnType::F64 as u8 }>(req)),
|
#[inline]
|
||||||
ColumnType::Bool => Ok(create_collector::<{ ColumnType::Bool as u8 }>(req)),
|
pub(crate) fn collect_block_with_field(
|
||||||
ColumnType::DateTime => Ok(create_collector::<{ ColumnType::DateTime as u8 }>(req)),
|
&mut self,
|
||||||
ColumnType::Bytes => Ok(create_collector::<{ ColumnType::Bytes as u8 }>(req)),
|
docs: &[DocId],
|
||||||
ColumnType::Str => Ok(create_collector::<{ ColumnType::Str as u8 }>(req)),
|
req_data: &mut MetricAggReqData,
|
||||||
ColumnType::IpAddr => Ok(create_collector::<{ ColumnType::IpAddr as u8 }>(req)),
|
) {
|
||||||
|
if let Some(missing) = req_data.missing_u64.as_ref() {
|
||||||
|
req_data.column_block_accessor.fetch_block_with_missing(
|
||||||
|
docs,
|
||||||
|
&req_data.accessor,
|
||||||
|
*missing,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
req_data
|
||||||
|
.column_block_accessor
|
||||||
|
.fetch_block(docs, &req_data.accessor);
|
||||||
|
}
|
||||||
|
if req_data.is_number_or_date_type {
|
||||||
|
for val in req_data.column_block_accessor.iter_vals() {
|
||||||
|
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
|
||||||
|
self.stats.collect(val1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for _val in req_data.column_block_accessor.iter_vals() {
|
||||||
|
// we ignore the value and simply record that we got something
|
||||||
|
self.stats.collect(0.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[repr(C)]
|
impl SegmentAggregationCollector for SegmentStatsCollector {
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub(crate) struct SegmentStatsCollector<const COLUMN_TYPE_ID: u8> {
|
|
||||||
pub(crate) missing_u64: Option<u64>,
|
|
||||||
pub(crate) accessor: Column<u64>,
|
|
||||||
pub(crate) is_number_or_date_type: bool,
|
|
||||||
pub(crate) buckets: Vec<IntermediateStats>,
|
|
||||||
pub(crate) name: String,
|
|
||||||
pub(crate) collecting_for: StatsType,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<const COLUMN_TYPE_ID: u8> SegmentAggregationCollector
|
|
||||||
for SegmentStatsCollector<COLUMN_TYPE_ID>
|
|
||||||
{
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn add_intermediate_aggregation_result(
|
fn add_intermediate_aggregation_result(
|
||||||
&mut self,
|
self: Box<Self>,
|
||||||
agg_data: &AggregationsSegmentCtx,
|
agg_data: &AggregationsSegmentCtx,
|
||||||
results: &mut IntermediateAggregationResults,
|
results: &mut IntermediateAggregationResults,
|
||||||
parent_bucket_id: BucketId,
|
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let name = self.name.clone();
|
let req = agg_data.get_metric_req_data(self.accessor_idx);
|
||||||
|
let name = req.name.clone();
|
||||||
|
|
||||||
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
|
let intermediate_metric_result = match req.collecting_for {
|
||||||
let stats = self.buckets[parent_bucket_id as usize];
|
|
||||||
let intermediate_metric_result = match self.collecting_for {
|
|
||||||
StatsType::Average => {
|
StatsType::Average => {
|
||||||
IntermediateMetricResult::Average(IntermediateAverage::from_stats(stats))
|
IntermediateMetricResult::Average(IntermediateAverage::from_collector(*self))
|
||||||
}
|
}
|
||||||
StatsType::Count => {
|
StatsType::Count => {
|
||||||
IntermediateMetricResult::Count(IntermediateCount::from_stats(stats))
|
IntermediateMetricResult::Count(IntermediateCount::from_collector(*self))
|
||||||
}
|
}
|
||||||
StatsType::Max => IntermediateMetricResult::Max(IntermediateMax::from_stats(stats)),
|
StatsType::Max => IntermediateMetricResult::Max(IntermediateMax::from_collector(*self)),
|
||||||
StatsType::Min => IntermediateMetricResult::Min(IntermediateMin::from_stats(stats)),
|
StatsType::Min => IntermediateMetricResult::Min(IntermediateMin::from_collector(*self)),
|
||||||
StatsType::Stats => IntermediateMetricResult::Stats(stats),
|
StatsType::Stats => IntermediateMetricResult::Stats(self.stats),
|
||||||
StatsType::Sum => IntermediateMetricResult::Sum(IntermediateSum::from_stats(stats)),
|
StatsType::Sum => IntermediateMetricResult::Sum(IntermediateSum::from_collector(*self)),
|
||||||
_ => {
|
_ => {
|
||||||
return Err(TantivyError::InvalidArgument(format!(
|
return Err(TantivyError::InvalidArgument(format!(
|
||||||
"Unsupported stats type for stats aggregation: {:?}",
|
"Unsupported stats type for stats aggregation: {:?}",
|
||||||
self.collecting_for
|
req.collecting_for
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -271,67 +271,41 @@ impl<const COLUMN_TYPE_ID: u8> SegmentAggregationCollector
|
|||||||
#[inline]
|
#[inline]
|
||||||
fn collect(
|
fn collect(
|
||||||
&mut self,
|
&mut self,
|
||||||
parent_bucket_id: BucketId,
|
doc: crate::DocId,
|
||||||
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
let req_data = agg_data.get_metric_req_data(self.accessor_idx);
|
||||||
|
if let Some(missing) = req_data.missing_u64 {
|
||||||
|
let mut has_val = false;
|
||||||
|
for val in req_data.accessor.values_for_doc(doc) {
|
||||||
|
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
|
||||||
|
self.stats.collect(val1);
|
||||||
|
has_val = true;
|
||||||
|
}
|
||||||
|
if !has_val {
|
||||||
|
self.stats
|
||||||
|
.collect(f64_from_fastfield_u64(missing, &req_data.field_type));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for val in req_data.accessor.values_for_doc(doc) {
|
||||||
|
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
|
||||||
|
self.stats.collect(val1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn collect_block(
|
||||||
|
&mut self,
|
||||||
docs: &[crate::DocId],
|
docs: &[crate::DocId],
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
// TODO: remove once we fetch all values for all bucket ids in one go
|
let req_data = agg_data.get_metric_req_data_mut(self.accessor_idx);
|
||||||
if docs.len() == 1 && self.missing_u64.is_none() {
|
self.collect_block_with_field(docs, req_data);
|
||||||
collect_stats::<COLUMN_TYPE_ID>(
|
|
||||||
&mut self.buckets[parent_bucket_id as usize],
|
|
||||||
self.accessor.values_for_doc(docs[0]),
|
|
||||||
self.is_number_or_date_type,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
agg_data.column_block_accessor.fetch_block_with_missing(
|
|
||||||
docs,
|
|
||||||
&self.accessor,
|
|
||||||
self.missing_u64,
|
|
||||||
);
|
|
||||||
collect_stats::<COLUMN_TYPE_ID>(
|
|
||||||
&mut self.buckets[parent_bucket_id as usize],
|
|
||||||
agg_data.column_block_accessor.iter_vals(),
|
|
||||||
self.is_number_or_date_type,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn prepare_max_bucket(
|
|
||||||
&mut self,
|
|
||||||
max_bucket: BucketId,
|
|
||||||
_agg_data: &AggregationsSegmentCtx,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
let required_buckets = (max_bucket as usize) + 1;
|
|
||||||
if self.buckets.len() < required_buckets {
|
|
||||||
self.buckets
|
|
||||||
.resize_with(required_buckets, IntermediateStats::default);
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
fn collect_stats<const COLUMN_TYPE_ID: u8>(
|
|
||||||
stats: &mut IntermediateStats,
|
|
||||||
vals: impl Iterator<Item = u64>,
|
|
||||||
is_number_or_date_type: bool,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
if is_number_or_date_type {
|
|
||||||
for val in vals {
|
|
||||||
let val1 = convert_to_f64::<COLUMN_TYPE_ID>(val);
|
|
||||||
stats.collect(val1);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for _val in vals {
|
|
||||||
// we ignore the value and simply record that we got something
|
|
||||||
stats.collect(0.0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -52,8 +52,10 @@ pub struct IntermediateSum {
|
|||||||
|
|
||||||
impl IntermediateSum {
|
impl IntermediateSum {
|
||||||
/// Creates a new [`IntermediateSum`] instance from a [`SegmentStatsCollector`].
|
/// Creates a new [`IntermediateSum`] instance from a [`SegmentStatsCollector`].
|
||||||
pub(crate) fn from_stats(stats: IntermediateStats) -> Self {
|
pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
|
||||||
Self { stats }
|
Self {
|
||||||
|
stats: collector.stats,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/// Merges the other intermediate result into self.
|
/// Merges the other intermediate result into self.
|
||||||
pub fn merge_fruits(&mut self, other: IntermediateSum) {
|
pub fn merge_fruits(&mut self, other: IntermediateSum) {
|
||||||
|
|||||||
@@ -15,11 +15,11 @@ use crate::aggregation::intermediate_agg_result::{
|
|||||||
IntermediateAggregationResult, IntermediateMetricResult,
|
IntermediateAggregationResult, IntermediateMetricResult,
|
||||||
};
|
};
|
||||||
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
||||||
use crate::aggregation::{AggregationError, BucketId};
|
use crate::aggregation::AggregationError;
|
||||||
use crate::collector::sort_key::ReverseComparator;
|
|
||||||
use crate::collector::TopNComputer;
|
use crate::collector::TopNComputer;
|
||||||
use crate::schema::OwnedValue;
|
use crate::schema::OwnedValue;
|
||||||
use crate::{DocAddress, DocId, SegmentOrdinal};
|
use crate::{DocAddress, DocId, SegmentOrdinal};
|
||||||
|
// duplicate import removed; already imported above
|
||||||
|
|
||||||
/// Contains all information required by the TopHitsSegmentCollector to perform the
|
/// Contains all information required by the TopHitsSegmentCollector to perform the
|
||||||
/// top_hits aggregation on a segment.
|
/// top_hits aggregation on a segment.
|
||||||
@@ -458,7 +458,7 @@ impl Eq for DocSortValuesAndFields {}
|
|||||||
#[derive(Clone, Serialize, Deserialize, Debug)]
|
#[derive(Clone, Serialize, Deserialize, Debug)]
|
||||||
pub struct TopHitsTopNComputer {
|
pub struct TopHitsTopNComputer {
|
||||||
req: TopHitsAggregationReq,
|
req: TopHitsAggregationReq,
|
||||||
top_n: TopNComputer<DocSortValuesAndFields, DocAddress, ReverseComparator>,
|
top_n: TopNComputer<DocSortValuesAndFields, DocAddress, false>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::cmp::PartialEq for TopHitsTopNComputer {
|
impl std::cmp::PartialEq for TopHitsTopNComputer {
|
||||||
@@ -471,10 +471,7 @@ impl TopHitsTopNComputer {
|
|||||||
/// Create a new TopHitsCollector
|
/// Create a new TopHitsCollector
|
||||||
pub fn new(req: &TopHitsAggregationReq) -> Self {
|
pub fn new(req: &TopHitsAggregationReq) -> Self {
|
||||||
Self {
|
Self {
|
||||||
top_n: TopNComputer::new_with_comparator(
|
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
|
||||||
req.size + req.from.unwrap_or(0),
|
|
||||||
ReverseComparator,
|
|
||||||
),
|
|
||||||
req: req.clone(),
|
req: req.clone(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -485,7 +482,7 @@ impl TopHitsTopNComputer {
|
|||||||
|
|
||||||
pub(crate) fn merge_fruits(&mut self, other_fruit: Self) -> crate::Result<()> {
|
pub(crate) fn merge_fruits(&mut self, other_fruit: Self) -> crate::Result<()> {
|
||||||
for doc in other_fruit.top_n.into_vec() {
|
for doc in other_fruit.top_n.into_vec() {
|
||||||
self.collect(doc.sort_key, doc.doc);
|
self.collect(doc.feature, doc.doc);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -497,9 +494,9 @@ impl TopHitsTopNComputer {
|
|||||||
.into_sorted_vec()
|
.into_sorted_vec()
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|doc| TopHitsVecEntry {
|
.map(|doc| TopHitsVecEntry {
|
||||||
sort: doc.sort_key.sorts.iter().map(|f| f.value).collect(),
|
sort: doc.feature.sorts.iter().map(|f| f.value).collect(),
|
||||||
doc_value_fields: doc
|
doc_value_fields: doc
|
||||||
.sort_key
|
.feature
|
||||||
.doc_value_fields
|
.doc_value_fields
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(k, v)| (k, v.into()))
|
.map(|(k, v)| (k, v.into()))
|
||||||
@@ -520,8 +517,7 @@ impl TopHitsTopNComputer {
|
|||||||
pub(crate) struct TopHitsSegmentCollector {
|
pub(crate) struct TopHitsSegmentCollector {
|
||||||
segment_ordinal: SegmentOrdinal,
|
segment_ordinal: SegmentOrdinal,
|
||||||
accessor_idx: usize,
|
accessor_idx: usize,
|
||||||
buckets: Vec<TopNComputer<Vec<DocValueAndOrder>, DocAddress, ReverseComparator>>,
|
top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, false>,
|
||||||
num_hits: usize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TopHitsSegmentCollector {
|
impl TopHitsSegmentCollector {
|
||||||
@@ -530,35 +526,25 @@ impl TopHitsSegmentCollector {
|
|||||||
accessor_idx: usize,
|
accessor_idx: usize,
|
||||||
segment_ordinal: SegmentOrdinal,
|
segment_ordinal: SegmentOrdinal,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let num_hits = req.size + req.from.unwrap_or(0);
|
|
||||||
Self {
|
Self {
|
||||||
num_hits,
|
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
|
||||||
segment_ordinal,
|
segment_ordinal,
|
||||||
accessor_idx,
|
accessor_idx,
|
||||||
buckets: vec![TopNComputer::new_with_comparator(num_hits, ReverseComparator); 1],
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn get_top_hits_computer(
|
fn into_top_hits_collector(
|
||||||
&mut self,
|
self,
|
||||||
parent_bucket_id: BucketId,
|
|
||||||
value_accessors: &HashMap<String, Vec<DynamicColumn>>,
|
value_accessors: &HashMap<String, Vec<DynamicColumn>>,
|
||||||
req: &TopHitsAggregationReq,
|
req: &TopHitsAggregationReq,
|
||||||
) -> TopHitsTopNComputer {
|
) -> TopHitsTopNComputer {
|
||||||
if parent_bucket_id as usize >= self.buckets.len() {
|
|
||||||
return TopHitsTopNComputer::new(req);
|
|
||||||
}
|
|
||||||
let top_n = std::mem::replace(
|
|
||||||
&mut self.buckets[parent_bucket_id as usize],
|
|
||||||
TopNComputer::new(0),
|
|
||||||
);
|
|
||||||
let mut top_hits_computer = TopHitsTopNComputer::new(req);
|
let mut top_hits_computer = TopHitsTopNComputer::new(req);
|
||||||
let top_results = top_n.into_vec();
|
let top_results = self.top_n.into_vec();
|
||||||
|
|
||||||
for res in top_results {
|
for res in top_results {
|
||||||
let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id);
|
let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id);
|
||||||
top_hits_computer.collect(
|
top_hits_computer.collect(
|
||||||
DocSortValuesAndFields {
|
DocSortValuesAndFields {
|
||||||
sorts: res.sort_key,
|
sorts: res.feature,
|
||||||
doc_value_fields,
|
doc_value_fields,
|
||||||
},
|
},
|
||||||
res.doc,
|
res.doc,
|
||||||
@@ -567,24 +553,54 @@ impl TopHitsSegmentCollector {
|
|||||||
|
|
||||||
top_hits_computer
|
top_hits_computer
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// TODO add a specialized variant for a single sort field
|
||||||
|
fn collect_with(
|
||||||
|
&mut self,
|
||||||
|
doc_id: crate::DocId,
|
||||||
|
req: &TopHitsAggregationReq,
|
||||||
|
accessors: &[(Column<u64>, ColumnType)],
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
let sorts: Vec<DocValueAndOrder> = req
|
||||||
|
.sort
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(idx, KeyOrder { order, .. })| {
|
||||||
|
let order = *order;
|
||||||
|
let value = accessors
|
||||||
|
.get(idx)
|
||||||
|
.expect("could not find field in accessors")
|
||||||
|
.0
|
||||||
|
.values_for_doc(doc_id)
|
||||||
|
.next();
|
||||||
|
DocValueAndOrder { value, order }
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
self.top_n.push(
|
||||||
|
sorts,
|
||||||
|
DocAddress {
|
||||||
|
segment_ord: self.segment_ordinal,
|
||||||
|
doc_id,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentAggregationCollector for TopHitsSegmentCollector {
|
impl SegmentAggregationCollector for TopHitsSegmentCollector {
|
||||||
fn add_intermediate_aggregation_result(
|
fn add_intermediate_aggregation_result(
|
||||||
&mut self,
|
self: Box<Self>,
|
||||||
agg_data: &AggregationsSegmentCtx,
|
agg_data: &AggregationsSegmentCtx,
|
||||||
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
|
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
|
||||||
parent_bucket_id: BucketId,
|
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let req_data = agg_data.get_top_hits_req_data(self.accessor_idx);
|
let req_data = agg_data.get_top_hits_req_data(self.accessor_idx);
|
||||||
|
|
||||||
let value_accessors = &req_data.value_accessors;
|
let value_accessors = &req_data.value_accessors;
|
||||||
|
|
||||||
let intermediate_result = IntermediateMetricResult::TopHits(self.get_top_hits_computer(
|
let intermediate_result = IntermediateMetricResult::TopHits(
|
||||||
parent_bucket_id,
|
self.into_top_hits_collector(value_accessors, &req_data.req),
|
||||||
value_accessors,
|
);
|
||||||
&req_data.req,
|
|
||||||
));
|
|
||||||
results.push(
|
results.push(
|
||||||
req_data.name.to_string(),
|
req_data.name.to_string(),
|
||||||
IntermediateAggregationResult::Metric(intermediate_result),
|
IntermediateAggregationResult::Metric(intermediate_result),
|
||||||
@@ -594,54 +610,24 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
|
|||||||
/// TODO: Consider a caching layer to reduce the call overhead
|
/// TODO: Consider a caching layer to reduce the call overhead
|
||||||
fn collect(
|
fn collect(
|
||||||
&mut self,
|
&mut self,
|
||||||
parent_bucket_id: BucketId,
|
doc_id: crate::DocId,
|
||||||
docs: &[crate::DocId],
|
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let top_n = &mut self.buckets[parent_bucket_id as usize];
|
|
||||||
let req_data = agg_data.get_top_hits_req_data(self.accessor_idx);
|
let req_data = agg_data.get_top_hits_req_data(self.accessor_idx);
|
||||||
let req = &req_data.req;
|
self.collect_with(doc_id, &req_data.req, &req_data.accessors)?;
|
||||||
let accessors = &req_data.accessors;
|
|
||||||
for &doc_id in docs {
|
|
||||||
// TODO: this is terrible, a new vec is allocated for every doc
|
|
||||||
// We can fetch blocks instead
|
|
||||||
// We don't need to store the order for every value
|
|
||||||
let sorts: Vec<DocValueAndOrder> = req
|
|
||||||
.sort
|
|
||||||
.iter()
|
|
||||||
.enumerate()
|
|
||||||
.map(|(idx, KeyOrder { order, .. })| {
|
|
||||||
let order = *order;
|
|
||||||
let value = accessors
|
|
||||||
.get(idx)
|
|
||||||
.expect("could not find field in accessors")
|
|
||||||
.0
|
|
||||||
.values_for_doc(doc_id)
|
|
||||||
.next();
|
|
||||||
DocValueAndOrder { value, order }
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
top_n.push(
|
|
||||||
sorts,
|
|
||||||
DocAddress {
|
|
||||||
segment_ord: self.segment_ordinal,
|
|
||||||
doc_id,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn prepare_max_bucket(
|
fn collect_block(
|
||||||
&mut self,
|
&mut self,
|
||||||
max_bucket: BucketId,
|
docs: &[crate::DocId],
|
||||||
_agg_data: &AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
self.buckets.resize(
|
let req_data = agg_data.get_top_hits_req_data(self.accessor_idx);
|
||||||
(max_bucket as usize) + 1,
|
// TODO: Consider getting fields with the column block accessor.
|
||||||
TopNComputer::new_with_comparator(self.num_hits, ReverseComparator),
|
for doc in docs {
|
||||||
);
|
self.collect_with(*doc, &req_data.req, &req_data.accessors)?;
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -659,7 +645,6 @@ mod tests {
|
|||||||
use crate::aggregation::bucket::tests::get_test_index_from_docs;
|
use crate::aggregation::bucket::tests::get_test_index_from_docs;
|
||||||
use crate::aggregation::tests::get_test_index_from_values;
|
use crate::aggregation::tests::get_test_index_from_values;
|
||||||
use crate::aggregation::AggregationCollector;
|
use crate::aggregation::AggregationCollector;
|
||||||
use crate::collector::sort_key::ReverseComparator;
|
|
||||||
use crate::collector::ComparableDoc;
|
use crate::collector::ComparableDoc;
|
||||||
use crate::query::AllQuery;
|
use crate::query::AllQuery;
|
||||||
use crate::schema::OwnedValue;
|
use crate::schema::OwnedValue;
|
||||||
@@ -675,7 +660,7 @@ mod tests {
|
|||||||
|
|
||||||
fn collector_with_capacity(capacity: usize) -> super::TopHitsTopNComputer {
|
fn collector_with_capacity(capacity: usize) -> super::TopHitsTopNComputer {
|
||||||
super::TopHitsTopNComputer {
|
super::TopHitsTopNComputer {
|
||||||
top_n: super::TopNComputer::new_with_comparator(capacity, ReverseComparator),
|
top_n: super::TopNComputer::new(capacity),
|
||||||
req: Default::default(),
|
req: Default::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -759,7 +744,7 @@ mod tests {
|
|||||||
],
|
],
|
||||||
"from": 0,
|
"from": 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}))
|
}))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
@@ -789,12 +774,12 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_top_hits_collector_single_feature() -> crate::Result<()> {
|
fn test_top_hits_collector_single_feature() -> crate::Result<()> {
|
||||||
let docs = vec![
|
let docs = vec![
|
||||||
ComparableDoc::<_, _> {
|
ComparableDoc::<_, _, false> {
|
||||||
doc: crate::DocAddress {
|
doc: crate::DocAddress {
|
||||||
segment_ord: 0,
|
segment_ord: 0,
|
||||||
doc_id: 0,
|
doc_id: 0,
|
||||||
},
|
},
|
||||||
sort_key: DocSortValuesAndFields {
|
feature: DocSortValuesAndFields {
|
||||||
sorts: vec![DocValueAndOrder {
|
sorts: vec![DocValueAndOrder {
|
||||||
value: Some(1),
|
value: Some(1),
|
||||||
order: Order::Asc,
|
order: Order::Asc,
|
||||||
@@ -807,7 +792,7 @@ mod tests {
|
|||||||
segment_ord: 0,
|
segment_ord: 0,
|
||||||
doc_id: 2,
|
doc_id: 2,
|
||||||
},
|
},
|
||||||
sort_key: DocSortValuesAndFields {
|
feature: DocSortValuesAndFields {
|
||||||
sorts: vec![DocValueAndOrder {
|
sorts: vec![DocValueAndOrder {
|
||||||
value: Some(3),
|
value: Some(3),
|
||||||
order: Order::Asc,
|
order: Order::Asc,
|
||||||
@@ -820,7 +805,7 @@ mod tests {
|
|||||||
segment_ord: 0,
|
segment_ord: 0,
|
||||||
doc_id: 1,
|
doc_id: 1,
|
||||||
},
|
},
|
||||||
sort_key: DocSortValuesAndFields {
|
feature: DocSortValuesAndFields {
|
||||||
sorts: vec![DocValueAndOrder {
|
sorts: vec![DocValueAndOrder {
|
||||||
value: Some(5),
|
value: Some(5),
|
||||||
order: Order::Asc,
|
order: Order::Asc,
|
||||||
@@ -832,7 +817,7 @@ mod tests {
|
|||||||
|
|
||||||
let mut collector = collector_with_capacity(3);
|
let mut collector = collector_with_capacity(3);
|
||||||
for doc in docs.clone() {
|
for doc in docs.clone() {
|
||||||
collector.collect(doc.sort_key, doc.doc);
|
collector.collect(doc.feature, doc.doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
let res = collector.into_final_result();
|
let res = collector.into_final_result();
|
||||||
@@ -842,15 +827,15 @@ mod tests {
|
|||||||
super::TopHitsMetricResult {
|
super::TopHitsMetricResult {
|
||||||
hits: vec![
|
hits: vec![
|
||||||
super::TopHitsVecEntry {
|
super::TopHitsVecEntry {
|
||||||
sort: vec![docs[0].sort_key.sorts[0].value],
|
sort: vec![docs[0].feature.sorts[0].value],
|
||||||
doc_value_fields: Default::default(),
|
doc_value_fields: Default::default(),
|
||||||
},
|
},
|
||||||
super::TopHitsVecEntry {
|
super::TopHitsVecEntry {
|
||||||
sort: vec![docs[1].sort_key.sorts[0].value],
|
sort: vec![docs[1].feature.sorts[0].value],
|
||||||
doc_value_fields: Default::default(),
|
doc_value_fields: Default::default(),
|
||||||
},
|
},
|
||||||
super::TopHitsVecEntry {
|
super::TopHitsVecEntry {
|
||||||
sort: vec![docs[2].sort_key.sorts[0].value],
|
sort: vec![docs[2].feature.sorts[0].value],
|
||||||
doc_value_fields: Default::default(),
|
doc_value_fields: Default::default(),
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
@@ -888,7 +873,7 @@ mod tests {
|
|||||||
"mixed.*",
|
"mixed.*",
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}))?;
|
}))?;
|
||||||
|
|
||||||
let collector = AggregationCollector::from_aggs(d, Default::default());
|
let collector = AggregationCollector::from_aggs(d, Default::default());
|
||||||
|
|||||||
@@ -133,7 +133,7 @@ mod agg_limits;
|
|||||||
pub mod agg_req;
|
pub mod agg_req;
|
||||||
pub mod agg_result;
|
pub mod agg_result;
|
||||||
pub mod bucket;
|
pub mod bucket;
|
||||||
pub(crate) mod cached_sub_aggs;
|
mod buf_collector;
|
||||||
mod collector;
|
mod collector;
|
||||||
mod date;
|
mod date;
|
||||||
mod error;
|
mod error;
|
||||||
@@ -162,19 +162,6 @@ use serde::{Deserialize, Deserializer, Serialize};
|
|||||||
|
|
||||||
use crate::tokenizer::TokenizerManager;
|
use crate::tokenizer::TokenizerManager;
|
||||||
|
|
||||||
/// A bucket id is a dense identifier for a bucket within an aggregation.
|
|
||||||
/// It is used to index into a Vec that hold per-bucket data.
|
|
||||||
///
|
|
||||||
/// For example, in a terms aggregation, each unique term will be assigned a incremental BucketId.
|
|
||||||
/// This BucketId will be forwarded to sub-aggregations to identify the parent bucket.
|
|
||||||
///
|
|
||||||
/// This allows to have a single AggregationCollector instance per aggregation,
|
|
||||||
/// that can handle multiple buckets efficiently.
|
|
||||||
///
|
|
||||||
/// The API to call sub-aggregations is therefore a &[(BucketId, &[DocId])].
|
|
||||||
/// For that we'll need a buffer. One Vec per bucket aggregation is needed.
|
|
||||||
pub type BucketId = u32;
|
|
||||||
|
|
||||||
/// Context parameters for aggregation execution
|
/// Context parameters for aggregation execution
|
||||||
///
|
///
|
||||||
/// This struct holds shared resources needed during aggregation execution:
|
/// This struct holds shared resources needed during aggregation execution:
|
||||||
@@ -348,37 +335,19 @@ impl Display for Key {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn convert_to_f64<const COLUMN_TYPE_ID: u8>(val: u64) -> f64 {
|
|
||||||
if COLUMN_TYPE_ID == ColumnType::U64 as u8 {
|
|
||||||
val as f64
|
|
||||||
} else if COLUMN_TYPE_ID == ColumnType::I64 as u8
|
|
||||||
|| COLUMN_TYPE_ID == ColumnType::DateTime as u8
|
|
||||||
{
|
|
||||||
i64::from_u64(val) as f64
|
|
||||||
} else if COLUMN_TYPE_ID == ColumnType::F64 as u8 {
|
|
||||||
f64::from_u64(val)
|
|
||||||
} else if COLUMN_TYPE_ID == ColumnType::Bool as u8 {
|
|
||||||
val as f64
|
|
||||||
} else {
|
|
||||||
panic!(
|
|
||||||
"ColumnType ID {} cannot be converted to f64 metric",
|
|
||||||
COLUMN_TYPE_ID
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Inverse of `to_fastfield_u64`. Used to convert to `f64` for metrics.
|
/// Inverse of `to_fastfield_u64`. Used to convert to `f64` for metrics.
|
||||||
///
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
/// Only `u64`, `f64`, `date`, and `i64` are supported.
|
/// Only `u64`, `f64`, `date`, and `i64` are supported.
|
||||||
pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: ColumnType) -> f64 {
|
pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: &ColumnType) -> f64 {
|
||||||
match field_type {
|
match field_type {
|
||||||
ColumnType::U64 => convert_to_f64::<{ ColumnType::U64 as u8 }>(val),
|
ColumnType::U64 => val as f64,
|
||||||
ColumnType::I64 => convert_to_f64::<{ ColumnType::I64 as u8 }>(val),
|
ColumnType::I64 | ColumnType::DateTime => i64::from_u64(val) as f64,
|
||||||
ColumnType::F64 => convert_to_f64::<{ ColumnType::F64 as u8 }>(val),
|
ColumnType::F64 => f64::from_u64(val),
|
||||||
ColumnType::Bool => convert_to_f64::<{ ColumnType::Bool as u8 }>(val),
|
ColumnType::Bool => val as f64,
|
||||||
ColumnType::DateTime => convert_to_f64::<{ ColumnType::DateTime as u8 }>(val),
|
_ => {
|
||||||
_ => panic!("unexpected type {field_type:?}. This should not happen"),
|
panic!("unexpected type {field_type:?}. This should not happen")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,67 +8,28 @@ use std::fmt::Debug;
|
|||||||
pub(crate) use super::agg_limits::AggregationLimitsGuard;
|
pub(crate) use super::agg_limits::AggregationLimitsGuard;
|
||||||
use super::intermediate_agg_result::IntermediateAggregationResults;
|
use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||||
use crate::aggregation::agg_data::AggregationsSegmentCtx;
|
use crate::aggregation::agg_data::AggregationsSegmentCtx;
|
||||||
use crate::aggregation::BucketId;
|
|
||||||
|
|
||||||
/// Monotonically increasing provider of BucketIds.
|
|
||||||
#[derive(Debug, Clone, Default)]
|
|
||||||
pub struct BucketIdProvider(u32);
|
|
||||||
impl BucketIdProvider {
|
|
||||||
/// Get the next BucketId.
|
|
||||||
pub fn next_bucket_id(&mut self) -> BucketId {
|
|
||||||
let bucket_id = self.0;
|
|
||||||
self.0 += 1;
|
|
||||||
bucket_id
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A SegmentAggregationCollector is used to collect aggregation results.
|
/// A SegmentAggregationCollector is used to collect aggregation results.
|
||||||
pub trait SegmentAggregationCollector: Debug {
|
pub trait SegmentAggregationCollector: CollectorClone + Debug {
|
||||||
fn add_intermediate_aggregation_result(
|
fn add_intermediate_aggregation_result(
|
||||||
&mut self,
|
self: Box<Self>,
|
||||||
agg_data: &AggregationsSegmentCtx,
|
agg_data: &AggregationsSegmentCtx,
|
||||||
results: &mut IntermediateAggregationResults,
|
results: &mut IntermediateAggregationResults,
|
||||||
parent_bucket_id: BucketId,
|
|
||||||
) -> crate::Result<()>;
|
) -> crate::Result<()>;
|
||||||
|
|
||||||
/// Note: The caller needs to call `prepare_max_bucket` before calling `collect`.
|
#[inline]
|
||||||
fn collect(
|
fn collect(
|
||||||
&mut self,
|
&mut self,
|
||||||
parent_bucket_id: BucketId,
|
doc: crate::DocId,
|
||||||
docs: &[crate::DocId],
|
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
|
||||||
) -> crate::Result<()>;
|
|
||||||
|
|
||||||
/// Collect docs for multiple buckets in one call.
|
|
||||||
/// Minimizes dynamic dispatch overhead when collecting many buckets.
|
|
||||||
///
|
|
||||||
/// Note: The caller needs to call `prepare_max_bucket` before calling `collect`.
|
|
||||||
fn collect_multiple(
|
|
||||||
&mut self,
|
|
||||||
bucket_ids: &[BucketId],
|
|
||||||
docs: &[crate::DocId],
|
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
debug_assert_eq!(bucket_ids.len(), docs.len());
|
self.collect_block(&[doc], agg_data)
|
||||||
let mut start = 0;
|
|
||||||
while start < bucket_ids.len() {
|
|
||||||
let bucket_id = bucket_ids[start];
|
|
||||||
let mut end = start + 1;
|
|
||||||
while end < bucket_ids.len() && bucket_ids[end] == bucket_id {
|
|
||||||
end += 1;
|
|
||||||
}
|
|
||||||
self.collect(bucket_id, &docs[start..end], agg_data)?;
|
|
||||||
start = end;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Prepare the collector for collecting up to BucketId `max_bucket`.
|
fn collect_block(
|
||||||
/// This is useful so we can split allocation ahead of time of collecting.
|
|
||||||
fn prepare_max_bucket(
|
|
||||||
&mut self,
|
&mut self,
|
||||||
max_bucket: BucketId,
|
docs: &[crate::DocId],
|
||||||
agg_data: &AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()>;
|
) -> crate::Result<()>;
|
||||||
|
|
||||||
/// Finalize method. Some Aggregator collect blocks of docs before calling `collect_block`.
|
/// Finalize method. Some Aggregator collect blocks of docs before calling `collect_block`.
|
||||||
@@ -78,7 +39,26 @@ pub trait SegmentAggregationCollector: Debug {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
/// A helper trait to enable cloning of Box<dyn SegmentAggregationCollector>
|
||||||
|
pub trait CollectorClone {
|
||||||
|
fn clone_box(&self) -> Box<dyn SegmentAggregationCollector>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> CollectorClone for T
|
||||||
|
where T: 'static + SegmentAggregationCollector + Clone
|
||||||
|
{
|
||||||
|
fn clone_box(&self) -> Box<dyn SegmentAggregationCollector> {
|
||||||
|
Box::new(self.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Clone for Box<dyn SegmentAggregationCollector> {
|
||||||
|
fn clone(&self) -> Box<dyn SegmentAggregationCollector> {
|
||||||
|
self.clone_box()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Default)]
|
||||||
/// The GenericSegmentAggregationResultsCollector is the generic version of the collector, which
|
/// The GenericSegmentAggregationResultsCollector is the generic version of the collector, which
|
||||||
/// can handle arbitrary complexity of sub-aggregations. Ideally we never have to pick this one
|
/// can handle arbitrary complexity of sub-aggregations. Ideally we never have to pick this one
|
||||||
/// and can provide specialized versions instead, that remove some of its overhead.
|
/// and can provide specialized versions instead, that remove some of its overhead.
|
||||||
@@ -96,13 +76,12 @@ impl Debug for GenericSegmentAggregationResultsCollector {
|
|||||||
|
|
||||||
impl SegmentAggregationCollector for GenericSegmentAggregationResultsCollector {
|
impl SegmentAggregationCollector for GenericSegmentAggregationResultsCollector {
|
||||||
fn add_intermediate_aggregation_result(
|
fn add_intermediate_aggregation_result(
|
||||||
&mut self,
|
self: Box<Self>,
|
||||||
agg_data: &AggregationsSegmentCtx,
|
agg_data: &AggregationsSegmentCtx,
|
||||||
results: &mut IntermediateAggregationResults,
|
results: &mut IntermediateAggregationResults,
|
||||||
parent_bucket_id: BucketId,
|
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
for agg in &mut self.aggs {
|
for agg in self.aggs {
|
||||||
agg.add_intermediate_aggregation_result(agg_data, results, parent_bucket_id)?;
|
agg.add_intermediate_aggregation_result(agg_data, results)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -110,13 +89,23 @@ impl SegmentAggregationCollector for GenericSegmentAggregationResultsCollector {
|
|||||||
|
|
||||||
fn collect(
|
fn collect(
|
||||||
&mut self,
|
&mut self,
|
||||||
parent_bucket_id: BucketId,
|
doc: crate::DocId,
|
||||||
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
self.collect_block(&[doc], agg_data)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_block(
|
||||||
|
&mut self,
|
||||||
docs: &[crate::DocId],
|
docs: &[crate::DocId],
|
||||||
agg_data: &mut AggregationsSegmentCtx,
|
agg_data: &mut AggregationsSegmentCtx,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
for collector in &mut self.aggs {
|
for collector in &mut self.aggs {
|
||||||
collector.collect(parent_bucket_id, docs, agg_data)?;
|
collector.collect_block(docs, agg_data)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -126,15 +115,4 @@ impl SegmentAggregationCollector for GenericSegmentAggregationResultsCollector {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn prepare_max_bucket(
|
|
||||||
&mut self,
|
|
||||||
max_bucket: BucketId,
|
|
||||||
agg_data: &AggregationsSegmentCtx,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
for collector in &mut self.aggs {
|
|
||||||
collector.prepare_max_bucket(max_bucket, agg_data)?;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
121
src/collector/custom_score_top_collector.rs
Normal file
121
src/collector/custom_score_top_collector.rs
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
use crate::collector::top_collector::{TopCollector, TopSegmentCollector};
|
||||||
|
use crate::collector::{Collector, SegmentCollector};
|
||||||
|
use crate::{DocAddress, DocId, Score, SegmentReader};
|
||||||
|
|
||||||
|
pub(crate) struct CustomScoreTopCollector<TCustomScorer, TScore = Score> {
|
||||||
|
custom_scorer: TCustomScorer,
|
||||||
|
collector: TopCollector<TScore>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TCustomScorer, TScore> CustomScoreTopCollector<TCustomScorer, TScore>
|
||||||
|
where TScore: Clone + PartialOrd
|
||||||
|
{
|
||||||
|
pub(crate) fn new(
|
||||||
|
custom_scorer: TCustomScorer,
|
||||||
|
collector: TopCollector<TScore>,
|
||||||
|
) -> CustomScoreTopCollector<TCustomScorer, TScore> {
|
||||||
|
CustomScoreTopCollector {
|
||||||
|
custom_scorer,
|
||||||
|
collector,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A custom segment scorer makes it possible to define any kind of score
|
||||||
|
/// for a given document belonging to a specific segment.
|
||||||
|
///
|
||||||
|
/// It is the segment local version of the [`CustomScorer`].
|
||||||
|
pub trait CustomSegmentScorer<TScore>: 'static {
|
||||||
|
/// Computes the score of a specific `doc`.
|
||||||
|
fn score(&mut self, doc: DocId) -> TScore;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `CustomScorer` makes it possible to define any kind of score.
|
||||||
|
///
|
||||||
|
/// The `CustomerScorer` itself does not make much of the computation itself.
|
||||||
|
/// Instead, it helps constructing `Self::Child` instances that will compute
|
||||||
|
/// the score at a segment scale.
|
||||||
|
pub trait CustomScorer<TScore>: Sync {
|
||||||
|
/// Type of the associated [`CustomSegmentScorer`].
|
||||||
|
type Child: CustomSegmentScorer<TScore>;
|
||||||
|
/// Builds a child scorer for a specific segment. The child scorer is associated with
|
||||||
|
/// a specific segment.
|
||||||
|
fn segment_scorer(&self, segment_reader: &SegmentReader) -> crate::Result<Self::Child>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TCustomScorer, TScore> Collector for CustomScoreTopCollector<TCustomScorer, TScore>
|
||||||
|
where
|
||||||
|
TCustomScorer: CustomScorer<TScore> + Send + Sync,
|
||||||
|
TScore: 'static + PartialOrd + Clone + Send + Sync,
|
||||||
|
{
|
||||||
|
type Fruit = Vec<(TScore, DocAddress)>;
|
||||||
|
|
||||||
|
type Child = CustomScoreTopSegmentCollector<TCustomScorer::Child, TScore>;
|
||||||
|
|
||||||
|
fn for_segment(
|
||||||
|
&self,
|
||||||
|
segment_local_id: u32,
|
||||||
|
segment_reader: &SegmentReader,
|
||||||
|
) -> crate::Result<Self::Child> {
|
||||||
|
let segment_collector = self.collector.for_segment(segment_local_id, segment_reader);
|
||||||
|
let segment_scorer = self.custom_scorer.segment_scorer(segment_reader)?;
|
||||||
|
Ok(CustomScoreTopSegmentCollector {
|
||||||
|
segment_collector,
|
||||||
|
segment_scorer,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> crate::Result<Self::Fruit> {
|
||||||
|
self.collector.merge_fruits(segment_fruits)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct CustomScoreTopSegmentCollector<T, TScore>
|
||||||
|
where
|
||||||
|
TScore: 'static + PartialOrd + Clone + Send + Sync + Sized,
|
||||||
|
T: CustomSegmentScorer<TScore>,
|
||||||
|
{
|
||||||
|
segment_collector: TopSegmentCollector<TScore>,
|
||||||
|
segment_scorer: T,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T, TScore> SegmentCollector for CustomScoreTopSegmentCollector<T, TScore>
|
||||||
|
where
|
||||||
|
TScore: 'static + PartialOrd + Clone + Send + Sync,
|
||||||
|
T: 'static + CustomSegmentScorer<TScore>,
|
||||||
|
{
|
||||||
|
type Fruit = Vec<(TScore, DocAddress)>;
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||||
|
let score = self.segment_scorer.score(doc);
|
||||||
|
self.segment_collector.collect(doc, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest(self) -> Vec<(TScore, DocAddress)> {
|
||||||
|
self.segment_collector.harvest()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<F, TScore, T> CustomScorer<TScore> for F
|
||||||
|
where
|
||||||
|
F: 'static + Send + Sync + Fn(&SegmentReader) -> T,
|
||||||
|
T: CustomSegmentScorer<TScore>,
|
||||||
|
{
|
||||||
|
type Child = T;
|
||||||
|
|
||||||
|
fn segment_scorer(&self, segment_reader: &SegmentReader) -> crate::Result<Self::Child> {
|
||||||
|
Ok((self)(segment_reader))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<F, TScore> CustomSegmentScorer<TScore> for F
|
||||||
|
where F: 'static + FnMut(DocId) -> TScore
|
||||||
|
{
|
||||||
|
fn score(&mut self, doc: DocId) -> TScore {
|
||||||
|
(self)(doc)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -12,7 +12,6 @@ use std::marker::PhantomData;
|
|||||||
use columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType};
|
use columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType};
|
||||||
|
|
||||||
use crate::collector::{Collector, SegmentCollector};
|
use crate::collector::{Collector, SegmentCollector};
|
||||||
use crate::schema::Schema;
|
|
||||||
use crate::{DocId, Score, SegmentReader};
|
use crate::{DocId, Score, SegmentReader};
|
||||||
|
|
||||||
/// The `FilterCollector` filters docs using a fast field value and a predicate.
|
/// The `FilterCollector` filters docs using a fast field value and a predicate.
|
||||||
@@ -50,13 +49,13 @@ use crate::{DocId, Score, SegmentReader};
|
|||||||
///
|
///
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
/// let query = query_parser.parse_query("diary")?;
|
||||||
/// let no_filter_collector = FilterCollector::new("price".to_string(), |value: u64| value > 20_120u64, TopDocs::with_limit(2).order_by_score());
|
/// let no_filter_collector = FilterCollector::new("price".to_string(), |value: u64| value > 20_120u64, TopDocs::with_limit(2));
|
||||||
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
|
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
|
||||||
///
|
///
|
||||||
/// assert_eq!(top_docs.len(), 1);
|
/// assert_eq!(top_docs.len(), 1);
|
||||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||||
///
|
///
|
||||||
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new("price".to_string(), |value| value < 5u64, TopDocs::with_limit(2).order_by_score());
|
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new("price".to_string(), |value| value < 5u64, TopDocs::with_limit(2));
|
||||||
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
|
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
|
||||||
///
|
///
|
||||||
/// assert_eq!(filtered_top_docs.len(), 0);
|
/// assert_eq!(filtered_top_docs.len(), 0);
|
||||||
@@ -105,11 +104,6 @@ where
|
|||||||
|
|
||||||
type Child = FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>;
|
type Child = FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>;
|
||||||
|
|
||||||
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
|
|
||||||
self.collector.check_schema(schema)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn for_segment(
|
fn for_segment(
|
||||||
&self,
|
&self,
|
||||||
segment_local_id: u32,
|
segment_local_id: u32,
|
||||||
@@ -126,7 +120,6 @@ where
|
|||||||
segment_collector,
|
segment_collector,
|
||||||
predicate: self.predicate.clone(),
|
predicate: self.predicate.clone(),
|
||||||
t_predicate_value: PhantomData,
|
t_predicate_value: PhantomData,
|
||||||
filtered_docs: Vec::with_capacity(crate::COLLECT_BLOCK_BUFFER_LEN),
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -147,7 +140,6 @@ pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue
|
|||||||
segment_collector: TSegmentCollector,
|
segment_collector: TSegmentCollector,
|
||||||
predicate: TPredicate,
|
predicate: TPredicate,
|
||||||
t_predicate_value: PhantomData<TPredicateValue>,
|
t_predicate_value: PhantomData<TPredicateValue>,
|
||||||
filtered_docs: Vec<DocId>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TSegmentCollector, TPredicate, TPredicateValue>
|
impl<TSegmentCollector, TPredicate, TPredicateValue>
|
||||||
@@ -184,20 +176,6 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_block(&mut self, docs: &[DocId]) {
|
|
||||||
self.filtered_docs.clear();
|
|
||||||
for &doc in docs {
|
|
||||||
// TODO: `accept_document` could be further optimized to do batch lookups of column
|
|
||||||
// values for single-valued columns.
|
|
||||||
if self.accept_document(doc) {
|
|
||||||
self.filtered_docs.push(doc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !self.filtered_docs.is_empty() {
|
|
||||||
self.segment_collector.collect_block(&self.filtered_docs);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> TSegmentCollector::Fruit {
|
fn harvest(self) -> TSegmentCollector::Fruit {
|
||||||
self.segment_collector.harvest()
|
self.segment_collector.harvest()
|
||||||
}
|
}
|
||||||
@@ -240,7 +218,7 @@ where
|
|||||||
///
|
///
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
/// let query = query_parser.parse_query("diary")?;
|
||||||
/// let filter_collector = BytesFilterCollector::new("barcode".to_string(), |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2).order_by_score());
|
/// let filter_collector = BytesFilterCollector::new("barcode".to_string(), |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2));
|
||||||
/// let top_docs = searcher.search(&query, &filter_collector)?;
|
/// let top_docs = searcher.search(&query, &filter_collector)?;
|
||||||
///
|
///
|
||||||
/// assert_eq!(top_docs.len(), 1);
|
/// assert_eq!(top_docs.len(), 1);
|
||||||
@@ -280,10 +258,6 @@ where
|
|||||||
|
|
||||||
type Child = BytesFilterSegmentCollector<TCollector::Child, TPredicate>;
|
type Child = BytesFilterSegmentCollector<TCollector::Child, TPredicate>;
|
||||||
|
|
||||||
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
|
|
||||||
self.collector.check_schema(schema)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn for_segment(
|
fn for_segment(
|
||||||
&self,
|
&self,
|
||||||
segment_local_id: u32,
|
segment_local_id: u32,
|
||||||
@@ -300,7 +274,6 @@ where
|
|||||||
segment_collector,
|
segment_collector,
|
||||||
predicate: self.predicate.clone(),
|
predicate: self.predicate.clone(),
|
||||||
buffer: Vec::new(),
|
buffer: Vec::new(),
|
||||||
filtered_docs: Vec::with_capacity(crate::COLLECT_BLOCK_BUFFER_LEN),
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -323,7 +296,6 @@ where TPredicate: 'static
|
|||||||
segment_collector: TSegmentCollector,
|
segment_collector: TSegmentCollector,
|
||||||
predicate: TPredicate,
|
predicate: TPredicate,
|
||||||
buffer: Vec<u8>,
|
buffer: Vec<u8>,
|
||||||
filtered_docs: Vec<DocId>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TSegmentCollector, TPredicate> BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
|
impl<TSegmentCollector, TPredicate> BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
|
||||||
@@ -362,20 +334,6 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_block(&mut self, docs: &[DocId]) {
|
|
||||||
self.filtered_docs.clear();
|
|
||||||
for &doc in docs {
|
|
||||||
// TODO: `accept_document` could be further optimized to do batch lookups of column
|
|
||||||
// values for single-valued columns.
|
|
||||||
if self.accept_document(doc) {
|
|
||||||
self.filtered_docs.push(doc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !self.filtered_docs.is_empty() {
|
|
||||||
self.segment_collector.collect_block(&self.filtered_docs);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> TSegmentCollector::Fruit {
|
fn harvest(self) -> TSegmentCollector::Fruit {
|
||||||
self.segment_collector.harvest()
|
self.segment_collector.harvest()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -57,7 +57,7 @@
|
|||||||
//! # let query_parser = QueryParser::for_index(&index, vec![title]);
|
//! # let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
//! # let query = query_parser.parse_query("diary")?;
|
//! # let query = query_parser.parse_query("diary")?;
|
||||||
//! let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
|
//! let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
|
||||||
//! searcher.search(&query, &(Count, TopDocs::with_limit(2).order_by_score()))?;
|
//! searcher.search(&query, &(Count, TopDocs::with_limit(2)))?;
|
||||||
//! # Ok(())
|
//! # Ok(())
|
||||||
//! # }
|
//! # }
|
||||||
//! ```
|
//! ```
|
||||||
@@ -83,15 +83,11 @@
|
|||||||
|
|
||||||
use downcast_rs::impl_downcast;
|
use downcast_rs::impl_downcast;
|
||||||
|
|
||||||
use crate::schema::Schema;
|
|
||||||
use crate::{DocId, Score, SegmentOrdinal, SegmentReader};
|
use crate::{DocId, Score, SegmentOrdinal, SegmentReader};
|
||||||
|
|
||||||
mod count_collector;
|
mod count_collector;
|
||||||
pub use self::count_collector::Count;
|
pub use self::count_collector::Count;
|
||||||
|
|
||||||
/// Sort keys
|
|
||||||
pub mod sort_key;
|
|
||||||
|
|
||||||
mod histogram_collector;
|
mod histogram_collector;
|
||||||
pub use histogram_collector::HistogramCollector;
|
pub use histogram_collector::HistogramCollector;
|
||||||
|
|
||||||
@@ -99,13 +95,16 @@ mod multi_collector;
|
|||||||
pub use self::multi_collector::{FruitHandle, MultiCollector, MultiFruit};
|
pub use self::multi_collector::{FruitHandle, MultiCollector, MultiFruit};
|
||||||
|
|
||||||
mod top_collector;
|
mod top_collector;
|
||||||
pub use self::top_collector::ComparableDoc;
|
|
||||||
|
|
||||||
mod top_score_collector;
|
mod top_score_collector;
|
||||||
|
pub use self::top_collector::ComparableDoc;
|
||||||
pub use self::top_score_collector::{TopDocs, TopNComputer};
|
pub use self::top_score_collector::{TopDocs, TopNComputer};
|
||||||
|
|
||||||
mod sort_key_top_collector;
|
mod custom_score_top_collector;
|
||||||
pub use self::sort_key::{SegmentSortKeyComputer, SortKeyComputer};
|
pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};
|
||||||
|
|
||||||
|
mod tweak_score_top_collector;
|
||||||
|
pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
|
||||||
mod facet_collector;
|
mod facet_collector;
|
||||||
pub use self::facet_collector::{FacetCollector, FacetCounts};
|
pub use self::facet_collector::{FacetCollector, FacetCounts};
|
||||||
use crate::query::Weight;
|
use crate::query::Weight;
|
||||||
@@ -146,11 +145,6 @@ pub trait Collector: Sync + Send {
|
|||||||
/// Type of the `SegmentCollector` associated with this collector.
|
/// Type of the `SegmentCollector` associated with this collector.
|
||||||
type Child: SegmentCollector;
|
type Child: SegmentCollector;
|
||||||
|
|
||||||
/// Returns an error if the schema is not compatible with the collector.
|
|
||||||
fn check_schema(&self, _schema: &Schema) -> crate::Result<()> {
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `set_segment` is called before beginning to enumerate
|
/// `set_segment` is called before beginning to enumerate
|
||||||
/// on this segment.
|
/// on this segment.
|
||||||
fn for_segment(
|
fn for_segment(
|
||||||
@@ -176,50 +170,41 @@ pub trait Collector: Sync + Send {
|
|||||||
segment_ord: u32,
|
segment_ord: u32,
|
||||||
reader: &SegmentReader,
|
reader: &SegmentReader,
|
||||||
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
|
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
|
||||||
let with_scoring = self.requires_scoring();
|
|
||||||
let mut segment_collector = self.for_segment(segment_ord, reader)?;
|
let mut segment_collector = self.for_segment(segment_ord, reader)?;
|
||||||
default_collect_segment_impl(&mut segment_collector, weight, reader, with_scoring)?;
|
|
||||||
|
match (reader.alive_bitset(), self.requires_scoring()) {
|
||||||
|
(Some(alive_bitset), true) => {
|
||||||
|
weight.for_each(reader, &mut |doc, score| {
|
||||||
|
if alive_bitset.is_alive(doc) {
|
||||||
|
segment_collector.collect(doc, score);
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
(Some(alive_bitset), false) => {
|
||||||
|
weight.for_each_no_score(reader, &mut |docs| {
|
||||||
|
for doc in docs.iter().cloned() {
|
||||||
|
if alive_bitset.is_alive(doc) {
|
||||||
|
segment_collector.collect(doc, 0.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
(None, true) => {
|
||||||
|
weight.for_each(reader, &mut |doc, score| {
|
||||||
|
segment_collector.collect(doc, score);
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
(None, false) => {
|
||||||
|
weight.for_each_no_score(reader, &mut |docs| {
|
||||||
|
segment_collector.collect_block(docs);
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(segment_collector.harvest())
|
Ok(segment_collector.harvest())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn default_collect_segment_impl<TSegmentCollector: SegmentCollector>(
|
|
||||||
segment_collector: &mut TSegmentCollector,
|
|
||||||
weight: &dyn Weight,
|
|
||||||
reader: &SegmentReader,
|
|
||||||
with_scoring: bool,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
match (reader.alive_bitset(), with_scoring) {
|
|
||||||
(Some(alive_bitset), true) => {
|
|
||||||
weight.for_each(reader, &mut |doc, score| {
|
|
||||||
if alive_bitset.is_alive(doc) {
|
|
||||||
segment_collector.collect(doc, score);
|
|
||||||
}
|
|
||||||
})?;
|
|
||||||
}
|
|
||||||
(Some(alive_bitset), false) => {
|
|
||||||
weight.for_each_no_score(reader, &mut |docs| {
|
|
||||||
for doc in docs.iter().cloned() {
|
|
||||||
if alive_bitset.is_alive(doc) {
|
|
||||||
segment_collector.collect(doc, 0.0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})?;
|
|
||||||
}
|
|
||||||
(None, true) => {
|
|
||||||
weight.for_each(reader, &mut |doc, score| {
|
|
||||||
segment_collector.collect(doc, score);
|
|
||||||
})?;
|
|
||||||
}
|
|
||||||
(None, false) => {
|
|
||||||
weight.for_each_no_score(reader, &mut |docs| {
|
|
||||||
segment_collector.collect_block(docs);
|
|
||||||
})?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<TSegmentCollector: SegmentCollector> SegmentCollector for Option<TSegmentCollector> {
|
impl<TSegmentCollector: SegmentCollector> SegmentCollector for Option<TSegmentCollector> {
|
||||||
type Fruit = Option<TSegmentCollector::Fruit>;
|
type Fruit = Option<TSegmentCollector::Fruit>;
|
||||||
|
|
||||||
@@ -229,12 +214,6 @@ impl<TSegmentCollector: SegmentCollector> SegmentCollector for Option<TSegmentCo
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_block(&mut self, docs: &[DocId]) {
|
|
||||||
if let Some(segment_collector) = self {
|
|
||||||
segment_collector.collect_block(docs);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> Self::Fruit {
|
fn harvest(self) -> Self::Fruit {
|
||||||
self.map(|segment_collector| segment_collector.harvest())
|
self.map(|segment_collector| segment_collector.harvest())
|
||||||
}
|
}
|
||||||
@@ -245,13 +224,6 @@ impl<TCollector: Collector> Collector for Option<TCollector> {
|
|||||||
|
|
||||||
type Child = Option<<TCollector as Collector>::Child>;
|
type Child = Option<<TCollector as Collector>::Child>;
|
||||||
|
|
||||||
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
|
|
||||||
if let Some(underlying_collector) = self {
|
|
||||||
underlying_collector.check_schema(schema)?;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn for_segment(
|
fn for_segment(
|
||||||
&self,
|
&self,
|
||||||
segment_local_id: SegmentOrdinal,
|
segment_local_id: SegmentOrdinal,
|
||||||
@@ -327,12 +299,6 @@ where
|
|||||||
type Fruit = (Left::Fruit, Right::Fruit);
|
type Fruit = (Left::Fruit, Right::Fruit);
|
||||||
type Child = (Left::Child, Right::Child);
|
type Child = (Left::Child, Right::Child);
|
||||||
|
|
||||||
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
|
|
||||||
self.0.check_schema(schema)?;
|
|
||||||
self.1.check_schema(schema)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn for_segment(
|
fn for_segment(
|
||||||
&self,
|
&self,
|
||||||
segment_local_id: u32,
|
segment_local_id: u32,
|
||||||
@@ -376,11 +342,6 @@ where
|
|||||||
self.1.collect(doc, score);
|
self.1.collect(doc, score);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_block(&mut self, docs: &[DocId]) {
|
|
||||||
self.0.collect_block(docs);
|
|
||||||
self.1.collect_block(docs);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||||
(self.0.harvest(), self.1.harvest())
|
(self.0.harvest(), self.1.harvest())
|
||||||
}
|
}
|
||||||
@@ -397,13 +358,6 @@ where
|
|||||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
|
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
|
||||||
type Child = (One::Child, Two::Child, Three::Child);
|
type Child = (One::Child, Two::Child, Three::Child);
|
||||||
|
|
||||||
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
|
|
||||||
self.0.check_schema(schema)?;
|
|
||||||
self.1.check_schema(schema)?;
|
|
||||||
self.2.check_schema(schema)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn for_segment(
|
fn for_segment(
|
||||||
&self,
|
&self,
|
||||||
segment_local_id: u32,
|
segment_local_id: u32,
|
||||||
@@ -453,12 +407,6 @@ where
|
|||||||
self.2.collect(doc, score);
|
self.2.collect(doc, score);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_block(&mut self, docs: &[DocId]) {
|
|
||||||
self.0.collect_block(docs);
|
|
||||||
self.1.collect_block(docs);
|
|
||||||
self.2.collect_block(docs);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||||
(self.0.harvest(), self.1.harvest(), self.2.harvest())
|
(self.0.harvest(), self.1.harvest(), self.2.harvest())
|
||||||
}
|
}
|
||||||
@@ -476,14 +424,6 @@ where
|
|||||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
|
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
|
||||||
type Child = (One::Child, Two::Child, Three::Child, Four::Child);
|
type Child = (One::Child, Two::Child, Three::Child, Four::Child);
|
||||||
|
|
||||||
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
|
|
||||||
self.0.check_schema(schema)?;
|
|
||||||
self.1.check_schema(schema)?;
|
|
||||||
self.2.check_schema(schema)?;
|
|
||||||
self.3.check_schema(schema)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn for_segment(
|
fn for_segment(
|
||||||
&self,
|
&self,
|
||||||
segment_local_id: u32,
|
segment_local_id: u32,
|
||||||
@@ -542,13 +482,6 @@ where
|
|||||||
self.3.collect(doc, score);
|
self.3.collect(doc, score);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_block(&mut self, docs: &[DocId]) {
|
|
||||||
self.0.collect_block(docs);
|
|
||||||
self.1.collect_block(docs);
|
|
||||||
self.2.collect_block(docs);
|
|
||||||
self.3.collect_block(docs);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||||
(
|
(
|
||||||
self.0.harvest(),
|
self.0.harvest(),
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ use std::ops::Deref;
|
|||||||
|
|
||||||
use super::{Collector, SegmentCollector};
|
use super::{Collector, SegmentCollector};
|
||||||
use crate::collector::Fruit;
|
use crate::collector::Fruit;
|
||||||
use crate::schema::Schema;
|
|
||||||
use crate::{DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
use crate::{DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||||
|
|
||||||
/// MultiFruit keeps Fruits from every nested Collector
|
/// MultiFruit keeps Fruits from every nested Collector
|
||||||
@@ -17,10 +16,6 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
|||||||
type Fruit = Box<dyn Fruit>;
|
type Fruit = Box<dyn Fruit>;
|
||||||
type Child = Box<dyn BoxableSegmentCollector>;
|
type Child = Box<dyn BoxableSegmentCollector>;
|
||||||
|
|
||||||
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
|
|
||||||
self.0.check_schema(schema)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn for_segment(
|
fn for_segment(
|
||||||
&self,
|
&self,
|
||||||
segment_local_id: u32,
|
segment_local_id: u32,
|
||||||
@@ -152,7 +147,7 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
|||||||
/// let searcher = reader.searcher();
|
/// let searcher = reader.searcher();
|
||||||
///
|
///
|
||||||
/// let mut collectors = MultiCollector::new();
|
/// let mut collectors = MultiCollector::new();
|
||||||
/// let top_docs_handle = collectors.add_collector(TopDocs::with_limit(2).order_by_score());
|
/// let top_docs_handle = collectors.add_collector(TopDocs::with_limit(2));
|
||||||
/// let count_handle = collectors.add_collector(Count);
|
/// let count_handle = collectors.add_collector(Count);
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let query = query_parser.parse_query("diary").unwrap();
|
/// let query = query_parser.parse_query("diary").unwrap();
|
||||||
@@ -199,13 +194,6 @@ impl Collector for MultiCollector<'_> {
|
|||||||
type Fruit = MultiFruit;
|
type Fruit = MultiFruit;
|
||||||
type Child = MultiCollectorChild;
|
type Child = MultiCollectorChild;
|
||||||
|
|
||||||
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
|
|
||||||
for collector in &self.collector_wrappers {
|
|
||||||
collector.check_schema(schema)?;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn for_segment(
|
fn for_segment(
|
||||||
&self,
|
&self,
|
||||||
segment_local_id: SegmentOrdinal,
|
segment_local_id: SegmentOrdinal,
|
||||||
@@ -262,12 +250,6 @@ impl SegmentCollector for MultiCollectorChild {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_block(&mut self, docs: &[DocId]) {
|
|
||||||
for child in &mut self.children {
|
|
||||||
child.collect_block(docs);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> MultiFruit {
|
fn harvest(self) -> MultiFruit {
|
||||||
MultiFruit {
|
MultiFruit {
|
||||||
sub_fruits: self
|
sub_fruits: self
|
||||||
@@ -311,7 +293,7 @@ mod tests {
|
|||||||
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||||
|
|
||||||
let mut collectors = MultiCollector::new();
|
let mut collectors = MultiCollector::new();
|
||||||
let topdocs_handler = collectors.add_collector(TopDocs::with_limit(2).order_by_score());
|
let topdocs_handler = collectors.add_collector(TopDocs::with_limit(2));
|
||||||
let count_handler = collectors.add_collector(Count);
|
let count_handler = collectors.add_collector(Count);
|
||||||
let mut multifruits = searcher.search(&query, &collectors).unwrap();
|
let mut multifruits = searcher.search(&query, &collectors).unwrap();
|
||||||
|
|
||||||
|
|||||||
@@ -1,454 +0,0 @@
|
|||||||
mod order;
|
|
||||||
mod sort_by_erased_type;
|
|
||||||
mod sort_by_score;
|
|
||||||
mod sort_by_static_fast_value;
|
|
||||||
mod sort_by_string;
|
|
||||||
mod sort_key_computer;
|
|
||||||
|
|
||||||
pub use order::*;
|
|
||||||
pub use sort_by_erased_type::SortByErasedType;
|
|
||||||
pub use sort_by_score::SortBySimilarityScore;
|
|
||||||
pub use sort_by_static_fast_value::SortByStaticFastValue;
|
|
||||||
pub use sort_by_string::SortByString;
|
|
||||||
pub use sort_key_computer::{SegmentSortKeyComputer, SortKeyComputer};
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
pub(crate) mod tests {
|
|
||||||
|
|
||||||
// By spec, regardless of whether ascending or descending order was requested, in presence of a
|
|
||||||
// tie, we sort by ascending doc id/doc address.
|
|
||||||
pub(crate) fn sort_hits<TSortKey: Ord, D: Ord>(
|
|
||||||
hits: &mut [ComparableDoc<TSortKey, D>],
|
|
||||||
order: Order,
|
|
||||||
) {
|
|
||||||
if order.is_asc() {
|
|
||||||
hits.sort_by(|l, r| l.sort_key.cmp(&r.sort_key).then(l.doc.cmp(&r.doc)));
|
|
||||||
} else {
|
|
||||||
hits.sort_by(|l, r| {
|
|
||||||
l.sort_key
|
|
||||||
.cmp(&r.sort_key)
|
|
||||||
.reverse() // This is descending
|
|
||||||
.then(l.doc.cmp(&r.doc))
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::ops::Range;
|
|
||||||
|
|
||||||
use crate::collector::sort_key::{
|
|
||||||
SortByErasedType, SortBySimilarityScore, SortByStaticFastValue, SortByString,
|
|
||||||
};
|
|
||||||
use crate::collector::{ComparableDoc, DocSetCollector, TopDocs};
|
|
||||||
use crate::indexer::NoMergePolicy;
|
|
||||||
use crate::query::{AllQuery, QueryParser};
|
|
||||||
use crate::schema::{OwnedValue, Schema, FAST, TEXT};
|
|
||||||
use crate::{DocAddress, Document, Index, Order, Score, Searcher};
|
|
||||||
|
|
||||||
fn make_index() -> crate::Result<Index> {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let id = schema_builder.add_u64_field("id", FAST);
|
|
||||||
let city = schema_builder.add_text_field("city", TEXT | FAST);
|
|
||||||
let catchphrase = schema_builder.add_text_field("catchphrase", TEXT);
|
|
||||||
let altitude = schema_builder.add_f64_field("altitude", FAST);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
|
|
||||||
fn create_segment(index: &Index, docs: Vec<impl Document>) -> crate::Result<()> {
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
|
||||||
for doc in docs {
|
|
||||||
index_writer.add_document(doc)?;
|
|
||||||
}
|
|
||||||
index_writer.commit()?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
create_segment(
|
|
||||||
&index,
|
|
||||||
vec![
|
|
||||||
doc!(
|
|
||||||
id => 0_u64,
|
|
||||||
city => "austin",
|
|
||||||
catchphrase => "Hills, Barbeque, Glow",
|
|
||||||
altitude => 149.0,
|
|
||||||
),
|
|
||||||
doc!(
|
|
||||||
id => 1_u64,
|
|
||||||
city => "greenville",
|
|
||||||
catchphrase => "Grow, Glow, Glow",
|
|
||||||
altitude => 27.0,
|
|
||||||
),
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
create_segment(
|
|
||||||
&index,
|
|
||||||
vec![doc!(
|
|
||||||
id => 2_u64,
|
|
||||||
city => "tokyo",
|
|
||||||
catchphrase => "Glow, Glow, Glow",
|
|
||||||
altitude => 40.0,
|
|
||||||
)],
|
|
||||||
)?;
|
|
||||||
create_segment(
|
|
||||||
&index,
|
|
||||||
vec![doc!(
|
|
||||||
id => 3_u64,
|
|
||||||
catchphrase => "No, No, No",
|
|
||||||
altitude => 0.0,
|
|
||||||
)],
|
|
||||||
)?;
|
|
||||||
Ok(index)
|
|
||||||
}
|
|
||||||
|
|
||||||
// NOTE: You cannot determine the SegmentIds that will be generated for Segments
|
|
||||||
// ahead of time, so DocAddresses must be mapped back to a unique id for each Searcher.
|
|
||||||
fn id_mapping(searcher: &Searcher) -> HashMap<DocAddress, u64> {
|
|
||||||
searcher
|
|
||||||
.search(&AllQuery, &DocSetCollector)
|
|
||||||
.unwrap()
|
|
||||||
.into_iter()
|
|
||||||
.map(|doc_address| {
|
|
||||||
let column = searcher.segment_readers()[doc_address.segment_ord as usize]
|
|
||||||
.fast_fields()
|
|
||||||
.u64("id")
|
|
||||||
.unwrap();
|
|
||||||
(doc_address, column.first(doc_address.doc_id).unwrap())
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_order_by_string() -> crate::Result<()> {
|
|
||||||
let index = make_index()?;
|
|
||||||
|
|
||||||
#[track_caller]
|
|
||||||
fn assert_query(
|
|
||||||
index: &Index,
|
|
||||||
order: Order,
|
|
||||||
doc_range: Range<usize>,
|
|
||||||
expected: Vec<(Option<String>, u64)>,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
let ids = id_mapping(&searcher);
|
|
||||||
|
|
||||||
// Try as primitive.
|
|
||||||
let top_collector = TopDocs::for_doc_range(doc_range)
|
|
||||||
.order_by((SortByString::for_field("city"), order));
|
|
||||||
let actual = searcher
|
|
||||||
.search(&AllQuery, &top_collector)?
|
|
||||||
.into_iter()
|
|
||||||
.map(|(sort_key_opt, doc)| (sort_key_opt, ids[&doc]))
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
assert_eq!(actual, expected);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
assert_query(
|
|
||||||
&index,
|
|
||||||
Order::Asc,
|
|
||||||
0..4,
|
|
||||||
vec![
|
|
||||||
(Some("austin".to_owned()), 0),
|
|
||||||
(Some("greenville".to_owned()), 1),
|
|
||||||
(Some("tokyo".to_owned()), 2),
|
|
||||||
(None, 3),
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
assert_query(
|
|
||||||
&index,
|
|
||||||
Order::Asc,
|
|
||||||
0..3,
|
|
||||||
vec![
|
|
||||||
(Some("austin".to_owned()), 0),
|
|
||||||
(Some("greenville".to_owned()), 1),
|
|
||||||
(Some("tokyo".to_owned()), 2),
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
assert_query(
|
|
||||||
&index,
|
|
||||||
Order::Asc,
|
|
||||||
0..2,
|
|
||||||
vec![
|
|
||||||
(Some("austin".to_owned()), 0),
|
|
||||||
(Some("greenville".to_owned()), 1),
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
assert_query(
|
|
||||||
&index,
|
|
||||||
Order::Asc,
|
|
||||||
0..1,
|
|
||||||
vec![(Some("austin".to_string()), 0)],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
assert_query(
|
|
||||||
&index,
|
|
||||||
Order::Asc,
|
|
||||||
1..3,
|
|
||||||
vec![
|
|
||||||
(Some("greenville".to_owned()), 1),
|
|
||||||
(Some("tokyo".to_owned()), 2),
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
assert_query(
|
|
||||||
&index,
|
|
||||||
Order::Desc,
|
|
||||||
0..4,
|
|
||||||
vec![
|
|
||||||
(Some("tokyo".to_owned()), 2),
|
|
||||||
(Some("greenville".to_owned()), 1),
|
|
||||||
(Some("austin".to_owned()), 0),
|
|
||||||
(None, 3),
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
assert_query(
|
|
||||||
&index,
|
|
||||||
Order::Desc,
|
|
||||||
1..3,
|
|
||||||
vec![
|
|
||||||
(Some("greenville".to_owned()), 1),
|
|
||||||
(Some("austin".to_owned()), 0),
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
assert_query(
|
|
||||||
&index,
|
|
||||||
Order::Desc,
|
|
||||||
0..1,
|
|
||||||
vec![(Some("tokyo".to_owned()), 2)],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_order_by_f64() -> crate::Result<()> {
|
|
||||||
let index = make_index()?;
|
|
||||||
|
|
||||||
fn assert_query(
|
|
||||||
index: &Index,
|
|
||||||
order: Order,
|
|
||||||
expected: Vec<(Option<f64>, u64)>,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
let ids = id_mapping(&searcher);
|
|
||||||
|
|
||||||
// Try as primitive.
|
|
||||||
let top_collector = TopDocs::with_limit(3)
|
|
||||||
.order_by((SortByStaticFastValue::<f64>::for_field("altitude"), order));
|
|
||||||
let actual = searcher
|
|
||||||
.search(&AllQuery, &top_collector)?
|
|
||||||
.into_iter()
|
|
||||||
.map(|(altitude_opt, doc)| (altitude_opt, ids[&doc]))
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
assert_eq!(actual, expected);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
assert_query(
|
|
||||||
&index,
|
|
||||||
Order::Asc,
|
|
||||||
vec![(Some(0.0), 3), (Some(27.0), 1), (Some(40.0), 2)],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
assert_query(
|
|
||||||
&index,
|
|
||||||
Order::Desc,
|
|
||||||
vec![(Some(149.0), 0), (Some(40.0), 2), (Some(27.0), 1)],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_order_by_score() -> crate::Result<()> {
|
|
||||||
let index = make_index()?;
|
|
||||||
|
|
||||||
fn query(index: &Index, order: Order) -> crate::Result<Vec<(Score, u64)>> {
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
let ids = id_mapping(&searcher);
|
|
||||||
|
|
||||||
let top_collector = TopDocs::with_limit(4).order_by((SortBySimilarityScore, order));
|
|
||||||
let field = index.schema().get_field("catchphrase").unwrap();
|
|
||||||
let query_parser = QueryParser::for_index(index, vec![field]);
|
|
||||||
let text_query = query_parser.parse_query("glow")?;
|
|
||||||
|
|
||||||
Ok(searcher
|
|
||||||
.search(&text_query, &top_collector)?
|
|
||||||
.into_iter()
|
|
||||||
.map(|(score, doc)| (score, ids[&doc]))
|
|
||||||
.collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
&query(&index, Order::Desc)?,
|
|
||||||
&[(0.5604893, 2), (0.4904281, 1), (0.35667497, 0),]
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
&query(&index, Order::Asc)?,
|
|
||||||
&[(0.35667497, 0), (0.4904281, 1), (0.5604893, 2),]
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_order_by_score_then_string() -> crate::Result<()> {
|
|
||||||
let index = make_index()?;
|
|
||||||
|
|
||||||
type SortKey = (Score, Option<String>);
|
|
||||||
|
|
||||||
fn query(
|
|
||||||
index: &Index,
|
|
||||||
score_order: Order,
|
|
||||||
city_order: Order,
|
|
||||||
) -> crate::Result<Vec<(SortKey, u64)>> {
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
let ids = id_mapping(&searcher);
|
|
||||||
|
|
||||||
let top_collector = TopDocs::with_limit(4).order_by((
|
|
||||||
(SortBySimilarityScore, score_order),
|
|
||||||
(SortByString::for_field("city"), city_order),
|
|
||||||
));
|
|
||||||
let results: Vec<((Score, Option<String>), DocAddress)> =
|
|
||||||
searcher.search(&AllQuery, &top_collector)?;
|
|
||||||
Ok(results.into_iter().map(|(f, doc)| (f, ids[&doc])).collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
&query(&index, Order::Asc, Order::Asc)?,
|
|
||||||
&[
|
|
||||||
((1.0, Some("austin".to_owned())), 0),
|
|
||||||
((1.0, Some("greenville".to_owned())), 1),
|
|
||||||
((1.0, Some("tokyo".to_owned())), 2),
|
|
||||||
((1.0, None), 3),
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
&query(&index, Order::Asc, Order::Desc)?,
|
|
||||||
&[
|
|
||||||
((1.0, Some("tokyo".to_owned())), 2),
|
|
||||||
((1.0, Some("greenville".to_owned())), 1),
|
|
||||||
((1.0, Some("austin".to_owned())), 0),
|
|
||||||
((1.0, None), 3),
|
|
||||||
]
|
|
||||||
);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_order_by_score_then_owned_value() -> crate::Result<()> {
|
|
||||||
let index = make_index()?;
|
|
||||||
|
|
||||||
type SortKey = (Score, OwnedValue);
|
|
||||||
|
|
||||||
fn query(
|
|
||||||
index: &Index,
|
|
||||||
score_order: Order,
|
|
||||||
city_order: Order,
|
|
||||||
) -> crate::Result<Vec<(SortKey, u64)>> {
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
let ids = id_mapping(&searcher);
|
|
||||||
|
|
||||||
let top_collector = TopDocs::with_limit(4).order_by::<(Score, OwnedValue)>((
|
|
||||||
(SortBySimilarityScore, score_order),
|
|
||||||
(SortByErasedType::for_field("city"), city_order),
|
|
||||||
));
|
|
||||||
let results: Vec<((Score, OwnedValue), DocAddress)> =
|
|
||||||
searcher.search(&AllQuery, &top_collector)?;
|
|
||||||
Ok(results.into_iter().map(|(f, doc)| (f, ids[&doc])).collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
&query(&index, Order::Asc, Order::Asc)?,
|
|
||||||
&[
|
|
||||||
((1.0, OwnedValue::Str("austin".to_owned())), 0),
|
|
||||||
((1.0, OwnedValue::Str("greenville".to_owned())), 1),
|
|
||||||
((1.0, OwnedValue::Str("tokyo".to_owned())), 2),
|
|
||||||
((1.0, OwnedValue::Null), 3),
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
&query(&index, Order::Asc, Order::Desc)?,
|
|
||||||
&[
|
|
||||||
((1.0, OwnedValue::Str("tokyo".to_owned())), 2),
|
|
||||||
((1.0, OwnedValue::Str("greenville".to_owned())), 1),
|
|
||||||
((1.0, OwnedValue::Str("austin".to_owned())), 0),
|
|
||||||
((1.0, OwnedValue::Null), 3),
|
|
||||||
]
|
|
||||||
);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
use proptest::prelude::*;
|
|
||||||
|
|
||||||
proptest! {
|
|
||||||
#[test]
|
|
||||||
fn test_order_by_string_prop(
|
|
||||||
order in prop_oneof!(Just(Order::Desc), Just(Order::Asc)),
|
|
||||||
limit in 1..64_usize,
|
|
||||||
offset in 0..64_usize,
|
|
||||||
segments_terms in
|
|
||||||
proptest::collection::vec(
|
|
||||||
proptest::collection::vec(0..32_u8, 1..32_usize),
|
|
||||||
0..8_usize,
|
|
||||||
)
|
|
||||||
) {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let city = schema_builder.add_text_field("city", TEXT | FAST);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
|
|
||||||
// A Vec<Vec<u8>>, where the outer Vec represents segments, and the inner Vec
|
|
||||||
// represents terms.
|
|
||||||
for segment_terms in segments_terms.into_iter() {
|
|
||||||
for term in segment_terms.into_iter() {
|
|
||||||
let term = format!("{term:0>3}");
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
city => term,
|
|
||||||
))?;
|
|
||||||
}
|
|
||||||
index_writer.commit()?;
|
|
||||||
}
|
|
||||||
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
let top_n_results = searcher.search(&AllQuery, &TopDocs::with_limit(limit)
|
|
||||||
.and_offset(offset)
|
|
||||||
.order_by_string_fast_field("city", order))?;
|
|
||||||
let all_results = searcher.search(&AllQuery, &DocSetCollector)?.into_iter().map(|doc_address| {
|
|
||||||
// Get the term for this address.
|
|
||||||
let column = searcher.segment_readers()[doc_address.segment_ord as usize].fast_fields().str("city").unwrap().unwrap();
|
|
||||||
let value = column.term_ords(doc_address.doc_id).next().map(|term_ord| {
|
|
||||||
let mut city = Vec::new();
|
|
||||||
column.dictionary().ord_to_term(term_ord, &mut city).unwrap();
|
|
||||||
String::try_from(city).unwrap()
|
|
||||||
});
|
|
||||||
(value, doc_address)
|
|
||||||
});
|
|
||||||
|
|
||||||
// Using the TopDocs collector should always be equivalent to sorting, skipping the
|
|
||||||
// offset, and then taking the limit.
|
|
||||||
let sorted_docs: Vec<_> = {
|
|
||||||
let mut comparable_docs: Vec<ComparableDoc<_, _>> =
|
|
||||||
all_results.into_iter().map(|(sort_key, doc)| ComparableDoc { sort_key, doc}).collect();
|
|
||||||
sort_hits(&mut comparable_docs, order);
|
|
||||||
comparable_docs.into_iter().map(|cd| (cd.sort_key, cd.doc)).collect()
|
|
||||||
};
|
|
||||||
let expected_docs = sorted_docs.into_iter().skip(offset).take(limit).collect::<Vec<_>>();
|
|
||||||
prop_assert_eq!(
|
|
||||||
expected_docs,
|
|
||||||
top_n_results
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,567 +0,0 @@
|
|||||||
use std::cmp::Ordering;
|
|
||||||
|
|
||||||
use columnar::MonotonicallyMappableToU64;
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
|
|
||||||
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
|
|
||||||
use crate::schema::{OwnedValue, Schema};
|
|
||||||
use crate::{DocId, Order, Score};
|
|
||||||
|
|
||||||
fn compare_owned_value<const NULLS_FIRST: bool>(lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
|
|
||||||
match (lhs, rhs) {
|
|
||||||
(OwnedValue::Null, OwnedValue::Null) => Ordering::Equal,
|
|
||||||
(OwnedValue::Null, _) => {
|
|
||||||
if NULLS_FIRST {
|
|
||||||
Ordering::Less
|
|
||||||
} else {
|
|
||||||
Ordering::Greater
|
|
||||||
}
|
|
||||||
}
|
|
||||||
(_, OwnedValue::Null) => {
|
|
||||||
if NULLS_FIRST {
|
|
||||||
Ordering::Greater
|
|
||||||
} else {
|
|
||||||
Ordering::Less
|
|
||||||
}
|
|
||||||
}
|
|
||||||
(OwnedValue::Str(a), OwnedValue::Str(b)) => a.cmp(b),
|
|
||||||
(OwnedValue::PreTokStr(a), OwnedValue::PreTokStr(b)) => a.cmp(b),
|
|
||||||
(OwnedValue::U64(a), OwnedValue::U64(b)) => a.cmp(b),
|
|
||||||
(OwnedValue::I64(a), OwnedValue::I64(b)) => a.cmp(b),
|
|
||||||
(OwnedValue::F64(a), OwnedValue::F64(b)) => a.to_u64().cmp(&b.to_u64()),
|
|
||||||
(OwnedValue::Bool(a), OwnedValue::Bool(b)) => a.cmp(b),
|
|
||||||
(OwnedValue::Date(a), OwnedValue::Date(b)) => a.cmp(b),
|
|
||||||
(OwnedValue::Facet(a), OwnedValue::Facet(b)) => a.cmp(b),
|
|
||||||
(OwnedValue::Bytes(a), OwnedValue::Bytes(b)) => a.cmp(b),
|
|
||||||
(OwnedValue::IpAddr(a), OwnedValue::IpAddr(b)) => a.cmp(b),
|
|
||||||
(OwnedValue::U64(a), OwnedValue::I64(b)) => {
|
|
||||||
if *b < 0 {
|
|
||||||
Ordering::Greater
|
|
||||||
} else {
|
|
||||||
a.cmp(&(*b as u64))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
(OwnedValue::I64(a), OwnedValue::U64(b)) => {
|
|
||||||
if *a < 0 {
|
|
||||||
Ordering::Less
|
|
||||||
} else {
|
|
||||||
(*a as u64).cmp(b)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
(OwnedValue::U64(a), OwnedValue::F64(b)) => (*a as f64).to_u64().cmp(&b.to_u64()),
|
|
||||||
(OwnedValue::F64(a), OwnedValue::U64(b)) => a.to_u64().cmp(&(*b as f64).to_u64()),
|
|
||||||
(OwnedValue::I64(a), OwnedValue::F64(b)) => (*a as f64).to_u64().cmp(&b.to_u64()),
|
|
||||||
(OwnedValue::F64(a), OwnedValue::I64(b)) => a.to_u64().cmp(&(*b as f64).to_u64()),
|
|
||||||
(a, b) => {
|
|
||||||
let ord = a.discriminant_value().cmp(&b.discriminant_value());
|
|
||||||
// If the discriminant is equal, it's because a new type was added, but hasn't been
|
|
||||||
// included in this `match` statement.
|
|
||||||
assert!(
|
|
||||||
ord != Ordering::Equal,
|
|
||||||
"Unimplemented comparison for type of {a:?}, {b:?}"
|
|
||||||
);
|
|
||||||
ord
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Comparator trait defining the order in which documents should be ordered.
|
|
||||||
pub trait Comparator<T>: Send + Sync + std::fmt::Debug + Default {
|
|
||||||
/// Return the order between two values.
|
|
||||||
fn compare(&self, lhs: &T, rhs: &T) -> Ordering;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compare values naturally (e.g. 1 < 2).
|
|
||||||
///
|
|
||||||
/// When used with `TopDocs`, which reverses the order, this results in a
|
|
||||||
/// "Descending" sort (Greatest values first).
|
|
||||||
///
|
|
||||||
/// `None` (or Null for `OwnedValue`) values are considered to be smaller than any other value,
|
|
||||||
/// and will therefore appear last in a descending sort (e.g. `[Some(20), Some(10), None]`).
|
|
||||||
#[derive(Debug, Copy, Clone, Default, Serialize, Deserialize)]
|
|
||||||
pub struct NaturalComparator;
|
|
||||||
|
|
||||||
impl<T: PartialOrd> Comparator<T> for NaturalComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &T, rhs: &T) -> Ordering {
|
|
||||||
lhs.partial_cmp(rhs).unwrap_or(Ordering::Equal)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A (partial) implementation of comparison for OwnedValue.
|
|
||||||
///
|
|
||||||
/// Intended for use within columns of homogenous types, and so will panic for OwnedValues with
|
|
||||||
/// mismatched types. The one exception is Null, for which we do define all comparisons.
|
|
||||||
impl Comparator<OwnedValue> for NaturalComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
|
|
||||||
compare_owned_value::</* NULLS_FIRST= */ true>(lhs, rhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compare values in reverse (e.g. 2 < 1).
|
|
||||||
///
|
|
||||||
/// When used with `TopDocs`, which reverses the order, this results in an
|
|
||||||
/// "Ascending" sort (Smallest values first).
|
|
||||||
///
|
|
||||||
/// `None` is considered smaller than `Some` in the underlying comparator, but because the
|
|
||||||
/// comparison is reversed, `None` is effectively treated as the lowest value in the resulting
|
|
||||||
/// Ascending sort (e.g. `[None, Some(10), Some(20)]`).
|
|
||||||
///
|
|
||||||
/// The ReverseComparator does not necessarily imply that the sort order is reversed compared
|
|
||||||
/// to the NaturalComparator. In presence of a tie on the sort key, documents will always be
|
|
||||||
/// sorted by ascending `DocId`/`DocAddress` in TopN results, regardless of the sort key's order.
|
|
||||||
#[derive(Debug, Copy, Clone, Default, Serialize, Deserialize)]
|
|
||||||
pub struct ReverseComparator;
|
|
||||||
|
|
||||||
impl<T> Comparator<T> for ReverseComparator
|
|
||||||
where NaturalComparator: Comparator<T>
|
|
||||||
{
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &T, rhs: &T) -> Ordering {
|
|
||||||
NaturalComparator.compare(rhs, lhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compare values in reverse, but treating `None` as lower than `Some`.
|
|
||||||
///
|
|
||||||
/// When used with `TopDocs`, which reverses the order, this results in an
|
|
||||||
/// "Ascending" sort (Smallest values first), but with `None` values appearing last
|
|
||||||
/// (e.g. `[Some(10), Some(20), None]`).
|
|
||||||
///
|
|
||||||
/// This is usually what is wanted when sorting by a field in an ascending order.
|
|
||||||
/// For instance, in an e-commerce website, if sorting by price ascending,
|
|
||||||
/// the cheapest items would appear first, and items without a price would appear last.
|
|
||||||
#[derive(Debug, Copy, Clone, Default)]
|
|
||||||
pub struct ReverseNoneIsLowerComparator;
|
|
||||||
|
|
||||||
impl<T> Comparator<Option<T>> for ReverseNoneIsLowerComparator
|
|
||||||
where ReverseComparator: Comparator<T>
|
|
||||||
{
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs_opt: &Option<T>, rhs_opt: &Option<T>) -> Ordering {
|
|
||||||
match (lhs_opt, rhs_opt) {
|
|
||||||
(None, None) => Ordering::Equal,
|
|
||||||
(None, Some(_)) => Ordering::Less,
|
|
||||||
(Some(_), None) => Ordering::Greater,
|
|
||||||
(Some(lhs), Some(rhs)) => ReverseComparator.compare(lhs, rhs),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Comparator<u32> for ReverseNoneIsLowerComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &u32, rhs: &u32) -> Ordering {
|
|
||||||
ReverseComparator.compare(lhs, rhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Comparator<u64> for ReverseNoneIsLowerComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &u64, rhs: &u64) -> Ordering {
|
|
||||||
ReverseComparator.compare(lhs, rhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Comparator<f64> for ReverseNoneIsLowerComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &f64, rhs: &f64) -> Ordering {
|
|
||||||
ReverseComparator.compare(lhs, rhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Comparator<f32> for ReverseNoneIsLowerComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &f32, rhs: &f32) -> Ordering {
|
|
||||||
ReverseComparator.compare(lhs, rhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Comparator<i64> for ReverseNoneIsLowerComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &i64, rhs: &i64) -> Ordering {
|
|
||||||
ReverseComparator.compare(lhs, rhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Comparator<String> for ReverseNoneIsLowerComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &String, rhs: &String) -> Ordering {
|
|
||||||
ReverseComparator.compare(lhs, rhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Comparator<OwnedValue> for ReverseNoneIsLowerComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
|
|
||||||
compare_owned_value::</* NULLS_FIRST= */ false>(rhs, lhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compare values naturally, but treating `None` as higher than `Some`.
|
|
||||||
///
|
|
||||||
/// When used with `TopDocs`, which reverses the order, this results in a
|
|
||||||
/// "Descending" sort (Greatest values first), but with `None` values appearing first
|
|
||||||
/// (e.g. `[None, Some(20), Some(10)]`).
|
|
||||||
#[derive(Debug, Copy, Clone, Default, Serialize, Deserialize)]
|
|
||||||
pub struct NaturalNoneIsHigherComparator;
|
|
||||||
|
|
||||||
impl<T> Comparator<Option<T>> for NaturalNoneIsHigherComparator
|
|
||||||
where NaturalComparator: Comparator<T>
|
|
||||||
{
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs_opt: &Option<T>, rhs_opt: &Option<T>) -> Ordering {
|
|
||||||
match (lhs_opt, rhs_opt) {
|
|
||||||
(None, None) => Ordering::Equal,
|
|
||||||
(None, Some(_)) => Ordering::Greater,
|
|
||||||
(Some(_), None) => Ordering::Less,
|
|
||||||
(Some(lhs), Some(rhs)) => NaturalComparator.compare(lhs, rhs),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Comparator<u32> for NaturalNoneIsHigherComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &u32, rhs: &u32) -> Ordering {
|
|
||||||
NaturalComparator.compare(lhs, rhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Comparator<u64> for NaturalNoneIsHigherComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &u64, rhs: &u64) -> Ordering {
|
|
||||||
NaturalComparator.compare(lhs, rhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Comparator<f64> for NaturalNoneIsHigherComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &f64, rhs: &f64) -> Ordering {
|
|
||||||
NaturalComparator.compare(lhs, rhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Comparator<f32> for NaturalNoneIsHigherComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &f32, rhs: &f32) -> Ordering {
|
|
||||||
NaturalComparator.compare(lhs, rhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Comparator<i64> for NaturalNoneIsHigherComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &i64, rhs: &i64) -> Ordering {
|
|
||||||
NaturalComparator.compare(lhs, rhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Comparator<String> for NaturalNoneIsHigherComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &String, rhs: &String) -> Ordering {
|
|
||||||
NaturalComparator.compare(lhs, rhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Comparator<OwnedValue> for NaturalNoneIsHigherComparator {
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
|
|
||||||
compare_owned_value::</* NULLS_FIRST= */ false>(lhs, rhs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// An enum representing the different sort orders.
|
|
||||||
#[derive(Debug, Clone, Copy, Eq, PartialEq, Default)]
|
|
||||||
pub enum ComparatorEnum {
|
|
||||||
/// Natural order (See [NaturalComparator])
|
|
||||||
#[default]
|
|
||||||
Natural,
|
|
||||||
/// Reverse order (See [ReverseComparator])
|
|
||||||
Reverse,
|
|
||||||
/// Reverse order by treating None as the lowest value. (See [ReverseNoneLowerComparator])
|
|
||||||
ReverseNoneLower,
|
|
||||||
/// Natural order but treating None as the highest value. (See [NaturalNoneIsHigherComparator])
|
|
||||||
NaturalNoneHigher,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Order> for ComparatorEnum {
|
|
||||||
fn from(order: Order) -> Self {
|
|
||||||
match order {
|
|
||||||
Order::Asc => ComparatorEnum::ReverseNoneLower,
|
|
||||||
Order::Desc => ComparatorEnum::Natural,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> Comparator<T> for ComparatorEnum
|
|
||||||
where
|
|
||||||
ReverseNoneIsLowerComparator: Comparator<T>,
|
|
||||||
NaturalComparator: Comparator<T>,
|
|
||||||
ReverseComparator: Comparator<T>,
|
|
||||||
NaturalNoneIsHigherComparator: Comparator<T>,
|
|
||||||
{
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &T, rhs: &T) -> Ordering {
|
|
||||||
match self {
|
|
||||||
ComparatorEnum::Natural => NaturalComparator.compare(lhs, rhs),
|
|
||||||
ComparatorEnum::Reverse => ReverseComparator.compare(lhs, rhs),
|
|
||||||
ComparatorEnum::ReverseNoneLower => ReverseNoneIsLowerComparator.compare(lhs, rhs),
|
|
||||||
ComparatorEnum::NaturalNoneHigher => NaturalNoneIsHigherComparator.compare(lhs, rhs),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<Head, Tail, LeftComparator, RightComparator> Comparator<(Head, Tail)>
|
|
||||||
for (LeftComparator, RightComparator)
|
|
||||||
where
|
|
||||||
LeftComparator: Comparator<Head>,
|
|
||||||
RightComparator: Comparator<Tail>,
|
|
||||||
{
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &(Head, Tail), rhs: &(Head, Tail)) -> Ordering {
|
|
||||||
self.0
|
|
||||||
.compare(&lhs.0, &rhs.0)
|
|
||||||
.then_with(|| self.1.compare(&lhs.1, &rhs.1))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<Type1, Type2, Type3, Comparator1, Comparator2, Comparator3> Comparator<(Type1, (Type2, Type3))>
|
|
||||||
for (Comparator1, Comparator2, Comparator3)
|
|
||||||
where
|
|
||||||
Comparator1: Comparator<Type1>,
|
|
||||||
Comparator2: Comparator<Type2>,
|
|
||||||
Comparator3: Comparator<Type3>,
|
|
||||||
{
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &(Type1, (Type2, Type3)), rhs: &(Type1, (Type2, Type3))) -> Ordering {
|
|
||||||
self.0
|
|
||||||
.compare(&lhs.0, &rhs.0)
|
|
||||||
.then_with(|| self.1.compare(&lhs.1 .0, &rhs.1 .0))
|
|
||||||
.then_with(|| self.2.compare(&lhs.1 .1, &rhs.1 .1))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<Type1, Type2, Type3, Comparator1, Comparator2, Comparator3> Comparator<(Type1, Type2, Type3)>
|
|
||||||
for (Comparator1, Comparator2, Comparator3)
|
|
||||||
where
|
|
||||||
Comparator1: Comparator<Type1>,
|
|
||||||
Comparator2: Comparator<Type2>,
|
|
||||||
Comparator3: Comparator<Type3>,
|
|
||||||
{
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(&self, lhs: &(Type1, Type2, Type3), rhs: &(Type1, Type2, Type3)) -> Ordering {
|
|
||||||
self.0
|
|
||||||
.compare(&lhs.0, &rhs.0)
|
|
||||||
.then_with(|| self.1.compare(&lhs.1, &rhs.1))
|
|
||||||
.then_with(|| self.2.compare(&lhs.2, &rhs.2))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<Type1, Type2, Type3, Type4, Comparator1, Comparator2, Comparator3, Comparator4>
|
|
||||||
Comparator<(Type1, (Type2, (Type3, Type4)))>
|
|
||||||
for (Comparator1, Comparator2, Comparator3, Comparator4)
|
|
||||||
where
|
|
||||||
Comparator1: Comparator<Type1>,
|
|
||||||
Comparator2: Comparator<Type2>,
|
|
||||||
Comparator3: Comparator<Type3>,
|
|
||||||
Comparator4: Comparator<Type4>,
|
|
||||||
{
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(
|
|
||||||
&self,
|
|
||||||
lhs: &(Type1, (Type2, (Type3, Type4))),
|
|
||||||
rhs: &(Type1, (Type2, (Type3, Type4))),
|
|
||||||
) -> Ordering {
|
|
||||||
self.0
|
|
||||||
.compare(&lhs.0, &rhs.0)
|
|
||||||
.then_with(|| self.1.compare(&lhs.1 .0, &rhs.1 .0))
|
|
||||||
.then_with(|| self.2.compare(&lhs.1 .1 .0, &rhs.1 .1 .0))
|
|
||||||
.then_with(|| self.3.compare(&lhs.1 .1 .1, &rhs.1 .1 .1))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<Type1, Type2, Type3, Type4, Comparator1, Comparator2, Comparator3, Comparator4>
|
|
||||||
Comparator<(Type1, Type2, Type3, Type4)>
|
|
||||||
for (Comparator1, Comparator2, Comparator3, Comparator4)
|
|
||||||
where
|
|
||||||
Comparator1: Comparator<Type1>,
|
|
||||||
Comparator2: Comparator<Type2>,
|
|
||||||
Comparator3: Comparator<Type3>,
|
|
||||||
Comparator4: Comparator<Type4>,
|
|
||||||
{
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare(
|
|
||||||
&self,
|
|
||||||
lhs: &(Type1, Type2, Type3, Type4),
|
|
||||||
rhs: &(Type1, Type2, Type3, Type4),
|
|
||||||
) -> Ordering {
|
|
||||||
self.0
|
|
||||||
.compare(&lhs.0, &rhs.0)
|
|
||||||
.then_with(|| self.1.compare(&lhs.1, &rhs.1))
|
|
||||||
.then_with(|| self.2.compare(&lhs.2, &rhs.2))
|
|
||||||
.then_with(|| self.3.compare(&lhs.3, &rhs.3))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<TSortKeyComputer> SortKeyComputer for (TSortKeyComputer, ComparatorEnum)
|
|
||||||
where
|
|
||||||
TSortKeyComputer: SortKeyComputer,
|
|
||||||
ComparatorEnum: Comparator<TSortKeyComputer::SortKey>,
|
|
||||||
ComparatorEnum: Comparator<
|
|
||||||
<<TSortKeyComputer as SortKeyComputer>::Child as SegmentSortKeyComputer>::SegmentSortKey,
|
|
||||||
>,
|
|
||||||
{
|
|
||||||
type SortKey = TSortKeyComputer::SortKey;
|
|
||||||
|
|
||||||
type Child = SegmentSortKeyComputerWithComparator<TSortKeyComputer::Child, Self::Comparator>;
|
|
||||||
|
|
||||||
type Comparator = ComparatorEnum;
|
|
||||||
|
|
||||||
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
|
|
||||||
self.0.check_schema(schema)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.0.requires_scoring()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn comparator(&self) -> Self::Comparator {
|
|
||||||
self.1
|
|
||||||
}
|
|
||||||
|
|
||||||
fn segment_sort_key_computer(
|
|
||||||
&self,
|
|
||||||
segment_reader: &crate::SegmentReader,
|
|
||||||
) -> crate::Result<Self::Child> {
|
|
||||||
let child = self.0.segment_sort_key_computer(segment_reader)?;
|
|
||||||
Ok(SegmentSortKeyComputerWithComparator {
|
|
||||||
segment_sort_key_computer: child,
|
|
||||||
comparator: self.comparator(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<TSortKeyComputer> SortKeyComputer for (TSortKeyComputer, Order)
|
|
||||||
where
|
|
||||||
TSortKeyComputer: SortKeyComputer,
|
|
||||||
ComparatorEnum: Comparator<TSortKeyComputer::SortKey>,
|
|
||||||
ComparatorEnum: Comparator<
|
|
||||||
<<TSortKeyComputer as SortKeyComputer>::Child as SegmentSortKeyComputer>::SegmentSortKey,
|
|
||||||
>,
|
|
||||||
{
|
|
||||||
type SortKey = TSortKeyComputer::SortKey;
|
|
||||||
|
|
||||||
type Child = SegmentSortKeyComputerWithComparator<TSortKeyComputer::Child, Self::Comparator>;
|
|
||||||
|
|
||||||
type Comparator = ComparatorEnum;
|
|
||||||
|
|
||||||
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
|
|
||||||
self.0.check_schema(schema)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.0.requires_scoring()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn comparator(&self) -> Self::Comparator {
|
|
||||||
self.1.into()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn segment_sort_key_computer(
|
|
||||||
&self,
|
|
||||||
segment_reader: &crate::SegmentReader,
|
|
||||||
) -> crate::Result<Self::Child> {
|
|
||||||
let child = self.0.segment_sort_key_computer(segment_reader)?;
|
|
||||||
Ok(SegmentSortKeyComputerWithComparator {
|
|
||||||
segment_sort_key_computer: child,
|
|
||||||
comparator: self.comparator(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A segment sort key computer with a custom ordering.
|
|
||||||
pub struct SegmentSortKeyComputerWithComparator<TSegmentSortKeyComputer, TComparator> {
|
|
||||||
segment_sort_key_computer: TSegmentSortKeyComputer,
|
|
||||||
comparator: TComparator,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<TSegmentSortKeyComputer, TSegmentSortKey, TComparator> SegmentSortKeyComputer
|
|
||||||
for SegmentSortKeyComputerWithComparator<TSegmentSortKeyComputer, TComparator>
|
|
||||||
where
|
|
||||||
TSegmentSortKeyComputer: SegmentSortKeyComputer<SegmentSortKey = TSegmentSortKey>,
|
|
||||||
TSegmentSortKey: Clone + 'static + Sync + Send,
|
|
||||||
TComparator: Comparator<TSegmentSortKey> + 'static + Sync + Send,
|
|
||||||
{
|
|
||||||
type SortKey = TSegmentSortKeyComputer::SortKey;
|
|
||||||
type SegmentSortKey = TSegmentSortKey;
|
|
||||||
type SegmentComparator = TComparator;
|
|
||||||
|
|
||||||
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey {
|
|
||||||
self.segment_sort_key_computer.segment_sort_key(doc, score)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare_segment_sort_key(
|
|
||||||
&self,
|
|
||||||
left: &Self::SegmentSortKey,
|
|
||||||
right: &Self::SegmentSortKey,
|
|
||||||
) -> Ordering {
|
|
||||||
self.comparator.compare(left, right)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey {
|
|
||||||
self.segment_sort_key_computer
|
|
||||||
.convert_segment_sort_key(sort_key)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
use crate::schema::OwnedValue;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_natural_none_is_higher() {
|
|
||||||
let comp = NaturalNoneIsHigherComparator;
|
|
||||||
let null = None;
|
|
||||||
let v1 = Some(1_u64);
|
|
||||||
let v2 = Some(2_u64);
|
|
||||||
|
|
||||||
// NaturalNoneIsGreaterComparator logic:
|
|
||||||
// 1. Delegates to NaturalComparator for non-nulls.
|
|
||||||
// NaturalComparator compare(2, 1) -> 2.cmp(1) -> Greater.
|
|
||||||
assert_eq!(comp.compare(&v2, &v1), Ordering::Greater);
|
|
||||||
|
|
||||||
// 2. Treats None (Null) as Greater than any value.
|
|
||||||
// compare(None, Some(2)) should be Greater.
|
|
||||||
assert_eq!(comp.compare(&null, &v2), Ordering::Greater);
|
|
||||||
|
|
||||||
// compare(Some(1), None) should be Less.
|
|
||||||
assert_eq!(comp.compare(&v1, &null), Ordering::Less);
|
|
||||||
|
|
||||||
// compare(None, None) should be Equal.
|
|
||||||
assert_eq!(comp.compare(&null, &null), Ordering::Equal);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_mixed_ownedvalue_compare() {
|
|
||||||
let u = OwnedValue::U64(10);
|
|
||||||
let i = OwnedValue::I64(10);
|
|
||||||
let f = OwnedValue::F64(10.0);
|
|
||||||
|
|
||||||
let nc = NaturalComparator;
|
|
||||||
assert_eq!(nc.compare(&u, &i), Ordering::Equal);
|
|
||||||
assert_eq!(nc.compare(&u, &f), Ordering::Equal);
|
|
||||||
assert_eq!(nc.compare(&i, &f), Ordering::Equal);
|
|
||||||
|
|
||||||
let u2 = OwnedValue::U64(11);
|
|
||||||
assert_eq!(nc.compare(&u2, &f), Ordering::Greater);
|
|
||||||
|
|
||||||
let s = OwnedValue::Str("a".to_string());
|
|
||||||
// Str < U64
|
|
||||||
assert_eq!(nc.compare(&s, &u), Ordering::Less);
|
|
||||||
// Str < I64
|
|
||||||
assert_eq!(nc.compare(&s, &i), Ordering::Less);
|
|
||||||
// Str < F64
|
|
||||||
assert_eq!(nc.compare(&s, &f), Ordering::Less);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,361 +0,0 @@
|
|||||||
use columnar::{ColumnType, MonotonicallyMappableToU64};
|
|
||||||
|
|
||||||
use crate::collector::sort_key::{
|
|
||||||
NaturalComparator, SortBySimilarityScore, SortByStaticFastValue, SortByString,
|
|
||||||
};
|
|
||||||
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
|
|
||||||
use crate::fastfield::FastFieldNotAvailableError;
|
|
||||||
use crate::schema::OwnedValue;
|
|
||||||
use crate::{DateTime, DocId, Score};
|
|
||||||
|
|
||||||
/// Sort by the boxed / OwnedValue representation of either a fast field, or of the score.
|
|
||||||
///
|
|
||||||
/// Using the OwnedValue representation allows for type erasure, and can be useful when sort orders
|
|
||||||
/// are not known until runtime. But it comes with a performance cost: wherever possible, prefer to
|
|
||||||
/// use a SortKeyComputer implementation with a known-type at compile time.
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub enum SortByErasedType {
|
|
||||||
/// Sort by a fast field
|
|
||||||
Field(String),
|
|
||||||
/// Sort by score
|
|
||||||
Score,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SortByErasedType {
|
|
||||||
/// Creates a new sort key computer which will sort by the given fast field column, with type
|
|
||||||
/// erasure.
|
|
||||||
pub fn for_field(column_name: impl ToString) -> Self {
|
|
||||||
Self::Field(column_name.to_string())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a new sort key computer which will sort by score, with type erasure.
|
|
||||||
pub fn for_score() -> Self {
|
|
||||||
Self::Score
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
trait ErasedSegmentSortKeyComputer: Send + Sync {
|
|
||||||
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Option<u64>;
|
|
||||||
fn convert_segment_sort_key(&self, sort_key: Option<u64>) -> OwnedValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ErasedSegmentSortKeyComputerWrapper<C, F> {
|
|
||||||
inner: C,
|
|
||||||
converter: F,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<C, F> ErasedSegmentSortKeyComputer for ErasedSegmentSortKeyComputerWrapper<C, F>
|
|
||||||
where
|
|
||||||
C: SegmentSortKeyComputer<SegmentSortKey = Option<u64>> + Send + Sync,
|
|
||||||
F: Fn(C::SortKey) -> OwnedValue + Send + Sync + 'static,
|
|
||||||
{
|
|
||||||
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Option<u64> {
|
|
||||||
self.inner.segment_sort_key(doc, score)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn convert_segment_sort_key(&self, sort_key: Option<u64>) -> OwnedValue {
|
|
||||||
let val = self.inner.convert_segment_sort_key(sort_key);
|
|
||||||
(self.converter)(val)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ScoreSegmentSortKeyComputer {
|
|
||||||
segment_computer: SortBySimilarityScore,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ErasedSegmentSortKeyComputer for ScoreSegmentSortKeyComputer {
|
|
||||||
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Option<u64> {
|
|
||||||
let score_value: f64 = self.segment_computer.segment_sort_key(doc, score).into();
|
|
||||||
Some(score_value.to_u64())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn convert_segment_sort_key(&self, sort_key: Option<u64>) -> OwnedValue {
|
|
||||||
let score_value: u64 = sort_key.expect("This implementation always produces a score.");
|
|
||||||
OwnedValue::F64(f64::from_u64(score_value))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SortKeyComputer for SortByErasedType {
|
|
||||||
type SortKey = OwnedValue;
|
|
||||||
type Child = ErasedColumnSegmentSortKeyComputer;
|
|
||||||
type Comparator = NaturalComparator;
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
matches!(self, Self::Score)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn segment_sort_key_computer(
|
|
||||||
&self,
|
|
||||||
segment_reader: &crate::SegmentReader,
|
|
||||||
) -> crate::Result<Self::Child> {
|
|
||||||
let inner: Box<dyn ErasedSegmentSortKeyComputer> = match self {
|
|
||||||
Self::Field(column_name) => {
|
|
||||||
let fast_fields = segment_reader.fast_fields();
|
|
||||||
// TODO: We currently double-open the column to avoid relying on the implementation
|
|
||||||
// details of `SortByString` or `SortByStaticFastValue`. Once
|
|
||||||
// https://github.com/quickwit-oss/tantivy/issues/2776 is resolved, we should
|
|
||||||
// consider directly constructing the appropriate `SegmentSortKeyComputer` type for
|
|
||||||
// the column that we open here.
|
|
||||||
let (_column, column_type) =
|
|
||||||
fast_fields.u64_lenient(column_name)?.ok_or_else(|| {
|
|
||||||
FastFieldNotAvailableError {
|
|
||||||
field_name: column_name.to_owned(),
|
|
||||||
}
|
|
||||||
})?;
|
|
||||||
|
|
||||||
match column_type {
|
|
||||||
ColumnType::Str => {
|
|
||||||
let computer = SortByString::for_field(column_name);
|
|
||||||
let inner = computer.segment_sort_key_computer(segment_reader)?;
|
|
||||||
Box::new(ErasedSegmentSortKeyComputerWrapper {
|
|
||||||
inner,
|
|
||||||
converter: |val: Option<String>| {
|
|
||||||
val.map(OwnedValue::Str).unwrap_or(OwnedValue::Null)
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
ColumnType::U64 => {
|
|
||||||
let computer = SortByStaticFastValue::<u64>::for_field(column_name);
|
|
||||||
let inner = computer.segment_sort_key_computer(segment_reader)?;
|
|
||||||
Box::new(ErasedSegmentSortKeyComputerWrapper {
|
|
||||||
inner,
|
|
||||||
converter: |val: Option<u64>| {
|
|
||||||
val.map(OwnedValue::U64).unwrap_or(OwnedValue::Null)
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
ColumnType::I64 => {
|
|
||||||
let computer = SortByStaticFastValue::<i64>::for_field(column_name);
|
|
||||||
let inner = computer.segment_sort_key_computer(segment_reader)?;
|
|
||||||
Box::new(ErasedSegmentSortKeyComputerWrapper {
|
|
||||||
inner,
|
|
||||||
converter: |val: Option<i64>| {
|
|
||||||
val.map(OwnedValue::I64).unwrap_or(OwnedValue::Null)
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
ColumnType::F64 => {
|
|
||||||
let computer = SortByStaticFastValue::<f64>::for_field(column_name);
|
|
||||||
let inner = computer.segment_sort_key_computer(segment_reader)?;
|
|
||||||
Box::new(ErasedSegmentSortKeyComputerWrapper {
|
|
||||||
inner,
|
|
||||||
converter: |val: Option<f64>| {
|
|
||||||
val.map(OwnedValue::F64).unwrap_or(OwnedValue::Null)
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
ColumnType::Bool => {
|
|
||||||
let computer = SortByStaticFastValue::<bool>::for_field(column_name);
|
|
||||||
let inner = computer.segment_sort_key_computer(segment_reader)?;
|
|
||||||
Box::new(ErasedSegmentSortKeyComputerWrapper {
|
|
||||||
inner,
|
|
||||||
converter: |val: Option<bool>| {
|
|
||||||
val.map(OwnedValue::Bool).unwrap_or(OwnedValue::Null)
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
ColumnType::DateTime => {
|
|
||||||
let computer = SortByStaticFastValue::<DateTime>::for_field(column_name);
|
|
||||||
let inner = computer.segment_sort_key_computer(segment_reader)?;
|
|
||||||
Box::new(ErasedSegmentSortKeyComputerWrapper {
|
|
||||||
inner,
|
|
||||||
converter: |val: Option<DateTime>| {
|
|
||||||
val.map(OwnedValue::Date).unwrap_or(OwnedValue::Null)
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
column_type => {
|
|
||||||
return Err(crate::TantivyError::SchemaError(format!(
|
|
||||||
"Field `{}` is of type {column_type:?}, which is not supported for \
|
|
||||||
sorting by owned value yet.",
|
|
||||||
column_name
|
|
||||||
)))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Self::Score => Box::new(ScoreSegmentSortKeyComputer {
|
|
||||||
segment_computer: SortBySimilarityScore,
|
|
||||||
}),
|
|
||||||
};
|
|
||||||
Ok(ErasedColumnSegmentSortKeyComputer { inner })
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct ErasedColumnSegmentSortKeyComputer {
|
|
||||||
inner: Box<dyn ErasedSegmentSortKeyComputer>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentSortKeyComputer for ErasedColumnSegmentSortKeyComputer {
|
|
||||||
type SortKey = OwnedValue;
|
|
||||||
type SegmentSortKey = Option<u64>;
|
|
||||||
type SegmentComparator = NaturalComparator;
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Option<u64> {
|
|
||||||
self.inner.segment_sort_key(doc, score)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn convert_segment_sort_key(&self, segment_sort_key: Self::SegmentSortKey) -> OwnedValue {
|
|
||||||
self.inner.convert_segment_sort_key(segment_sort_key)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use crate::collector::sort_key::{ComparatorEnum, SortByErasedType};
|
|
||||||
use crate::collector::TopDocs;
|
|
||||||
use crate::query::AllQuery;
|
|
||||||
use crate::schema::{OwnedValue, Schema, FAST, TEXT};
|
|
||||||
use crate::Index;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_sort_by_owned_u64() {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let id_field = schema_builder.add_u64_field("id", FAST);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let mut writer = index.writer_for_tests().unwrap();
|
|
||||||
writer.add_document(doc!(id_field => 10u64)).unwrap();
|
|
||||||
writer.add_document(doc!(id_field => 2u64)).unwrap();
|
|
||||||
writer.add_document(doc!()).unwrap();
|
|
||||||
writer.commit().unwrap();
|
|
||||||
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
|
||||||
let collector = TopDocs::with_limit(10)
|
|
||||||
.order_by((SortByErasedType::for_field("id"), ComparatorEnum::Natural));
|
|
||||||
let top_docs = searcher.search(&AllQuery, &collector).unwrap();
|
|
||||||
|
|
||||||
let values: Vec<OwnedValue> = top_docs.into_iter().map(|(key, _)| key).collect();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
values,
|
|
||||||
vec![OwnedValue::U64(10), OwnedValue::U64(2), OwnedValue::Null]
|
|
||||||
);
|
|
||||||
|
|
||||||
let collector = TopDocs::with_limit(10).order_by((
|
|
||||||
SortByErasedType::for_field("id"),
|
|
||||||
ComparatorEnum::ReverseNoneLower,
|
|
||||||
));
|
|
||||||
let top_docs = searcher.search(&AllQuery, &collector).unwrap();
|
|
||||||
|
|
||||||
let values: Vec<OwnedValue> = top_docs.into_iter().map(|(key, _)| key).collect();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
values,
|
|
||||||
vec![OwnedValue::U64(2), OwnedValue::U64(10), OwnedValue::Null]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_sort_by_owned_string() {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let city_field = schema_builder.add_text_field("city", FAST | TEXT);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let mut writer = index.writer_for_tests().unwrap();
|
|
||||||
writer.add_document(doc!(city_field => "tokyo")).unwrap();
|
|
||||||
writer.add_document(doc!(city_field => "austin")).unwrap();
|
|
||||||
writer.add_document(doc!()).unwrap();
|
|
||||||
writer.commit().unwrap();
|
|
||||||
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
|
||||||
let collector = TopDocs::with_limit(10).order_by((
|
|
||||||
SortByErasedType::for_field("city"),
|
|
||||||
ComparatorEnum::ReverseNoneLower,
|
|
||||||
));
|
|
||||||
let top_docs = searcher.search(&AllQuery, &collector).unwrap();
|
|
||||||
|
|
||||||
let values: Vec<OwnedValue> = top_docs.into_iter().map(|(key, _)| key).collect();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
values,
|
|
||||||
vec![
|
|
||||||
OwnedValue::Str("austin".to_string()),
|
|
||||||
OwnedValue::Str("tokyo".to_string()),
|
|
||||||
OwnedValue::Null
|
|
||||||
]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_sort_by_owned_reverse() {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let id_field = schema_builder.add_u64_field("id", FAST);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let mut writer = index.writer_for_tests().unwrap();
|
|
||||||
writer.add_document(doc!(id_field => 10u64)).unwrap();
|
|
||||||
writer.add_document(doc!(id_field => 2u64)).unwrap();
|
|
||||||
writer.add_document(doc!()).unwrap();
|
|
||||||
writer.commit().unwrap();
|
|
||||||
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
|
||||||
let collector = TopDocs::with_limit(10)
|
|
||||||
.order_by((SortByErasedType::for_field("id"), ComparatorEnum::Reverse));
|
|
||||||
let top_docs = searcher.search(&AllQuery, &collector).unwrap();
|
|
||||||
|
|
||||||
let values: Vec<OwnedValue> = top_docs.into_iter().map(|(key, _)| key).collect();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
values,
|
|
||||||
vec![OwnedValue::Null, OwnedValue::U64(2), OwnedValue::U64(10)]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_sort_by_owned_score() {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let body_field = schema_builder.add_text_field("body", TEXT);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let mut writer = index.writer_for_tests().unwrap();
|
|
||||||
writer.add_document(doc!(body_field => "a a")).unwrap();
|
|
||||||
writer.add_document(doc!(body_field => "a")).unwrap();
|
|
||||||
writer.commit().unwrap();
|
|
||||||
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
let query_parser = crate::query::QueryParser::for_index(&index, vec![body_field]);
|
|
||||||
let query = query_parser.parse_query("a").unwrap();
|
|
||||||
|
|
||||||
// Sort by score descending (Natural)
|
|
||||||
let collector = TopDocs::with_limit(10)
|
|
||||||
.order_by((SortByErasedType::for_score(), ComparatorEnum::Natural));
|
|
||||||
let top_docs = searcher.search(&query, &collector).unwrap();
|
|
||||||
|
|
||||||
let values: Vec<f64> = top_docs
|
|
||||||
.into_iter()
|
|
||||||
.map(|(key, _)| match key {
|
|
||||||
OwnedValue::F64(val) => val,
|
|
||||||
_ => panic!("Wrong type {key:?}"),
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
assert_eq!(values.len(), 2);
|
|
||||||
assert!(values[0] > values[1]);
|
|
||||||
|
|
||||||
// Sort by score ascending (ReverseNoneLower)
|
|
||||||
let collector = TopDocs::with_limit(10).order_by((
|
|
||||||
SortByErasedType::for_score(),
|
|
||||||
ComparatorEnum::ReverseNoneLower,
|
|
||||||
));
|
|
||||||
let top_docs = searcher.search(&query, &collector).unwrap();
|
|
||||||
|
|
||||||
let values: Vec<f64> = top_docs
|
|
||||||
.into_iter()
|
|
||||||
.map(|(key, _)| match key {
|
|
||||||
OwnedValue::F64(val) => val,
|
|
||||||
_ => panic!("Wrong type {key:?}"),
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
assert_eq!(values.len(), 2);
|
|
||||||
assert!(values[0] < values[1]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,77 +0,0 @@
|
|||||||
use crate::collector::sort_key::NaturalComparator;
|
|
||||||
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer, TopNComputer};
|
|
||||||
use crate::{DocAddress, DocId, Score};
|
|
||||||
|
|
||||||
/// Sort by similarity score.
|
|
||||||
#[derive(Clone, Debug, Copy)]
|
|
||||||
pub struct SortBySimilarityScore;
|
|
||||||
|
|
||||||
impl SortKeyComputer for SortBySimilarityScore {
|
|
||||||
type SortKey = Score;
|
|
||||||
|
|
||||||
type Child = SortBySimilarityScore;
|
|
||||||
|
|
||||||
type Comparator = NaturalComparator;
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
true
|
|
||||||
}
|
|
||||||
|
|
||||||
fn segment_sort_key_computer(
|
|
||||||
&self,
|
|
||||||
_segment_reader: &crate::SegmentReader,
|
|
||||||
) -> crate::Result<Self::Child> {
|
|
||||||
Ok(SortBySimilarityScore)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sorting by score is special in that it allows for the Block-Wand optimization.
|
|
||||||
fn collect_segment_top_k(
|
|
||||||
&self,
|
|
||||||
k: usize,
|
|
||||||
weight: &dyn crate::query::Weight,
|
|
||||||
reader: &crate::SegmentReader,
|
|
||||||
segment_ord: u32,
|
|
||||||
) -> crate::Result<Vec<(Self::SortKey, DocAddress)>> {
|
|
||||||
let mut top_n: TopNComputer<Score, DocId, Self::Comparator> =
|
|
||||||
TopNComputer::new_with_comparator(k, self.comparator());
|
|
||||||
|
|
||||||
if let Some(alive_bitset) = reader.alive_bitset() {
|
|
||||||
let mut threshold = Score::MIN;
|
|
||||||
top_n.threshold = Some(threshold);
|
|
||||||
weight.for_each_pruning(Score::MIN, reader, &mut |doc, score| {
|
|
||||||
if alive_bitset.is_deleted(doc) {
|
|
||||||
return threshold;
|
|
||||||
}
|
|
||||||
top_n.push(score, doc);
|
|
||||||
threshold = top_n.threshold.unwrap_or(Score::MIN);
|
|
||||||
threshold
|
|
||||||
})?;
|
|
||||||
} else {
|
|
||||||
weight.for_each_pruning(Score::MIN, reader, &mut |doc, score| {
|
|
||||||
top_n.push(score, doc);
|
|
||||||
top_n.threshold.unwrap_or(Score::MIN)
|
|
||||||
})?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(top_n
|
|
||||||
.into_vec()
|
|
||||||
.into_iter()
|
|
||||||
.map(|cid| (cid.sort_key, DocAddress::new(segment_ord, cid.doc)))
|
|
||||||
.collect())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentSortKeyComputer for SortBySimilarityScore {
|
|
||||||
type SortKey = Score;
|
|
||||||
type SegmentSortKey = Score;
|
|
||||||
type SegmentComparator = NaturalComparator;
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn segment_sort_key(&mut self, _doc: DocId, score: Score) -> Score {
|
|
||||||
score
|
|
||||||
}
|
|
||||||
|
|
||||||
fn convert_segment_sort_key(&self, score: Score) -> Score {
|
|
||||||
score
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,96 +0,0 @@
|
|||||||
use std::marker::PhantomData;
|
|
||||||
|
|
||||||
use columnar::Column;
|
|
||||||
|
|
||||||
use crate::collector::sort_key::NaturalComparator;
|
|
||||||
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
|
|
||||||
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
|
|
||||||
use crate::{DocId, Score, SegmentReader};
|
|
||||||
|
|
||||||
/// Sorts by a fast value (u64, i64, f64, bool).
|
|
||||||
///
|
|
||||||
/// The field must appear explicitly in the schema, with the right type, and declared as
|
|
||||||
/// a fast field..
|
|
||||||
///
|
|
||||||
/// If the field is multivalued, only the first value is considered.
|
|
||||||
///
|
|
||||||
/// Documents that do not have this value are still considered.
|
|
||||||
/// Their sort key will simply be `None`.
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct SortByStaticFastValue<T: FastValue> {
|
|
||||||
field: String,
|
|
||||||
typ: PhantomData<T>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: FastValue> SortByStaticFastValue<T> {
|
|
||||||
/// Creates a new `SortByStaticFastValue` instance for the given field.
|
|
||||||
pub fn for_field(column_name: impl ToString) -> SortByStaticFastValue<T> {
|
|
||||||
Self {
|
|
||||||
field: column_name.to_string(),
|
|
||||||
typ: PhantomData,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: FastValue> SortKeyComputer for SortByStaticFastValue<T> {
|
|
||||||
type Child = SortByFastValueSegmentSortKeyComputer<T>;
|
|
||||||
type SortKey = Option<T>;
|
|
||||||
type Comparator = NaturalComparator;
|
|
||||||
|
|
||||||
fn check_schema(&self, schema: &crate::schema::Schema) -> crate::Result<()> {
|
|
||||||
// At the segment sort key computer level, we rely on the u64 representation.
|
|
||||||
// The mapping is monotonic, so it is sufficient to compute our top-K docs.
|
|
||||||
let field = schema.get_field(&self.field)?;
|
|
||||||
let field_entry = schema.get_field_entry(field);
|
|
||||||
if !field_entry.is_fast() {
|
|
||||||
return Err(crate::TantivyError::SchemaError(format!(
|
|
||||||
"Field `{}` is not a fast field.",
|
|
||||||
self.field,
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
let schema_type = field_entry.field_type().value_type();
|
|
||||||
if schema_type != T::to_type() {
|
|
||||||
return Err(crate::TantivyError::SchemaError(format!(
|
|
||||||
"Field `{}` is of type {schema_type:?}, not of the type {:?}.",
|
|
||||||
&self.field,
|
|
||||||
T::to_type()
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn segment_sort_key_computer(
|
|
||||||
&self,
|
|
||||||
segment_reader: &SegmentReader,
|
|
||||||
) -> crate::Result<Self::Child> {
|
|
||||||
let sort_column_opt = segment_reader.fast_fields().u64_lenient(&self.field)?;
|
|
||||||
let (sort_column, _sort_column_type) =
|
|
||||||
sort_column_opt.ok_or_else(|| FastFieldNotAvailableError {
|
|
||||||
field_name: self.field.clone(),
|
|
||||||
})?;
|
|
||||||
Ok(SortByFastValueSegmentSortKeyComputer {
|
|
||||||
sort_column,
|
|
||||||
typ: PhantomData,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct SortByFastValueSegmentSortKeyComputer<T> {
|
|
||||||
sort_column: Column<u64>,
|
|
||||||
typ: PhantomData<T>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: FastValue> SegmentSortKeyComputer for SortByFastValueSegmentSortKeyComputer<T> {
|
|
||||||
type SortKey = Option<T>;
|
|
||||||
type SegmentSortKey = Option<u64>;
|
|
||||||
type SegmentComparator = NaturalComparator;
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn segment_sort_key(&mut self, doc: DocId, _score: Score) -> Self::SegmentSortKey {
|
|
||||||
self.sort_column.first(doc)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey {
|
|
||||||
sort_key.map(T::from_u64)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,72 +0,0 @@
|
|||||||
use columnar::StrColumn;
|
|
||||||
|
|
||||||
use crate::collector::sort_key::NaturalComparator;
|
|
||||||
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
|
|
||||||
use crate::termdict::TermOrdinal;
|
|
||||||
use crate::{DocId, Score};
|
|
||||||
|
|
||||||
/// Sort by the first value of a string column.
|
|
||||||
///
|
|
||||||
/// The string can be dynamic (coming from a json field)
|
|
||||||
/// or static (being specificaly defined in the configuration).
|
|
||||||
///
|
|
||||||
/// If the field is multivalued, only the first value is considered.
|
|
||||||
///
|
|
||||||
/// Documents that do not have this value are still considered.
|
|
||||||
/// Their sort key will simply be `None`.
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct SortByString {
|
|
||||||
column_name: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SortByString {
|
|
||||||
/// Creates a new sort by string sort key computer.
|
|
||||||
pub fn for_field(column_name: impl ToString) -> Self {
|
|
||||||
SortByString {
|
|
||||||
column_name: column_name.to_string(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SortKeyComputer for SortByString {
|
|
||||||
type SortKey = Option<String>;
|
|
||||||
type Child = ByStringColumnSegmentSortKeyComputer;
|
|
||||||
type Comparator = NaturalComparator;
|
|
||||||
|
|
||||||
fn segment_sort_key_computer(
|
|
||||||
&self,
|
|
||||||
segment_reader: &crate::SegmentReader,
|
|
||||||
) -> crate::Result<Self::Child> {
|
|
||||||
let str_column_opt = segment_reader.fast_fields().str(&self.column_name)?;
|
|
||||||
Ok(ByStringColumnSegmentSortKeyComputer { str_column_opt })
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct ByStringColumnSegmentSortKeyComputer {
|
|
||||||
str_column_opt: Option<StrColumn>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentSortKeyComputer for ByStringColumnSegmentSortKeyComputer {
|
|
||||||
type SortKey = Option<String>;
|
|
||||||
type SegmentSortKey = Option<TermOrdinal>;
|
|
||||||
type SegmentComparator = NaturalComparator;
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn segment_sort_key(&mut self, doc: DocId, _score: Score) -> Option<TermOrdinal> {
|
|
||||||
let str_column = self.str_column_opt.as_ref()?;
|
|
||||||
str_column.ords().first(doc)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn convert_segment_sort_key(&self, term_ord_opt: Option<TermOrdinal>) -> Option<String> {
|
|
||||||
// TODO: Individual lookups to the dictionary like this are very likely to repeatedly
|
|
||||||
// decompress the same blocks. See https://github.com/quickwit-oss/tantivy/issues/2776
|
|
||||||
let term_ord = term_ord_opt?;
|
|
||||||
let str_column = self.str_column_opt.as_ref()?;
|
|
||||||
let mut bytes = Vec::new();
|
|
||||||
str_column
|
|
||||||
.dictionary()
|
|
||||||
.ord_to_term(term_ord, &mut bytes)
|
|
||||||
.ok()?;
|
|
||||||
String::try_from(bytes).ok()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,643 +0,0 @@
|
|||||||
use std::cmp::Ordering;
|
|
||||||
|
|
||||||
use crate::collector::sort_key::{Comparator, NaturalComparator};
|
|
||||||
use crate::collector::sort_key_top_collector::TopBySortKeySegmentCollector;
|
|
||||||
use crate::collector::{default_collect_segment_impl, SegmentCollector as _, TopNComputer};
|
|
||||||
use crate::schema::Schema;
|
|
||||||
use crate::{DocAddress, DocId, Result, Score, SegmentReader};
|
|
||||||
|
|
||||||
/// A `SegmentSortKeyComputer` makes it possible to modify the default score
|
|
||||||
/// for a given document belonging to a specific segment.
|
|
||||||
///
|
|
||||||
/// It is the segment local version of the [`SortKeyComputer`].
|
|
||||||
pub trait SegmentSortKeyComputer: 'static {
|
|
||||||
/// The final score being emitted.
|
|
||||||
type SortKey: 'static + Send + Sync + Clone;
|
|
||||||
|
|
||||||
/// Sort key used by at the segment level by the `SegmentSortKeyComputer`.
|
|
||||||
///
|
|
||||||
/// It is typically small like a `u64`, and is meant to be converted
|
|
||||||
/// to the final score at the end of the collection of the segment.
|
|
||||||
type SegmentSortKey: 'static + Clone + Send + Sync + Clone;
|
|
||||||
|
|
||||||
/// Comparator type.
|
|
||||||
type SegmentComparator: Comparator<Self::SegmentSortKey> + 'static;
|
|
||||||
|
|
||||||
/// Returns the segment sort key comparator.
|
|
||||||
fn segment_comparator(&self) -> Self::SegmentComparator {
|
|
||||||
Self::SegmentComparator::default()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Computes the sort key for the given document and score.
|
|
||||||
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey;
|
|
||||||
|
|
||||||
/// Computes the sort key and pushes the document in a TopN Computer.
|
|
||||||
///
|
|
||||||
/// When using a tuple as the sorting key, the sort key is evaluated in a lazy manner.
|
|
||||||
#[inline(always)]
|
|
||||||
fn compute_sort_key_and_collect<C: Comparator<Self::SegmentSortKey>>(
|
|
||||||
&mut self,
|
|
||||||
doc: DocId,
|
|
||||||
score: Score,
|
|
||||||
top_n_computer: &mut TopNComputer<Self::SegmentSortKey, DocId, C>,
|
|
||||||
) {
|
|
||||||
let sort_key = self.segment_sort_key(doc, score);
|
|
||||||
top_n_computer.push(sort_key, doc);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A SegmentSortKeyComputer maps to a SegmentSortKey, but it can also decide on
|
|
||||||
/// its ordering.
|
|
||||||
///
|
|
||||||
/// This method must be consistent with the `SortKey` ordering.
|
|
||||||
#[inline(always)]
|
|
||||||
fn compare_segment_sort_key(
|
|
||||||
&self,
|
|
||||||
left: &Self::SegmentSortKey,
|
|
||||||
right: &Self::SegmentSortKey,
|
|
||||||
) -> Ordering {
|
|
||||||
self.segment_comparator().compare(left, right)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Implementing this method makes it possible to avoid computing
|
|
||||||
/// a sort_key entirely if we can assess that it won't pass a threshold
|
|
||||||
/// with a partial computation.
|
|
||||||
///
|
|
||||||
/// This is currently used for lexicographic sorting.
|
|
||||||
fn accept_sort_key_lazy(
|
|
||||||
&mut self,
|
|
||||||
doc_id: DocId,
|
|
||||||
score: Score,
|
|
||||||
threshold: &Self::SegmentSortKey,
|
|
||||||
) -> Option<(Ordering, Self::SegmentSortKey)> {
|
|
||||||
let sort_key = self.segment_sort_key(doc_id, score);
|
|
||||||
let cmp = self.compare_segment_sort_key(&sort_key, threshold);
|
|
||||||
if cmp == Ordering::Less {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some((cmp, sort_key))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Convert a segment level sort key into the global sort key.
|
|
||||||
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `SortKeyComputer` defines the sort key to be used by a TopK Collector.
|
|
||||||
///
|
|
||||||
/// The `SortKeyComputer` itself does not make much of the computation itself.
|
|
||||||
/// Instead, it helps constructing `Self::Child` instances that will compute
|
|
||||||
/// the sort key at a segment scale.
|
|
||||||
pub trait SortKeyComputer: Sync {
|
|
||||||
/// The sort key type.
|
|
||||||
type SortKey: 'static + Send + Sync + Clone + std::fmt::Debug;
|
|
||||||
/// Type of the associated [`SegmentSortKeyComputer`].
|
|
||||||
type Child: SegmentSortKeyComputer<SortKey = Self::SortKey>;
|
|
||||||
/// Comparator type.
|
|
||||||
type Comparator: Comparator<Self::SortKey>
|
|
||||||
+ Comparator<<Self::Child as SegmentSortKeyComputer>::SegmentSortKey>
|
|
||||||
+ 'static;
|
|
||||||
|
|
||||||
/// Checks whether the schema is compatible with the sort key computer.
|
|
||||||
fn check_schema(&self, _schema: &Schema) -> crate::Result<()> {
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the sort key comparator.
|
|
||||||
fn comparator(&self) -> Self::Comparator {
|
|
||||||
Self::Comparator::default()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Indicates whether the sort key actually uses the similarity score (by default BM25).
|
|
||||||
/// If set to false, the similary score might not be computed (as an optimization),
|
|
||||||
/// and the score fed in the segment sort key computer could take any value.
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Sorting by score has a overriding implementation for BM25 scores, using Block-WAND.
|
|
||||||
fn collect_segment_top_k(
|
|
||||||
&self,
|
|
||||||
k: usize,
|
|
||||||
weight: &dyn crate::query::Weight,
|
|
||||||
reader: &crate::SegmentReader,
|
|
||||||
segment_ord: u32,
|
|
||||||
) -> crate::Result<Vec<(Self::SortKey, DocAddress)>> {
|
|
||||||
let with_scoring = self.requires_scoring();
|
|
||||||
let segment_sort_key_computer = self.segment_sort_key_computer(reader)?;
|
|
||||||
let topn_computer = TopNComputer::new_with_comparator(k, self.comparator());
|
|
||||||
let mut segment_top_key_collector = TopBySortKeySegmentCollector {
|
|
||||||
topn_computer,
|
|
||||||
segment_ord,
|
|
||||||
segment_sort_key_computer,
|
|
||||||
};
|
|
||||||
default_collect_segment_impl(&mut segment_top_key_collector, weight, reader, with_scoring)?;
|
|
||||||
Ok(segment_top_key_collector.harvest())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Builds a child sort key computer for a specific segment.
|
|
||||||
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child>;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<HeadSortKeyComputer, TailSortKeyComputer> SortKeyComputer
|
|
||||||
for (HeadSortKeyComputer, TailSortKeyComputer)
|
|
||||||
where
|
|
||||||
HeadSortKeyComputer: SortKeyComputer,
|
|
||||||
TailSortKeyComputer: SortKeyComputer,
|
|
||||||
{
|
|
||||||
type SortKey = (HeadSortKeyComputer::SortKey, TailSortKeyComputer::SortKey);
|
|
||||||
type Child = (HeadSortKeyComputer::Child, TailSortKeyComputer::Child);
|
|
||||||
|
|
||||||
type Comparator = (
|
|
||||||
HeadSortKeyComputer::Comparator,
|
|
||||||
TailSortKeyComputer::Comparator,
|
|
||||||
);
|
|
||||||
|
|
||||||
fn comparator(&self) -> Self::Comparator {
|
|
||||||
(self.0.comparator(), self.1.comparator())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
|
|
||||||
Ok((
|
|
||||||
self.0.segment_sort_key_computer(segment_reader)?,
|
|
||||||
self.1.segment_sort_key_computer(segment_reader)?,
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Checks whether the schema is compatible with the sort key computer.
|
|
||||||
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
|
|
||||||
self.0.check_schema(schema)?;
|
|
||||||
self.1.check_schema(schema)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Indicates whether the sort key actually uses the similarity score (by default BM25).
|
|
||||||
/// If set to false, the similary score might not be computed (as an optimization),
|
|
||||||
/// and the score fed in the segment sort key computer could take any value.
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.0.requires_scoring() || self.1.requires_scoring()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<HeadSegmentSortKeyComputer, TailSegmentSortKeyComputer> SegmentSortKeyComputer
|
|
||||||
for (HeadSegmentSortKeyComputer, TailSegmentSortKeyComputer)
|
|
||||||
where
|
|
||||||
HeadSegmentSortKeyComputer: SegmentSortKeyComputer,
|
|
||||||
TailSegmentSortKeyComputer: SegmentSortKeyComputer,
|
|
||||||
{
|
|
||||||
type SortKey = (
|
|
||||||
HeadSegmentSortKeyComputer::SortKey,
|
|
||||||
TailSegmentSortKeyComputer::SortKey,
|
|
||||||
);
|
|
||||||
type SegmentSortKey = (
|
|
||||||
HeadSegmentSortKeyComputer::SegmentSortKey,
|
|
||||||
TailSegmentSortKeyComputer::SegmentSortKey,
|
|
||||||
);
|
|
||||||
|
|
||||||
type SegmentComparator = (
|
|
||||||
HeadSegmentSortKeyComputer::SegmentComparator,
|
|
||||||
TailSegmentSortKeyComputer::SegmentComparator,
|
|
||||||
);
|
|
||||||
|
|
||||||
/// A SegmentSortKeyComputer maps to a SegmentSortKey, but it can also decide on
|
|
||||||
/// its ordering.
|
|
||||||
///
|
|
||||||
/// By default, it uses the natural ordering.
|
|
||||||
#[inline]
|
|
||||||
fn compare_segment_sort_key(
|
|
||||||
&self,
|
|
||||||
left: &Self::SegmentSortKey,
|
|
||||||
right: &Self::SegmentSortKey,
|
|
||||||
) -> Ordering {
|
|
||||||
self.0
|
|
||||||
.compare_segment_sort_key(&left.0, &right.0)
|
|
||||||
.then_with(|| self.1.compare_segment_sort_key(&left.1, &right.1))
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn compute_sort_key_and_collect<C: Comparator<Self::SegmentSortKey>>(
|
|
||||||
&mut self,
|
|
||||||
doc: DocId,
|
|
||||||
score: Score,
|
|
||||||
top_n_computer: &mut TopNComputer<Self::SegmentSortKey, DocId, C>,
|
|
||||||
) {
|
|
||||||
let sort_key: Self::SegmentSortKey;
|
|
||||||
if let Some(threshold) = &top_n_computer.threshold {
|
|
||||||
if let Some((_cmp, lazy_sort_key)) = self.accept_sort_key_lazy(doc, score, threshold) {
|
|
||||||
sort_key = lazy_sort_key;
|
|
||||||
} else {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
sort_key = self.segment_sort_key(doc, score);
|
|
||||||
};
|
|
||||||
top_n_computer.append_doc(doc, sort_key);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey {
|
|
||||||
let head_sort_key = self.0.segment_sort_key(doc, score);
|
|
||||||
let tail_sort_key = self.1.segment_sort_key(doc, score);
|
|
||||||
(head_sort_key, tail_sort_key)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn accept_sort_key_lazy(
|
|
||||||
&mut self,
|
|
||||||
doc_id: DocId,
|
|
||||||
score: Score,
|
|
||||||
threshold: &Self::SegmentSortKey,
|
|
||||||
) -> Option<(Ordering, Self::SegmentSortKey)> {
|
|
||||||
let (head_threshold, tail_threshold) = threshold;
|
|
||||||
let (head_cmp, head_sort_key) =
|
|
||||||
self.0.accept_sort_key_lazy(doc_id, score, head_threshold)?;
|
|
||||||
if head_cmp == Ordering::Equal {
|
|
||||||
let (tail_cmp, tail_sort_key) =
|
|
||||||
self.1.accept_sort_key_lazy(doc_id, score, tail_threshold)?;
|
|
||||||
Some((tail_cmp, (head_sort_key, tail_sort_key)))
|
|
||||||
} else {
|
|
||||||
let tail_sort_key = self.1.segment_sort_key(doc_id, score);
|
|
||||||
Some((head_cmp, (head_sort_key, tail_sort_key)))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey {
|
|
||||||
let (head_sort_key, tail_sort_key) = sort_key;
|
|
||||||
(
|
|
||||||
self.0.convert_segment_sort_key(head_sort_key),
|
|
||||||
self.1.convert_segment_sort_key(tail_sort_key),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This struct is used as an adapter to take a sort key computer and map its score to another
|
|
||||||
/// new sort key.
|
|
||||||
pub struct MappedSegmentSortKeyComputer<T, PreviousSortKey, NewSortKey> {
|
|
||||||
sort_key_computer: T,
|
|
||||||
map: fn(PreviousSortKey) -> NewSortKey,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T, PreviousScore, NewScore> SegmentSortKeyComputer
|
|
||||||
for MappedSegmentSortKeyComputer<T, PreviousScore, NewScore>
|
|
||||||
where
|
|
||||||
T: SegmentSortKeyComputer<SortKey = PreviousScore>,
|
|
||||||
PreviousScore: 'static + Clone + Send + Sync,
|
|
||||||
NewScore: 'static + Clone + Send + Sync,
|
|
||||||
{
|
|
||||||
type SortKey = NewScore;
|
|
||||||
type SegmentSortKey = T::SegmentSortKey;
|
|
||||||
type SegmentComparator = T::SegmentComparator;
|
|
||||||
|
|
||||||
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey {
|
|
||||||
self.sort_key_computer.segment_sort_key(doc, score)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn accept_sort_key_lazy(
|
|
||||||
&mut self,
|
|
||||||
doc_id: DocId,
|
|
||||||
score: Score,
|
|
||||||
threshold: &Self::SegmentSortKey,
|
|
||||||
) -> Option<(Ordering, Self::SegmentSortKey)> {
|
|
||||||
self.sort_key_computer
|
|
||||||
.accept_sort_key_lazy(doc_id, score, threshold)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn compute_sort_key_and_collect<C: Comparator<Self::SegmentSortKey>>(
|
|
||||||
&mut self,
|
|
||||||
doc: DocId,
|
|
||||||
score: Score,
|
|
||||||
top_n_computer: &mut TopNComputer<Self::SegmentSortKey, DocId, C>,
|
|
||||||
) {
|
|
||||||
self.sort_key_computer
|
|
||||||
.compute_sort_key_and_collect(doc, score, top_n_computer);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn convert_segment_sort_key(&self, segment_sort_key: Self::SegmentSortKey) -> Self::SortKey {
|
|
||||||
(self.map)(
|
|
||||||
self.sort_key_computer
|
|
||||||
.convert_segment_sort_key(segment_sort_key),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// We then re-use our (head, tail) implement and our mapper by seeing mapping any tuple (a, b, c,
|
|
||||||
// ...) as the chain (a, (b, (c, ...)))
|
|
||||||
|
|
||||||
impl<SortKeyComputer1, SortKeyComputer2, SortKeyComputer3> SortKeyComputer
|
|
||||||
for (SortKeyComputer1, SortKeyComputer2, SortKeyComputer3)
|
|
||||||
where
|
|
||||||
SortKeyComputer1: SortKeyComputer,
|
|
||||||
SortKeyComputer2: SortKeyComputer,
|
|
||||||
SortKeyComputer3: SortKeyComputer,
|
|
||||||
{
|
|
||||||
type SortKey = (
|
|
||||||
SortKeyComputer1::SortKey,
|
|
||||||
SortKeyComputer2::SortKey,
|
|
||||||
SortKeyComputer3::SortKey,
|
|
||||||
);
|
|
||||||
type Child = MappedSegmentSortKeyComputer<
|
|
||||||
<(SortKeyComputer1, (SortKeyComputer2, SortKeyComputer3)) as SortKeyComputer>::Child,
|
|
||||||
(
|
|
||||||
SortKeyComputer1::SortKey,
|
|
||||||
(SortKeyComputer2::SortKey, SortKeyComputer3::SortKey),
|
|
||||||
),
|
|
||||||
Self::SortKey,
|
|
||||||
>;
|
|
||||||
|
|
||||||
type Comparator = (
|
|
||||||
SortKeyComputer1::Comparator,
|
|
||||||
SortKeyComputer2::Comparator,
|
|
||||||
SortKeyComputer3::Comparator,
|
|
||||||
);
|
|
||||||
|
|
||||||
fn comparator(&self) -> Self::Comparator {
|
|
||||||
(
|
|
||||||
self.0.comparator(),
|
|
||||||
self.1.comparator(),
|
|
||||||
self.2.comparator(),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
|
|
||||||
let sort_key_computer1 = self.0.segment_sort_key_computer(segment_reader)?;
|
|
||||||
let sort_key_computer2 = self.1.segment_sort_key_computer(segment_reader)?;
|
|
||||||
let sort_key_computer3 = self.2.segment_sort_key_computer(segment_reader)?;
|
|
||||||
let map = |(sort_key1, (sort_key2, sort_key3))| (sort_key1, sort_key2, sort_key3);
|
|
||||||
Ok(MappedSegmentSortKeyComputer {
|
|
||||||
sort_key_computer: (sort_key_computer1, (sort_key_computer2, sort_key_computer3)),
|
|
||||||
map,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
|
|
||||||
self.0.check_schema(schema)?;
|
|
||||||
self.1.check_schema(schema)?;
|
|
||||||
self.2.check_schema(schema)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.0.requires_scoring() || self.1.requires_scoring() || self.2.requires_scoring()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<SortKeyComputer1, SortKeyComputer2, SortKeyComputer3, SortKeyComputer4> SortKeyComputer
|
|
||||||
for (
|
|
||||||
SortKeyComputer1,
|
|
||||||
SortKeyComputer2,
|
|
||||||
SortKeyComputer3,
|
|
||||||
SortKeyComputer4,
|
|
||||||
)
|
|
||||||
where
|
|
||||||
SortKeyComputer1: SortKeyComputer,
|
|
||||||
SortKeyComputer2: SortKeyComputer,
|
|
||||||
SortKeyComputer3: SortKeyComputer,
|
|
||||||
SortKeyComputer4: SortKeyComputer,
|
|
||||||
{
|
|
||||||
type Child = MappedSegmentSortKeyComputer<
|
|
||||||
<(
|
|
||||||
SortKeyComputer1,
|
|
||||||
(SortKeyComputer2, (SortKeyComputer3, SortKeyComputer4)),
|
|
||||||
) as SortKeyComputer>::Child,
|
|
||||||
(
|
|
||||||
SortKeyComputer1::SortKey,
|
|
||||||
(
|
|
||||||
SortKeyComputer2::SortKey,
|
|
||||||
(SortKeyComputer3::SortKey, SortKeyComputer4::SortKey),
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Self::SortKey,
|
|
||||||
>;
|
|
||||||
type SortKey = (
|
|
||||||
SortKeyComputer1::SortKey,
|
|
||||||
SortKeyComputer2::SortKey,
|
|
||||||
SortKeyComputer3::SortKey,
|
|
||||||
SortKeyComputer4::SortKey,
|
|
||||||
);
|
|
||||||
type Comparator = (
|
|
||||||
SortKeyComputer1::Comparator,
|
|
||||||
SortKeyComputer2::Comparator,
|
|
||||||
SortKeyComputer3::Comparator,
|
|
||||||
SortKeyComputer4::Comparator,
|
|
||||||
);
|
|
||||||
|
|
||||||
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
|
|
||||||
let sort_key_computer1 = self.0.segment_sort_key_computer(segment_reader)?;
|
|
||||||
let sort_key_computer2 = self.1.segment_sort_key_computer(segment_reader)?;
|
|
||||||
let sort_key_computer3 = self.2.segment_sort_key_computer(segment_reader)?;
|
|
||||||
let sort_key_computer4 = self.3.segment_sort_key_computer(segment_reader)?;
|
|
||||||
Ok(MappedSegmentSortKeyComputer {
|
|
||||||
sort_key_computer: (
|
|
||||||
sort_key_computer1,
|
|
||||||
(sort_key_computer2, (sort_key_computer3, sort_key_computer4)),
|
|
||||||
),
|
|
||||||
map: |(sort_key1, (sort_key2, (sort_key3, sort_key4)))| {
|
|
||||||
(sort_key1, sort_key2, sort_key3, sort_key4)
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
|
|
||||||
self.0.check_schema(schema)?;
|
|
||||||
self.1.check_schema(schema)?;
|
|
||||||
self.2.check_schema(schema)?;
|
|
||||||
self.3.check_schema(schema)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.0.requires_scoring()
|
|
||||||
|| self.1.requires_scoring()
|
|
||||||
|| self.2.requires_scoring()
|
|
||||||
|| self.3.requires_scoring()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<F, SegmentF, TSortKey> SortKeyComputer for F
|
|
||||||
where
|
|
||||||
F: 'static + Send + Sync + Fn(&SegmentReader) -> SegmentF,
|
|
||||||
SegmentF: 'static + FnMut(DocId) -> TSortKey,
|
|
||||||
TSortKey: 'static + PartialOrd + Clone + Send + Sync + std::fmt::Debug,
|
|
||||||
{
|
|
||||||
type SortKey = TSortKey;
|
|
||||||
type Child = SegmentF;
|
|
||||||
type Comparator = NaturalComparator;
|
|
||||||
|
|
||||||
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
|
|
||||||
Ok((self)(segment_reader))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<F, TSortKey> SegmentSortKeyComputer for F
|
|
||||||
where
|
|
||||||
F: 'static + FnMut(DocId) -> TSortKey,
|
|
||||||
TSortKey: 'static + PartialOrd + Clone + Send + Sync,
|
|
||||||
{
|
|
||||||
type SortKey = TSortKey;
|
|
||||||
type SegmentSortKey = TSortKey;
|
|
||||||
type SegmentComparator = NaturalComparator;
|
|
||||||
|
|
||||||
fn segment_sort_key(&mut self, doc: DocId, _score: Score) -> TSortKey {
|
|
||||||
(self)(doc)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Convert a segment level score into the global level score.
|
|
||||||
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey {
|
|
||||||
sort_key
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use std::cmp::Ordering;
|
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering};
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
|
|
||||||
use crate::schema::Schema;
|
|
||||||
use crate::{DocId, Index, Order, SegmentReader};
|
|
||||||
|
|
||||||
fn build_test_index() -> Index {
|
|
||||||
let schema = Schema::builder().build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let mut index_writer = index.writer_for_tests().unwrap();
|
|
||||||
index_writer
|
|
||||||
.add_document(crate::TantivyDocument::default())
|
|
||||||
.unwrap();
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
index
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_lazy_score_computer() {
|
|
||||||
let score_computer_primary = |_segment_reader: &SegmentReader| |_doc: DocId| 200u32;
|
|
||||||
let call_count = Arc::new(AtomicUsize::new(0));
|
|
||||||
let call_count_clone = call_count.clone();
|
|
||||||
let score_computer_secondary = move |_segment_reader: &SegmentReader| {
|
|
||||||
let call_count_new_clone = call_count_clone.clone();
|
|
||||||
move |_doc: DocId| {
|
|
||||||
call_count_new_clone.fetch_add(1, AtomicOrdering::SeqCst);
|
|
||||||
"b"
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let lazy_score_computer = (score_computer_primary, score_computer_secondary);
|
|
||||||
let index = build_test_index();
|
|
||||||
let searcher = index.reader().unwrap().searcher();
|
|
||||||
let mut segment_sort_key_computer = lazy_score_computer
|
|
||||||
.segment_sort_key_computer(searcher.segment_reader(0))
|
|
||||||
.unwrap();
|
|
||||||
let expected_sort_key = (200, "b");
|
|
||||||
{
|
|
||||||
let sort_key_opt =
|
|
||||||
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(100u32, "a"));
|
|
||||||
assert_eq!(sort_key_opt, Some((Ordering::Greater, expected_sort_key)));
|
|
||||||
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 1);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let sort_key_opt =
|
|
||||||
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(100u32, "c"));
|
|
||||||
assert_eq!(sort_key_opt, Some((Ordering::Greater, expected_sort_key)));
|
|
||||||
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 2);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let sort_key_opt =
|
|
||||||
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(200u32, "a"));
|
|
||||||
assert_eq!(sort_key_opt, Some((Ordering::Greater, expected_sort_key)));
|
|
||||||
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 3);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let sort_key_opt =
|
|
||||||
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(200u32, "c"));
|
|
||||||
assert!(sort_key_opt.is_none());
|
|
||||||
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 4);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let sort_key_opt =
|
|
||||||
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(300u32, "a"));
|
|
||||||
assert_eq!(sort_key_opt, None);
|
|
||||||
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 4);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let sort_key_opt =
|
|
||||||
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(300u32, "c"));
|
|
||||||
assert_eq!(sort_key_opt, None);
|
|
||||||
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 4);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let sort_key_opt =
|
|
||||||
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &expected_sort_key);
|
|
||||||
assert_eq!(sort_key_opt, Some((Ordering::Equal, expected_sort_key)));
|
|
||||||
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 5);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_lazy_score_computer_dynamic_ordering() {
|
|
||||||
let score_computer_primary = |_segment_reader: &SegmentReader| |_doc: DocId| 200u32;
|
|
||||||
let call_count = Arc::new(AtomicUsize::new(0));
|
|
||||||
let call_count_clone = call_count.clone();
|
|
||||||
let score_computer_secondary = move |_segment_reader: &SegmentReader| {
|
|
||||||
let call_count_new_clone = call_count_clone.clone();
|
|
||||||
move |_doc: DocId| {
|
|
||||||
call_count_new_clone.fetch_add(1, AtomicOrdering::SeqCst);
|
|
||||||
2u32
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let lazy_score_computer = (
|
|
||||||
(score_computer_primary, Order::Desc),
|
|
||||||
(score_computer_secondary, Order::Asc),
|
|
||||||
);
|
|
||||||
let index = build_test_index();
|
|
||||||
let searcher = index.reader().unwrap().searcher();
|
|
||||||
let mut segment_sort_key_computer = lazy_score_computer
|
|
||||||
.segment_sort_key_computer(searcher.segment_reader(0))
|
|
||||||
.unwrap();
|
|
||||||
let expected_sort_key = (200, 2u32);
|
|
||||||
|
|
||||||
{
|
|
||||||
let sort_key_opt =
|
|
||||||
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(100u32, 1u32));
|
|
||||||
assert_eq!(sort_key_opt, Some((Ordering::Greater, expected_sort_key)));
|
|
||||||
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 1);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let sort_key_opt =
|
|
||||||
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(100u32, 3u32));
|
|
||||||
assert_eq!(sort_key_opt, Some((Ordering::Greater, expected_sort_key)));
|
|
||||||
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 2);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let sort_key_opt =
|
|
||||||
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(200u32, 1u32));
|
|
||||||
assert!(sort_key_opt.is_none());
|
|
||||||
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 3);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let sort_key_opt =
|
|
||||||
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(200u32, 3u32));
|
|
||||||
assert_eq!(sort_key_opt, Some((Ordering::Greater, expected_sort_key)));
|
|
||||||
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 4);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let sort_key_opt =
|
|
||||||
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(300u32, 1u32));
|
|
||||||
assert_eq!(sort_key_opt, None);
|
|
||||||
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 4);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let sort_key_opt =
|
|
||||||
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(300u32, 3u32));
|
|
||||||
assert_eq!(sort_key_opt, None);
|
|
||||||
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 4);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let sort_key_opt =
|
|
||||||
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &expected_sort_key);
|
|
||||||
assert_eq!(sort_key_opt, Some((Ordering::Equal, expected_sort_key)));
|
|
||||||
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 5);
|
|
||||||
}
|
|
||||||
assert_eq!(
|
|
||||||
segment_sort_key_computer.convert_segment_sort_key(expected_sort_key),
|
|
||||||
(200u32, 2u32)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,193 +0,0 @@
|
|||||||
use std::ops::Range;
|
|
||||||
|
|
||||||
use crate::collector::sort_key::{Comparator, SegmentSortKeyComputer, SortKeyComputer};
|
|
||||||
use crate::collector::{Collector, SegmentCollector, TopNComputer};
|
|
||||||
use crate::query::Weight;
|
|
||||||
use crate::schema::Schema;
|
|
||||||
use crate::{DocAddress, DocId, Result, Score, SegmentReader};
|
|
||||||
|
|
||||||
pub(crate) struct TopBySortKeyCollector<TSortKeyComputer> {
|
|
||||||
sort_key_computer: TSortKeyComputer,
|
|
||||||
doc_range: Range<usize>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<TSortKeyComputer> TopBySortKeyCollector<TSortKeyComputer> {
|
|
||||||
pub fn new(sort_key_computer: TSortKeyComputer, doc_range: Range<usize>) -> Self {
|
|
||||||
TopBySortKeyCollector {
|
|
||||||
sort_key_computer,
|
|
||||||
doc_range,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<TSortKeyComputer> Collector for TopBySortKeyCollector<TSortKeyComputer>
|
|
||||||
where TSortKeyComputer: SortKeyComputer + Send + Sync + 'static
|
|
||||||
{
|
|
||||||
type Fruit = Vec<(TSortKeyComputer::SortKey, DocAddress)>;
|
|
||||||
|
|
||||||
type Child =
|
|
||||||
TopBySortKeySegmentCollector<TSortKeyComputer::Child, TSortKeyComputer::Comparator>;
|
|
||||||
|
|
||||||
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
|
|
||||||
self.sort_key_computer.check_schema(schema)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn for_segment(&self, segment_ord: u32, segment_reader: &SegmentReader) -> Result<Self::Child> {
|
|
||||||
let segment_sort_key_computer = self
|
|
||||||
.sort_key_computer
|
|
||||||
.segment_sort_key_computer(segment_reader)?;
|
|
||||||
let topn_computer = TopNComputer::new_with_comparator(
|
|
||||||
self.doc_range.end,
|
|
||||||
self.sort_key_computer.comparator(),
|
|
||||||
);
|
|
||||||
Ok(TopBySortKeySegmentCollector {
|
|
||||||
topn_computer,
|
|
||||||
segment_ord,
|
|
||||||
segment_sort_key_computer,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.sort_key_computer.requires_scoring()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit> {
|
|
||||||
Ok(merge_top_k(
|
|
||||||
segment_fruits.into_iter().flatten(),
|
|
||||||
self.doc_range.clone(),
|
|
||||||
self.sort_key_computer.comparator(),
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn collect_segment(
|
|
||||||
&self,
|
|
||||||
weight: &dyn Weight,
|
|
||||||
segment_ord: u32,
|
|
||||||
reader: &SegmentReader,
|
|
||||||
) -> crate::Result<Vec<(TSortKeyComputer::SortKey, DocAddress)>> {
|
|
||||||
let k = self.doc_range.end;
|
|
||||||
let docs = self
|
|
||||||
.sort_key_computer
|
|
||||||
.collect_segment_top_k(k, weight, reader, segment_ord)?;
|
|
||||||
Ok(docs)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_top_k<D: Ord, TSortKey: Clone + std::fmt::Debug, C: Comparator<TSortKey>>(
|
|
||||||
sort_key_docs: impl Iterator<Item = (TSortKey, D)>,
|
|
||||||
doc_range: Range<usize>,
|
|
||||||
comparator: C,
|
|
||||||
) -> Vec<(TSortKey, D)> {
|
|
||||||
if doc_range.is_empty() {
|
|
||||||
return Vec::new();
|
|
||||||
}
|
|
||||||
let mut top_collector: TopNComputer<TSortKey, D, C> =
|
|
||||||
TopNComputer::new_with_comparator(doc_range.end, comparator);
|
|
||||||
for (sort_key, doc) in sort_key_docs {
|
|
||||||
top_collector.push(sort_key, doc);
|
|
||||||
}
|
|
||||||
top_collector
|
|
||||||
.into_sorted_vec()
|
|
||||||
.into_iter()
|
|
||||||
.skip(doc_range.start)
|
|
||||||
.map(|cdoc| (cdoc.sort_key, cdoc.doc))
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct TopBySortKeySegmentCollector<TSegmentSortKeyComputer, C>
|
|
||||||
where
|
|
||||||
TSegmentSortKeyComputer: SegmentSortKeyComputer,
|
|
||||||
C: Comparator<TSegmentSortKeyComputer::SegmentSortKey>,
|
|
||||||
{
|
|
||||||
pub(crate) topn_computer: TopNComputer<TSegmentSortKeyComputer::SegmentSortKey, DocId, C>,
|
|
||||||
pub(crate) segment_ord: u32,
|
|
||||||
pub(crate) segment_sort_key_computer: TSegmentSortKeyComputer,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<TSegmentSortKeyComputer, C> SegmentCollector
|
|
||||||
for TopBySortKeySegmentCollector<TSegmentSortKeyComputer, C>
|
|
||||||
where
|
|
||||||
TSegmentSortKeyComputer: 'static + SegmentSortKeyComputer,
|
|
||||||
C: Comparator<TSegmentSortKeyComputer::SegmentSortKey> + 'static,
|
|
||||||
{
|
|
||||||
type Fruit = Vec<(TSegmentSortKeyComputer::SortKey, DocAddress)>;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
|
||||||
self.segment_sort_key_computer.compute_sort_key_and_collect(
|
|
||||||
doc,
|
|
||||||
score,
|
|
||||||
&mut self.topn_computer,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> Self::Fruit {
|
|
||||||
let segment_ord = self.segment_ord;
|
|
||||||
let segment_hits: Vec<(TSegmentSortKeyComputer::SortKey, DocAddress)> = self
|
|
||||||
.topn_computer
|
|
||||||
.into_vec()
|
|
||||||
.into_iter()
|
|
||||||
.map(|comparable_doc| {
|
|
||||||
let sort_key = self
|
|
||||||
.segment_sort_key_computer
|
|
||||||
.convert_segment_sort_key(comparable_doc.sort_key);
|
|
||||||
(
|
|
||||||
sort_key,
|
|
||||||
DocAddress {
|
|
||||||
segment_ord,
|
|
||||||
doc_id: comparable_doc.doc,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
segment_hits
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use std::ops::Range;
|
|
||||||
|
|
||||||
use rand;
|
|
||||||
use rand::seq::SliceRandom as _;
|
|
||||||
|
|
||||||
use super::merge_top_k;
|
|
||||||
use crate::collector::sort_key::ComparatorEnum;
|
|
||||||
use crate::Order;
|
|
||||||
|
|
||||||
fn test_merge_top_k_aux(
|
|
||||||
order: Order,
|
|
||||||
doc_range: Range<usize>,
|
|
||||||
expected: &[(crate::Score, usize)],
|
|
||||||
) {
|
|
||||||
let mut vals: Vec<(crate::Score, usize)> = (0..10).map(|val| (val as f32, val)).collect();
|
|
||||||
vals.shuffle(&mut rand::thread_rng());
|
|
||||||
let vals_merged = merge_top_k(vals.into_iter(), doc_range, ComparatorEnum::from(order));
|
|
||||||
assert_eq!(&vals_merged, expected);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_merge_top_k() {
|
|
||||||
test_merge_top_k_aux(Order::Asc, 0..0, &[]);
|
|
||||||
test_merge_top_k_aux(Order::Asc, 3..3, &[]);
|
|
||||||
test_merge_top_k_aux(Order::Asc, 0..3, &[(0.0f32, 0), (1.0f32, 1), (2.0f32, 2)]);
|
|
||||||
test_merge_top_k_aux(
|
|
||||||
Order::Asc,
|
|
||||||
0..11,
|
|
||||||
&[
|
|
||||||
(0.0f32, 0),
|
|
||||||
(1.0f32, 1),
|
|
||||||
(2.0f32, 2),
|
|
||||||
(3.0f32, 3),
|
|
||||||
(4.0f32, 4),
|
|
||||||
(5.0f32, 5),
|
|
||||||
(6.0f32, 6),
|
|
||||||
(7.0f32, 7),
|
|
||||||
(8.0f32, 8),
|
|
||||||
(9.0f32, 9),
|
|
||||||
],
|
|
||||||
);
|
|
||||||
test_merge_top_k_aux(Order::Asc, 1..3, &[(1.0f32, 1), (2.0f32, 2)]);
|
|
||||||
test_merge_top_k_aux(Order::Desc, 0..2, &[(9.0f32, 9), (8.0f32, 8)]);
|
|
||||||
test_merge_top_k_aux(Order::Desc, 2..4, &[(7.0f32, 7), (6.0f32, 6)]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -40,7 +40,7 @@ pub fn test_filter_collector() -> crate::Result<()> {
|
|||||||
let filter_some_collector = FilterCollector::new(
|
let filter_some_collector = FilterCollector::new(
|
||||||
"price".to_string(),
|
"price".to_string(),
|
||||||
&|value: u64| value > 20_120u64,
|
&|value: u64| value > 20_120u64,
|
||||||
TopDocs::with_limit(2).order_by_score(),
|
TopDocs::with_limit(2),
|
||||||
);
|
);
|
||||||
let top_docs = searcher.search(&query, &filter_some_collector)?;
|
let top_docs = searcher.search(&query, &filter_some_collector)?;
|
||||||
|
|
||||||
@@ -50,7 +50,7 @@ pub fn test_filter_collector() -> crate::Result<()> {
|
|||||||
let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(
|
let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(
|
||||||
"price".to_string(),
|
"price".to_string(),
|
||||||
&|value| value < 5u64,
|
&|value| value < 5u64,
|
||||||
TopDocs::with_limit(2).order_by_score(),
|
TopDocs::with_limit(2),
|
||||||
);
|
);
|
||||||
let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
|
let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
|
||||||
|
|
||||||
@@ -62,11 +62,8 @@ pub fn test_filter_collector() -> crate::Result<()> {
|
|||||||
> 0
|
> 0
|
||||||
}
|
}
|
||||||
|
|
||||||
let filter_dates_collector = FilterCollector::new(
|
let filter_dates_collector =
|
||||||
"date".to_string(),
|
FilterCollector::new("date".to_string(), &date_filter, TopDocs::with_limit(5));
|
||||||
&date_filter,
|
|
||||||
TopDocs::with_limit(5).order_by_score(),
|
|
||||||
);
|
|
||||||
let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?;
|
let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?;
|
||||||
|
|
||||||
assert_eq!(filtered_date_docs.len(), 2);
|
assert_eq!(filtered_date_docs.len(), 2);
|
||||||
|
|||||||
@@ -1,22 +1,374 @@
|
|||||||
|
use std::cmp::Ordering;
|
||||||
|
use std::marker::PhantomData;
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use super::top_score_collector::TopNComputer;
|
||||||
|
use crate::index::SegmentReader;
|
||||||
|
use crate::{DocAddress, DocId, SegmentOrdinal};
|
||||||
|
|
||||||
/// Contains a feature (field, score, etc.) of a document along with the document address.
|
/// Contains a feature (field, score, etc.) of a document along with the document address.
|
||||||
///
|
///
|
||||||
/// Used only by TopNComputer, which implements the actual comparison via a `Comparator`.
|
/// It guarantees stable sorting: in case of a tie on the feature, the document
|
||||||
#[derive(Clone, Default, Eq, PartialEq, Serialize, Deserialize)]
|
/// address is used.
|
||||||
pub struct ComparableDoc<T, D> {
|
///
|
||||||
|
/// The REVERSE_ORDER generic parameter controls whether the by-feature order
|
||||||
|
/// should be reversed, which is useful for achieving for example largest-first
|
||||||
|
/// semantics without having to wrap the feature in a `Reverse`.
|
||||||
|
#[derive(Clone, Default, Serialize, Deserialize)]
|
||||||
|
pub struct ComparableDoc<T, D, const REVERSE_ORDER: bool = false> {
|
||||||
/// The feature of the document. In practice, this is
|
/// The feature of the document. In practice, this is
|
||||||
/// is a type which can be compared with a `Comparator<T>`.
|
/// is any type that implements `PartialOrd`.
|
||||||
pub sort_key: T,
|
pub feature: T,
|
||||||
/// The document address. In practice, this is either a `DocId` or `DocAddress`.
|
/// The document address. In practice, this is any
|
||||||
|
/// type that implements `PartialOrd`, and is guaranteed
|
||||||
|
/// to be unique for each document.
|
||||||
pub doc: D,
|
pub doc: D,
|
||||||
}
|
}
|
||||||
|
impl<T: std::fmt::Debug, D: std::fmt::Debug, const R: bool> std::fmt::Debug
|
||||||
impl<T: std::fmt::Debug, D: std::fmt::Debug> std::fmt::Debug for ComparableDoc<T, D> {
|
for ComparableDoc<T, D, R>
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
{
|
||||||
f.debug_struct("ComparableDoc")
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
.field("feature", &self.sort_key)
|
f.debug_struct(format!("ComparableDoc<_, _ {R}").as_str())
|
||||||
|
.field("feature", &self.feature)
|
||||||
.field("doc", &self.doc)
|
.field("doc", &self.doc)
|
||||||
.finish()
|
.finish()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T: PartialOrd, D: PartialOrd, const R: bool> PartialOrd for ComparableDoc<T, D, R> {
|
||||||
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
|
Some(self.cmp(other))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: PartialOrd, D: PartialOrd, const R: bool> Ord for ComparableDoc<T, D, R> {
|
||||||
|
#[inline]
|
||||||
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
|
let by_feature = self
|
||||||
|
.feature
|
||||||
|
.partial_cmp(&other.feature)
|
||||||
|
.map(|ord| if R { ord.reverse() } else { ord })
|
||||||
|
.unwrap_or(Ordering::Equal);
|
||||||
|
|
||||||
|
let lazy_by_doc_address = || self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal);
|
||||||
|
|
||||||
|
// In case of a tie on the feature, we sort by ascending
|
||||||
|
// `DocAddress` in order to ensure a stable sorting of the
|
||||||
|
// documents.
|
||||||
|
by_feature.then_with(lazy_by_doc_address)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: PartialOrd, D: PartialOrd, const R: bool> PartialEq for ComparableDoc<T, D, R> {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
self.cmp(other) == Ordering::Equal
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: PartialOrd, D: PartialOrd, const R: bool> Eq for ComparableDoc<T, D, R> {}
|
||||||
|
|
||||||
|
pub(crate) struct TopCollector<T> {
|
||||||
|
pub limit: usize,
|
||||||
|
pub offset: usize,
|
||||||
|
_marker: PhantomData<T>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> TopCollector<T>
|
||||||
|
where T: PartialOrd + Clone
|
||||||
|
{
|
||||||
|
/// Creates a top collector, with a number of documents equal to "limit".
|
||||||
|
///
|
||||||
|
/// # Panics
|
||||||
|
/// The method panics if limit is 0
|
||||||
|
pub fn with_limit(limit: usize) -> TopCollector<T> {
|
||||||
|
assert!(limit >= 1, "Limit must be strictly greater than 0.");
|
||||||
|
Self {
|
||||||
|
limit,
|
||||||
|
offset: 0,
|
||||||
|
_marker: PhantomData,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Skip the first "offset" documents when collecting.
|
||||||
|
///
|
||||||
|
/// This is equivalent to `OFFSET` in MySQL or PostgreSQL and `start` in
|
||||||
|
/// Lucene's TopDocsCollector.
|
||||||
|
pub fn and_offset(mut self, offset: usize) -> TopCollector<T> {
|
||||||
|
self.offset = offset;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn merge_fruits(
|
||||||
|
&self,
|
||||||
|
children: Vec<Vec<(T, DocAddress)>>,
|
||||||
|
) -> crate::Result<Vec<(T, DocAddress)>> {
|
||||||
|
if self.limit == 0 {
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
let mut top_collector: TopNComputer<_, _> = TopNComputer::new(self.limit + self.offset);
|
||||||
|
for child_fruit in children {
|
||||||
|
for (feature, doc) in child_fruit {
|
||||||
|
top_collector.push(feature, doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(top_collector
|
||||||
|
.into_sorted_vec()
|
||||||
|
.into_iter()
|
||||||
|
.skip(self.offset)
|
||||||
|
.map(|cdoc| (cdoc.feature, cdoc.doc))
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn for_segment<F: PartialOrd + Clone>(
|
||||||
|
&self,
|
||||||
|
segment_id: SegmentOrdinal,
|
||||||
|
_: &SegmentReader,
|
||||||
|
) -> TopSegmentCollector<F> {
|
||||||
|
TopSegmentCollector::new(segment_id, self.limit + self.offset)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new TopCollector with the same limit and offset.
|
||||||
|
///
|
||||||
|
/// Ideally we would use Into but the blanket implementation seems to cause the Scorer traits
|
||||||
|
/// to fail.
|
||||||
|
#[doc(hidden)]
|
||||||
|
pub(crate) fn into_tscore<TScore: PartialOrd + Clone>(self) -> TopCollector<TScore> {
|
||||||
|
TopCollector {
|
||||||
|
limit: self.limit,
|
||||||
|
offset: self.offset,
|
||||||
|
_marker: PhantomData,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The Top Collector keeps track of the K documents
|
||||||
|
/// sorted by type `T`.
|
||||||
|
///
|
||||||
|
/// The implementation is based on a repeatedly truncating on the median after K * 2 documents
|
||||||
|
/// The theoretical complexity for collecting the top `K` out of `n` documents
|
||||||
|
/// is `O(n + K)`.
|
||||||
|
pub(crate) struct TopSegmentCollector<T> {
|
||||||
|
/// We reverse the order of the feature in order to
|
||||||
|
/// have top-semantics instead of bottom semantics.
|
||||||
|
topn_computer: TopNComputer<T, DocId>,
|
||||||
|
segment_ord: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||||
|
fn new(segment_ord: SegmentOrdinal, limit: usize) -> TopSegmentCollector<T> {
|
||||||
|
TopSegmentCollector {
|
||||||
|
topn_computer: TopNComputer::new(limit),
|
||||||
|
segment_ord,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||||
|
pub fn harvest(self) -> Vec<(T, DocAddress)> {
|
||||||
|
let segment_ord = self.segment_ord;
|
||||||
|
self.topn_computer
|
||||||
|
.into_sorted_vec()
|
||||||
|
.into_iter()
|
||||||
|
.map(|comparable_doc| {
|
||||||
|
(
|
||||||
|
comparable_doc.feature,
|
||||||
|
DocAddress {
|
||||||
|
segment_ord,
|
||||||
|
doc_id: comparable_doc.doc,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collects a document scored by the given feature
|
||||||
|
///
|
||||||
|
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
|
||||||
|
/// will compare the lowest scoring item with the given one and keep whichever is greater.
|
||||||
|
#[inline]
|
||||||
|
pub fn collect(&mut self, doc: DocId, feature: T) {
|
||||||
|
self.topn_computer.push(feature, doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::{TopCollector, TopSegmentCollector};
|
||||||
|
use crate::DocAddress;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_top_collector_not_at_capacity() {
|
||||||
|
let mut top_collector = TopSegmentCollector::new(0, 4);
|
||||||
|
top_collector.collect(1, 0.8);
|
||||||
|
top_collector.collect(3, 0.2);
|
||||||
|
top_collector.collect(5, 0.3);
|
||||||
|
assert_eq!(
|
||||||
|
top_collector.harvest(),
|
||||||
|
vec![
|
||||||
|
(0.8, DocAddress::new(0, 1)),
|
||||||
|
(0.3, DocAddress::new(0, 5)),
|
||||||
|
(0.2, DocAddress::new(0, 3))
|
||||||
|
]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_top_collector_at_capacity() {
|
||||||
|
let mut top_collector = TopSegmentCollector::new(0, 4);
|
||||||
|
top_collector.collect(1, 0.8);
|
||||||
|
top_collector.collect(3, 0.2);
|
||||||
|
top_collector.collect(5, 0.3);
|
||||||
|
top_collector.collect(7, 0.9);
|
||||||
|
top_collector.collect(9, -0.2);
|
||||||
|
assert_eq!(
|
||||||
|
top_collector.harvest(),
|
||||||
|
vec![
|
||||||
|
(0.9, DocAddress::new(0, 7)),
|
||||||
|
(0.8, DocAddress::new(0, 1)),
|
||||||
|
(0.3, DocAddress::new(0, 5)),
|
||||||
|
(0.2, DocAddress::new(0, 3))
|
||||||
|
]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_top_segment_collector_stable_ordering_for_equal_feature() {
|
||||||
|
// given that the documents are collected in ascending doc id order,
|
||||||
|
// when harvesting we have to guarantee stable sorting in case of a tie
|
||||||
|
// on the score
|
||||||
|
let doc_ids_collection = [4, 5, 6];
|
||||||
|
let score = 3.3f32;
|
||||||
|
|
||||||
|
let mut top_collector_limit_2 = TopSegmentCollector::new(0, 2);
|
||||||
|
for id in &doc_ids_collection {
|
||||||
|
top_collector_limit_2.collect(*id, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut top_collector_limit_3 = TopSegmentCollector::new(0, 3);
|
||||||
|
for id in &doc_ids_collection {
|
||||||
|
top_collector_limit_3.collect(*id, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
top_collector_limit_2.harvest(),
|
||||||
|
top_collector_limit_3.harvest()[..2].to_vec(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_top_collector_with_limit_and_offset() {
|
||||||
|
let collector = TopCollector::with_limit(2).and_offset(1);
|
||||||
|
|
||||||
|
let results = collector
|
||||||
|
.merge_fruits(vec![vec![
|
||||||
|
(0.9, DocAddress::new(0, 1)),
|
||||||
|
(0.8, DocAddress::new(0, 2)),
|
||||||
|
(0.7, DocAddress::new(0, 3)),
|
||||||
|
(0.6, DocAddress::new(0, 4)),
|
||||||
|
(0.5, DocAddress::new(0, 5)),
|
||||||
|
]])
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
results,
|
||||||
|
vec![(0.8, DocAddress::new(0, 2)), (0.7, DocAddress::new(0, 3)),]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_top_collector_with_limit_larger_than_set_and_offset() {
|
||||||
|
let collector = TopCollector::with_limit(2).and_offset(1);
|
||||||
|
|
||||||
|
let results = collector
|
||||||
|
.merge_fruits(vec![vec![
|
||||||
|
(0.9, DocAddress::new(0, 1)),
|
||||||
|
(0.8, DocAddress::new(0, 2)),
|
||||||
|
]])
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(results, vec![(0.8, DocAddress::new(0, 2)),]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_top_collector_with_limit_and_offset_larger_than_set() {
|
||||||
|
let collector = TopCollector::with_limit(2).and_offset(20);
|
||||||
|
|
||||||
|
let results = collector
|
||||||
|
.merge_fruits(vec![vec![
|
||||||
|
(0.9, DocAddress::new(0, 1)),
|
||||||
|
(0.8, DocAddress::new(0, 2)),
|
||||||
|
]])
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(results, vec![]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
|
mod bench {
|
||||||
|
use test::Bencher;
|
||||||
|
|
||||||
|
use super::TopSegmentCollector;
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_top_segment_collector_collect_not_at_capacity(b: &mut Bencher) {
|
||||||
|
let mut top_collector = TopSegmentCollector::new(0, 400);
|
||||||
|
|
||||||
|
b.iter(|| {
|
||||||
|
for i in 0..100 {
|
||||||
|
top_collector.collect(i, 0.8);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_top_segment_collector_collect_at_capacity(b: &mut Bencher) {
|
||||||
|
let mut top_collector = TopSegmentCollector::new(0, 100);
|
||||||
|
|
||||||
|
for i in 0..100 {
|
||||||
|
top_collector.collect(i, 0.8);
|
||||||
|
}
|
||||||
|
|
||||||
|
b.iter(|| {
|
||||||
|
for i in 0..100 {
|
||||||
|
top_collector.collect(i, 0.8);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_top_segment_collector_collect_and_harvest_many_ties(b: &mut Bencher) {
|
||||||
|
b.iter(|| {
|
||||||
|
let mut top_collector = TopSegmentCollector::new(0, 100);
|
||||||
|
|
||||||
|
for i in 0..100 {
|
||||||
|
top_collector.collect(i, 0.8);
|
||||||
|
}
|
||||||
|
|
||||||
|
// it would be nice to be able to do the setup N times but still
|
||||||
|
// measure only harvest(). We can't since harvest() consumes
|
||||||
|
// the top_collector.
|
||||||
|
top_collector.harvest()
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_top_segment_collector_collect_and_harvest_no_tie(b: &mut Bencher) {
|
||||||
|
b.iter(|| {
|
||||||
|
let mut top_collector = TopSegmentCollector::new(0, 100);
|
||||||
|
let mut score = 1.0;
|
||||||
|
|
||||||
|
for i in 0..100 {
|
||||||
|
score += 1.0;
|
||||||
|
top_collector.collect(i, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
// it would be nice to be able to do the setup N times but still
|
||||||
|
// measure only harvest(). We can't since harvest() consumes
|
||||||
|
// the top_collector.
|
||||||
|
top_collector.harvest()
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
124
src/collector/tweak_score_top_collector.rs
Normal file
124
src/collector/tweak_score_top_collector.rs
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
use crate::collector::top_collector::{TopCollector, TopSegmentCollector};
|
||||||
|
use crate::collector::{Collector, SegmentCollector};
|
||||||
|
use crate::{DocAddress, DocId, Result, Score, SegmentReader};
|
||||||
|
|
||||||
|
pub(crate) struct TweakedScoreTopCollector<TScoreTweaker, TScore = Score> {
|
||||||
|
score_tweaker: TScoreTweaker,
|
||||||
|
collector: TopCollector<TScore>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TScoreTweaker, TScore> TweakedScoreTopCollector<TScoreTweaker, TScore>
|
||||||
|
where TScore: Clone + PartialOrd
|
||||||
|
{
|
||||||
|
pub fn new(
|
||||||
|
score_tweaker: TScoreTweaker,
|
||||||
|
collector: TopCollector<TScore>,
|
||||||
|
) -> TweakedScoreTopCollector<TScoreTweaker, TScore> {
|
||||||
|
TweakedScoreTopCollector {
|
||||||
|
score_tweaker,
|
||||||
|
collector,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A `ScoreSegmentTweaker` makes it possible to modify the default score
|
||||||
|
/// for a given document belonging to a specific segment.
|
||||||
|
///
|
||||||
|
/// It is the segment local version of the [`ScoreTweaker`].
|
||||||
|
pub trait ScoreSegmentTweaker<TScore>: 'static {
|
||||||
|
/// Tweak the given `score` for the document `doc`.
|
||||||
|
fn score(&mut self, doc: DocId, score: Score) -> TScore;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `ScoreTweaker` makes it possible to tweak the score
|
||||||
|
/// emitted by the scorer into another one.
|
||||||
|
///
|
||||||
|
/// The `ScoreTweaker` itself does not make much of the computation itself.
|
||||||
|
/// Instead, it helps constructing `Self::Child` instances that will compute
|
||||||
|
/// the score at a segment scale.
|
||||||
|
pub trait ScoreTweaker<TScore>: Sync {
|
||||||
|
/// Type of the associated [`ScoreSegmentTweaker`].
|
||||||
|
type Child: ScoreSegmentTweaker<TScore>;
|
||||||
|
|
||||||
|
/// Builds a child tweaker for a specific segment. The child scorer is associated with
|
||||||
|
/// a specific segment.
|
||||||
|
fn segment_tweaker(&self, segment_reader: &SegmentReader) -> Result<Self::Child>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TScoreTweaker, TScore> Collector for TweakedScoreTopCollector<TScoreTweaker, TScore>
|
||||||
|
where
|
||||||
|
TScoreTweaker: ScoreTweaker<TScore> + Send + Sync,
|
||||||
|
TScore: 'static + PartialOrd + Clone + Send + Sync,
|
||||||
|
{
|
||||||
|
type Fruit = Vec<(TScore, DocAddress)>;
|
||||||
|
|
||||||
|
type Child = TopTweakedScoreSegmentCollector<TScoreTweaker::Child, TScore>;
|
||||||
|
|
||||||
|
fn for_segment(
|
||||||
|
&self,
|
||||||
|
segment_local_id: u32,
|
||||||
|
segment_reader: &SegmentReader,
|
||||||
|
) -> Result<Self::Child> {
|
||||||
|
let segment_scorer = self.score_tweaker.segment_tweaker(segment_reader)?;
|
||||||
|
let segment_collector = self.collector.for_segment(segment_local_id, segment_reader);
|
||||||
|
Ok(TopTweakedScoreSegmentCollector {
|
||||||
|
segment_collector,
|
||||||
|
segment_scorer,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit> {
|
||||||
|
self.collector.merge_fruits(segment_fruits)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct TopTweakedScoreSegmentCollector<TSegmentScoreTweaker, TScore>
|
||||||
|
where
|
||||||
|
TScore: 'static + PartialOrd + Clone + Send + Sync + Sized,
|
||||||
|
TSegmentScoreTweaker: ScoreSegmentTweaker<TScore>,
|
||||||
|
{
|
||||||
|
segment_collector: TopSegmentCollector<TScore>,
|
||||||
|
segment_scorer: TSegmentScoreTweaker,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TSegmentScoreTweaker, TScore> SegmentCollector
|
||||||
|
for TopTweakedScoreSegmentCollector<TSegmentScoreTweaker, TScore>
|
||||||
|
where
|
||||||
|
TScore: 'static + PartialOrd + Clone + Send + Sync,
|
||||||
|
TSegmentScoreTweaker: 'static + ScoreSegmentTweaker<TScore>,
|
||||||
|
{
|
||||||
|
type Fruit = Vec<(TScore, DocAddress)>;
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
|
let score = self.segment_scorer.score(doc, score);
|
||||||
|
self.segment_collector.collect(doc, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest(self) -> Vec<(TScore, DocAddress)> {
|
||||||
|
self.segment_collector.harvest()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<F, TScore, TSegmentScoreTweaker> ScoreTweaker<TScore> for F
|
||||||
|
where
|
||||||
|
F: 'static + Send + Sync + Fn(&SegmentReader) -> TSegmentScoreTweaker,
|
||||||
|
TSegmentScoreTweaker: ScoreSegmentTweaker<TScore>,
|
||||||
|
{
|
||||||
|
type Child = TSegmentScoreTweaker;
|
||||||
|
|
||||||
|
fn segment_tweaker(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
|
||||||
|
Ok((self)(segment_reader))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<F, TScore> ScoreSegmentTweaker<TScore> for F
|
||||||
|
where F: 'static + FnMut(DocId, Score) -> TScore
|
||||||
|
{
|
||||||
|
fn score(&mut self, doc: DocId, score: Score) -> TScore {
|
||||||
|
(self)(doc, score)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -69,7 +69,7 @@ fn assert_date_time_precision(index: &Index, doc_store_precision: DateTimePrecis
|
|||||||
.parse_query("dateformat")
|
.parse_query("dateformat")
|
||||||
.expect("Failed to parse query");
|
.expect("Failed to parse query");
|
||||||
let top_docs = searcher
|
let top_docs = searcher
|
||||||
.search(&query, &TopDocs::with_limit(1).order_by_score())
|
.search(&query, &TopDocs::with_limit(1))
|
||||||
.expect("Search failed");
|
.expect("Search failed");
|
||||||
|
|
||||||
assert_eq!(top_docs.len(), 1, "Expected 1 search result");
|
assert_eq!(top_docs.len(), 1, "Expected 1 search result");
|
||||||
|
|||||||
@@ -48,15 +48,7 @@ impl Executor {
|
|||||||
F: Sized + Sync + Fn(A) -> crate::Result<R>,
|
F: Sized + Sync + Fn(A) -> crate::Result<R>,
|
||||||
{
|
{
|
||||||
match self {
|
match self {
|
||||||
Executor::SingleThread => {
|
Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
|
||||||
// Avoid `collect`, since the stacktrace is blown up by it, which makes profiling
|
|
||||||
// harder.
|
|
||||||
let mut result = Vec::with_capacity(args.size_hint().0);
|
|
||||||
for arg in args {
|
|
||||||
result.push(f(arg)?);
|
|
||||||
}
|
|
||||||
Ok(result)
|
|
||||||
}
|
|
||||||
Executor::ThreadPool(pool) => {
|
Executor::ThreadPool(pool) => {
|
||||||
let args: Vec<A> = args.collect();
|
let args: Vec<A> = args.collect();
|
||||||
let num_fruits = args.len();
|
let num_fruits = args.len();
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
|
|||||||
use common::{replace_in_place, JsonPathWriter};
|
use common::{replace_in_place, JsonPathWriter};
|
||||||
use rustc_hash::FxHashMap;
|
use rustc_hash::FxHashMap;
|
||||||
|
|
||||||
use crate::indexer::indexing_term::IndexingTerm;
|
|
||||||
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
||||||
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
|
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
|
||||||
use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
|
use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
|
||||||
@@ -78,7 +77,7 @@ fn index_json_object<'a, V: Value<'a>>(
|
|||||||
doc: DocId,
|
doc: DocId,
|
||||||
json_visitor: V::ObjectIter,
|
json_visitor: V::ObjectIter,
|
||||||
text_analyzer: &mut TextAnalyzer,
|
text_analyzer: &mut TextAnalyzer,
|
||||||
term_buffer: &mut IndexingTerm,
|
term_buffer: &mut Term,
|
||||||
json_path_writer: &mut JsonPathWriter,
|
json_path_writer: &mut JsonPathWriter,
|
||||||
postings_writer: &mut dyn PostingsWriter,
|
postings_writer: &mut dyn PostingsWriter,
|
||||||
ctx: &mut IndexingContext,
|
ctx: &mut IndexingContext,
|
||||||
@@ -108,17 +107,17 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
|||||||
doc: DocId,
|
doc: DocId,
|
||||||
json_value: V,
|
json_value: V,
|
||||||
text_analyzer: &mut TextAnalyzer,
|
text_analyzer: &mut TextAnalyzer,
|
||||||
term_buffer: &mut IndexingTerm,
|
term_buffer: &mut Term,
|
||||||
json_path_writer: &mut JsonPathWriter,
|
json_path_writer: &mut JsonPathWriter,
|
||||||
postings_writer: &mut dyn PostingsWriter,
|
postings_writer: &mut dyn PostingsWriter,
|
||||||
ctx: &mut IndexingContext,
|
ctx: &mut IndexingContext,
|
||||||
positions_per_path: &mut IndexingPositionsPerPath,
|
positions_per_path: &mut IndexingPositionsPerPath,
|
||||||
) {
|
) {
|
||||||
let set_path_id = |term_buffer: &mut IndexingTerm, unordered_id: u32| {
|
let set_path_id = |term_buffer: &mut Term, unordered_id: u32| {
|
||||||
term_buffer.truncate_value_bytes(0);
|
term_buffer.truncate_value_bytes(0);
|
||||||
term_buffer.append_bytes(&unordered_id.to_be_bytes());
|
term_buffer.append_bytes(&unordered_id.to_be_bytes());
|
||||||
};
|
};
|
||||||
let set_type = |term_buffer: &mut IndexingTerm, typ: Type| {
|
let set_type = |term_buffer: &mut Term, typ: Type| {
|
||||||
term_buffer.append_bytes(&[typ.to_code()]);
|
term_buffer.append_bytes(&[typ.to_code()]);
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -406,7 +405,7 @@ mod tests {
|
|||||||
let mut term = Term::from_field_json_path(field, "color", false);
|
let mut term = Term::from_field_json_path(field, "color", false);
|
||||||
term.append_type_and_str("red");
|
term.append_type_and_str("red");
|
||||||
|
|
||||||
assert_eq!(term.serialized_value_bytes(), b"color\x00sred".to_vec())
|
assert_eq!(term.serialized_term(), b"\x00\x00\x00\x01jcolor\x00sred")
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -416,8 +415,8 @@ mod tests {
|
|||||||
term.append_type_and_fast_value(-4i64);
|
term.append_type_and_fast_value(-4i64);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
term.serialized_value_bytes(),
|
term.serialized_term(),
|
||||||
b"color\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc".to_vec()
|
b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -428,8 +427,8 @@ mod tests {
|
|||||||
term.append_type_and_fast_value(4u64);
|
term.append_type_and_fast_value(4u64);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
term.serialized_value_bytes(),
|
term.serialized_term(),
|
||||||
b"color\x00u\x00\x00\x00\x00\x00\x00\x00\x04".to_vec()
|
b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -439,8 +438,8 @@ mod tests {
|
|||||||
let mut term = Term::from_field_json_path(field, "color", false);
|
let mut term = Term::from_field_json_path(field, "color", false);
|
||||||
term.append_type_and_fast_value(4.0f64);
|
term.append_type_and_fast_value(4.0f64);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
term.serialized_value_bytes(),
|
term.serialized_term(),
|
||||||
b"color\x00f\xc0\x10\x00\x00\x00\x00\x00\x00".to_vec()
|
b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -450,8 +449,8 @@ mod tests {
|
|||||||
let mut term = Term::from_field_json_path(field, "color", false);
|
let mut term = Term::from_field_json_path(field, "color", false);
|
||||||
term.append_type_and_fast_value(true);
|
term.append_type_and_fast_value(true);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
term.serialized_value_bytes(),
|
term.serialized_term(),
|
||||||
b"color\x00o\x00\x00\x00\x00\x00\x00\x00\x01".to_vec()
|
b"\x00\x00\x00\x01jcolor\x00o\x00\x00\x00\x00\x00\x00\x00\x01"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -225,7 +225,6 @@ impl Searcher {
|
|||||||
enabled_scoring: EnableScoring,
|
enabled_scoring: EnableScoring,
|
||||||
) -> crate::Result<C::Fruit> {
|
) -> crate::Result<C::Fruit> {
|
||||||
let weight = query.weight(enabled_scoring)?;
|
let weight = query.weight(enabled_scoring)?;
|
||||||
collector.check_schema(self.schema())?;
|
|
||||||
let segment_readers = self.segment_readers();
|
let segment_readers = self.segment_readers();
|
||||||
let fruits = executor.map(
|
let fruits = executor.map(
|
||||||
|(segment_ord, segment_reader)| {
|
|(segment_ord, segment_reader)| {
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ use std::ops::Range;
|
|||||||
use common::{BinarySerializable, CountingWriter, HasLen, VInt};
|
use common::{BinarySerializable, CountingWriter, HasLen, VInt};
|
||||||
|
|
||||||
use crate::directory::{FileSlice, TerminatingWrite, WritePtr};
|
use crate::directory::{FileSlice, TerminatingWrite, WritePtr};
|
||||||
use crate::schema::{Field, Schema};
|
use crate::schema::Field;
|
||||||
use crate::space_usage::{FieldUsage, PerFieldSpaceUsage};
|
use crate::space_usage::{FieldUsage, PerFieldSpaceUsage};
|
||||||
|
|
||||||
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
|
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
|
||||||
@@ -167,11 +167,10 @@ impl CompositeFile {
|
|||||||
.map(|byte_range| self.data.slice(byte_range.clone()))
|
.map(|byte_range| self.data.slice(byte_range.clone()))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn space_usage(&self, schema: &Schema) -> PerFieldSpaceUsage {
|
pub fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||||
let mut fields = Vec::new();
|
let mut fields = Vec::new();
|
||||||
for (&field_addr, byte_range) in &self.offsets_index {
|
for (&field_addr, byte_range) in &self.offsets_index {
|
||||||
let field_name = schema.get_field_name(field_addr.field).to_string();
|
let mut field_usage = FieldUsage::empty(field_addr.field);
|
||||||
let mut field_usage = FieldUsage::empty(field_name);
|
|
||||||
field_usage.add_field_idx(field_addr.idx, byte_range.len().into());
|
field_usage.add_field_idx(field_addr.idx, byte_range.len().into());
|
||||||
fields.push(field_usage);
|
fields.push(field_usage);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -108,7 +108,7 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
|||||||
/// Opens a file and returns a boxed `FileHandle`.
|
/// Opens a file and returns a boxed `FileHandle`.
|
||||||
///
|
///
|
||||||
/// Users of `Directory` should typically call `Directory::open_read(...)`,
|
/// Users of `Directory` should typically call `Directory::open_read(...)`,
|
||||||
/// while `Directory` implementer should implement `get_file_handle()`.
|
/// while `Directory` implementor should implement `get_file_handle()`.
|
||||||
fn get_file_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>, OpenReadError>;
|
fn get_file_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>, OpenReadError>;
|
||||||
|
|
||||||
/// Once a virtual file is open, its data may not
|
/// Once a virtual file is open, its data may not
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
mod file_watcher;
|
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::fs::{self, File, OpenOptions};
|
use std::fs::{self, File, OpenOptions};
|
||||||
@@ -9,7 +7,6 @@ use std::path::{Path, PathBuf};
|
|||||||
use std::sync::{Arc, RwLock, Weak};
|
use std::sync::{Arc, RwLock, Weak};
|
||||||
|
|
||||||
use common::StableDeref;
|
use common::StableDeref;
|
||||||
use file_watcher::FileWatcher;
|
|
||||||
use fs4::fs_std::FileExt;
|
use fs4::fs_std::FileExt;
|
||||||
#[cfg(all(feature = "mmap", unix))]
|
#[cfg(all(feature = "mmap", unix))]
|
||||||
pub use memmap2::Advice;
|
pub use memmap2::Advice;
|
||||||
@@ -21,6 +18,7 @@ use crate::core::META_FILEPATH;
|
|||||||
use crate::directory::error::{
|
use crate::directory::error::{
|
||||||
DeleteError, LockError, OpenDirectoryError, OpenReadError, OpenWriteError,
|
DeleteError, LockError, OpenDirectoryError, OpenReadError, OpenWriteError,
|
||||||
};
|
};
|
||||||
|
use crate::directory::file_watcher::FileWatcher;
|
||||||
use crate::directory::{
|
use crate::directory::{
|
||||||
AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
|
AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
|
||||||
WatchCallback, WatchHandle, WritePtr,
|
WatchCallback, WatchHandle, WritePtr,
|
||||||
@@ -5,6 +5,7 @@ mod mmap_directory;
|
|||||||
|
|
||||||
mod directory;
|
mod directory;
|
||||||
mod directory_lock;
|
mod directory_lock;
|
||||||
|
mod file_watcher;
|
||||||
pub mod footer;
|
pub mod footer;
|
||||||
mod managed_directory;
|
mod managed_directory;
|
||||||
mod ram_directory;
|
mod ram_directory;
|
||||||
|
|||||||
@@ -40,8 +40,6 @@ pub trait DocSet: Send {
|
|||||||
/// of `DocSet` should support it.
|
/// of `DocSet` should support it.
|
||||||
///
|
///
|
||||||
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`.
|
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`.
|
||||||
///
|
|
||||||
/// `target` has to be larger or equal to `.doc()` when calling `seek`.
|
|
||||||
fn seek(&mut self, target: DocId) -> DocId {
|
fn seek(&mut self, target: DocId) -> DocId {
|
||||||
let mut doc = self.doc();
|
let mut doc = self.doc();
|
||||||
debug_assert!(doc <= target);
|
debug_assert!(doc <= target);
|
||||||
@@ -51,33 +49,6 @@ pub trait DocSet: Send {
|
|||||||
doc
|
doc
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Seeks to the target if possible and returns true if the target is in the DocSet.
|
|
||||||
///
|
|
||||||
/// DocSets that already have an efficient `seek` method don't need to implement
|
|
||||||
/// `seek_into_the_danger_zone`. All wrapper DocSets should forward
|
|
||||||
/// `seek_into_the_danger_zone` to the underlying DocSet.
|
|
||||||
///
|
|
||||||
/// ## API Behaviour
|
|
||||||
/// If `seek_into_the_danger_zone` is returning true, a call to `doc()` has to return target.
|
|
||||||
/// If `seek_into_the_danger_zone` is returning false, a call to `doc()` may return any doc
|
|
||||||
/// between the last doc that matched and target or a doc that is a valid next hit after
|
|
||||||
/// target. The DocSet is considered to be in an invalid state until
|
|
||||||
/// `seek_into_the_danger_zone` returns true again.
|
|
||||||
///
|
|
||||||
/// `target` needs to be equal or larger than `doc` when in a valid state.
|
|
||||||
///
|
|
||||||
/// Consecutive calls are not allowed to have decreasing `target` values.
|
|
||||||
///
|
|
||||||
/// # Warning
|
|
||||||
/// This is an advanced API used by intersection. The API contract is tricky, avoid using it.
|
|
||||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
|
|
||||||
let current_doc = self.doc();
|
|
||||||
if current_doc < target {
|
|
||||||
self.seek(target);
|
|
||||||
}
|
|
||||||
self.doc() == target
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Fills a given mutable buffer with the next doc ids from the
|
/// Fills a given mutable buffer with the next doc ids from the
|
||||||
/// `DocSet`
|
/// `DocSet`
|
||||||
///
|
///
|
||||||
@@ -123,15 +94,6 @@ pub trait DocSet: Send {
|
|||||||
/// which would be the number of documents in the DocSet.
|
/// which would be the number of documents in the DocSet.
|
||||||
///
|
///
|
||||||
/// By default this returns `size_hint()`.
|
/// By default this returns `size_hint()`.
|
||||||
///
|
|
||||||
/// DocSets may have vastly different cost depending on their type,
|
|
||||||
/// e.g. an intersection with 10 hits is much cheaper than
|
|
||||||
/// a phrase search with 10 hits, since it needs to load positions.
|
|
||||||
///
|
|
||||||
/// ### Future Work
|
|
||||||
/// We may want to differentiate `DocSet` costs more more granular, e.g.
|
|
||||||
/// creation_cost, advance_cost, seek_cost on to get a good estimation
|
|
||||||
/// what query types to choose.
|
|
||||||
fn cost(&self) -> u64 {
|
fn cost(&self) -> u64 {
|
||||||
self.size_hint() as u64
|
self.size_hint() as u64
|
||||||
}
|
}
|
||||||
@@ -175,10 +137,6 @@ impl DocSet for &mut dyn DocSet {
|
|||||||
(**self).seek(target)
|
(**self).seek(target)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
|
|
||||||
(**self).seek_into_the_danger_zone(target)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn doc(&self) -> u32 {
|
fn doc(&self) -> u32 {
|
||||||
(**self).doc()
|
(**self).doc()
|
||||||
}
|
}
|
||||||
@@ -211,11 +169,6 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
|||||||
unboxed.seek(target)
|
unboxed.seek(target)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
|
|
||||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
|
||||||
unboxed.seek_into_the_danger_zone(target)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
|
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
|
||||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||||
unboxed.fill_buffer(buffer)
|
unboxed.fill_buffer(buffer)
|
||||||
|
|||||||
@@ -104,7 +104,7 @@ pub enum TantivyError {
|
|||||||
#[error("{0:?}")]
|
#[error("{0:?}")]
|
||||||
IncompatibleIndex(Incompatibility),
|
IncompatibleIndex(Incompatibility),
|
||||||
/// An internal error occurred. This is are internal states that should not be reached.
|
/// An internal error occurred. This is are internal states that should not be reached.
|
||||||
/// e.g. a datastructure is incorrectly initialized.
|
/// e.g. a datastructure is incorrectly inititalized.
|
||||||
#[error("Internal error: '{0}'")]
|
#[error("Internal error: '{0}'")]
|
||||||
InternalError(String),
|
InternalError(String),
|
||||||
#[error("Deserialize error: {0}")]
|
#[error("Deserialize error: {0}")]
|
||||||
|
|||||||
@@ -726,22 +726,22 @@ mod tests {
|
|||||||
.column_opt::<DateTime>("multi_date")
|
.column_opt::<DateTime>("multi_date")
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
let mut dates = Vec::new();
|
||||||
{
|
{
|
||||||
assert_eq!(date_fast_field.get_val(0).into_timestamp_nanos(), 1i64);
|
assert_eq!(date_fast_field.get_val(0).into_timestamp_nanos(), 1i64);
|
||||||
let dates: Vec<DateTime> = dates_fast_field.values_for_doc(0u32).collect();
|
dates_fast_field.fill_vals(0u32, &mut dates);
|
||||||
assert_eq!(dates.len(), 2);
|
assert_eq!(dates.len(), 2);
|
||||||
assert_eq!(dates[0].into_timestamp_nanos(), 2i64);
|
assert_eq!(dates[0].into_timestamp_nanos(), 2i64);
|
||||||
assert_eq!(dates[1].into_timestamp_nanos(), 3i64);
|
assert_eq!(dates[1].into_timestamp_nanos(), 3i64);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
assert_eq!(date_fast_field.get_val(1).into_timestamp_nanos(), 4i64);
|
assert_eq!(date_fast_field.get_val(1).into_timestamp_nanos(), 4i64);
|
||||||
let dates: Vec<DateTime> = dates_fast_field.values_for_doc(1u32).collect();
|
dates_fast_field.fill_vals(1u32, &mut dates);
|
||||||
assert!(dates.is_empty());
|
assert!(dates.is_empty());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
assert_eq!(date_fast_field.get_val(2).into_timestamp_nanos(), 0i64);
|
assert_eq!(date_fast_field.get_val(2).into_timestamp_nanos(), 0i64);
|
||||||
let dates: Vec<DateTime> = dates_fast_field.values_for_doc(2u32).collect();
|
dates_fast_field.fill_vals(2u32, &mut dates);
|
||||||
assert_eq!(dates.len(), 2);
|
assert_eq!(dates.len(), 2);
|
||||||
assert_eq!(dates[0].into_timestamp_nanos(), 5i64);
|
assert_eq!(dates[0].into_timestamp_nanos(), 5i64);
|
||||||
assert_eq!(dates[1].into_timestamp_nanos(), 6i64);
|
assert_eq!(dates[1].into_timestamp_nanos(), 6i64);
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use columnar::{
|
|||||||
};
|
};
|
||||||
use common::ByteCount;
|
use common::ByteCount;
|
||||||
|
|
||||||
use crate::core::json_utils::{encode_column_name, json_path_sep_to_dot};
|
use crate::core::json_utils::encode_column_name;
|
||||||
use crate::directory::FileSlice;
|
use crate::directory::FileSlice;
|
||||||
use crate::schema::{Field, FieldEntry, FieldType, Schema};
|
use crate::schema::{Field, FieldEntry, FieldType, Schema};
|
||||||
use crate::space_usage::{FieldUsage, PerFieldSpaceUsage};
|
use crate::space_usage::{FieldUsage, PerFieldSpaceUsage};
|
||||||
@@ -39,15 +39,19 @@ impl FastFieldReaders {
|
|||||||
self.resolve_column_name_given_default_field(column_name, default_field_opt)
|
self.resolve_column_name_given_default_field(column_name, default_field_opt)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn space_usage(&self) -> io::Result<PerFieldSpaceUsage> {
|
pub(crate) fn space_usage(&self, schema: &Schema) -> io::Result<PerFieldSpaceUsage> {
|
||||||
let mut per_field_usages: Vec<FieldUsage> = Default::default();
|
let mut per_field_usages: Vec<FieldUsage> = Default::default();
|
||||||
for (mut field_name, column_handle) in self.columnar.iter_columns()? {
|
for (field, field_entry) in schema.fields() {
|
||||||
json_path_sep_to_dot(&mut field_name);
|
let column_handles = self.columnar.read_columns(field_entry.name())?;
|
||||||
let space_usage = column_handle.space_usage()?;
|
let num_bytes: ByteCount = column_handles
|
||||||
let mut field_usage = FieldUsage::empty(field_name);
|
.iter()
|
||||||
field_usage.set_column_usage(space_usage);
|
.map(|column_handle| column_handle.num_bytes())
|
||||||
|
.sum();
|
||||||
|
let mut field_usage = FieldUsage::empty(field);
|
||||||
|
field_usage.add_field_idx(0, num_bytes);
|
||||||
per_field_usages.push(field_usage);
|
per_field_usages.push(field_usage);
|
||||||
}
|
}
|
||||||
|
// TODO fix space usage for JSON fields.
|
||||||
Ok(PerFieldSpaceUsage::new(per_field_usages))
|
Ok(PerFieldSpaceUsage::new(per_field_usages))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use std::sync::Arc;
|
|||||||
|
|
||||||
use super::{fieldnorm_to_id, id_to_fieldnorm};
|
use super::{fieldnorm_to_id, id_to_fieldnorm};
|
||||||
use crate::directory::{CompositeFile, FileSlice, OwnedBytes};
|
use crate::directory::{CompositeFile, FileSlice, OwnedBytes};
|
||||||
use crate::schema::{Field, Schema};
|
use crate::schema::Field;
|
||||||
use crate::space_usage::PerFieldSpaceUsage;
|
use crate::space_usage::PerFieldSpaceUsage;
|
||||||
use crate::DocId;
|
use crate::DocId;
|
||||||
|
|
||||||
@@ -37,8 +37,8 @@ impl FieldNormReaders {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Return a break down of the space usage per field.
|
/// Return a break down of the space usage per field.
|
||||||
pub fn space_usage(&self, schema: &Schema) -> PerFieldSpaceUsage {
|
pub fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||||
self.data.space_usage(schema)
|
self.data.space_usage()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a handle to inner file
|
/// Returns a handle to inner file
|
||||||
|
|||||||
@@ -13,9 +13,9 @@ use crate::store::Compressor;
|
|||||||
use crate::{Inventory, Opstamp, TrackedObject};
|
use crate::{Inventory, Opstamp, TrackedObject};
|
||||||
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
pub struct DeleteMeta {
|
struct DeleteMeta {
|
||||||
num_deleted_docs: u32,
|
num_deleted_docs: u32,
|
||||||
pub opstamp: Opstamp,
|
opstamp: Opstamp,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Default)]
|
#[derive(Clone, Default)]
|
||||||
@@ -213,7 +213,7 @@ impl SegmentMeta {
|
|||||||
struct InnerSegmentMeta {
|
struct InnerSegmentMeta {
|
||||||
segment_id: SegmentId,
|
segment_id: SegmentId,
|
||||||
max_doc: u32,
|
max_doc: u32,
|
||||||
pub deletes: Option<DeleteMeta>,
|
deletes: Option<DeleteMeta>,
|
||||||
/// If you want to avoid the SegmentComponent::TempStore file to be covered by
|
/// If you want to avoid the SegmentComponent::TempStore file to be covered by
|
||||||
/// garbage collection and deleted, set this to true. This is used during merge.
|
/// garbage collection and deleted, set this to true. This is used during merge.
|
||||||
#[serde(skip)]
|
#[serde(skip)]
|
||||||
@@ -276,14 +276,13 @@ impl Default for IndexSettings {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// The order to sort by
|
/// The order to sort by
|
||||||
#[derive(Clone, Copy, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||||
pub enum Order {
|
pub enum Order {
|
||||||
/// Ascending Order
|
/// Ascending Order
|
||||||
Asc,
|
Asc,
|
||||||
/// Descending Order
|
/// Descending Order
|
||||||
Desc,
|
Desc,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Order {
|
impl Order {
|
||||||
/// return if the Order is ascending
|
/// return if the Order is ascending
|
||||||
pub fn is_asc(&self) -> bool {
|
pub fn is_asc(&self) -> bool {
|
||||||
@@ -404,10 +403,7 @@ mod tests {
|
|||||||
schema_builder.build()
|
schema_builder.build()
|
||||||
};
|
};
|
||||||
let index_metas = IndexMeta {
|
let index_metas = IndexMeta {
|
||||||
index_settings: IndexSettings {
|
index_settings: IndexSettings::default(),
|
||||||
docstore_compression: Compressor::None,
|
|
||||||
..Default::default()
|
|
||||||
},
|
|
||||||
segments: Vec::new(),
|
segments: Vec::new(),
|
||||||
schema,
|
schema,
|
||||||
opstamp: 0u64,
|
opstamp: 0u64,
|
||||||
@@ -416,7 +412,7 @@ mod tests {
|
|||||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
json,
|
json,
|
||||||
r#"{"index_settings":{"docstore_compression":"none","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
r#"{"index_settings":{"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||||
);
|
);
|
||||||
|
|
||||||
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
||||||
@@ -497,8 +493,6 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
#[cfg(feature = "lz4-compression")]
|
#[cfg(feature = "lz4-compression")]
|
||||||
fn test_index_settings_default() {
|
fn test_index_settings_default() {
|
||||||
use crate::store::Compressor;
|
|
||||||
|
|
||||||
let mut index_settings = IndexSettings::default();
|
let mut index_settings = IndexSettings::default();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
index_settings,
|
index_settings,
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ impl Segment {
|
|||||||
///
|
///
|
||||||
/// This method is only used when updating `max_doc` from 0
|
/// This method is only used when updating `max_doc` from 0
|
||||||
/// as we finalize a fresh new segment.
|
/// as we finalize a fresh new segment.
|
||||||
pub fn with_max_doc(self, max_doc: u32) -> Segment {
|
pub(crate) fn with_max_doc(self, max_doc: u32) -> Segment {
|
||||||
Segment {
|
Segment {
|
||||||
index: self.index,
|
index: self.index,
|
||||||
meta: self.meta.with_max_doc(max_doc),
|
meta: self.meta.with_max_doc(max_doc),
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user