Compare commits

..

1 Commits

Author SHA1 Message Date
Paul Masurel
b2573a3b16 low cardinality optimisation 2025-11-19 18:41:10 +01:00
168 changed files with 4275 additions and 9687 deletions

View File

@@ -15,11 +15,11 @@ jobs:
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Install Rust - name: Install Rust
run: rustup toolchain install nightly-2025-12-01 --profile minimal --component llvm-tools-preview run: rustup toolchain install nightly-2024-07-01 --profile minimal --component llvm-tools-preview
- uses: Swatinem/rust-cache@v2 - uses: Swatinem/rust-cache@v2
- uses: taiki-e/install-action@cargo-llvm-cov - uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage - name: Generate code coverage
run: cargo +nightly-2025-12-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info run: cargo +nightly-2024-07-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
- name: Upload coverage to Codecov - name: Upload coverage to Codecov
uses: codecov/codecov-action@v3 uses: codecov/codecov-action@v3
continue-on-error: true continue-on-error: true

View File

@@ -39,11 +39,11 @@ jobs:
- name: Check Formatting - name: Check Formatting
run: cargo +nightly fmt --all -- --check run: cargo +nightly fmt --all -- --check
- name: Check Stable Compilation - name: Check Stable Compilation
run: cargo build --all-features run: cargo build --all-features
- name: Check Bench Compilation - name: Check Bench Compilation
run: cargo +nightly bench --no-run --profile=dev --all-features run: cargo +nightly bench --no-run --profile=dev --all-features
@@ -59,10 +59,10 @@ jobs:
strategy: strategy:
matrix: matrix:
features: features: [
- { label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints,stemmer" } { label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints" },
- { label: "quickwit", flags: "mmap,quickwit,failpoints" } { label: "quickwit", flags: "mmap,quickwit,failpoints" }
- { label: "none", flags: "" } ]
name: test-${{ matrix.features.label}} name: test-${{ matrix.features.label}}
@@ -80,21 +80,7 @@ jobs:
- uses: Swatinem/rust-cache@v2 - uses: Swatinem/rust-cache@v2
- name: Run tests - name: Run tests
run: | run: cargo +stable nextest run --features ${{ matrix.features.flags }} --verbose --workspace
# if matrix.feature.flags is empty then run on --lib to avoid compiling examples
# (as most of them rely on mmap) otherwise run all
if [ -z "${{ matrix.features.flags }}" ]; then
cargo +stable nextest run --lib --no-default-features --verbose --workspace
else
cargo +stable nextest run --features ${{ matrix.features.flags }} --no-default-features --verbose --workspace
fi
- name: Run doctests - name: Run doctests
run: | run: cargo +stable test --doc --features ${{ matrix.features.flags }} --verbose --workspace
# if matrix.feature.flags is empty then run on --lib to avoid compiling examples
# (as most of them rely on mmap) otherwise run all
if [ -z "${{ matrix.features.flags }}" ]; then
echo "no doctest for no feature flag"
else
cargo +stable test --doc --features ${{ matrix.features.flags }} --verbose --workspace
fi

View File

@@ -78,7 +78,7 @@ This will slightly increase space and access time. [#2439](https://github.com/qu
- **Store DateTime as nanoseconds in doc store** DateTime in the doc store was truncated to microseconds previously. This removes this truncation, while still keeping backwards compatibility. [#2486](https://github.com/quickwit-oss/tantivy/pull/2486)(@PSeitz) - **Store DateTime as nanoseconds in doc store** DateTime in the doc store was truncated to microseconds previously. This removes this truncation, while still keeping backwards compatibility. [#2486](https://github.com/quickwit-oss/tantivy/pull/2486)(@PSeitz)
- **Performance/Memory** - **Performace/Memory**
- lift clauses in LogicalAst for optimized ast during execution [#2449](https://github.com/quickwit-oss/tantivy/pull/2449)(@PSeitz) - lift clauses in LogicalAst for optimized ast during execution [#2449](https://github.com/quickwit-oss/tantivy/pull/2449)(@PSeitz)
- Use Vec instead of BTreeMap to back OwnedValue object [#2364](https://github.com/quickwit-oss/tantivy/pull/2364)(@fulmicoton) - Use Vec instead of BTreeMap to back OwnedValue object [#2364](https://github.com/quickwit-oss/tantivy/pull/2364)(@fulmicoton)
- Replace TantivyDocument with CompactDoc. CompactDoc is much smaller and provides similar performance. [#2402](https://github.com/quickwit-oss/tantivy/pull/2402)(@PSeitz) - Replace TantivyDocument with CompactDoc. CompactDoc is much smaller and provides similar performance. [#2402](https://github.com/quickwit-oss/tantivy/pull/2402)(@PSeitz)

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.26.0" version = "0.25.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
@@ -37,9 +37,9 @@ fs4 = { version = "0.13.1", optional = true }
levenshtein_automata = "0.2.1" levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] } uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4" crossbeam-channel = "0.5.4"
rust-stemmers = { version = "1.2.0", optional = true } rust-stemmers = "1.2.0"
downcast-rs = "2.0.1" downcast-rs = "2.0.1"
bitpacking = { version = "0.9.3", default-features = false, features = [ bitpacking = { version = "0.9.2", default-features = false, features = [
"bitpacker4x", "bitpacker4x",
] } ] }
census = "0.4.2" census = "0.4.2"
@@ -75,12 +75,12 @@ typetag = "0.2.21"
winapi = "0.3.9" winapi = "0.3.9"
[dev-dependencies] [dev-dependencies]
binggan = "0.14.2" binggan = "0.14.0"
rand = "0.8.5" rand = "0.8.5"
maplit = "1.0.2" maplit = "1.0.2"
matches = "0.1.9" matches = "0.1.9"
pretty_assertions = "1.2.1" pretty_assertions = "1.2.1"
proptest = "1.7.0" proptest = "1.0.0"
test-log = "0.2.10" test-log = "0.2.10"
futures = "0.3.21" futures = "0.3.21"
paste = "1.0.11" paste = "1.0.11"
@@ -113,8 +113,7 @@ debug-assertions = true
overflow-checks = true overflow-checks = true
[features] [features]
default = ["mmap", "stopwords", "lz4-compression", "columnar-zstd-compression", "stemmer"] default = ["mmap", "stopwords", "lz4-compression", "columnar-zstd-compression"]
stemmer = ["rust-stemmers"]
mmap = ["fs4", "tempfile", "memmap2"] mmap = ["fs4", "tempfile", "memmap2"]
stopwords = [] stopwords = []
@@ -174,18 +173,6 @@ harness = false
name = "exists_json" name = "exists_json"
harness = false harness = false
[[bench]]
name = "range_query"
harness = false
[[bench]] [[bench]]
name = "and_or_queries" name = "and_or_queries"
harness = false harness = false
[[bench]]
name = "range_queries"
harness = false
[[bench]]
name = "bool_queries_with_range"
harness = false

View File

@@ -123,7 +123,6 @@ You can also find other bindings on [GitHub](https://github.com/search?q=tantivy
- [seshat](https://github.com/matrix-org/seshat/): A matrix message database/indexer - [seshat](https://github.com/matrix-org/seshat/): A matrix message database/indexer
- [tantiny](https://github.com/baygeldin/tantiny): Tiny full-text search for Ruby - [tantiny](https://github.com/baygeldin/tantiny): Tiny full-text search for Ruby
- [lnx](https://github.com/lnx-search/lnx): adaptable, typo tolerant search engine with a REST API - [lnx](https://github.com/lnx-search/lnx): adaptable, typo tolerant search engine with a REST API
- [Bichon](https://github.com/rustmailer/bichon): A lightweight, high-performance Rust email archiver with WebUI
- and [more](https://github.com/search?q=tantivy)! - and [more](https://github.com/search?q=tantivy)!
### On average, how much faster is Tantivy compared to Lucene? ### On average, how much faster is Tantivy compared to Lucene?

View File

@@ -10,7 +10,7 @@ rename FastFieldReaders::open to load
remove fast field reader remove fast field reader
find a way to unify the two DateTime. find a way to unify the two DateTime.
re-add type check in the filter wrapper readd type check in the filter wrapper
add unit test on columnar list columns. add unit test on columnar list columns.

View File

@@ -1,6 +1,5 @@
use binggan::plugins::PeakMemAllocPlugin; use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM}; use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use rand::distributions::WeightedIndex;
use rand::prelude::SliceRandom; use rand::prelude::SliceRandom;
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::{Rng, SeedableRng}; use rand::{Rng, SeedableRng};
@@ -54,33 +53,25 @@ fn bench_agg(mut group: InputGroup<Index>) {
register!(group, stats_f64); register!(group, stats_f64);
register!(group, extendedstats_f64); register!(group, extendedstats_f64);
register!(group, percentiles_f64); register!(group, percentiles_f64);
register!(group, terms_7); register!(group, terms_few);
register!(group, terms_all_unique); register!(group, terms_many);
register!(group, terms_150_000);
register!(group, terms_many_top_1000); register!(group, terms_many_top_1000);
register!(group, terms_many_order_by_term); register!(group, terms_many_order_by_term);
register!(group, terms_many_with_top_hits); register!(group, terms_many_with_top_hits);
register!(group, terms_all_unique_with_avg_sub_agg);
register!(group, terms_many_with_avg_sub_agg); register!(group, terms_many_with_avg_sub_agg);
register!(group, terms_status_with_avg_sub_agg);
register!(group, terms_status_with_histogram);
register!(group, terms_zipf_1000);
register!(group, terms_zipf_1000_with_histogram);
register!(group, terms_zipf_1000_with_avg_sub_agg);
register!(group, terms_many_json_mixed_type_with_avg_sub_agg); register!(group, terms_many_json_mixed_type_with_avg_sub_agg);
register!(group, cardinality_agg); register!(group, cardinality_agg);
register!(group, terms_status_with_cardinality_agg); register!(group, terms_few_with_cardinality_agg);
register!(group, range_agg); register!(group, range_agg);
register!(group, range_agg_with_avg_sub_agg); register!(group, range_agg_with_avg_sub_agg);
register!(group, range_agg_with_term_agg_status); register!(group, range_agg_with_term_agg_few);
register!(group, range_agg_with_term_agg_many); register!(group, range_agg_with_term_agg_many);
register!(group, histogram); register!(group, histogram);
register!(group, histogram_hard_bounds); register!(group, histogram_hard_bounds);
register!(group, histogram_with_avg_sub_agg); register!(group, histogram_with_avg_sub_agg);
register!(group, histogram_with_term_agg_status); register!(group, histogram_with_term_agg_few);
register!(group, avg_and_range_with_avg_sub_agg); register!(group, avg_and_range_with_avg_sub_agg);
// Filter aggregation benchmarks // Filter aggregation benchmarks
@@ -139,12 +130,12 @@ fn extendedstats_f64(index: &Index) {
} }
fn percentiles_f64(index: &Index) { fn percentiles_f64(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"mypercentiles": { "mypercentiles": {
"percentiles": { "percentiles": {
"field": "score_f64", "field": "score_f64",
"percents": [ 95, 99, 99.9 ] "percents": [ 95, 99, 99.9 ]
}
} }
}
}); });
execute_agg(index, agg_req); execute_agg(index, agg_req);
} }
@@ -159,10 +150,10 @@ fn cardinality_agg(index: &Index) {
}); });
execute_agg(index, agg_req); execute_agg(index, agg_req);
} }
fn terms_status_with_cardinality_agg(index: &Index) { fn terms_few_with_cardinality_agg(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"my_texts": { "my_texts": {
"terms": { "field": "text_few_terms_status" }, "terms": { "field": "text_few_terms" },
"aggs": { "aggs": {
"cardinality": { "cardinality": {
"cardinality": { "cardinality": {
@@ -175,20 +166,13 @@ fn terms_status_with_cardinality_agg(index: &Index) {
execute_agg(index, agg_req); execute_agg(index, agg_req);
} }
fn terms_7(index: &Index) { fn terms_few(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"my_texts": { "terms": { "field": "text_few_terms_status" } }, "my_texts": { "terms": { "field": "text_few_terms" } },
}); });
execute_agg(index, agg_req); execute_agg(index, agg_req);
} }
fn terms_all_unique(index: &Index) { fn terms_many(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_all_unique_terms" } },
});
execute_agg(index, agg_req);
}
fn terms_150_000(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"my_texts": { "terms": { "field": "text_many_terms" } }, "my_texts": { "terms": { "field": "text_many_terms" } },
}); });
@@ -236,72 +220,6 @@ fn terms_many_with_avg_sub_agg(index: &Index) {
}); });
execute_agg(index, agg_req); execute_agg(index, agg_req);
} }
fn terms_all_unique_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_all_unique_terms" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn terms_status_with_histogram(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_few_terms_status" },
"aggs": {
"histo": {"histogram": { "field": "score_f64", "interval": 10 }}
}
}
});
execute_agg(index, agg_req);
}
fn terms_zipf_1000_with_histogram(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_1000_terms_zipf" },
"aggs": {
"histo": {"histogram": { "field": "score_f64", "interval": 10 }}
}
}
});
execute_agg(index, agg_req);
}
fn terms_status_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_few_terms_status" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn terms_zipf_1000_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_1000_terms_zipf" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn terms_zipf_1000(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_1000_terms_zipf" } },
});
execute_agg(index, agg_req);
}
fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) { fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"my_texts": { "my_texts": {
@@ -357,7 +275,7 @@ fn range_agg_with_avg_sub_agg(index: &Index) {
execute_agg(index, agg_req); execute_agg(index, agg_req);
} }
fn range_agg_with_term_agg_status(index: &Index) { fn range_agg_with_term_agg_few(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"rangef64": { "rangef64": {
"range": { "range": {
@@ -372,7 +290,7 @@ fn range_agg_with_term_agg_status(index: &Index) {
] ]
}, },
"aggs": { "aggs": {
"my_texts": { "terms": { "field": "text_few_terms_status" } }, "my_texts": { "terms": { "field": "text_few_terms" } },
} }
}, },
}); });
@@ -428,12 +346,12 @@ fn histogram_with_avg_sub_agg(index: &Index) {
}); });
execute_agg(index, agg_req); execute_agg(index, agg_req);
} }
fn histogram_with_term_agg_status(index: &Index) { fn histogram_with_term_agg_few(index: &Index) {
let agg_req = json!({ let agg_req = json!({
"rangef64": { "rangef64": {
"histogram": { "field": "score_f64", "interval": 10 }, "histogram": { "field": "score_f64", "interval": 10 },
"aggs": { "aggs": {
"my_texts": { "terms": { "field": "text_few_terms_status" } } "my_texts": { "terms": { "field": "text_few_terms" } }
} }
} }
}); });
@@ -478,13 +396,6 @@ fn get_collector(agg_req: Aggregations) -> AggregationCollector {
} }
fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> { fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
// Flag to use existing index
let reuse_index = std::env::var("REUSE_AGG_BENCH_INDEX").is_ok();
if reuse_index && std::path::Path::new("agg_bench").exists() {
return Index::open_in_dir("agg_bench");
}
// crreate dir
std::fs::create_dir_all("agg_bench")?;
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_fieldtype = tantivy::schema::TextOptions::default() let text_fieldtype = tantivy::schema::TextOptions::default()
.set_indexing_options( .set_indexing_options(
@@ -493,47 +404,20 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
.set_stored(); .set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype); let text_field = schema_builder.add_text_field("text", text_fieldtype);
let json_field = schema_builder.add_json_field("json", FAST); let json_field = schema_builder.add_json_field("json", FAST);
let text_field_all_unique_terms =
schema_builder.add_text_field("text_all_unique_terms", STRING | FAST);
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST); let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_few_terms_status = let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
schema_builder.add_text_field("text_few_terms_status", STRING | FAST);
let text_field_1000_terms_zipf =
schema_builder.add_text_field("text_1000_terms_zipf", STRING | FAST);
let score_fieldtype = tantivy::schema::NumericOptions::default().set_fast(); let score_fieldtype = tantivy::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone()); let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone()); let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype); let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
// use tmp dir let index = Index::create_from_tempdir(schema_builder.build())?;
let index = if reuse_index { let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
Index::create_in_dir("agg_bench", schema_builder.build())?
} else {
Index::create_from_tempdir(schema_builder.build())?
};
// Approximate log proportions
let status_field_data = [
("INFO", 8000),
("ERROR", 300),
("WARN", 1200),
("DEBUG", 500),
("OK", 500),
("CRITICAL", 20),
("EMERGENCY", 1),
];
let log_level_distribution =
WeightedIndex::new(status_field_data.iter().map(|item| item.1)).unwrap();
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap(); let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let many_terms_data = (0..150_000) let many_terms_data = (0..150_000)
.map(|num| format!("author{num}")) .map(|num| format!("author{num}"))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
// Prepare 1000 unique terms sampled using a Zipf distribution.
// Exponent ~1.1 approximates top-20 terms covering around ~20%.
let terms_1000: Vec<String> = (1..=1000).map(|i| format!("term_{i}")).collect();
let zipf_1000 = rand_distr::Zipf::new(1000, 1.1f64).unwrap();
{ {
let mut rng = StdRng::from_seed([1u8; 32]); let mut rng = StdRng::from_seed([1u8; 32]);
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?; let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
@@ -543,25 +427,15 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
index_writer.add_document(doc!())?; index_writer.add_document(doc!())?;
} }
if cardinality == Cardinality::Multivalued { if cardinality == Cardinality::Multivalued {
let log_level_sample_a = status_field_data[log_level_distribution.sample(&mut rng)].0;
let log_level_sample_b = status_field_data[log_level_distribution.sample(&mut rng)].0;
let idx_a = zipf_1000.sample(&mut rng) as usize - 1;
let idx_b = zipf_1000.sample(&mut rng) as usize - 1;
let term_1000_a = &terms_1000[idx_a];
let term_1000_b = &terms_1000[idx_b];
index_writer.add_document(doc!( index_writer.add_document(doc!(
json_field => json!({"mixed_type": 10.0}), json_field => json!({"mixed_type": 10.0}),
json_field => json!({"mixed_type": 10.0}), json_field => json!({"mixed_type": 10.0}),
text_field => "cool", text_field => "cool",
text_field => "cool", text_field => "cool",
text_field_all_unique_terms => "cool",
text_field_all_unique_terms => "coolo",
text_field_many_terms => "cool", text_field_many_terms => "cool",
text_field_many_terms => "cool", text_field_many_terms => "cool",
text_field_few_terms_status => log_level_sample_a, text_field_few_terms => "cool",
text_field_few_terms_status => log_level_sample_b, text_field_few_terms => "cool",
text_field_1000_terms_zipf => term_1000_a.as_str(),
text_field_1000_terms_zipf => term_1000_b.as_str(),
score_field => 1u64, score_field => 1u64,
score_field => 1u64, score_field => 1u64,
score_field_f64 => lg_norm.sample(&mut rng), score_field_f64 => lg_norm.sample(&mut rng),
@@ -586,10 +460,8 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "cool", text_field => "cool",
json_field => json, json_field => json,
text_field_all_unique_terms => format!("unique_term_{}", rng.gen::<u64>()),
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(), text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms_status => status_field_data[log_level_distribution.sample(&mut rng)].0, text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_1000_terms_zipf => terms_1000[zipf_1000.sample(&mut rng) as usize - 1].as_str(),
score_field => val as u64, score_field => val as u64,
score_field_f64 => lg_norm.sample(&mut rng), score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => val as i64, score_field_i64 => val as i64,
@@ -641,7 +513,7 @@ fn filter_agg_all_query_with_sub_aggs(index: &Index) {
"avg_score": { "avg": { "field": "score" } }, "avg_score": { "avg": { "field": "score" } },
"stats_score": { "stats": { "field": "score_f64" } }, "stats_score": { "stats": { "field": "score_f64" } },
"terms_text": { "terms_text": {
"terms": { "field": "text_few_terms_status" } "terms": { "field": "text_few_terms" }
} }
} }
} }
@@ -657,7 +529,7 @@ fn filter_agg_term_query_with_sub_aggs(index: &Index) {
"avg_score": { "avg": { "field": "score" } }, "avg_score": { "avg": { "field": "score" } },
"stats_score": { "stats": { "field": "score_f64" } }, "stats_score": { "stats": { "field": "score_f64" } },
"terms_text": { "terms_text": {
"terms": { "field": "text_few_terms_status" } "terms": { "field": "text_few_terms" }
} }
} }
} }

View File

@@ -16,15 +16,14 @@
// - This bench isolates boolean iteration speed and intersection/union cost. // - This bench isolates boolean iteration speed and intersection/union cost.
// - Use `cargo bench --bench boolean_conjunction` to run. // - Use `cargo bench --bench boolean_conjunction` to run.
use binggan::{black_box, BenchGroup, BenchRunner}; use binggan::{black_box, BenchRunner};
use rand::prelude::*; use rand::prelude::*;
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::SeedableRng; use rand::SeedableRng;
use tantivy::collector::sort_key::SortByStaticFastValue; use tantivy::collector::{Count, TopDocs};
use tantivy::collector::{Collector, Count, TopDocs}; use tantivy::query::QueryParser;
use tantivy::query::{Query, QueryParser}; use tantivy::schema::{Schema, TEXT};
use tantivy::schema::{Schema, FAST, TEXT}; use tantivy::{doc, Index, ReloadPolicy, Searcher};
use tantivy::{doc, Index, Order, ReloadPolicy, Searcher};
#[derive(Clone)] #[derive(Clone)]
struct BenchIndex { struct BenchIndex {
@@ -34,6 +33,23 @@ struct BenchIndex {
query_parser: QueryParser, query_parser: QueryParser,
} }
impl BenchIndex {
#[inline(always)]
fn count_query(&self, query_str: &str) -> usize {
let query = self.query_parser.parse_query(query_str).unwrap();
self.searcher.search(&query, &Count).unwrap()
}
#[inline(always)]
fn topk_len(&self, query_str: &str, k: usize) -> usize {
let query = self.query_parser.parse_query(query_str).unwrap();
self.searcher
.search(&query, &TopDocs::with_limit(k))
.unwrap()
.len()
}
}
/// Build a single index containing both fields (title, body) and /// Build a single index containing both fields (title, body) and
/// return two BenchIndex views: /// return two BenchIndex views:
/// - single_field: QueryParser defaults to only "body" /// - single_field: QueryParser defaults to only "body"
@@ -43,8 +59,6 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let f_title = schema_builder.add_text_field("title", TEXT); let f_title = schema_builder.add_text_field("title", TEXT);
let f_body = schema_builder.add_text_field("body", TEXT); let f_body = schema_builder.add_text_field("body", TEXT);
let f_score = schema_builder.add_u64_field("score", FAST);
let f_score2 = schema_builder.add_u64_field("score2", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
@@ -53,13 +67,11 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench
// Populate: spread each present token 90/10 to body/title // Populate: spread each present token 90/10 to body/title
{ {
let mut writer = index.writer_with_num_threads(1, 500_000_000).unwrap(); let mut writer = index.writer(500_000_000).unwrap();
for _ in 0..num_docs { for _ in 0..num_docs {
let has_a = rng.gen_bool(p_a as f64); let has_a = rng.gen_bool(p_a as f64);
let has_b = rng.gen_bool(p_b as f64); let has_b = rng.gen_bool(p_b as f64);
let has_c = rng.gen_bool(p_c as f64); let has_c = rng.gen_bool(p_c as f64);
let score = rng.gen_range(0u64..100u64);
let score2 = rng.gen_range(0u64..100_000u64);
let mut title_tokens: Vec<&str> = Vec::new(); let mut title_tokens: Vec<&str> = Vec::new();
let mut body_tokens: Vec<&str> = Vec::new(); let mut body_tokens: Vec<&str> = Vec::new();
if has_a { if has_a {
@@ -89,9 +101,7 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench
writer writer
.add_document(doc!( .add_document(doc!(
f_title=>title_tokens.join(" "), f_title=>title_tokens.join(" "),
f_body=>body_tokens.join(" "), f_body=>body_tokens.join(" ")
f_score=>score,
f_score2=>score2,
)) ))
.unwrap(); .unwrap();
} }
@@ -143,76 +153,72 @@ fn main() {
), ),
]; ];
let queries = &["a", "+a +b", "+a +b +c", "a OR b", "a OR b OR c"];
let mut runner = BenchRunner::new(); let mut runner = BenchRunner::new();
for (label, n, pa, pb, pc) in scenarios { for (label, n, pa, pb, pc) in scenarios {
let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc); let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc);
for (view_name, bench_index) in [("single_field", single_view), ("multi_field", multi_view)] // Single-field group: default field is body only
{ {
// Single-field group: default field is body only
let mut group = runner.new_group(); let mut group = runner.new_group();
group.set_name(format!("{}{}", view_name, label)); group.set_name(format!("single_field — {}", label));
for query_str in queries { group.register_with_input("+a_+b_count", &single_view, |benv: &BenchIndex| {
add_bench_task(&mut group, &bench_index, query_str, Count, "count"); black_box(benv.count_query("+a +b"))
add_bench_task( });
&mut group, group.register_with_input("+a_+b_+c_count", &single_view, |benv: &BenchIndex| {
&bench_index, black_box(benv.count_query("+a +b +c"))
query_str, });
TopDocs::with_limit(10).order_by_score(), group.register_with_input("+a_+b_top10", &single_view, |benv: &BenchIndex| {
"top10", black_box(benv.topk_len("+a +b", 10))
); });
add_bench_task( group.register_with_input("+a_+b_+c_top10", &single_view, |benv: &BenchIndex| {
&mut group, black_box(benv.topk_len("+a +b +c", 10))
&bench_index, });
query_str, // OR queries
TopDocs::with_limit(10).order_by_fast_field::<u64>("score", Order::Asc), group.register_with_input("a_OR_b_count", &single_view, |benv: &BenchIndex| {
"top10_by_ff", black_box(benv.count_query("a OR b"))
); });
add_bench_task( group.register_with_input("a_OR_b_OR_c_count", &single_view, |benv: &BenchIndex| {
&mut group, black_box(benv.count_query("a OR b OR c"))
&bench_index, });
query_str, group.register_with_input("a_OR_b_top10", &single_view, |benv: &BenchIndex| {
TopDocs::with_limit(10).order_by(( black_box(benv.topk_len("a OR b", 10))
SortByStaticFastValue::<u64>::for_field("score"), });
SortByStaticFastValue::<u64>::for_field("score2"), group.register_with_input("a_OR_b_OR_c_top10", &single_view, |benv: &BenchIndex| {
)), black_box(benv.topk_len("a OR b OR c", 10))
"top10_by_2ff", });
); group.run();
} }
// Multi-field group: default fields are [title, body]
{
let mut group = runner.new_group();
group.set_name(format!("multi_field — {}", label));
group.register_with_input("+a_+b_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("+a +b"))
});
group.register_with_input("+a_+b_+c_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("+a +b +c"))
});
group.register_with_input("+a_+b_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("+a +b", 10))
});
group.register_with_input("+a_+b_+c_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("+a +b +c", 10))
});
// OR queries
group.register_with_input("a_OR_b_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("a OR b"))
});
group.register_with_input("a_OR_b_OR_c_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("a OR b OR c"))
});
group.register_with_input("a_OR_b_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("a OR b", 10))
});
group.register_with_input("a_OR_b_OR_c_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("a OR b OR c", 10))
});
group.run(); group.run();
} }
} }
} }
fn add_bench_task<C: Collector + 'static>(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query_str: &str,
collector: C,
collector_name: &str,
) {
let task_name = format!("{}_{}", query_str.replace(" ", "_"), collector_name);
let query = bench_index.query_parser.parse_query(query_str).unwrap();
let search_task = SearchTask {
searcher: bench_index.searcher.clone(),
collector,
query,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
struct SearchTask<C: Collector> {
searcher: Searcher,
collector: C,
query: Box<dyn Query>,
}
impl<C: Collector> SearchTask<C> {
#[inline(never)]
pub fn run(&self) -> usize {
self.searcher.search(&self.query, &self.collector).unwrap();
1
}
}

View File

@@ -1,288 +0,0 @@
use binggan::{black_box, BenchGroup, BenchRunner};
use rand::prelude::*;
use rand::rngs::StdRng;
use rand::SeedableRng;
use tantivy::collector::{Collector, Count, DocSetCollector, TopDocs};
use tantivy::query::{Query, QueryParser};
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, Order, ReloadPolicy, Searcher};
#[derive(Clone)]
struct BenchIndex {
#[allow(dead_code)]
index: Index,
searcher: Searcher,
query_parser: QueryParser,
}
fn build_shared_indices(num_docs: usize, p_title_a: f32, distribution: &str) -> BenchIndex {
// Unified schema
let mut schema_builder = Schema::builder();
let f_title = schema_builder.add_text_field("title", TEXT);
let f_num_rand = schema_builder.add_u64_field("num_rand", INDEXED);
let f_num_asc = schema_builder.add_u64_field("num_asc", INDEXED);
let f_num_rand_fast = schema_builder.add_u64_field("num_rand_fast", INDEXED | FAST);
let f_num_asc_fast = schema_builder.add_u64_field("num_asc_fast", INDEXED | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
// Populate index with stable RNG for reproducibility.
let mut rng = StdRng::from_seed([7u8; 32]);
{
let mut writer = index.writer_with_num_threads(1, 4_000_000_000).unwrap();
match distribution {
"dense" => {
for doc_id in 0..num_docs {
// Always add title to avoid empty documents
let title_token = if rng.gen_bool(p_title_a as f64) {
"a"
} else {
"b"
};
let num_rand = rng.gen_range(0u64..1000u64);
let num_asc = (doc_id / 10000) as u64;
writer
.add_document(doc!(
f_title=>title_token,
f_num_rand=>num_rand,
f_num_asc=>num_asc,
f_num_rand_fast=>num_rand,
f_num_asc_fast=>num_asc,
))
.unwrap();
}
}
"sparse" => {
for doc_id in 0..num_docs {
// Always add title to avoid empty documents
let title_token = if rng.gen_bool(p_title_a as f64) {
"a"
} else {
"b"
};
let num_rand = rng.gen_range(0u64..10000000u64);
let num_asc = doc_id as u64;
writer
.add_document(doc!(
f_title=>title_token,
f_num_rand=>num_rand,
f_num_asc=>num_asc,
f_num_rand_fast=>num_rand,
f_num_asc_fast=>num_asc,
))
.unwrap();
}
}
_ => {
panic!("Unsupported distribution type");
}
}
writer.commit().unwrap();
}
// Prepare reader/searcher once.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let searcher = reader.searcher();
// Build query parser for title field
let qp_title = QueryParser::for_index(&index, vec![f_title]);
BenchIndex {
index,
searcher,
query_parser: qp_title,
}
}
fn main() {
// Prepare corpora with varying scenarios
let scenarios = vec![
(
"dense and 99% a".to_string(),
10_000_000,
0.99,
"dense",
0,
9,
),
(
"dense and 99% a".to_string(),
10_000_000,
0.99,
"dense",
990,
999,
),
(
"sparse and 99% a".to_string(),
10_000_000,
0.99,
"sparse",
0,
9,
),
(
"sparse and 99% a".to_string(),
10_000_000,
0.99,
"sparse",
9_999_990,
9_999_999,
),
];
let mut runner = BenchRunner::new();
for (scenario_id, n, p_title_a, num_rand_distribution, range_low, range_high) in scenarios {
// Build index for this scenario
let bench_index = build_shared_indices(n, p_title_a, num_rand_distribution);
// Create benchmark group
let mut group = runner.new_group();
// Now set the name (this moves scenario_id)
group.set_name(scenario_id);
// Define all four field types
let field_names = ["num_rand", "num_asc", "num_rand_fast", "num_asc_fast"];
// Define the three terms we want to test with
let terms = ["a", "b", "z"];
// Generate all combinations of terms and field names
let mut queries = Vec::new();
for &term in &terms {
for &field_name in &field_names {
let query_str = format!(
"{} AND {}:[{} TO {}]",
term, field_name, range_low, range_high
);
queries.push((query_str, field_name.to_string()));
}
}
let query_str = format!(
"{}:[{} TO {}] AND {}:[{} TO {}]",
"num_rand_fast", range_low, range_high, "num_asc_fast", range_low, range_high
);
queries.push((query_str, "num_asc_fast".to_string()));
// Run all benchmark tasks for each query and its corresponding field name
for (query_str, field_name) in queries {
run_benchmark_tasks(&mut group, &bench_index, &query_str, &field_name);
}
group.run();
}
}
/// Run all benchmark tasks for a given query string and field name
fn run_benchmark_tasks(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query_str: &str,
field_name: &str,
) {
// Test count
add_bench_task(bench_group, bench_index, query_str, Count, "count");
// Test all results
add_bench_task(
bench_group,
bench_index,
query_str,
DocSetCollector,
"all results",
);
// Test top 100 by the field (if it's a FAST field)
if field_name.ends_with("_fast") {
// Ascending order
{
let collector_name = format!("top100_by_{}_asc", field_name);
let field_name_owned = field_name.to_string();
add_bench_task(
bench_group,
bench_index,
query_str,
TopDocs::with_limit(100).order_by_fast_field::<u64>(field_name_owned, Order::Asc),
&collector_name,
);
}
// Descending order
{
let collector_name = format!("top100_by_{}_desc", field_name);
let field_name_owned = field_name.to_string();
add_bench_task(
bench_group,
bench_index,
query_str,
TopDocs::with_limit(100).order_by_fast_field::<u64>(field_name_owned, Order::Desc),
&collector_name,
);
}
}
}
fn add_bench_task<C: Collector + 'static>(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query_str: &str,
collector: C,
collector_name: &str,
) {
let task_name = format!("{}_{}", query_str.replace(" ", "_"), collector_name);
let query = bench_index.query_parser.parse_query(query_str).unwrap();
let search_task = SearchTask {
searcher: bench_index.searcher.clone(),
collector,
query,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
struct SearchTask<C: Collector> {
searcher: Searcher,
collector: C,
query: Box<dyn Query>,
}
impl<C: Collector> SearchTask<C> {
#[inline(never)]
pub fn run(&self) -> usize {
let result = self.searcher.search(&self.query, &self.collector).unwrap();
if let Some(count) = (&result as &dyn std::any::Any).downcast_ref::<usize>() {
*count
} else if let Some(top_docs) = (&result as &dyn std::any::Any)
.downcast_ref::<Vec<(Option<u64>, tantivy::DocAddress)>>()
{
top_docs.len()
} else if let Some(top_docs) =
(&result as &dyn std::any::Any).downcast_ref::<Vec<(u64, tantivy::DocAddress)>>()
{
top_docs.len()
} else if let Some(doc_set) = (&result as &dyn std::any::Any)
.downcast_ref::<std::collections::HashSet<tantivy::DocAddress>>()
{
doc_set.len()
} else {
eprintln!(
"Unknown collector result type: {:?}",
std::any::type_name::<C::Fruit>()
);
0
}
}
}

View File

@@ -1,365 +0,0 @@
use std::ops::Bound;
use binggan::{black_box, BenchGroup, BenchRunner};
use rand::prelude::*;
use rand::rngs::StdRng;
use rand::SeedableRng;
use tantivy::collector::{Count, DocSetCollector, TopDocs};
use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, FAST, INDEXED};
use tantivy::{doc, Index, Order, ReloadPolicy, Searcher, Term};
#[derive(Clone)]
struct BenchIndex {
#[allow(dead_code)]
index: Index,
searcher: Searcher,
}
fn build_shared_indices(num_docs: usize, distribution: &str) -> BenchIndex {
// Schema with fast fields only
let mut schema_builder = Schema::builder();
let f_num_rand_fast = schema_builder.add_u64_field("num_rand_fast", INDEXED | FAST);
let f_num_asc_fast = schema_builder.add_u64_field("num_asc_fast", INDEXED | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
// Populate index with stable RNG for reproducibility.
let mut rng = StdRng::from_seed([7u8; 32]);
{
let mut writer = index.writer_with_num_threads(1, 4_000_000_000).unwrap();
match distribution {
"dense" => {
for doc_id in 0..num_docs {
let num_rand = rng.gen_range(0u64..1000u64);
let num_asc = (doc_id / 10000) as u64;
writer
.add_document(doc!(
f_num_rand_fast=>num_rand,
f_num_asc_fast=>num_asc,
))
.unwrap();
}
}
"sparse" => {
for doc_id in 0..num_docs {
let num_rand = rng.gen_range(0u64..10000000u64);
let num_asc = doc_id as u64;
writer
.add_document(doc!(
f_num_rand_fast=>num_rand,
f_num_asc_fast=>num_asc,
))
.unwrap();
}
}
_ => {
panic!("Unsupported distribution type");
}
}
writer.commit().unwrap();
}
// Prepare reader/searcher once.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let searcher = reader.searcher();
BenchIndex { index, searcher }
}
fn main() {
// Prepare corpora with varying scenarios
let scenarios = vec![
// Dense distribution - random values in small range (0-999)
(
"dense_values_search_low_value_range".to_string(),
10_000_000,
"dense",
0,
9,
),
(
"dense_values_search_high_value_range".to_string(),
10_000_000,
"dense",
990,
999,
),
(
"dense_values_search_out_of_range".to_string(),
10_000_000,
"dense",
1000,
1002,
),
(
"sparse_values_search_low_value_range".to_string(),
10_000_000,
"sparse",
0,
9,
),
(
"sparse_values_search_high_value_range".to_string(),
10_000_000,
"sparse",
9_999_990,
9_999_999,
),
(
"sparse_values_search_out_of_range".to_string(),
10_000_000,
"sparse",
10_000_000,
10_000_002,
),
];
let mut runner = BenchRunner::new();
for (scenario_id, n, num_rand_distribution, range_low, range_high) in scenarios {
// Build index for this scenario
let bench_index = build_shared_indices(n, num_rand_distribution);
// Create benchmark group
let mut group = runner.new_group();
// Now set the name (this moves scenario_id)
group.set_name(scenario_id);
// Define fast field types
let field_names = ["num_rand_fast", "num_asc_fast"];
// Generate range queries for fast fields
for &field_name in &field_names {
// Create the range query
let field = bench_index.searcher.schema().get_field(field_name).unwrap();
let lower_term = Term::from_field_u64(field, range_low);
let upper_term = Term::from_field_u64(field, range_high);
let query = RangeQuery::new(Bound::Included(lower_term), Bound::Included(upper_term));
run_benchmark_tasks(
&mut group,
&bench_index,
query,
field_name,
range_low,
range_high,
);
}
group.run();
}
}
/// Run all benchmark tasks for a given range query and field name
fn run_benchmark_tasks(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
field_name: &str,
range_low: u64,
range_high: u64,
) {
// Test count
add_bench_task_count(
bench_group,
bench_index,
query.clone(),
"count",
field_name,
range_low,
range_high,
);
// Test top 100 by the field (ascending order)
{
let collector_name = format!("top100_by_{}_asc", field_name);
let field_name_owned = field_name.to_string();
add_bench_task_top100_asc(
bench_group,
bench_index,
query.clone(),
&collector_name,
field_name,
range_low,
range_high,
field_name_owned,
);
}
// Test top 100 by the field (descending order)
{
let collector_name = format!("top100_by_{}_desc", field_name);
let field_name_owned = field_name.to_string();
add_bench_task_top100_desc(
bench_group,
bench_index,
query,
&collector_name,
field_name,
range_low,
range_high,
field_name_owned,
);
}
}
fn add_bench_task_count(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
collector_name: &str,
field_name: &str,
range_low: u64,
range_high: u64,
) {
let task_name = format!(
"range_{}_[{} TO {}]_{}",
field_name, range_low, range_high, collector_name
);
let search_task = CountSearchTask {
searcher: bench_index.searcher.clone(),
query,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
fn add_bench_task_docset(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
collector_name: &str,
field_name: &str,
range_low: u64,
range_high: u64,
) {
let task_name = format!(
"range_{}_[{} TO {}]_{}",
field_name, range_low, range_high, collector_name
);
let search_task = DocSetSearchTask {
searcher: bench_index.searcher.clone(),
query,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
fn add_bench_task_top100_asc(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
collector_name: &str,
field_name: &str,
range_low: u64,
range_high: u64,
field_name_owned: String,
) {
let task_name = format!(
"range_{}_[{} TO {}]_{}",
field_name, range_low, range_high, collector_name
);
let search_task = Top100AscSearchTask {
searcher: bench_index.searcher.clone(),
query,
field_name: field_name_owned,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
fn add_bench_task_top100_desc(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
collector_name: &str,
field_name: &str,
range_low: u64,
range_high: u64,
field_name_owned: String,
) {
let task_name = format!(
"range_{}_[{} TO {}]_{}",
field_name, range_low, range_high, collector_name
);
let search_task = Top100DescSearchTask {
searcher: bench_index.searcher.clone(),
query,
field_name: field_name_owned,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
struct CountSearchTask {
searcher: Searcher,
query: RangeQuery,
}
impl CountSearchTask {
#[inline(never)]
pub fn run(&self) -> usize {
self.searcher.search(&self.query, &Count).unwrap()
}
}
struct DocSetSearchTask {
searcher: Searcher,
query: RangeQuery,
}
impl DocSetSearchTask {
#[inline(never)]
pub fn run(&self) -> usize {
let result = self.searcher.search(&self.query, &DocSetCollector).unwrap();
result.len()
}
}
struct Top100AscSearchTask {
searcher: Searcher,
query: RangeQuery,
field_name: String,
}
impl Top100AscSearchTask {
#[inline(never)]
pub fn run(&self) -> usize {
let collector =
TopDocs::with_limit(100).order_by_fast_field::<u64>(&self.field_name, Order::Asc);
let result = self.searcher.search(&self.query, &collector).unwrap();
for (_score, doc_address) in &result {
let _doc: tantivy::TantivyDocument = self.searcher.doc(*doc_address).unwrap();
}
result.len()
}
}
struct Top100DescSearchTask {
searcher: Searcher,
query: RangeQuery,
field_name: String,
}
impl Top100DescSearchTask {
#[inline(never)]
pub fn run(&self) -> usize {
let collector =
TopDocs::with_limit(100).order_by_fast_field::<u64>(&self.field_name, Order::Desc);
let result = self.searcher.search(&self.query, &collector).unwrap();
for (_score, doc_address) in &result {
let _doc: tantivy::TantivyDocument = self.searcher.doc(*doc_address).unwrap();
}
result.len()
}
}

View File

@@ -1,260 +0,0 @@
use std::fmt::Display;
use std::net::Ipv6Addr;
use std::ops::RangeInclusive;
use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, BenchRunner, OutputValue, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use columnar::MonotonicallyMappableToU128;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index};
#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
fn main() {
bench_range_query();
}
fn bench_range_query() {
let index = get_index_0_to_100();
let mut runner = BenchRunner::new();
runner.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
runner.set_name("range_query on u64");
let field_name_and_descr: Vec<_> = vec![
("id", "Single Valued Range Field"),
("ids", "Multi Valued Range Field"),
];
let range_num_hits = vec![
("90_percent", get_90_percent()),
("10_percent", get_10_percent()),
("1_percent", get_1_percent()),
];
test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);
runner.set_name("range_query on ip");
let field_name_and_descr: Vec<_> = vec![
("ip", "Single Valued Range Field"),
("ips", "Multi Valued Range Field"),
];
let range_num_hits = vec![
("90_percent", get_90_percent_ip()),
("10_percent", get_10_percent_ip()),
("1_percent", get_1_percent_ip()),
];
test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);
}
fn test_range<T: Display>(
runner: &mut BenchRunner,
index: &Index,
field_name_and_descr: &[(&str, &str)],
range_num_hits: Vec<(&str, RangeInclusive<T>)>,
) {
for (field, suffix) in field_name_and_descr {
let term_num_hits = vec![
("", ""),
("1_percent", "veryfew"),
("10_percent", "few"),
("90_percent", "most"),
];
let mut group = runner.new_group();
group.set_name(suffix);
// all intersect combinations
for (range_name, range) in &range_num_hits {
for (term_name, term) in &term_num_hits {
let index = &index;
let test_name = if term_name.is_empty() {
format!("id_range_hit_{}", range_name)
} else {
format!(
"id_range_hit_{}_intersect_with_term_{}",
range_name, term_name
)
};
group.register(test_name, move |_| {
let query = if term_name.is_empty() {
"".to_string()
} else {
format!("AND id_name:{}", term)
};
black_box(execute_query(field, range, &query, index));
});
}
}
group.run();
}
}
fn get_index_0_to_100() -> Index {
let mut rng = StdRng::from_seed([1u8; 32]);
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id_name = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"most".to_string() // 90%
};
Doc {
id_name,
id: rng.gen_range(0..100),
// Multiply by 1000, so that we create most buckets in the compact space
// The benches depend on this range to select n-percent of elements with the
// methods below.
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
}
})
.collect();
create_index_from_docs(&docs)
}
#[derive(Clone, Debug)]
pub struct Doc {
pub id_name: String,
pub id: u64,
pub ip: Ipv6Addr,
}
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
let mut schema_builder = Schema::builder();
let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST);
let ids_u64_field =
schema_builder.add_u64_field("ids", NumericOptions::default().set_fast().set_indexed());
let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST);
let ids_f64_field = schema_builder.add_f64_field(
"ids_f64",
NumericOptions::default().set_fast().set_indexed(),
);
let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST);
let ids_i64_field = schema_builder.add_i64_field(
"ids_i64",
NumericOptions::default().set_fast().set_indexed(),
);
let text_field = schema_builder.add_text_field("id_name", STRING | STORED);
let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST);
let ip_field = schema_builder.add_ip_addr_field("ip", FAST);
let ips_field = schema_builder.add_ip_addr_field("ips", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
for doc in docs.iter() {
index_writer
.add_document(doc!(
ids_i64_field => doc.id as i64,
ids_i64_field => doc.id as i64,
ids_f64_field => doc.id as f64,
ids_f64_field => doc.id as f64,
ids_u64_field => doc.id,
ids_u64_field => doc.id,
id_u64_field => doc.id,
id_f64_field => doc.id as f64,
id_i64_field => doc.id as i64,
text_field => doc.id_name.to_string(),
text_field2 => doc.id_name.to_string(),
ips_field => doc.ip,
ips_field => doc.ip,
ip_field => doc.ip,
))
.unwrap();
}
index_writer.commit().unwrap();
}
index
}
fn get_90_percent() -> RangeInclusive<u64> {
0..=90
}
fn get_10_percent() -> RangeInclusive<u64> {
0..=10
}
fn get_1_percent() -> RangeInclusive<u64> {
10..=10
}
fn get_90_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
start..=end
}
fn get_10_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn get_1_percent_ip() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
struct NumHits {
count: usize,
}
impl OutputValue for NumHits {
fn column_title() -> &'static str {
"NumHits"
}
fn format(&self) -> Option<String> {
Some(self.count.to_string())
}
}
fn execute_query<T: Display>(
field: &str,
id_range: &RangeInclusive<T>,
suffix: &str,
index: &Index,
) -> NumHits {
let gen_query_inclusive = |from: &T, to: &T| {
format!(
"{}:[{} TO {}] {}",
field,
&from.to_string(),
&to.to_string(),
suffix
)
};
let query = gen_query_inclusive(id_range.start(), id_range.end());
execute_query_(&query, index)
}
fn execute_query_(query: &str, index: &Index) -> NumHits {
let query_from_text = |text: &str| {
QueryParser::for_index(index, vec![])
.parse_query(text)
.unwrap()
};
let query = query_from_text(query);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let num_hits = searcher
.search(&query, &(TopDocs::with_limit(10).order_by_score(), Count))
.unwrap()
.1;
NumHits { count: num_hits }
}

View File

@@ -258,7 +258,7 @@ mod test {
bitpacker.write(val, num_bits, &mut data).unwrap(); bitpacker.write(val, num_bits, &mut data).unwrap();
} }
bitpacker.close(&mut data).unwrap(); bitpacker.close(&mut data).unwrap();
assert_eq!(data.len(), ((num_bits as usize) * len).div_ceil(8)); assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8);
let bitunpacker = BitUnpacker::new(num_bits); let bitunpacker = BitUnpacker::new(num_bits);
(bitunpacker, vals, data) (bitunpacker, vals, data)
} }
@@ -304,7 +304,7 @@ mod test {
bitpacker.write(val, num_bits, &mut buffer).unwrap(); bitpacker.write(val, num_bits, &mut buffer).unwrap();
} }
bitpacker.flush(&mut buffer).unwrap(); bitpacker.flush(&mut buffer).unwrap();
assert_eq!(buffer.len(), (vals.len() * num_bits as usize).div_ceil(8)); assert_eq!(buffer.len(), (vals.len() * num_bits as usize + 7) / 8);
let bitunpacker = BitUnpacker::new(num_bits); let bitunpacker = BitUnpacker::new(num_bits);
let max_val = if num_bits == 64 { let max_val = if num_bits == 64 {
u64::MAX u64::MAX

View File

@@ -19,7 +19,7 @@ fn u32_to_i32(val: u32) -> i32 {
#[inline] #[inline]
unsafe fn u32_to_i32_avx2(vals_u32x8s: DataType) -> DataType { unsafe fn u32_to_i32_avx2(vals_u32x8s: DataType) -> DataType {
const HIGHEST_BIT_MASK: DataType = from_u32x8([HIGHEST_BIT; NUM_LANES]); const HIGHEST_BIT_MASK: DataType = from_u32x8([HIGHEST_BIT; NUM_LANES]);
unsafe { op_xor(vals_u32x8s, HIGHEST_BIT_MASK) } op_xor(vals_u32x8s, HIGHEST_BIT_MASK)
} }
pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) { pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
@@ -66,19 +66,17 @@ unsafe fn filter_vec_avx2_aux(
]); ]);
const SHIFT: __m256i = from_u32x8([NUM_LANES as u32; NUM_LANES]); const SHIFT: __m256i = from_u32x8([NUM_LANES as u32; NUM_LANES]);
for _ in 0..num_words { for _ in 0..num_words {
unsafe { let word = load_unaligned(input);
let word = load_unaligned(input); let word = u32_to_i32_avx2(word);
let word = u32_to_i32_avx2(word); let keeper_bitset = compute_filter_bitset(word, range_simd.clone());
let keeper_bitset = compute_filter_bitset(word, range_simd.clone()); let added_len = keeper_bitset.count_ones();
let added_len = keeper_bitset.count_ones(); let filtered_doc_ids = compact(ids, keeper_bitset);
let filtered_doc_ids = compact(ids, keeper_bitset); store_unaligned(output_tail as *mut __m256i, filtered_doc_ids);
store_unaligned(output_tail as *mut __m256i, filtered_doc_ids); output_tail = output_tail.offset(added_len as isize);
output_tail = output_tail.offset(added_len as isize); ids = op_add(ids, SHIFT);
ids = op_add(ids, SHIFT); input = input.offset(1);
input = input.offset(1);
}
} }
unsafe { output_tail.offset_from(output) as usize } output_tail.offset_from(output) as usize
} }
#[inline] #[inline]
@@ -94,7 +92,8 @@ unsafe fn compute_filter_bitset(val: __m256i, range: std::ops::RangeInclusive<__
let too_low = op_greater(*range.start(), val); let too_low = op_greater(*range.start(), val);
let too_high = op_greater(val, *range.end()); let too_high = op_greater(val, *range.end());
let inside = op_or(too_low, too_high); let inside = op_or(too_low, too_high);
255 - std::arch::x86_64::_mm256_movemask_ps(_mm256_castsi256_ps(inside)) as u8 255 - std::arch::x86_64::_mm256_movemask_ps(std::mem::transmute::<DataType, __m256>(inside))
as u8
} }
union U8x32 { union U8x32 {

View File

@@ -73,7 +73,7 @@ The crate introduces the following concepts.
`Columnar` is an equivalent of a dataframe. `Columnar` is an equivalent of a dataframe.
It maps `column_key` to `Column`. It maps `column_key` to `Column`.
A `Column<T>` associates a `RowId` (u32) to any A `Column<T>` asssociates a `RowId` (u32) to any
number of values. number of values.
This is made possible by wrapping a `ColumnIndex` and a `ColumnValue` object. This is made possible by wrapping a `ColumnIndex` and a `ColumnValue` object.

View File

@@ -89,6 +89,13 @@ fn main() {
black_box(sum); black_box(sum);
}); });
group.register("first_block_fetch", |column| {
let mut block: Vec<Option<u64>> = vec![None; 64];
let fetch_docids = (0..64).collect::<Vec<_>>();
column.first_vals(&fetch_docids, &mut block);
black_box(block[0]);
});
group.register("first_block_single_calls", |column| { group.register("first_block_single_calls", |column| {
let mut block: Vec<Option<u64>> = vec![None; 64]; let mut block: Vec<Option<u64>> = vec![None; 64];
let fetch_docids = (0..64).collect::<Vec<_>>(); let fetch_docids = (0..64).collect::<Vec<_>>();

View File

@@ -29,20 +29,12 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
} }
} }
#[inline] #[inline]
pub fn fetch_block_with_missing( pub fn fetch_block_with_missing(&mut self, docs: &[u32], accessor: &Column<T>, missing: T) {
&mut self,
docs: &[u32],
accessor: &Column<T>,
missing: Option<T>,
) {
self.fetch_block(docs, accessor); self.fetch_block(docs, accessor);
// no missing values // no missing values
if accessor.index.get_cardinality().is_full() { if accessor.index.get_cardinality().is_full() {
return; return;
} }
let Some(missing) = missing else {
return;
};
// We can compare docid_cache length with docs to find missing docs // We can compare docid_cache length with docs to find missing docs
// For multi value columns we can't rely on the length and always need to scan // For multi value columns we can't rely on the length and always need to scan

View File

@@ -85,8 +85,8 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
} }
#[inline] #[inline]
pub fn first(&self, doc_id: DocId) -> Option<T> { pub fn first(&self, row_id: RowId) -> Option<T> {
self.values_for_doc(doc_id).next() self.values_for_doc(row_id).next()
} }
/// Load the first value for each docid in the provided slice. /// Load the first value for each docid in the provided slice.
@@ -131,8 +131,6 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
self.index.docids_to_rowids(doc_ids, doc_ids_out, row_ids) self.index.docids_to_rowids(doc_ids, doc_ids_out, row_ids)
} }
/// Get an iterator over the values for the provided docid.
#[inline]
pub fn values_for_doc(&self, doc_id: DocId) -> impl Iterator<Item = T> + '_ { pub fn values_for_doc(&self, doc_id: DocId) -> impl Iterator<Item = T> + '_ {
self.index self.index
.value_row_ids(doc_id) .value_row_ids(doc_id)
@@ -160,6 +158,15 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
.select_batch_in_place(selected_docid_range.start, doc_ids); .select_batch_in_place(selected_docid_range.start, doc_ids);
} }
/// Fills the output vector with the (possibly multiple values that are associated_with
/// `row_id`.
///
/// This method clears the `output` vector.
pub fn fill_vals(&self, row_id: RowId, output: &mut Vec<T>) {
output.clear();
output.extend(self.values_for_doc(row_id));
}
pub fn first_or_default_col(self, default_value: T) -> Arc<dyn ColumnValues<T>> { pub fn first_or_default_col(self, default_value: T) -> Arc<dyn ColumnValues<T>> {
Arc::new(FirstValueWithDefault { Arc::new(FirstValueWithDefault {
column: self, column: self,

View File

@@ -1,7 +1,7 @@
use std::fmt::Debug; use std::fmt::Debug;
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
/// Monotonic maps a value to u128 value space /// Montonic maps a value to u128 value space
/// Monotonic mapping enables `PartialOrd` on u128 space without conversion to original space. /// Monotonic mapping enables `PartialOrd` on u128 space without conversion to original space.
pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Debug + Send + Sync { pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Debug + Send + Sync {
/// Converts a value to u128. /// Converts a value to u128.

View File

@@ -41,6 +41,12 @@ fn transform_range_before_linear_transformation(
if range.is_empty() { if range.is_empty() {
return None; return None;
} }
if stats.min_value > *range.end() {
return None;
}
if stats.max_value < *range.start() {
return None;
}
let shifted_range = let shifted_range =
range.start().saturating_sub(stats.min_value)..=range.end().saturating_sub(stats.min_value); range.start().saturating_sub(stats.min_value)..=range.end().saturating_sub(stats.min_value);
let start_before_gcd_multiplication: u64 = div_ceil(*shifted_range.start(), stats.gcd); let start_before_gcd_multiplication: u64 = div_ceil(*shifted_range.start(), stats.gcd);

View File

@@ -8,7 +8,7 @@ use crate::column_values::ColumnValues;
const MID_POINT: u64 = (1u64 << 32) - 1u64; const MID_POINT: u64 = (1u64 << 32) - 1u64;
/// `Line` describes a line function `y: ax + b` using integer /// `Line` describes a line function `y: ax + b` using integer
/// arithmetic. /// arithmetics.
/// ///
/// The slope is in fact a decimal split into a 32 bit integer value, /// The slope is in fact a decimal split into a 32 bit integer value,
/// and a 32-bit decimal value. /// and a 32-bit decimal value.
@@ -94,7 +94,7 @@ impl Line {
// `(i, ys[])`. // `(i, ys[])`.
// //
// The best intercept therefore has the form // The best intercept therefore has the form
// `y[i] - line.eval(i)` (using wrapping arithmetic). // `y[i] - line.eval(i)` (using wrapping arithmetics).
// In other words, the best intercept is one of the `y - Line::eval(ys[i])` // In other words, the best intercept is one of the `y - Line::eval(ys[i])`
// and our task is just to pick the one that minimizes our error. // and our task is just to pick the one that minimizes our error.
// //

View File

@@ -52,7 +52,7 @@ pub trait ColumnCodecEstimator<T = u64>: 'static {
) -> io::Result<()>; ) -> io::Result<()>;
} }
/// A column codec describes a column serialization format. /// A column codec describes a colunm serialization format.
pub trait ColumnCodec<T: PartialOrd = u64> { pub trait ColumnCodec<T: PartialOrd = u64> {
/// Specialized `ColumnValues` type. /// Specialized `ColumnValues` type.
type ColumnValues: ColumnValues<T> + 'static; type ColumnValues: ColumnValues<T> + 'static;

View File

@@ -3,8 +3,7 @@ use std::sync::Arc;
use std::{fmt, io}; use std::{fmt, io};
use common::file_slice::FileSlice; use common::file_slice::FileSlice;
use common::{ByteCount, DateTime, OwnedBytes}; use common::{ByteCount, DateTime, HasLen, OwnedBytes};
use serde::{Deserialize, Serialize};
use crate::column::{BytesColumn, Column, StrColumn}; use crate::column::{BytesColumn, Column, StrColumn};
use crate::column_values::{StrictlyMonotonicFn, monotonic_map_column}; use crate::column_values::{StrictlyMonotonicFn, monotonic_map_column};
@@ -318,89 +317,10 @@ impl DynamicColumnHandle {
} }
pub fn num_bytes(&self) -> ByteCount { pub fn num_bytes(&self) -> ByteCount {
self.file_slice.num_bytes() self.file_slice.len().into()
}
/// Legacy helper returning the column space usage.
pub fn column_and_dictionary_num_bytes(&self) -> io::Result<ColumnSpaceUsage> {
self.space_usage()
}
/// Return the space usage of the column, optionally broken down by dictionary and column
/// values.
///
/// For dictionary encoded columns (strings and bytes), this splits the total footprint into
/// the dictionary and the remaining column data (including index and values).
/// For all other column types, the dictionary size is `None` and the column size
/// equals the total bytes.
pub fn space_usage(&self) -> io::Result<ColumnSpaceUsage> {
let total_num_bytes = self.num_bytes();
let dynamic_column = self.open()?;
let dictionary_num_bytes = match &dynamic_column {
DynamicColumn::Bytes(bytes_column) => bytes_column.dictionary().num_bytes(),
DynamicColumn::Str(str_column) => str_column.dictionary().num_bytes(),
_ => {
return Ok(ColumnSpaceUsage::new(self.num_bytes(), None));
}
};
assert!(dictionary_num_bytes <= total_num_bytes);
let column_num_bytes =
ByteCount::from(total_num_bytes.get_bytes() - dictionary_num_bytes.get_bytes());
Ok(ColumnSpaceUsage::new(
column_num_bytes,
Some(dictionary_num_bytes),
))
} }
pub fn column_type(&self) -> ColumnType { pub fn column_type(&self) -> ColumnType {
self.column_type self.column_type
} }
} }
/// Represents space usage of a column.
///
/// `column_num_bytes` tracks the column payload (index, values and footer).
/// For dictionary encoded columns, `dictionary_num_bytes` captures the dictionary footprint.
/// [`ColumnSpaceUsage::total_num_bytes`] returns the sum of both parts.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ColumnSpaceUsage {
column_num_bytes: ByteCount,
dictionary_num_bytes: Option<ByteCount>,
}
impl ColumnSpaceUsage {
pub(crate) fn new(
column_num_bytes: ByteCount,
dictionary_num_bytes: Option<ByteCount>,
) -> Self {
ColumnSpaceUsage {
column_num_bytes,
dictionary_num_bytes,
}
}
pub fn column_num_bytes(&self) -> ByteCount {
self.column_num_bytes
}
pub fn dictionary_num_bytes(&self) -> Option<ByteCount> {
self.dictionary_num_bytes
}
pub fn total_num_bytes(&self) -> ByteCount {
self.column_num_bytes + self.dictionary_num_bytes.unwrap_or_default()
}
/// Merge two space usage values by summing their components.
pub fn merge(&self, other: &ColumnSpaceUsage) -> ColumnSpaceUsage {
let dictionary_num_bytes = match (self.dictionary_num_bytes, other.dictionary_num_bytes) {
(Some(lhs), Some(rhs)) => Some(lhs + rhs),
(Some(val), None) | (None, Some(val)) => Some(val),
(None, None) => None,
};
ColumnSpaceUsage {
column_num_bytes: self.column_num_bytes + other.column_num_bytes,
dictionary_num_bytes,
}
}
}

View File

@@ -48,7 +48,7 @@ pub use columnar::{
use sstable::VoidSSTable; use sstable::VoidSSTable;
pub use value::{NumericalType, NumericalValue}; pub use value::{NumericalType, NumericalValue};
pub use self::dynamic_column::{ColumnSpaceUsage, DynamicColumn, DynamicColumnHandle}; pub use self::dynamic_column::{DynamicColumn, DynamicColumnHandle};
pub type RowId = u32; pub type RowId = u32;
pub type DocId = u32; pub type DocId = u32;

View File

@@ -60,7 +60,7 @@ fn test_dataframe_writer_bool() {
let DynamicColumn::Bool(bool_col) = dyn_bool_col else { let DynamicColumn::Bool(bool_col) = dyn_bool_col else {
panic!(); panic!();
}; };
let vals: Vec<Option<bool>> = (0..5).map(|doc_id| bool_col.first(doc_id)).collect(); let vals: Vec<Option<bool>> = (0..5).map(|row_id| bool_col.first(row_id)).collect();
assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]); assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
} }
@@ -108,7 +108,7 @@ fn test_dataframe_writer_ip_addr() {
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else { let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else {
panic!(); panic!();
}; };
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|doc_id| ip_col.first(doc_id)).collect(); let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|row_id| ip_col.first(row_id)).collect();
assert_eq!( assert_eq!(
&vals, &vals,
&[ &[
@@ -169,7 +169,7 @@ fn test_dictionary_encoded_str() {
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else { let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else {
panic!(); panic!();
}; };
let index: Vec<Option<u64>> = (0..5).map(|doc_id| str_col.ords().first(doc_id)).collect(); let index: Vec<Option<u64>> = (0..5).map(|row_id| str_col.ords().first(row_id)).collect();
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]); assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
assert_eq!(str_col.num_rows(), 5); assert_eq!(str_col.num_rows(), 5);
let mut term_buffer = String::new(); let mut term_buffer = String::new();
@@ -204,7 +204,7 @@ fn test_dictionary_encoded_bytes() {
panic!(); panic!();
}; };
let index: Vec<Option<u64>> = (0..5) let index: Vec<Option<u64>> = (0..5)
.map(|doc_id| bytes_col.ords().first(doc_id)) .map(|row_id| bytes_col.ords().first(row_id))
.collect(); .collect();
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]); assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
assert_eq!(bytes_col.num_rows(), 5); assert_eq!(bytes_col.num_rows(), 5);

View File

@@ -181,14 +181,6 @@ pub struct BitSet {
len: u64, len: u64,
max_value: u32, max_value: u32,
} }
impl std::fmt::Debug for BitSet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("BitSet")
.field("len", &self.len)
.field("max_value", &self.max_value)
.finish()
}
}
fn num_buckets(max_val: u32) -> u32 { fn num_buckets(max_val: u32) -> u32 {
max_val.div_ceil(64u32) max_val.div_ceil(64u32)

View File

@@ -28,9 +28,7 @@ impl BinarySerializable for VIntU128 {
writer.write_all(&buffer) writer.write_all(&buffer)
} }
#[allow(clippy::unbuffered_bytes)]
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> { fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
#[allow(clippy::unbuffered_bytes)]
let mut bytes = reader.bytes(); let mut bytes = reader.bytes();
let mut result = 0u128; let mut result = 0u128;
let mut shift = 0u64; let mut shift = 0u64;
@@ -197,9 +195,7 @@ impl BinarySerializable for VInt {
writer.write_all(&buffer[0..num_bytes]) writer.write_all(&buffer[0..num_bytes])
} }
#[allow(clippy::unbuffered_bytes)]
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> { fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
#[allow(clippy::unbuffered_bytes)]
let mut bytes = reader.bytes(); let mut bytes = reader.bytes();
let mut result = 0u64; let mut result = 0u64;
let mut shift = 0u64; let mut shift = 0u64;

View File

@@ -208,7 +208,7 @@ fn main() -> tantivy::Result<()> {
// is the role of the `TopDocs` collector. // is the role of the `TopDocs` collector.
// We can now perform our query. // We can now perform our query.
let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?; let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
// The actual documents still need to be // The actual documents still need to be
// retrieved from Tantivy's store. // retrieved from Tantivy's store.
@@ -226,7 +226,7 @@ fn main() -> tantivy::Result<()> {
let query = query_parser.parse_query("title:sea^20 body:whale^70")?; let query = query_parser.parse_query("title:sea^20 body:whale^70")?;
let (_score, doc_address) = searcher let (_score, doc_address) = searcher
.search(&query, &TopDocs::with_limit(1).order_by_score())? .search(&query, &TopDocs::with_limit(1))?
.into_iter() .into_iter()
.next() .next()
.unwrap(); .unwrap();

View File

@@ -100,7 +100,7 @@ fn main() -> tantivy::Result<()> {
// here we want to get a hit on the 'ken' in Frankenstein // here we want to get a hit on the 'ken' in Frankenstein
let query = query_parser.parse_query("ken")?; let query = query_parser.parse_query("ken")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?; let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (_, doc_address) in top_docs { for (_, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;

View File

@@ -50,14 +50,14 @@ fn main() -> tantivy::Result<()> {
{ {
// Simple exact search on the date // Simple exact search on the date
let query = query_parser.parse_query("occurred_at:\"2022-06-22T12:53:50.53Z\"")?; let query = query_parser.parse_query("occurred_at:\"2022-06-22T12:53:50.53Z\"")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5).order_by_score())?; let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
assert_eq!(count_docs.len(), 1); assert_eq!(count_docs.len(), 1);
} }
{ {
// Range query on the date field // Range query on the date field
let query = query_parser let query = query_parser
.parse_query(r#"occurred_at:[2022-06-22T12:58:00Z TO 2022-06-23T00:00:00Z}"#)?; .parse_query(r#"occurred_at:[2022-06-22T12:58:00Z TO 2022-06-23T00:00:00Z}"#)?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4).order_by_score())?; let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
assert_eq!(count_docs.len(), 1); assert_eq!(count_docs.len(), 1);
for (_score, doc_address) in count_docs { for (_score, doc_address) in count_docs {
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?; let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;

View File

@@ -28,7 +28,7 @@ fn extract_doc_given_isbn(
// The second argument is here to tell we don't care about decoding positions, // The second argument is here to tell we don't care about decoding positions,
// or term frequencies. // or term frequencies.
let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic); let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic);
let top_docs = searcher.search(&term_query, &TopDocs::with_limit(1).order_by_score())?; let top_docs = searcher.search(&term_query, &TopDocs::with_limit(1))?;
if let Some((_score, doc_address)) = top_docs.first() { if let Some((_score, doc_address)) = top_docs.first() {
let doc = searcher.doc(*doc_address)?; let doc = searcher.doc(*doc_address)?;

View File

@@ -145,7 +145,7 @@ fn main() -> tantivy::Result<()> {
let query = FuzzyTermQuery::new(term, 2, true); let query = FuzzyTermQuery::new(term, 2, true);
let (top_docs, count) = searcher let (top_docs, count) = searcher
.search(&query, &(TopDocs::with_limit(5).order_by_score(), Count)) .search(&query, &(TopDocs::with_limit(5), Count))
.unwrap(); .unwrap();
assert_eq!(count, 3); assert_eq!(count, 3);
assert_eq!(top_docs.len(), 3); assert_eq!(top_docs.len(), 3);

View File

@@ -69,25 +69,25 @@ fn main() -> tantivy::Result<()> {
{ {
// Inclusive range queries // Inclusive range queries
let query = query_parser.parse_query("ip:[192.168.0.80 TO 192.168.0.100]")?; let query = query_parser.parse_query("ip:[192.168.0.80 TO 192.168.0.100]")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5).order_by_score())?; let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
assert_eq!(count_docs.len(), 1); assert_eq!(count_docs.len(), 1);
} }
{ {
// Exclusive range queries // Exclusive range queries
let query = query_parser.parse_query("ip:{192.168.0.80 TO 192.168.1.100]")?; let query = query_parser.parse_query("ip:{192.168.0.80 TO 192.168.1.100]")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?; let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(count_docs.len(), 0); assert_eq!(count_docs.len(), 0);
} }
{ {
// Find docs with IP addresses smaller equal 192.168.1.100 // Find docs with IP addresses smaller equal 192.168.1.100
let query = query_parser.parse_query("ip:[* TO 192.168.1.100]")?; let query = query_parser.parse_query("ip:[* TO 192.168.1.100]")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?; let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(count_docs.len(), 2); assert_eq!(count_docs.len(), 2);
} }
{ {
// Find docs with IP addresses smaller than 192.168.1.100 // Find docs with IP addresses smaller than 192.168.1.100
let query = query_parser.parse_query("ip:[* TO 192.168.1.100}")?; let query = query_parser.parse_query("ip:[* TO 192.168.1.100}")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?; let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(count_docs.len(), 2); assert_eq!(count_docs.len(), 2);
} }

View File

@@ -59,12 +59,12 @@ fn main() -> tantivy::Result<()> {
let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]); let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]);
{ {
let query = query_parser.parse_query("target:submit-button")?; let query = query_parser.parse_query("target:submit-button")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?; let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(count_docs.len(), 2); assert_eq!(count_docs.len(), 2);
} }
{ {
let query = query_parser.parse_query("target:submit")?; let query = query_parser.parse_query("target:submit")?;
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?; let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(count_docs.len(), 2); assert_eq!(count_docs.len(), 2);
} }
{ {
@@ -74,33 +74,33 @@ fn main() -> tantivy::Result<()> {
} }
{ {
let query = query_parser.parse_query("click AND cart.product_id:133")?; let query = query_parser.parse_query("click AND cart.product_id:133")?;
let hits = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?; let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(hits.len(), 1); assert_eq!(hits.len(), 1);
} }
{ {
// The sub-fields in the json field marked as default field still need to be explicitly // The sub-fields in the json field marked as default field still need to be explicitly
// addressed // addressed
let query = query_parser.parse_query("click AND 133")?; let query = query_parser.parse_query("click AND 133")?;
let hits = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?; let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(hits.len(), 0); assert_eq!(hits.len(), 0);
} }
{ {
// Default json fields are ignored if they collide with the schema // Default json fields are ignored if they collide with the schema
let query = query_parser.parse_query("event_type:holiday-sale")?; let query = query_parser.parse_query("event_type:holiday-sale")?;
let hits = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?; let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(hits.len(), 0); assert_eq!(hits.len(), 0);
} }
// # Query via full attribute path // # Query via full attribute path
{ {
// This only searches in our schema's `event_type` field // This only searches in our schema's `event_type` field
let query = query_parser.parse_query("event_type:click")?; let query = query_parser.parse_query("event_type:click")?;
let hits = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?; let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(hits.len(), 2); assert_eq!(hits.len(), 2);
} }
{ {
// Default json fields can still be accessed by full path // Default json fields can still be accessed by full path
let query = query_parser.parse_query("attributes.event_type:holiday-sale")?; let query = query_parser.parse_query("attributes.event_type:holiday-sale")?;
let hits = searcher.search(&*query, &TopDocs::with_limit(2).order_by_score())?; let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(hits.len(), 1); assert_eq!(hits.len(), 1);
} }
Ok(()) Ok(())

View File

@@ -63,7 +63,7 @@ fn main() -> Result<()> {
// but not "in the Gulf Stream". // but not "in the Gulf Stream".
let query = query_parser.parse_query("\"in the su\"*")?; let query = query_parser.parse_query("\"in the su\"*")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?; let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let mut titles = top_docs let mut titles = top_docs
.into_iter() .into_iter()
.map(|(_score, doc_address)| { .map(|(_score, doc_address)| {

View File

@@ -107,8 +107,7 @@ fn main() -> tantivy::Result<()> {
IndexRecordOption::Basic, IndexRecordOption::Basic,
); );
let (top_docs, count) = let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
searcher.search(&query, &(TopDocs::with_limit(2).order_by_score(), Count))?;
assert_eq!(count, 2); assert_eq!(count, 2);
@@ -129,8 +128,7 @@ fn main() -> tantivy::Result<()> {
IndexRecordOption::Basic, IndexRecordOption::Basic,
); );
let (_top_docs, count) = let (_top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
searcher.search(&query, &(TopDocs::with_limit(2).order_by_score(), Count))?;
assert_eq!(count, 0); assert_eq!(count, 0);

View File

@@ -50,7 +50,7 @@ fn main() -> tantivy::Result<()> {
let query_parser = QueryParser::for_index(&index, vec![title, body]); let query_parser = QueryParser::for_index(&index, vec![title, body]);
let query = query_parser.parse_query("sycamore spring")?; let query = query_parser.parse_query("sycamore spring")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?; let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?; let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;

View File

@@ -102,7 +102,7 @@ fn main() -> tantivy::Result<()> {
// stop words are applied on the query as well. // stop words are applied on the query as well.
// The following will be equivalent to `title:frankenstein` // The following will be equivalent to `title:frankenstein`
let query = query_parser.parse_query("title:\"the Frankenstein\"")?; let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?; let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (score, doc_address) in top_docs { for (score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;

View File

@@ -164,7 +164,7 @@ fn main() -> tantivy::Result<()> {
move |doc_id: DocId| Reverse(price[doc_id as usize]) move |doc_id: DocId| Reverse(price[doc_id as usize])
}; };
let most_expensive_first = TopDocs::with_limit(10).order_by(score_by_price); let most_expensive_first = TopDocs::with_limit(10).custom_score(score_by_price);
let hits = searcher.search(&query, &most_expensive_first)?; let hits = searcher.search(&query, &most_expensive_first)?;
assert_eq!( assert_eq!(

View File

@@ -758,17 +758,7 @@ fn negate(expr: UserInputAst) -> UserInputAst {
fn leaf(inp: &str) -> IResult<&str, UserInputAst> { fn leaf(inp: &str) -> IResult<&str, UserInputAst> {
alt(( alt((
delimited(char('('), ast, char(')')), delimited(char('('), ast, char(')')),
map( map(char('*'), |_| UserInputAst::from(UserInputLeaf::All)),
terminated(
char('*'),
peek(alt((
value((), multispace1),
value((), char(')')),
value((), eof),
))),
),
|_| UserInputAst::from(UserInputLeaf::All),
),
map(preceded(tuple((tag("NOT"), multispace1)), leaf), negate), map(preceded(tuple((tag("NOT"), multispace1)), leaf), negate),
literal, literal,
))(inp) ))(inp)
@@ -789,17 +779,7 @@ fn leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
), ),
), ),
( (
value( value((), char('*')),
(),
terminated(
char('*'),
peek(alt((
value((), multispace1),
value((), char(')')),
value((), eof),
))),
),
),
map(nothing, |_| { map(nothing, |_| {
(Some(UserInputAst::from(UserInputLeaf::All)), Vec::new()) (Some(UserInputAst::from(UserInputLeaf::All)), Vec::new())
}), }),
@@ -1691,21 +1671,6 @@ mod test {
test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":a *b)"); test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":a *b)");
test_parse_query_to_ast_helper("abc:\"a b\"", "\"abc\":\"a b\""); test_parse_query_to_ast_helper("abc:\"a b\"", "\"abc\":\"a b\"");
test_parse_query_to_ast_helper("foo:[1 TO 5]", "\"foo\":[\"1\" TO \"5\"]"); test_parse_query_to_ast_helper("foo:[1 TO 5]", "\"foo\":[\"1\" TO \"5\"]");
// Phrase prefixed with *
test_parse_query_to_ast_helper("foo:(*A)", "\"foo\":*A");
test_parse_query_to_ast_helper("*A", "*A");
test_parse_query_to_ast_helper("(*A)", "*A");
test_parse_query_to_ast_helper("foo:(A OR B)", "(?\"foo\":A ?\"foo\":B)");
test_parse_query_to_ast_helper("foo:(A* OR B*)", "(?\"foo\":A* ?\"foo\":B*)");
test_parse_query_to_ast_helper("foo:(*A OR *B)", "(?\"foo\":*A ?\"foo\":*B)");
}
#[test]
fn test_parse_query_all() {
test_parse_query_to_ast_helper("*", "*");
test_parse_query_to_ast_helper("(*)", "*");
test_parse_query_to_ast_helper("(* )", "*");
} }
#[test] #[test]

View File

@@ -16,16 +16,15 @@ use crate::index::SegmentReader;
/// That way we can use it the same way as if it would come from the fastfield. /// That way we can use it the same way as if it would come from the fastfield.
pub(crate) fn get_missing_val_as_u64_lenient( pub(crate) fn get_missing_val_as_u64_lenient(
column_type: ColumnType, column_type: ColumnType,
column_max_value: u64,
missing: &Key, missing: &Key,
field_name: &str, field_name: &str,
) -> crate::Result<Option<u64>> { ) -> crate::Result<Option<u64>> {
let missing_val = match missing { let missing_val = match missing {
Key::Str(_) if column_type == ColumnType::Str => Some(column_max_value + 1), Key::Str(_) if column_type == ColumnType::Str => Some(u64::MAX),
// Allow fallback to number on text fields // Allow fallback to number on text fields
Key::F64(_) if column_type == ColumnType::Str => Some(column_max_value + 1), Key::F64(_) if column_type == ColumnType::Str => Some(u64::MAX),
Key::U64(_) if column_type == ColumnType::Str => Some(column_max_value + 1), Key::U64(_) if column_type == ColumnType::Str => Some(u64::MAX),
Key::I64(_) if column_type == ColumnType::Str => Some(column_max_value + 1), Key::I64(_) if column_type == ColumnType::Str => Some(u64::MAX),
Key::F64(val) if column_type.numerical_type().is_some() => { Key::F64(val) if column_type.numerical_type().is_some() => {
f64_to_fastfield_u64(*val, &column_type) f64_to_fastfield_u64(*val, &column_type)
} }

View File

@@ -1,4 +1,4 @@
use columnar::{Column, ColumnBlockAccessor, ColumnType, StrColumn}; use columnar::{Column, ColumnType, StrColumn};
use common::BitSet; use common::BitSet;
use rustc_hash::FxHashSet; use rustc_hash::FxHashSet;
use serde::Serialize; use serde::Serialize;
@@ -10,16 +10,16 @@ use crate::aggregation::accessor_helpers::{
}; };
use crate::aggregation::agg_req::{Aggregation, AggregationVariants, Aggregations}; use crate::aggregation::agg_req::{Aggregation, AggregationVariants, Aggregations};
use crate::aggregation::bucket::{ use crate::aggregation::bucket::{
build_segment_filter_collector, build_segment_range_collector, FilterAggReqData, build_segment_aggregation_collector, FilterAggReqData, HistogramAggReqData, HistogramBounds,
HistogramAggReqData, HistogramBounds, IncludeExcludeParam, MissingTermAggReqData, IncludeExcludeParam, MissingTermAggReqData, RangeAggReqData, SegmentFilterCollector,
RangeAggReqData, SegmentHistogramCollector, TermMissingAgg, TermsAggReqData, TermsAggregation, SegmentHistogramCollector, SegmentRangeCollector, TermMissingAgg, TermsAggReqData,
TermsAggregationInternal, TermsAggregation, TermsAggregationInternal,
}; };
use crate::aggregation::metric::{ use crate::aggregation::metric::{
build_segment_stats_collector, AverageAggregation, CardinalityAggReqData, AverageAggregation, CardinalityAggReqData, CardinalityAggregationReq, CountAggregation,
CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation, MaxAggregation, ExtendedStatsAggregation, MaxAggregation, MetricAggReqData, MinAggregation,
MetricAggReqData, MinAggregation, SegmentCardinalityCollector, SegmentExtendedStatsCollector, SegmentCardinalityCollector, SegmentExtendedStatsCollector, SegmentPercentilesCollector,
SegmentPercentilesCollector, StatsAggregation, StatsType, SumAggregation, TopHitsAggReqData, SegmentStatsCollector, StatsAggregation, StatsType, SumAggregation, TopHitsAggReqData,
TopHitsSegmentCollector, TopHitsSegmentCollector,
}; };
use crate::aggregation::segment_agg_result::{ use crate::aggregation::segment_agg_result::{
@@ -35,7 +35,6 @@ pub struct AggregationsSegmentCtx {
/// Request data for each aggregation type. /// Request data for each aggregation type.
pub per_request: PerRequestAggSegCtx, pub per_request: PerRequestAggSegCtx,
pub context: AggContextParams, pub context: AggContextParams,
pub column_block_accessor: ColumnBlockAccessor<u64>,
} }
impl AggregationsSegmentCtx { impl AggregationsSegmentCtx {
@@ -108,14 +107,21 @@ impl AggregationsSegmentCtx {
.as_deref() .as_deref()
.expect("range_req_data slot is empty (taken)") .expect("range_req_data slot is empty (taken)")
} }
#[inline]
pub(crate) fn get_filter_req_data(&self, idx: usize) -> &FilterAggReqData {
self.per_request.filter_req_data[idx]
.as_deref()
.expect("filter_req_data slot is empty (taken)")
}
// ---------- mutable getters ---------- // ---------- mutable getters ----------
#[inline] #[inline]
pub(crate) fn get_metric_req_data_mut(&mut self, idx: usize) -> &mut MetricAggReqData { pub(crate) fn get_term_req_data_mut(&mut self, idx: usize) -> &mut TermsAggReqData {
&mut self.per_request.stats_metric_req_data[idx] self.per_request.term_req_data[idx]
.as_deref_mut()
.expect("term_req_data slot is empty (taken)")
} }
#[inline] #[inline]
pub(crate) fn get_cardinality_req_data_mut( pub(crate) fn get_cardinality_req_data_mut(
&mut self, &mut self,
@@ -123,7 +129,10 @@ impl AggregationsSegmentCtx {
) -> &mut CardinalityAggReqData { ) -> &mut CardinalityAggReqData {
&mut self.per_request.cardinality_req_data[idx] &mut self.per_request.cardinality_req_data[idx]
} }
#[inline]
pub(crate) fn get_metric_req_data_mut(&mut self, idx: usize) -> &mut MetricAggReqData {
&mut self.per_request.stats_metric_req_data[idx]
}
#[inline] #[inline]
pub(crate) fn get_histogram_req_data_mut(&mut self, idx: usize) -> &mut HistogramAggReqData { pub(crate) fn get_histogram_req_data_mut(&mut self, idx: usize) -> &mut HistogramAggReqData {
self.per_request.histogram_req_data[idx] self.per_request.histogram_req_data[idx]
@@ -133,6 +142,21 @@ impl AggregationsSegmentCtx {
// ---------- take / put (terms, histogram, range) ---------- // ---------- take / put (terms, histogram, range) ----------
/// Move out the boxed Terms request at `idx`, leaving `None`.
#[inline]
pub(crate) fn take_term_req_data(&mut self, idx: usize) -> Box<TermsAggReqData> {
self.per_request.term_req_data[idx]
.take()
.expect("term_req_data slot is empty (taken)")
}
/// Put back a Terms request into an empty slot at `idx`.
#[inline]
pub(crate) fn put_back_term_req_data(&mut self, idx: usize, value: Box<TermsAggReqData>) {
debug_assert!(self.per_request.term_req_data[idx].is_none());
self.per_request.term_req_data[idx] = Some(value);
}
/// Move out the boxed Histogram request at `idx`, leaving `None`. /// Move out the boxed Histogram request at `idx`, leaving `None`.
#[inline] #[inline]
pub(crate) fn take_histogram_req_data(&mut self, idx: usize) -> Box<HistogramAggReqData> { pub(crate) fn take_histogram_req_data(&mut self, idx: usize) -> Box<HistogramAggReqData> {
@@ -296,7 +320,6 @@ impl PerRequestAggSegCtx {
/// Convert the aggregation tree into a serializable struct representation. /// Convert the aggregation tree into a serializable struct representation.
/// Each node contains: { name, kind, children }. /// Each node contains: { name, kind, children }.
#[allow(dead_code)]
pub fn get_view_tree(&self) -> Vec<AggTreeViewNode> { pub fn get_view_tree(&self) -> Vec<AggTreeViewNode> {
fn node_to_view(node: &AggRefNode, pr: &PerRequestAggSegCtx) -> AggTreeViewNode { fn node_to_view(node: &AggRefNode, pr: &PerRequestAggSegCtx) -> AggTreeViewNode {
let mut children: Vec<AggTreeViewNode> = let mut children: Vec<AggTreeViewNode> =
@@ -322,19 +345,12 @@ impl PerRequestAggSegCtx {
pub(crate) fn build_segment_agg_collectors_root( pub(crate) fn build_segment_agg_collectors_root(
req: &mut AggregationsSegmentCtx, req: &mut AggregationsSegmentCtx,
) -> crate::Result<Box<dyn SegmentAggregationCollector>> { ) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
build_segment_agg_collectors_generic(req, &req.per_request.agg_tree.clone()) build_segment_agg_collectors(req, &req.per_request.agg_tree.clone())
} }
pub(crate) fn build_segment_agg_collectors( pub(crate) fn build_segment_agg_collectors(
req: &mut AggregationsSegmentCtx, req: &mut AggregationsSegmentCtx,
nodes: &[AggRefNode], nodes: &[AggRefNode],
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
build_segment_agg_collectors_generic(req, nodes)
}
fn build_segment_agg_collectors_generic(
req: &mut AggregationsSegmentCtx,
nodes: &[AggRefNode],
) -> crate::Result<Box<dyn SegmentAggregationCollector>> { ) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
let mut collectors = Vec::new(); let mut collectors = Vec::new();
for node in nodes.iter() { for node in nodes.iter() {
@@ -357,7 +373,7 @@ pub(crate) fn build_segment_agg_collector(
node: &AggRefNode, node: &AggRefNode,
) -> crate::Result<Box<dyn SegmentAggregationCollector>> { ) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
match node.kind { match node.kind {
AggKind::Terms => crate::aggregation::bucket::build_segment_term_collector(req, node), AggKind::Terms => build_segment_aggregation_collector(req, node),
AggKind::MissingTerm => { AggKind::MissingTerm => {
let req_data = &mut req.per_request.missing_term_req_data[node.idx_in_req_data]; let req_data = &mut req.per_request.missing_term_req_data[node.idx_in_req_data];
if req_data.accessors.is_empty() { if req_data.accessors.is_empty() {
@@ -372,8 +388,6 @@ pub(crate) fn build_segment_agg_collector(
Ok(Box::new(SegmentCardinalityCollector::from_req( Ok(Box::new(SegmentCardinalityCollector::from_req(
req_data.column_type, req_data.column_type,
node.idx_in_req_data, node.idx_in_req_data,
req_data.accessor.clone(),
req_data.missing_value_for_accessor,
))) )))
} }
AggKind::StatsKind(stats_type) => { AggKind::StatsKind(stats_type) => {
@@ -384,21 +398,20 @@ pub(crate) fn build_segment_agg_collector(
| StatsType::Count | StatsType::Count
| StatsType::Max | StatsType::Max
| StatsType::Min | StatsType::Min
| StatsType::Stats => build_segment_stats_collector(req_data), | StatsType::Stats => Ok(Box::new(SegmentStatsCollector::from_req(
StatsType::ExtendedStats(sigma) => Ok(Box::new( node.idx_in_req_data,
SegmentExtendedStatsCollector::from_req(req_data, sigma), ))),
)), StatsType::ExtendedStats(sigma) => {
StatsType::Percentiles => { Ok(Box::new(SegmentExtendedStatsCollector::from_req(
let req_data = req.get_metric_req_data_mut(node.idx_in_req_data); req_data.field_type,
Ok(Box::new( sigma,
SegmentPercentilesCollector::from_req_and_validate( node.idx_in_req_data,
req_data.field_type, req_data.missing,
req_data.missing_u64, )))
req_data.accessor.clone(),
node.idx_in_req_data,
),
))
} }
StatsType::Percentiles => Ok(Box::new(
SegmentPercentilesCollector::from_req_and_validate(node.idx_in_req_data)?,
)),
} }
} }
AggKind::TopHits => { AggKind::TopHits => {
@@ -415,8 +428,12 @@ pub(crate) fn build_segment_agg_collector(
AggKind::DateHistogram => Ok(Box::new(SegmentHistogramCollector::from_req_and_validate( AggKind::DateHistogram => Ok(Box::new(SegmentHistogramCollector::from_req_and_validate(
req, node, req, node,
)?)), )?)),
AggKind::Range => Ok(build_segment_range_collector(req, node)?), AggKind::Range => Ok(Box::new(SegmentRangeCollector::from_req_and_validate(
AggKind::Filter => build_segment_filter_collector(req, node), req, node,
)?)),
AggKind::Filter => Ok(Box::new(SegmentFilterCollector::from_req_and_validate(
req, node,
)?)),
} }
} }
@@ -476,11 +493,10 @@ pub(crate) fn build_aggregations_data_from_req(
let mut data = AggregationsSegmentCtx { let mut data = AggregationsSegmentCtx {
per_request: Default::default(), per_request: Default::default(),
context, context,
column_block_accessor: ColumnBlockAccessor::default(),
}; };
for (name, agg) in aggs.iter() { for (name, agg) in aggs.iter() {
let nodes = build_nodes(name, agg, reader, segment_ordinal, &mut data, true)?; let nodes = build_nodes(name, agg, reader, segment_ordinal, &mut data)?;
data.per_request.agg_tree.extend(nodes); data.per_request.agg_tree.extend(nodes);
} }
Ok(data) Ok(data)
@@ -492,7 +508,6 @@ fn build_nodes(
reader: &SegmentReader, reader: &SegmentReader,
segment_ordinal: SegmentOrdinal, segment_ordinal: SegmentOrdinal,
data: &mut AggregationsSegmentCtx, data: &mut AggregationsSegmentCtx,
is_top_level: bool,
) -> crate::Result<Vec<AggRefNode>> { ) -> crate::Result<Vec<AggRefNode>> {
use AggregationVariants::*; use AggregationVariants::*;
match &req.agg { match &req.agg {
@@ -505,9 +520,9 @@ fn build_nodes(
let idx_in_req_data = data.push_range_req_data(RangeAggReqData { let idx_in_req_data = data.push_range_req_data(RangeAggReqData {
accessor, accessor,
field_type, field_type,
column_block_accessor: Default::default(),
name: agg_name.to_string(), name: agg_name.to_string(),
req: range_req.clone(), req: range_req.clone(),
is_top_level,
}); });
let children = build_children(&req.sub_aggregation, reader, segment_ordinal, data)?; let children = build_children(&req.sub_aggregation, reader, segment_ordinal, data)?;
Ok(vec![AggRefNode { Ok(vec![AggRefNode {
@@ -525,7 +540,9 @@ fn build_nodes(
let idx_in_req_data = data.push_histogram_req_data(HistogramAggReqData { let idx_in_req_data = data.push_histogram_req_data(HistogramAggReqData {
accessor, accessor,
field_type, field_type,
column_block_accessor: Default::default(),
name: agg_name.to_string(), name: agg_name.to_string(),
sub_aggregation_blueprint: None,
req: histo_req.clone(), req: histo_req.clone(),
is_date_histogram: false, is_date_histogram: false,
bounds: HistogramBounds { bounds: HistogramBounds {
@@ -550,7 +567,9 @@ fn build_nodes(
let idx_in_req_data = data.push_histogram_req_data(HistogramAggReqData { let idx_in_req_data = data.push_histogram_req_data(HistogramAggReqData {
accessor, accessor,
field_type, field_type,
column_block_accessor: Default::default(),
name: agg_name.to_string(), name: agg_name.to_string(),
sub_aggregation_blueprint: None,
req: histo_req, req: histo_req,
is_date_histogram: true, is_date_histogram: true,
bounds: HistogramBounds { bounds: HistogramBounds {
@@ -575,7 +594,6 @@ fn build_nodes(
data, data,
&req.sub_aggregation, &req.sub_aggregation,
TermsOrCardinalityRequest::Terms(terms_req.clone()), TermsOrCardinalityRequest::Terms(terms_req.clone()),
is_top_level,
), ),
Cardinality(card_req) => build_terms_or_cardinality_nodes( Cardinality(card_req) => build_terms_or_cardinality_nodes(
agg_name, agg_name,
@@ -586,7 +604,6 @@ fn build_nodes(
data, data,
&req.sub_aggregation, &req.sub_aggregation,
TermsOrCardinalityRequest::Cardinality(card_req.clone()), TermsOrCardinalityRequest::Cardinality(card_req.clone()),
is_top_level,
), ),
Average(AverageAggregation { field, missing, .. }) Average(AverageAggregation { field, missing, .. })
| Max(MaxAggregation { field, missing, .. }) | Max(MaxAggregation { field, missing, .. })
@@ -630,6 +647,7 @@ fn build_nodes(
let idx_in_req_data = data.push_metric_req_data(MetricAggReqData { let idx_in_req_data = data.push_metric_req_data(MetricAggReqData {
accessor, accessor,
field_type, field_type,
column_block_accessor: Default::default(),
name: agg_name.to_string(), name: agg_name.to_string(),
collecting_for, collecting_for,
missing: *missing, missing: *missing,
@@ -657,6 +675,7 @@ fn build_nodes(
let idx_in_req_data = data.push_metric_req_data(MetricAggReqData { let idx_in_req_data = data.push_metric_req_data(MetricAggReqData {
accessor, accessor,
field_type, field_type,
column_block_accessor: Default::default(),
name: agg_name.to_string(), name: agg_name.to_string(),
collecting_for: StatsType::Percentiles, collecting_for: StatsType::Percentiles,
missing: percentiles_req.missing, missing: percentiles_req.missing,
@@ -713,7 +732,7 @@ fn build_nodes(
// Build the query and evaluator upfront // Build the query and evaluator upfront
let schema = reader.schema(); let schema = reader.schema();
let tokenizers = &data.context.tokenizers; let tokenizers = &data.context.tokenizers;
let query = filter_req.parse_query(schema, tokenizers)?; let query = filter_req.parse_query(&schema, tokenizers)?;
let evaluator = crate::aggregation::bucket::DocumentQueryEvaluator::new( let evaluator = crate::aggregation::bucket::DocumentQueryEvaluator::new(
query, query,
schema.clone(), schema.clone(),
@@ -731,7 +750,6 @@ fn build_nodes(
segment_reader: reader.clone(), segment_reader: reader.clone(),
evaluator, evaluator,
matching_docs_buffer, matching_docs_buffer,
is_top_level,
}); });
let children = build_children(&req.sub_aggregation, reader, segment_ordinal, data)?; let children = build_children(&req.sub_aggregation, reader, segment_ordinal, data)?;
Ok(vec![AggRefNode { Ok(vec![AggRefNode {
@@ -751,14 +769,7 @@ fn build_children(
) -> crate::Result<Vec<AggRefNode>> { ) -> crate::Result<Vec<AggRefNode>> {
let mut children = Vec::new(); let mut children = Vec::new();
for (name, agg) in aggs.iter() { for (name, agg) in aggs.iter() {
children.extend(build_nodes( children.extend(build_nodes(name, agg, reader, segment_ordinal, data)?);
name,
agg,
reader,
segment_ordinal,
data,
false,
)?);
} }
Ok(children) Ok(children)
} }
@@ -822,7 +833,6 @@ fn build_terms_or_cardinality_nodes(
data: &mut AggregationsSegmentCtx, data: &mut AggregationsSegmentCtx,
sub_aggs: &Aggregations, sub_aggs: &Aggregations,
req: TermsOrCardinalityRequest, req: TermsOrCardinalityRequest,
is_top_level: bool,
) -> crate::Result<Vec<AggRefNode>> { ) -> crate::Result<Vec<AggRefNode>> {
let mut nodes = Vec::new(); let mut nodes = Vec::new();
@@ -874,12 +884,12 @@ fn build_terms_or_cardinality_nodes(
}); });
} }
// Add one node per accessor // Add one node per accessor to mirror previous behavior and allow per-type missing handling.
for (accessor, column_type) in column_and_types { for (accessor, column_type) in column_and_types {
let missing_value_for_accessor = if use_special_missing_agg { let missing_value_for_accessor = if use_special_missing_agg {
None None
} else if let Some(m) = missing.as_ref() { } else if let Some(m) = missing.as_ref() {
get_missing_val_as_u64_lenient(column_type, accessor.max_value(), m, field_name)? get_missing_val_as_u64_lenient(column_type, m, field_name)?
} else { } else {
None None
}; };
@@ -905,11 +915,13 @@ fn build_terms_or_cardinality_nodes(
column_type, column_type,
str_dict_column: str_dict_column.clone(), str_dict_column: str_dict_column.clone(),
missing_value_for_accessor, missing_value_for_accessor,
column_block_accessor: Default::default(),
name: agg_name.to_string(), name: agg_name.to_string(),
req: TermsAggregationInternal::from_req(req), req: TermsAggregationInternal::from_req(req),
// Will be filled later when building collectors
sub_aggregation_blueprint: None,
sug_aggregations: sub_aggs.clone(), sug_aggregations: sub_aggs.clone(),
allowed_term_ids, allowed_term_ids,
is_top_level,
}); });
(idx_in_req_data, AggKind::Terms) (idx_in_req_data, AggKind::Terms)
} }
@@ -919,6 +931,7 @@ fn build_terms_or_cardinality_nodes(
column_type, column_type,
str_dict_column: str_dict_column.clone(), str_dict_column: str_dict_column.clone(),
missing_value_for_accessor, missing_value_for_accessor,
column_block_accessor: Default::default(),
name: agg_name.to_string(), name: agg_name.to_string(),
req: req.clone(), req: req.clone(),
}); });

View File

@@ -35,7 +35,6 @@ pub struct AggregationLimitsGuard {
/// Allocated memory with this guard. /// Allocated memory with this guard.
allocated_with_the_guard: u64, allocated_with_the_guard: u64,
} }
impl Clone for AggregationLimitsGuard { impl Clone for AggregationLimitsGuard {
fn clone(&self) -> Self { fn clone(&self) -> Self {
Self { Self {

View File

@@ -16,7 +16,7 @@ use super::{AggregationError, Key};
use crate::TantivyError; use crate::TantivyError;
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)] #[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
/// The final aggregation result. /// The final aggegation result.
pub struct AggregationResults(pub FxHashMap<String, AggregationResult>); pub struct AggregationResults(pub FxHashMap<String, AggregationResult>);
impl AggregationResults { impl AggregationResults {

View File

@@ -2,441 +2,15 @@ use serde_json::Value;
use crate::aggregation::agg_req::{Aggregation, Aggregations}; use crate::aggregation::agg_req::{Aggregation, Aggregations};
use crate::aggregation::agg_result::AggregationResults; use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
use crate::aggregation::collector::AggregationCollector; use crate::aggregation::collector::AggregationCollector;
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults; use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms}; use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
use crate::aggregation::DistributedAggregationCollector; use crate::aggregation::DistributedAggregationCollector;
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
use crate::query::{AllQuery, TermQuery}; use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, FAST}; use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, IndexWriter, Term}; use crate::{Index, IndexWriter, Term};
// The following tests ensure that each bucket aggregation type correctly functions as a
// sub-aggregation of another bucket aggregation in two scenarios:
// 1) The parent has more buckets than the child sub-aggregation
// 2) The child sub-aggregation has more buckets than the parent
//
// These scenarios exercise the bucket id mapping and sub-aggregation routing logic.
#[test]
fn test_terms_as_subagg_parent_more_vs_child_more() -> crate::Result<()> {
let index = get_test_index_2_segments(false)?;
// Case A: parent has more buckets than child
// Parent: range with 4 buckets
// Child: terms on text -> 2 buckets
let agg_parent_more: Aggregations = serde_json::from_value(json!({
"parent_range": {
"range": {
"field": "score",
"ranges": [
{"to": 3.0},
{"from": 3.0, "to": 7.0},
{"from": 7.0, "to": 20.0},
{"from": 20.0}
]
},
"aggs": {
"child_terms": {"terms": {"field": "text", "order": {"_key": "asc"}}}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?;
// Exact expected structure and counts
assert_eq!(
res["parent_range"]["buckets"],
json!([
{
"key": "*-3",
"doc_count": 1,
"to": 3.0,
"child_terms": {
"buckets": [
{"doc_count": 1, "key": "cool"}
],
"sum_other_doc_count": 0
}
},
{
"key": "3-7",
"doc_count": 3,
"from": 3.0,
"to": 7.0,
"child_terms": {
"buckets": [
{"doc_count": 2, "key": "cool"},
{"doc_count": 1, "key": "nohit"}
],
"sum_other_doc_count": 0
}
},
{
"key": "7-20",
"doc_count": 3,
"from": 7.0,
"to": 20.0,
"child_terms": {
"buckets": [
{"doc_count": 3, "key": "cool"}
],
"sum_other_doc_count": 0
}
},
{
"key": "20-*",
"doc_count": 2,
"from": 20.0,
"child_terms": {
"buckets": [
{"doc_count": 1, "key": "cool"},
{"doc_count": 1, "key": "nohit"}
],
"sum_other_doc_count": 0
}
}
])
);
// Case B: child has more buckets than parent
// Parent: histogram on score with large interval -> 1 bucket
// Child: terms on text -> 2 buckets (cool/nohit)
let agg_child_more: Aggregations = serde_json::from_value(json!({
"parent_hist": {
"histogram": {"field": "score", "interval": 100.0},
"aggs": {
"child_terms": {"terms": {"field": "text", "order": {"_key": "asc"}}}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?;
assert_eq!(
res["parent_hist"],
json!({
"buckets": [
{
"key": 0.0,
"doc_count": 9,
"child_terms": {
"buckets": [
{"doc_count": 7, "key": "cool"},
{"doc_count": 2, "key": "nohit"}
],
"sum_other_doc_count": 0
}
}
]
})
);
Ok(())
}
#[test]
fn test_range_as_subagg_parent_more_vs_child_more() -> crate::Result<()> {
let index = get_test_index_2_segments(false)?;
// Case A: parent has more buckets than child
// Parent: range with 5 buckets
// Child: coarse range with 3 buckets
let agg_parent_more: Aggregations = serde_json::from_value(json!({
"parent_range": {
"range": {
"field": "score",
"ranges": [
{"to": 3.0},
{"from": 3.0, "to": 7.0},
{"from": 7.0, "to": 11.0},
{"from": 11.0, "to": 20.0},
{"from": 20.0}
]
},
"aggs": {
"child_range": {
"range": {
"field": "score",
"ranges": [
{"to": 3.0},
{"from": 3.0, "to": 20.0}
]
}
}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?;
assert_eq!(
res["parent_range"]["buckets"],
json!([
{"key": "*-3", "doc_count": 1, "to": 3.0,
"child_range": {"buckets": [
{"key": "*-3", "doc_count": 1, "to": 3.0},
{"key": "3-20", "doc_count": 0, "from": 3.0, "to": 20.0},
{"key": "20-*", "doc_count": 0, "from": 20.0}
]}
},
{"key": "3-7", "doc_count": 3, "from": 3.0, "to": 7.0,
"child_range": {"buckets": [
{"key": "*-3", "doc_count": 0, "to": 3.0},
{"key": "3-20", "doc_count": 3, "from": 3.0, "to": 20.0},
{"key": "20-*", "doc_count": 0, "from": 20.0}
]}
},
{"key": "7-11", "doc_count": 1, "from": 7.0, "to": 11.0,
"child_range": {"buckets": [
{"key": "*-3", "doc_count": 0, "to": 3.0},
{"key": "3-20", "doc_count": 1, "from": 3.0, "to": 20.0},
{"key": "20-*", "doc_count": 0, "from": 20.0}
]}
},
{"key": "11-20", "doc_count": 2, "from": 11.0, "to": 20.0,
"child_range": {"buckets": [
{"key": "*-3", "doc_count": 0, "to": 3.0},
{"key": "3-20", "doc_count": 2, "from": 3.0, "to": 20.0},
{"key": "20-*", "doc_count": 0, "from": 20.0}
]}
},
{"key": "20-*", "doc_count": 2, "from": 20.0,
"child_range": {"buckets": [
{"key": "*-3", "doc_count": 0, "to": 3.0},
{"key": "3-20", "doc_count": 0, "from": 3.0, "to": 20.0},
{"key": "20-*", "doc_count": 2, "from": 20.0}
]}
}
])
);
// Case B: child has more buckets than parent
// Parent: terms on text (2 buckets)
// Child: range with 4 buckets
let agg_child_more: Aggregations = serde_json::from_value(json!({
"parent_terms": {
"terms": {"field": "text"},
"aggs": {
"child_range": {
"range": {
"field": "score",
"ranges": [
{"to": 3.0},
{"from": 3.0, "to": 7.0},
{"from": 7.0, "to": 20.0}
]
}
}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?;
assert_eq!(
res["parent_terms"],
json!({
"buckets": [
{
"key": "cool",
"doc_count": 7,
"child_range": {
"buckets": [
{"key": "*-3", "doc_count": 1, "to": 3.0},
{"key": "3-7", "doc_count": 2, "from": 3.0, "to": 7.0},
{"key": "7-20", "doc_count": 3, "from": 7.0, "to": 20.0},
{"key": "20-*", "doc_count": 1, "from": 20.0}
]
}
},
{
"key": "nohit",
"doc_count": 2,
"child_range": {
"buckets": [
{"key": "*-3", "doc_count": 0, "to": 3.0},
{"key": "3-7", "doc_count": 1, "from": 3.0, "to": 7.0},
{"key": "7-20", "doc_count": 0, "from": 7.0, "to": 20.0},
{"key": "20-*", "doc_count": 1, "from": 20.0}
]
}
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
})
);
Ok(())
}
#[test]
fn test_histogram_as_subagg_parent_more_vs_child_more() -> crate::Result<()> {
let index = get_test_index_2_segments(false)?;
// Case A: parent has more buckets than child
// Parent: range with several ranges
// Child: histogram with large interval (single bucket per parent)
let agg_parent_more: Aggregations = serde_json::from_value(json!({
"parent_range": {
"range": {
"field": "score",
"ranges": [
{"to": 3.0},
{"from": 3.0, "to": 7.0},
{"from": 7.0, "to": 11.0},
{"from": 11.0, "to": 20.0},
{"from": 20.0}
]
},
"aggs": {
"child_hist": {"histogram": {"field": "score", "interval": 100.0}}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?;
assert_eq!(
res["parent_range"]["buckets"],
json!([
{"key": "*-3", "doc_count": 1, "to": 3.0,
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 1} ]}
},
{"key": "3-7", "doc_count": 3, "from": 3.0, "to": 7.0,
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 3} ]}
},
{"key": "7-11", "doc_count": 1, "from": 7.0, "to": 11.0,
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 1} ]}
},
{"key": "11-20", "doc_count": 2, "from": 11.0, "to": 20.0,
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 2} ]}
},
{"key": "20-*", "doc_count": 2, "from": 20.0,
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 2} ]}
}
])
);
// Case B: child has more buckets than parent
// Parent: terms on text -> 2 buckets
// Child: histogram with small interval -> multiple buckets including empties
let agg_child_more: Aggregations = serde_json::from_value(json!({
"parent_terms": {
"terms": {"field": "text"},
"aggs": {
"child_hist": {"histogram": {"field": "score", "interval": 10.0}}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?;
assert_eq!(
res["parent_terms"],
json!({
"buckets": [
{
"key": "cool",
"doc_count": 7,
"child_hist": {
"buckets": [
{"key": 0.0, "doc_count": 4},
{"key": 10.0, "doc_count": 2},
{"key": 20.0, "doc_count": 0},
{"key": 30.0, "doc_count": 0},
{"key": 40.0, "doc_count": 1}
]
}
},
{
"key": "nohit",
"doc_count": 2,
"child_hist": {
"buckets": [
{"key": 0.0, "doc_count": 1},
{"key": 10.0, "doc_count": 0},
{"key": 20.0, "doc_count": 0},
{"key": 30.0, "doc_count": 0},
{"key": 40.0, "doc_count": 1}
]
}
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
})
);
Ok(())
}
#[test]
fn test_date_histogram_as_subagg_parent_more_vs_child_more() -> crate::Result<()> {
let index = get_test_index_2_segments(false)?;
// Case A: parent has more buckets than child
// Parent: range with several buckets
// Child: date_histogram with 30d -> single bucket per parent
let agg_parent_more: Aggregations = serde_json::from_value(json!({
"parent_range": {
"range": {
"field": "score",
"ranges": [
{"to": 3.0},
{"from": 3.0, "to": 7.0},
{"from": 7.0, "to": 11.0},
{"from": 11.0, "to": 20.0},
{"from": 20.0}
]
},
"aggs": {
"child_date_hist": {"date_histogram": {"field": "date", "fixed_interval": "30d"}}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?;
let buckets = res["parent_range"]["buckets"].as_array().unwrap();
// Verify each parent bucket has exactly one child date bucket with matching doc_count
for bucket in buckets {
let parent_count = bucket["doc_count"].as_u64().unwrap();
let child_buckets = bucket["child_date_hist"]["buckets"].as_array().unwrap();
assert_eq!(child_buckets.len(), 1);
assert_eq!(child_buckets[0]["doc_count"], parent_count);
}
// Case B: child has more buckets than parent
// Parent: terms on text (2 buckets)
// Child: date_histogram with 1d -> multiple buckets
let agg_child_more: Aggregations = serde_json::from_value(json!({
"parent_terms": {
"terms": {"field": "text"},
"aggs": {
"child_date_hist": {"date_histogram": {"field": "date", "fixed_interval": "1d"}}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?;
let buckets = res["parent_terms"]["buckets"].as_array().unwrap();
// cool bucket
assert_eq!(buckets[0]["key"], "cool");
let cool_buckets = buckets[0]["child_date_hist"]["buckets"].as_array().unwrap();
assert_eq!(cool_buckets.len(), 3);
assert_eq!(cool_buckets[0]["doc_count"], 1); // day 0
assert_eq!(cool_buckets[1]["doc_count"], 4); // day 1
assert_eq!(cool_buckets[2]["doc_count"], 2); // day 2
// nohit bucket
assert_eq!(buckets[1]["key"], "nohit");
let nohit_buckets = buckets[1]["child_date_hist"]["buckets"].as_array().unwrap();
assert_eq!(nohit_buckets.len(), 2);
assert_eq!(nohit_buckets[0]["doc_count"], 1); // day 1
assert_eq!(nohit_buckets[1]["doc_count"], 1); // day 2
Ok(())
}
fn get_avg_req(field_name: &str) -> Aggregation { fn get_avg_req(field_name: &str) -> Aggregation {
serde_json::from_value(json!({ serde_json::from_value(json!({
"avg": { "avg": {
@@ -451,10 +25,6 @@ fn get_collector(agg_req: Aggregations) -> AggregationCollector {
} }
// *** EVERY BUCKET-TYPE SHOULD BE TESTED HERE *** // *** EVERY BUCKET-TYPE SHOULD BE TESTED HERE ***
// Note: The flushng part of these tests are outdated, since the buffering change after converting
// the collection into one collector per request instead of per bucket.
//
// However they are useful as they test a complex aggregation requests.
fn test_aggregation_flushing( fn test_aggregation_flushing(
merge_segments: bool, merge_segments: bool,
use_distributed_collector: bool, use_distributed_collector: bool,
@@ -467,9 +37,8 @@ fn test_aggregation_flushing(
let reader = index.reader()?; let reader = index.reader()?;
assert_eq!(COLLECT_BLOCK_BUFFER_LEN, 64); assert_eq!(DOC_BLOCK_SIZE, 64);
// In the tree we cache documents of COLLECT_BLOCK_BUFFER_LEN before passing them down as one // In the tree we cache Documents of DOC_BLOCK_SIZE, before passing them down as one block.
// block.
// //
// Build a request so that on the first level we have one full cache, which is then flushed. // Build a request so that on the first level we have one full cache, which is then flushed.
// The same cache should have some residue docs at the end, which are flushed (Range 0-70) // The same cache should have some residue docs at the end, which are flushed (Range 0-70)

View File

@@ -6,14 +6,10 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer};
use crate::aggregation::agg_data::{ use crate::aggregation::agg_data::{
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx, build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
}; };
use crate::aggregation::cached_sub_aggs::{
CachedSubAggs, HighCardSubAggCache, LowCardSubAggCache, SubAggCache,
};
use crate::aggregation::intermediate_agg_result::{ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult, IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
}; };
use crate::aggregation::segment_agg_result::{BucketIdProvider, SegmentAggregationCollector}; use crate::aggregation::segment_agg_result::{CollectorClone, SegmentAggregationCollector};
use crate::aggregation::BucketId;
use crate::docset::DocSet; use crate::docset::DocSet;
use crate::query::{AllQuery, EnableScoring, Query, QueryParser}; use crate::query::{AllQuery, EnableScoring, Query, QueryParser};
use crate::schema::Schema; use crate::schema::Schema;
@@ -36,7 +32,7 @@ use crate::{DocId, SegmentReader, TantivyError};
/// ///
/// # Implementation Requirements /// # Implementation Requirements
/// ///
/// Implementers must: /// Implementors must:
/// 1. Derive `Debug`, `Clone`, `Serialize`, and `Deserialize` /// 1. Derive `Debug`, `Clone`, `Serialize`, and `Deserialize`
/// 2. Use `#[typetag::serde]` attribute on the impl block /// 2. Use `#[typetag::serde]` attribute on the impl block
/// 3. Implement `build_query()` to construct the query from schema/tokenizers /// 3. Implement `build_query()` to construct the query from schema/tokenizers
@@ -408,18 +404,15 @@ pub struct FilterAggReqData {
pub evaluator: DocumentQueryEvaluator, pub evaluator: DocumentQueryEvaluator,
/// Reusable buffer for matching documents to minimize allocations during collection /// Reusable buffer for matching documents to minimize allocations during collection
pub matching_docs_buffer: Vec<DocId>, pub matching_docs_buffer: Vec<DocId>,
/// True if this filter aggregation is at the top level of the aggregation tree (not nested).
pub is_top_level: bool,
} }
impl FilterAggReqData { impl FilterAggReqData {
pub(crate) fn get_memory_consumption(&self) -> usize { pub(crate) fn get_memory_consumption(&self) -> usize {
// Estimate: name + segment reader reference + bitset + buffer capacity // Estimate: name + segment reader reference + bitset + buffer capacity
self.name.len() self.name.len()
+ std::mem::size_of::<SegmentReader>() + std::mem::size_of::<SegmentReader>()
+ self.evaluator.bitset.len() / 8 // BitSet memory (bits to bytes) + self.evaluator.bitset.len() / 8 // BitSet memory (bits to bytes)
+ self.matching_docs_buffer.capacity() * std::mem::size_of::<DocId>() + self.matching_docs_buffer.capacity() * std::mem::size_of::<DocId>()
+ std::mem::size_of::<bool>()
} }
} }
@@ -496,24 +489,17 @@ impl Debug for DocumentQueryEvaluator {
} }
} }
#[derive(Debug, Clone, PartialEq, Copy)]
struct DocCount {
doc_count: u64,
bucket_id: BucketId,
}
/// Segment collector for filter aggregation /// Segment collector for filter aggregation
pub struct SegmentFilterCollector<C: SubAggCache> { pub struct SegmentFilterCollector {
/// Document counts per parent bucket /// Document count in this bucket
parent_buckets: Vec<DocCount>, doc_count: u64,
/// Sub-aggregation collectors /// Sub-aggregation collectors
sub_aggregations: Option<CachedSubAggs<C>>, sub_aggregations: Option<Box<dyn SegmentAggregationCollector>>,
bucket_id_provider: BucketIdProvider,
/// Accessor index for this filter aggregation (to access FilterAggReqData) /// Accessor index for this filter aggregation (to access FilterAggReqData)
accessor_idx: usize, accessor_idx: usize,
} }
impl<C: SubAggCache> SegmentFilterCollector<C> { impl SegmentFilterCollector {
/// Create a new filter segment collector following the new agg_data pattern /// Create a new filter segment collector following the new agg_data pattern
pub(crate) fn from_req_and_validate( pub(crate) fn from_req_and_validate(
req: &mut AggregationsSegmentCtx, req: &mut AggregationsSegmentCtx,
@@ -525,75 +511,47 @@ impl<C: SubAggCache> SegmentFilterCollector<C> {
} else { } else {
None None
}; };
let sub_agg_collector = sub_agg_collector.map(CachedSubAggs::new);
Ok(SegmentFilterCollector { Ok(SegmentFilterCollector {
parent_buckets: Vec::new(), doc_count: 0,
sub_aggregations: sub_agg_collector, sub_aggregations: sub_agg_collector,
accessor_idx: node.idx_in_req_data, accessor_idx: node.idx_in_req_data,
bucket_id_provider: BucketIdProvider::default(),
}) })
} }
} }
pub(crate) fn build_segment_filter_collector( impl Debug for SegmentFilterCollector {
req: &mut AggregationsSegmentCtx,
node: &AggRefNode,
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
let is_top_level = req.per_request.filter_req_data[node.idx_in_req_data]
.as_ref()
.expect("filter_req_data slot is empty")
.is_top_level;
if is_top_level {
Ok(Box::new(
SegmentFilterCollector::<LowCardSubAggCache>::from_req_and_validate(req, node)?,
))
} else {
Ok(Box::new(
SegmentFilterCollector::<HighCardSubAggCache>::from_req_and_validate(req, node)?,
))
}
}
impl<C: SubAggCache> Debug for SegmentFilterCollector<C> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SegmentFilterCollector") f.debug_struct("SegmentFilterCollector")
.field("buckets", &self.parent_buckets) .field("doc_count", &self.doc_count)
.field("has_sub_aggs", &self.sub_aggregations.is_some()) .field("has_sub_aggs", &self.sub_aggregations.is_some())
.field("accessor_idx", &self.accessor_idx) .field("accessor_idx", &self.accessor_idx)
.finish() .finish()
} }
} }
impl<C: SubAggCache> SegmentAggregationCollector for SegmentFilterCollector<C> { impl CollectorClone for SegmentFilterCollector {
fn clone_box(&self) -> Box<dyn SegmentAggregationCollector> {
// For now, panic - this needs proper implementation with weight recreation
panic!("SegmentFilterCollector cloning not yet implemented - requires weight recreation")
}
}
impl SegmentAggregationCollector for SegmentFilterCollector {
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
&mut self, self: Box<Self>,
agg_data: &AggregationsSegmentCtx, agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults, results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> { ) -> crate::Result<()> {
let mut sub_results = IntermediateAggregationResults::default(); let mut sub_results = IntermediateAggregationResults::default();
let bucket_opt = self.parent_buckets.get(parent_bucket_id as usize);
if let Some(sub_aggs) = &mut self.sub_aggregations { if let Some(sub_aggs) = self.sub_aggregations {
sub_aggs sub_aggs.add_intermediate_aggregation_result(agg_data, &mut sub_results)?;
.get_sub_agg_collector()
.add_intermediate_aggregation_result(
agg_data,
&mut sub_results,
// Here we create a new bucket ID for sub-aggregations if the bucket doesn't
// exist, so that sub-aggregations can still produce results (e.g., zero doc
// count)
bucket_opt
.map(|bucket| bucket.bucket_id)
.unwrap_or(self.bucket_id_provider.next_bucket_id()),
)?;
} }
// Create the filter bucket result // Create the filter bucket result
let filter_bucket_result = IntermediateBucketResult::Filter { let filter_bucket_result = IntermediateBucketResult::Filter {
doc_count: bucket_opt.map(|b| b.doc_count).unwrap_or(0), doc_count: self.doc_count,
sub_aggregations: sub_results, sub_aggregations: sub_results,
}; };
@@ -612,17 +570,32 @@ impl<C: SubAggCache> SegmentAggregationCollector for SegmentFilterCollector<C> {
Ok(()) Ok(())
} }
fn collect( fn collect(&mut self, doc: DocId, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
// Access the evaluator from FilterAggReqData
let req_data = agg_data.get_filter_req_data(self.accessor_idx);
// O(1) BitSet lookup to check if document matches filter
if req_data.evaluator.matches_document(doc) {
self.doc_count += 1;
// If we have sub-aggregations, collect on them for this filtered document
if let Some(sub_aggs) = &mut self.sub_aggregations {
sub_aggs.collect(doc, agg_data)?;
}
}
Ok(())
}
#[inline]
fn collect_block(
&mut self, &mut self,
parent_bucket_id: BucketId, docs: &[DocId],
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> { ) -> crate::Result<()> {
if docs.is_empty() { if docs.is_empty() {
return Ok(()); return Ok(());
} }
let mut bucket = self.parent_buckets[parent_bucket_id as usize];
// Take the request data to avoid borrow checker issues with sub-aggregations // Take the request data to avoid borrow checker issues with sub-aggregations
let mut req = agg_data.take_filter_req_data(self.accessor_idx); let mut req = agg_data.take_filter_req_data(self.accessor_idx);
@@ -631,24 +604,18 @@ impl<C: SubAggCache> SegmentAggregationCollector for SegmentFilterCollector<C> {
req.evaluator req.evaluator
.filter_batch(docs, &mut req.matching_docs_buffer); .filter_batch(docs, &mut req.matching_docs_buffer);
bucket.doc_count += req.matching_docs_buffer.len() as u64; self.doc_count += req.matching_docs_buffer.len() as u64;
// Batch process sub-aggregations if we have matches // Batch process sub-aggregations if we have matches
if !req.matching_docs_buffer.is_empty() { if !req.matching_docs_buffer.is_empty() {
if let Some(sub_aggs) = &mut self.sub_aggregations { if let Some(sub_aggs) = &mut self.sub_aggregations {
for &doc_id in &req.matching_docs_buffer { // Use collect_block for better sub-aggregation performance
sub_aggs.push(bucket.bucket_id, doc_id); sub_aggs.collect_block(&req.matching_docs_buffer, agg_data)?;
}
} }
} }
// Put the request data back // Put the request data back
agg_data.put_back_filter_req_data(self.accessor_idx, req); agg_data.put_back_filter_req_data(self.accessor_idx, req);
if let Some(sub_aggs) = &mut self.sub_aggregations {
sub_aggs.check_flush_local(agg_data)?;
}
// put back bucket
self.parent_buckets[parent_bucket_id as usize] = bucket;
Ok(()) Ok(())
} }
@@ -659,21 +626,6 @@ impl<C: SubAggCache> SegmentAggregationCollector for SegmentFilterCollector<C> {
} }
Ok(()) Ok(())
} }
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
_agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
while self.parent_buckets.len() <= max_bucket as usize {
let bucket_id = self.bucket_id_provider.next_bucket_id();
self.parent_buckets.push(DocCount {
doc_count: 0,
bucket_id,
});
}
Ok(())
}
} }
/// Intermediate result for filter aggregation /// Intermediate result for filter aggregation
@@ -687,14 +639,16 @@ pub struct IntermediateFilterBucketResult {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::time::Instant;
use serde_json::{json, Value}; use serde_json::{json, Value};
use super::*; use super::*;
use crate::aggregation::agg_req::Aggregations; use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::agg_result::AggregationResults; use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::{AggContextParams, AggregationCollector}; use crate::aggregation::{AggContextParams, AggregationCollector};
use crate::query::{AllQuery, TermQuery}; use crate::query::{AllQuery, QueryParser, TermQuery};
use crate::schema::{IndexRecordOption, Schema, Term, FAST, INDEXED, TEXT}; use crate::schema::{IndexRecordOption, Schema, Term, FAST, INDEXED, STORED, TEXT};
use crate::{doc, Index, IndexWriter}; use crate::{doc, Index, IndexWriter};
// Test helper functions // Test helper functions
@@ -775,13 +729,12 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_for_tests()?; let mut writer: IndexWriter = index.writer(50_000_000)?;
writer.add_document(doc!( writer.add_document(doc!(
category => "electronics", brand => "apple", category => "electronics", brand => "apple",
price => 999u64, rating => 4.5f64, in_stock => true price => 999u64, rating => 4.5f64, in_stock => true
))?; ))?;
writer.commit()?;
writer.add_document(doc!( writer.add_document(doc!(
category => "electronics", brand => "samsung", category => "electronics", brand => "samsung",
price => 799u64, rating => 4.2f64, in_stock => true price => 799u64, rating => 4.2f64, in_stock => true
@@ -985,7 +938,7 @@ mod tests {
let index = create_standard_test_index()?; let index = create_standard_test_index()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 2);
let agg = json!({ let agg = json!({
"premium_electronics": { "premium_electronics": {
"filter": "category:electronics AND price:[800 TO *]", "filter": "category:electronics AND price:[800 TO *]",
@@ -1567,9 +1520,9 @@ mod tests {
let searcher = reader.searcher(); let searcher = reader.searcher();
let agg = json!({ let agg = json!({
"test": { "test": {
"filter": deserialized, "filter": deserialized,
"aggs": { "count": { "value_count": { "field": "brand" } } } "aggs": { "count": { "value_count": { "field": "brand" } } }
} }
}); });

View File

@@ -1,6 +1,6 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use columnar::{Column, ColumnType}; use columnar::{Column, ColumnBlockAccessor, ColumnType};
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tantivy_bitpacker::minmax; use tantivy_bitpacker::minmax;
@@ -8,14 +8,14 @@ use tantivy_bitpacker::minmax;
use crate::aggregation::agg_data::{ use crate::aggregation::agg_data::{
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx, build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
}; };
use crate::aggregation::agg_limits::MemoryConsumption;
use crate::aggregation::agg_req::Aggregations; use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::agg_result::BucketEntry; use crate::aggregation::agg_result::BucketEntry;
use crate::aggregation::cached_sub_aggs::{CachedSubAggs, HighCardCachedSubAggs};
use crate::aggregation::intermediate_agg_result::{ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult, IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
IntermediateHistogramBucketEntry, IntermediateHistogramBucketEntry,
}; };
use crate::aggregation::segment_agg_result::{BucketIdProvider, SegmentAggregationCollector}; use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::*; use crate::aggregation::*;
use crate::TantivyError; use crate::TantivyError;
@@ -26,8 +26,13 @@ pub struct HistogramAggReqData {
pub accessor: Column<u64>, pub accessor: Column<u64>,
/// The field type of the fast field. /// The field type of the fast field.
pub field_type: ColumnType, pub field_type: ColumnType,
/// The column block accessor to access the fast field values.
pub column_block_accessor: ColumnBlockAccessor<u64>,
/// The name of the aggregation. /// The name of the aggregation.
pub name: String, pub name: String,
/// The sub aggregation blueprint, used to create sub aggregations for each bucket.
/// Will be filled during initialization of the collector.
pub sub_aggregation_blueprint: Option<Box<dyn SegmentAggregationCollector>>,
/// The histogram aggregation request. /// The histogram aggregation request.
pub req: HistogramAggregation, pub req: HistogramAggregation,
/// True if this is a date_histogram aggregation. /// True if this is a date_histogram aggregation.
@@ -252,24 +257,18 @@ impl HistogramBounds {
pub(crate) struct SegmentHistogramBucketEntry { pub(crate) struct SegmentHistogramBucketEntry {
pub key: f64, pub key: f64,
pub doc_count: u64, pub doc_count: u64,
pub bucket_id: BucketId,
} }
impl SegmentHistogramBucketEntry { impl SegmentHistogramBucketEntry {
pub(crate) fn into_intermediate_bucket_entry( pub(crate) fn into_intermediate_bucket_entry(
self, self,
sub_aggregation: &mut Option<HighCardCachedSubAggs>, sub_aggregation: Option<Box<dyn SegmentAggregationCollector>>,
agg_data: &AggregationsSegmentCtx, agg_data: &AggregationsSegmentCtx,
) -> crate::Result<IntermediateHistogramBucketEntry> { ) -> crate::Result<IntermediateHistogramBucketEntry> {
let mut sub_aggregation_res = IntermediateAggregationResults::default(); let mut sub_aggregation_res = IntermediateAggregationResults::default();
if let Some(sub_aggregation) = sub_aggregation { if let Some(sub_aggregation) = sub_aggregation {
sub_aggregation sub_aggregation
.get_sub_agg_collector() .add_intermediate_aggregation_result(agg_data, &mut sub_aggregation_res)?;
.add_intermediate_aggregation_result(
agg_data,
&mut sub_aggregation_res,
self.bucket_id,
)?;
} }
Ok(IntermediateHistogramBucketEntry { Ok(IntermediateHistogramBucketEntry {
key: self.key, key: self.key,
@@ -279,38 +278,27 @@ impl SegmentHistogramBucketEntry {
} }
} }
#[derive(Clone, Debug, Default)]
struct HistogramBuckets {
pub buckets: FxHashMap<i64, SegmentHistogramBucketEntry>,
}
/// The collector puts values from the fast field into the correct buckets and does a conversion to /// The collector puts values from the fast field into the correct buckets and does a conversion to
/// the correct datatype. /// the correct datatype.
#[derive(Debug)] #[derive(Clone, Debug)]
pub struct SegmentHistogramCollector { pub struct SegmentHistogramCollector {
/// The buckets containing the aggregation data. /// The buckets containing the aggregation data.
/// One Histogram bucket per parent bucket id. buckets: FxHashMap<i64, SegmentHistogramBucketEntry>,
parent_buckets: Vec<HistogramBuckets>, sub_aggregations: FxHashMap<i64, Box<dyn SegmentAggregationCollector>>,
sub_agg: Option<HighCardCachedSubAggs>,
accessor_idx: usize, accessor_idx: usize,
bucket_id_provider: BucketIdProvider,
} }
impl SegmentAggregationCollector for SegmentHistogramCollector { impl SegmentAggregationCollector for SegmentHistogramCollector {
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
&mut self, self: Box<Self>,
agg_data: &AggregationsSegmentCtx, agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults, results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> { ) -> crate::Result<()> {
let name = agg_data let name = agg_data
.get_histogram_req_data(self.accessor_idx) .get_histogram_req_data(self.accessor_idx)
.name .name
.clone(); .clone();
// TODO: avoid prepare_max_bucket here and handle empty buckets. let bucket = self.into_intermediate_bucket_result(agg_data)?;
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
let histogram = std::mem::take(&mut self.parent_buckets[parent_bucket_id as usize]);
let bucket = self.add_intermediate_bucket_result(agg_data, histogram)?;
results.push(name, IntermediateAggregationResult::Bucket(bucket))?; results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
Ok(()) Ok(())
@@ -319,40 +307,44 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
#[inline] #[inline]
fn collect( fn collect(
&mut self, &mut self,
parent_bucket_id: BucketId, doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
self.collect_block(&[doc], agg_data)
}
#[inline]
fn collect_block(
&mut self,
docs: &[crate::DocId], docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> { ) -> crate::Result<()> {
let req = agg_data.take_histogram_req_data(self.accessor_idx); let mut req = agg_data.take_histogram_req_data(self.accessor_idx);
let mem_pre = self.get_memory_consumption(); let mem_pre = self.get_memory_consumption();
let buckets = &mut self.parent_buckets[parent_bucket_id as usize].buckets;
let bounds = req.bounds; let bounds = req.bounds;
let interval = req.req.interval; let interval = req.req.interval;
let offset = req.offset; let offset = req.offset;
let get_bucket_pos = |val| get_bucket_pos_f64(val, interval, offset) as i64; let get_bucket_pos = |val| get_bucket_pos_f64(val, interval, offset) as i64;
agg_data req.column_block_accessor.fetch_block(docs, &req.accessor);
.column_block_accessor for (doc, val) in req
.fetch_block(docs, &req.accessor);
for (doc, val) in agg_data
.column_block_accessor .column_block_accessor
.iter_docid_vals(docs, &req.accessor) .iter_docid_vals(docs, &req.accessor)
{ {
let val = f64_from_fastfield_u64(val, req.field_type); let val = f64_from_fastfield_u64(val, &req.field_type);
let bucket_pos = get_bucket_pos(val); let bucket_pos = get_bucket_pos(val);
if bounds.contains(val) { if bounds.contains(val) {
let bucket = buckets.entry(bucket_pos).or_insert_with(|| { let bucket = self.buckets.entry(bucket_pos).or_insert_with(|| {
let key = get_bucket_key_from_pos(bucket_pos as f64, interval, offset); let key = get_bucket_key_from_pos(bucket_pos as f64, interval, offset);
SegmentHistogramBucketEntry { SegmentHistogramBucketEntry { key, doc_count: 0 }
key,
doc_count: 0,
bucket_id: self.bucket_id_provider.next_bucket_id(),
}
}); });
bucket.doc_count += 1; bucket.doc_count += 1;
if let Some(sub_agg) = &mut self.sub_agg { if let Some(sub_aggregation_blueprint) = req.sub_aggregation_blueprint.as_ref() {
sub_agg.push(bucket.bucket_id, doc); self.sub_aggregations
.entry(bucket_pos)
.or_insert_with(|| sub_aggregation_blueprint.clone())
.collect(doc, agg_data)?;
} }
} }
} }
@@ -366,30 +358,14 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
.add_memory_consumed(mem_delta as u64)?; .add_memory_consumed(mem_delta as u64)?;
} }
if let Some(sub_agg) = &mut self.sub_agg {
sub_agg.check_flush_local(agg_data)?;
}
Ok(()) Ok(())
} }
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> { fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
if let Some(sub_aggregation) = &mut self.sub_agg { for sub_aggregation in self.sub_aggregations.values_mut() {
sub_aggregation.flush(agg_data)?; sub_aggregation.flush(agg_data)?;
} }
Ok(())
}
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
_agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
while self.parent_buckets.len() <= max_bucket as usize {
self.parent_buckets.push(HistogramBuckets {
buckets: FxHashMap::default(),
});
}
Ok(()) Ok(())
} }
} }
@@ -397,19 +373,22 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
impl SegmentHistogramCollector { impl SegmentHistogramCollector {
fn get_memory_consumption(&self) -> usize { fn get_memory_consumption(&self) -> usize {
let self_mem = std::mem::size_of::<Self>(); let self_mem = std::mem::size_of::<Self>();
let buckets_mem = self.parent_buckets.len() * std::mem::size_of::<HistogramBuckets>(); let sub_aggs_mem = self.sub_aggregations.memory_consumption();
self_mem + buckets_mem let buckets_mem = self.buckets.memory_consumption();
self_mem + sub_aggs_mem + buckets_mem
} }
/// Converts the collector result into a intermediate bucket result. /// Converts the collector result into a intermediate bucket result.
fn add_intermediate_bucket_result( pub fn into_intermediate_bucket_result(
&mut self, self,
agg_data: &AggregationsSegmentCtx, agg_data: &AggregationsSegmentCtx,
histogram: HistogramBuckets,
) -> crate::Result<IntermediateBucketResult> { ) -> crate::Result<IntermediateBucketResult> {
let mut buckets = Vec::with_capacity(histogram.buckets.len()); let mut buckets = Vec::with_capacity(self.buckets.len());
for bucket in histogram.buckets.into_values() { for (bucket_pos, bucket) in self.buckets {
let bucket_res = bucket.into_intermediate_bucket_entry(&mut self.sub_agg, agg_data); let bucket_res = bucket.into_intermediate_bucket_entry(
self.sub_aggregations.get(&bucket_pos).cloned(),
agg_data,
);
buckets.push(bucket_res?); buckets.push(bucket_res?);
} }
@@ -429,7 +408,7 @@ impl SegmentHistogramCollector {
agg_data: &mut AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
node: &AggRefNode, node: &AggRefNode,
) -> crate::Result<Self> { ) -> crate::Result<Self> {
let sub_agg = if !node.children.is_empty() { let blueprint = if !node.children.is_empty() {
Some(build_segment_agg_collectors(agg_data, &node.children)?) Some(build_segment_agg_collectors(agg_data, &node.children)?)
} else { } else {
None None
@@ -444,13 +423,13 @@ impl SegmentHistogramCollector {
max: f64::MAX, max: f64::MAX,
}); });
req_data.offset = req_data.req.offset.unwrap_or(0.0); req_data.offset = req_data.req.offset.unwrap_or(0.0);
let sub_agg = sub_agg.map(CachedSubAggs::new);
req_data.sub_aggregation_blueprint = blueprint;
Ok(Self { Ok(Self {
parent_buckets: Default::default(), buckets: Default::default(),
sub_agg, sub_aggregations: Default::default(),
accessor_idx: node.idx_in_req_data, accessor_idx: node.idx_in_req_data,
bucket_id_provider: BucketIdProvider::default(),
}) })
} }
} }

View File

@@ -1,22 +1,18 @@
use std::fmt::Debug; use std::fmt::Debug;
use std::ops::Range; use std::ops::Range;
use columnar::{Column, ColumnType}; use columnar::{Column, ColumnBlockAccessor, ColumnType};
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::aggregation::agg_data::{ use crate::aggregation::agg_data::{
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx, build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
}; };
use crate::aggregation::agg_limits::AggregationLimitsGuard;
use crate::aggregation::cached_sub_aggs::{
CachedSubAggs, HighCardSubAggCache, LowCardCachedSubAggs, LowCardSubAggCache, SubAggCache,
};
use crate::aggregation::intermediate_agg_result::{ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult, IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
IntermediateRangeBucketEntry, IntermediateRangeBucketResult, IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
}; };
use crate::aggregation::segment_agg_result::{BucketIdProvider, SegmentAggregationCollector}; use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::*; use crate::aggregation::*;
use crate::TantivyError; use crate::TantivyError;
@@ -27,12 +23,12 @@ pub struct RangeAggReqData {
pub accessor: Column<u64>, pub accessor: Column<u64>,
/// The type of the fast field. /// The type of the fast field.
pub field_type: ColumnType, pub field_type: ColumnType,
/// The column block accessor to access the fast field values.
pub column_block_accessor: ColumnBlockAccessor<u64>,
/// The range aggregation request. /// The range aggregation request.
pub req: RangeAggregation, pub req: RangeAggregation,
/// The name of the aggregation. /// The name of the aggregation.
pub name: String, pub name: String,
/// Whether this is a top-level aggregation.
pub is_top_level: bool,
} }
impl RangeAggReqData { impl RangeAggReqData {
@@ -155,47 +151,19 @@ pub(crate) struct SegmentRangeAndBucketEntry {
/// The collector puts values from the fast field into the correct buckets and does a conversion to /// The collector puts values from the fast field into the correct buckets and does a conversion to
/// the correct datatype. /// the correct datatype.
pub struct SegmentRangeCollector<C: SubAggCache> { #[derive(Clone, Debug)]
pub struct SegmentRangeCollector {
/// The buckets containing the aggregation data. /// The buckets containing the aggregation data.
/// One for each ParentBucketId buckets: Vec<SegmentRangeAndBucketEntry>,
parent_buckets: Vec<Vec<SegmentRangeAndBucketEntry>>,
column_type: ColumnType, column_type: ColumnType,
pub(crate) accessor_idx: usize, pub(crate) accessor_idx: usize,
sub_agg: Option<CachedSubAggs<C>>,
/// Here things get a bit weird. We need to assign unique bucket ids across all
/// parent buckets. So we keep track of the next available bucket id here.
/// This allows a kind of flattening of the bucket ids across all parent buckets.
/// E.g. in nested aggregations:
/// Term Agg -> Range aggregation -> Stats aggregation
/// E.g. the Term Agg creates 3 buckets ["INFO", "ERROR", "WARN"], each of these has a Range
/// aggregation with 4 buckets. The Range aggregation will create buckets with ids:
/// - INFO: 0,1,2,3
/// - ERROR: 4,5,6,7
/// - WARN: 8,9,10,11
///
/// This allows the Stats aggregation to have unique bucket ids to refer to.
bucket_id_provider: BucketIdProvider,
limits: AggregationLimitsGuard,
} }
impl<C: SubAggCache> Debug for SegmentRangeCollector<C> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SegmentRangeCollector")
.field("parent_buckets_len", &self.parent_buckets.len())
.field("column_type", &self.column_type)
.field("accessor_idx", &self.accessor_idx)
.field("has_sub_agg", &self.sub_agg.is_some())
.finish()
}
}
/// TODO: Bad naming, there's also SegmentRangeAndBucketEntry
#[derive(Clone)] #[derive(Clone)]
pub(crate) struct SegmentRangeBucketEntry { pub(crate) struct SegmentRangeBucketEntry {
pub key: Key, pub key: Key,
pub doc_count: u64, pub doc_count: u64,
// pub sub_aggregation: Option<Box<dyn SegmentAggregationCollector>>, pub sub_aggregation: Option<Box<dyn SegmentAggregationCollector>>,
pub bucket_id: BucketId,
/// The from range of the bucket. Equals `f64::MIN` when `None`. /// The from range of the bucket. Equals `f64::MIN` when `None`.
pub from: Option<f64>, pub from: Option<f64>,
/// The to range of the bucket. Equals `f64::MAX` when `None`. Open interval, `to` is not /// The to range of the bucket. Equals `f64::MAX` when `None`. Open interval, `to` is not
@@ -216,50 +184,48 @@ impl Debug for SegmentRangeBucketEntry {
impl SegmentRangeBucketEntry { impl SegmentRangeBucketEntry {
pub(crate) fn into_intermediate_bucket_entry( pub(crate) fn into_intermediate_bucket_entry(
self, self,
agg_data: &AggregationsSegmentCtx,
) -> crate::Result<IntermediateRangeBucketEntry> { ) -> crate::Result<IntermediateRangeBucketEntry> {
let sub_aggregation = IntermediateAggregationResults::default(); let mut sub_aggregation_res = IntermediateAggregationResults::default();
if let Some(sub_aggregation) = self.sub_aggregation {
sub_aggregation
.add_intermediate_aggregation_result(agg_data, &mut sub_aggregation_res)?
} else {
Default::default()
};
Ok(IntermediateRangeBucketEntry { Ok(IntermediateRangeBucketEntry {
key: self.key.into(), key: self.key.into(),
doc_count: self.doc_count, doc_count: self.doc_count,
sub_aggregation_res: sub_aggregation, sub_aggregation: sub_aggregation_res,
from: self.from, from: self.from,
to: self.to, to: self.to,
}) })
} }
} }
impl<C: SubAggCache> SegmentAggregationCollector for SegmentRangeCollector<C> { impl SegmentAggregationCollector for SegmentRangeCollector {
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
&mut self, self: Box<Self>,
agg_data: &AggregationsSegmentCtx, agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults, results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> { ) -> crate::Result<()> {
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
let field_type = self.column_type; let field_type = self.column_type;
let name = agg_data let name = agg_data
.get_range_req_data(self.accessor_idx) .get_range_req_data(self.accessor_idx)
.name .name
.to_string(); .to_string();
let buckets = std::mem::take(&mut self.parent_buckets[parent_bucket_id as usize]); let buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry> = self
.buckets
let buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry> = buckets
.into_iter() .into_iter()
.map(|range_bucket| { .map(move |range_bucket| {
let bucket_id = range_bucket.bucket.bucket_id; Ok((
let mut agg = range_bucket.bucket.into_intermediate_bucket_entry()?; range_to_string(&range_bucket.range, &field_type)?,
if let Some(sub_aggregation) = &mut self.sub_agg { range_bucket
sub_aggregation .bucket
.get_sub_agg_collector() .into_intermediate_bucket_entry(agg_data)?,
.add_intermediate_aggregation_result( ))
agg_data,
&mut agg.sub_aggregation_res,
bucket_id,
)?;
}
Ok((range_to_string(&range_bucket.range, &field_type)?, agg))
}) })
.collect::<crate::Result<_>>()?; .collect::<crate::Result<_>>()?;
@@ -276,114 +242,73 @@ impl<C: SubAggCache> SegmentAggregationCollector for SegmentRangeCollector<C> {
#[inline] #[inline]
fn collect( fn collect(
&mut self, &mut self,
parent_bucket_id: BucketId, doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
self.collect_block(&[doc], agg_data)
}
#[inline]
fn collect_block(
&mut self,
docs: &[crate::DocId], docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> { ) -> crate::Result<()> {
let req = agg_data.take_range_req_data(self.accessor_idx); // Take request data to avoid borrow conflicts during sub-aggregation
let mut req = agg_data.take_range_req_data(self.accessor_idx);
agg_data req.column_block_accessor.fetch_block(docs, &req.accessor);
.column_block_accessor
.fetch_block(docs, &req.accessor);
let buckets = &mut self.parent_buckets[parent_bucket_id as usize]; for (doc, val) in req
for (doc, val) in agg_data
.column_block_accessor .column_block_accessor
.iter_docid_vals(docs, &req.accessor) .iter_docid_vals(docs, &req.accessor)
{ {
let bucket_pos = get_bucket_pos(val, buckets); let bucket_pos = self.get_bucket_pos(val);
let bucket = &mut buckets[bucket_pos]; let bucket = &mut self.buckets[bucket_pos];
bucket.bucket.doc_count += 1; bucket.bucket.doc_count += 1;
if let Some(sub_agg) = self.sub_agg.as_mut() { if let Some(sub_agg) = bucket.bucket.sub_aggregation.as_mut() {
sub_agg.push(bucket.bucket.bucket_id, doc); sub_agg.collect(doc, agg_data)?;
} }
} }
agg_data.put_back_range_req_data(self.accessor_idx, req); agg_data.put_back_range_req_data(self.accessor_idx, req);
if let Some(sub_agg) = self.sub_agg.as_mut() {
sub_agg.check_flush_local(agg_data)?;
}
Ok(()) Ok(())
} }
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> { fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
if let Some(sub_agg) = self.sub_agg.as_mut() { for bucket in self.buckets.iter_mut() {
sub_agg.flush(agg_data)?; if let Some(sub_agg) = bucket.bucket.sub_aggregation.as_mut() {
sub_agg.flush(agg_data)?;
}
} }
Ok(()) Ok(())
} }
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
while self.parent_buckets.len() <= max_bucket as usize {
let new_buckets = self.create_new_buckets(agg_data)?;
self.parent_buckets.push(new_buckets);
}
Ok(())
}
}
/// Build a concrete `SegmentRangeCollector` with either a Vec- or HashMap-backed
/// bucket storage, depending on the column type and aggregation level.
pub(crate) fn build_segment_range_collector(
agg_data: &mut AggregationsSegmentCtx,
node: &AggRefNode,
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
let accessor_idx = node.idx_in_req_data;
let req_data = agg_data.get_range_req_data(node.idx_in_req_data);
let field_type = req_data.field_type;
// TODO: A better metric instead of is_top_level would be the number of buckets expected.
// E.g. If range agg is not top level, but the parent is a bucket agg with less than 10 buckets,
// we can are still in low cardinality territory.
let is_low_card = req_data.is_top_level && req_data.req.ranges.len() <= 64;
let sub_agg = if !node.children.is_empty() {
Some(build_segment_agg_collectors(agg_data, &node.children)?)
} else {
None
};
if is_low_card {
Ok(Box::new(SegmentRangeCollector::<LowCardSubAggCache> {
sub_agg: sub_agg.map(LowCardCachedSubAggs::new),
column_type: field_type,
accessor_idx,
parent_buckets: Vec::new(),
bucket_id_provider: BucketIdProvider::default(),
limits: agg_data.context.limits.clone(),
}))
} else {
Ok(Box::new(SegmentRangeCollector::<HighCardSubAggCache> {
sub_agg: sub_agg.map(CachedSubAggs::new),
column_type: field_type,
accessor_idx,
parent_buckets: Vec::new(),
bucket_id_provider: BucketIdProvider::default(),
limits: agg_data.context.limits.clone(),
}))
}
} }
impl<C: SubAggCache> SegmentRangeCollector<C> { impl SegmentRangeCollector {
pub(crate) fn create_new_buckets( pub(crate) fn from_req_and_validate(
&mut self, req_data: &mut AggregationsSegmentCtx,
agg_data: &AggregationsSegmentCtx, node: &AggRefNode,
) -> crate::Result<Vec<SegmentRangeAndBucketEntry>> { ) -> crate::Result<Self> {
let field_type = self.column_type; let accessor_idx = node.idx_in_req_data;
let req_data = agg_data.get_range_req_data(self.accessor_idx); let (field_type, ranges) = {
let req_view = req_data.get_range_req_data(node.idx_in_req_data);
(req_view.field_type, req_view.req.ranges.clone())
};
// The range input on the request is f64. // The range input on the request is f64.
// We need to convert to u64 ranges, because we read the values as u64. // We need to convert to u64 ranges, because we read the values as u64.
// The mapping from the conversion is monotonic so ordering is preserved. // The mapping from the conversion is monotonic so ordering is preserved.
let buckets: Vec<_> = extend_validate_ranges(&req_data.req.ranges, &field_type)? let sub_agg_prototype = if !node.children.is_empty() {
Some(build_segment_agg_collectors(req_data, &node.children)?)
} else {
None
};
let buckets: Vec<_> = extend_validate_ranges(&ranges, &field_type)?
.iter() .iter()
.map(|range| { .map(|range| {
let bucket_id = self.bucket_id_provider.next_bucket_id();
let key = range let key = range
.key .key
.clone() .clone()
@@ -392,20 +317,20 @@ impl<C: SubAggCache> SegmentRangeCollector<C> {
let to = if range.range.end == u64::MAX { let to = if range.range.end == u64::MAX {
None None
} else { } else {
Some(f64_from_fastfield_u64(range.range.end, field_type)) Some(f64_from_fastfield_u64(range.range.end, &field_type))
}; };
let from = if range.range.start == u64::MIN { let from = if range.range.start == u64::MIN {
None None
} else { } else {
Some(f64_from_fastfield_u64(range.range.start, field_type)) Some(f64_from_fastfield_u64(range.range.start, &field_type))
}; };
// let sub_aggregation = sub_agg_prototype.clone(); let sub_aggregation = sub_agg_prototype.clone();
Ok(SegmentRangeAndBucketEntry { Ok(SegmentRangeAndBucketEntry {
range: range.range.clone(), range: range.range.clone(),
bucket: SegmentRangeBucketEntry { bucket: SegmentRangeBucketEntry {
doc_count: 0, doc_count: 0,
bucket_id, sub_aggregation,
key, key,
from, from,
to, to,
@@ -414,19 +339,26 @@ impl<C: SubAggCache> SegmentRangeCollector<C> {
}) })
.collect::<crate::Result<_>>()?; .collect::<crate::Result<_>>()?;
self.limits.add_memory_consumed( req_data.context.limits.add_memory_consumed(
buckets.len() as u64 * std::mem::size_of::<SegmentRangeAndBucketEntry>() as u64, buckets.len() as u64 * std::mem::size_of::<SegmentRangeAndBucketEntry>() as u64,
)?; )?;
Ok(buckets)
Ok(SegmentRangeCollector {
buckets,
column_type: field_type,
accessor_idx,
})
}
#[inline]
fn get_bucket_pos(&self, val: u64) -> usize {
let pos = self
.buckets
.binary_search_by_key(&val, |probe| probe.range.start)
.unwrap_or_else(|pos| pos - 1);
debug_assert!(self.buckets[pos].range.contains(&val));
pos
} }
}
#[inline]
fn get_bucket_pos(val: u64, buckets: &[SegmentRangeAndBucketEntry]) -> usize {
let pos = buckets
.binary_search_by_key(&val, |probe| probe.range.start)
.unwrap_or_else(|pos| pos - 1);
debug_assert!(buckets[pos].range.contains(&val));
pos
} }
/// Converts the user provided f64 range value to fast field value space. /// Converts the user provided f64 range value to fast field value space.
@@ -524,7 +456,7 @@ pub(crate) fn range_to_string(
let val = i64::from_u64(val); let val = i64::from_u64(val);
format_date(val) format_date(val)
} else { } else {
Ok(f64_from_fastfield_u64(val, *field_type).to_string()) Ok(f64_from_fastfield_u64(val, field_type).to_string())
} }
}; };
@@ -554,7 +486,7 @@ mod tests {
pub fn get_collector_from_ranges( pub fn get_collector_from_ranges(
ranges: Vec<RangeAggregationRange>, ranges: Vec<RangeAggregationRange>,
field_type: ColumnType, field_type: ColumnType,
) -> SegmentRangeCollector<HighCardSubAggCache> { ) -> SegmentRangeCollector {
let req = RangeAggregation { let req = RangeAggregation {
field: "dummy".to_string(), field: "dummy".to_string(),
ranges, ranges,
@@ -574,33 +506,30 @@ mod tests {
let to = if range.range.end == u64::MAX { let to = if range.range.end == u64::MAX {
None None
} else { } else {
Some(f64_from_fastfield_u64(range.range.end, field_type)) Some(f64_from_fastfield_u64(range.range.end, &field_type))
}; };
let from = if range.range.start == u64::MIN { let from = if range.range.start == u64::MIN {
None None
} else { } else {
Some(f64_from_fastfield_u64(range.range.start, field_type)) Some(f64_from_fastfield_u64(range.range.start, &field_type))
}; };
SegmentRangeAndBucketEntry { SegmentRangeAndBucketEntry {
range: range.range.clone(), range: range.range.clone(),
bucket: SegmentRangeBucketEntry { bucket: SegmentRangeBucketEntry {
doc_count: 0, doc_count: 0,
sub_aggregation: None,
key, key,
from, from,
to, to,
bucket_id: 0,
}, },
} }
}) })
.collect(); .collect();
SegmentRangeCollector { SegmentRangeCollector {
parent_buckets: vec![buckets], buckets,
column_type: field_type, column_type: field_type,
accessor_idx: 0, accessor_idx: 0,
sub_agg: None,
bucket_id_provider: Default::default(),
limits: AggregationLimitsGuard::default(),
} }
} }
@@ -847,7 +776,7 @@ mod tests {
let buckets = vec![(10f64..20f64).into(), (30f64..40f64).into()]; let buckets = vec![(10f64..20f64).into(), (30f64..40f64).into()];
let collector = get_collector_from_ranges(buckets, ColumnType::F64); let collector = get_collector_from_ranges(buckets, ColumnType::F64);
let buckets = collector.parent_buckets[0].clone(); let buckets = collector.buckets;
assert_eq!(buckets[0].range.start, u64::MIN); assert_eq!(buckets[0].range.start, u64::MIN);
assert_eq!(buckets[0].range.end, 10f64.to_u64()); assert_eq!(buckets[0].range.end, 10f64.to_u64());
assert_eq!(buckets[1].range.start, 10f64.to_u64()); assert_eq!(buckets[1].range.start, 10f64.to_u64());
@@ -870,7 +799,7 @@ mod tests {
]; ];
let collector = get_collector_from_ranges(buckets, ColumnType::F64); let collector = get_collector_from_ranges(buckets, ColumnType::F64);
let buckets = collector.parent_buckets[0].clone(); let buckets = collector.buckets;
assert_eq!(buckets[0].range.start, u64::MIN); assert_eq!(buckets[0].range.start, u64::MIN);
assert_eq!(buckets[0].range.end, 10f64.to_u64()); assert_eq!(buckets[0].range.end, 10f64.to_u64());
assert_eq!(buckets[1].range.start, 10f64.to_u64()); assert_eq!(buckets[1].range.start, 10f64.to_u64());
@@ -885,7 +814,7 @@ mod tests {
let buckets = vec![(-10f64..-1f64).into()]; let buckets = vec![(-10f64..-1f64).into()];
let collector = get_collector_from_ranges(buckets, ColumnType::F64); let collector = get_collector_from_ranges(buckets, ColumnType::F64);
let buckets = collector.parent_buckets[0].clone(); let buckets = collector.buckets;
assert_eq!(&buckets[0].bucket.key.to_string(), "*--10"); assert_eq!(&buckets[0].bucket.key.to_string(), "*--10");
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "-1-*"); assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "-1-*");
} }
@@ -894,7 +823,7 @@ mod tests {
let buckets = vec![(0f64..10f64).into()]; let buckets = vec![(0f64..10f64).into()];
let collector = get_collector_from_ranges(buckets, ColumnType::F64); let collector = get_collector_from_ranges(buckets, ColumnType::F64);
let buckets = collector.parent_buckets[0].clone(); let buckets = collector.buckets;
assert_eq!(&buckets[0].bucket.key.to_string(), "*-0"); assert_eq!(&buckets[0].bucket.key.to_string(), "*-0");
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "10-*"); assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "10-*");
} }
@@ -903,7 +832,7 @@ mod tests {
fn range_binary_search_test_u64() { fn range_binary_search_test_u64() {
let check_ranges = |ranges: Vec<RangeAggregationRange>| { let check_ranges = |ranges: Vec<RangeAggregationRange>| {
let collector = get_collector_from_ranges(ranges, ColumnType::U64); let collector = get_collector_from_ranges(ranges, ColumnType::U64);
let search = |val: u64| get_bucket_pos(val, &collector.parent_buckets[0]); let search = |val: u64| collector.get_bucket_pos(val);
assert_eq!(search(u64::MIN), 0); assert_eq!(search(u64::MIN), 0);
assert_eq!(search(9), 0); assert_eq!(search(9), 0);
@@ -949,7 +878,7 @@ mod tests {
let ranges = vec![(10.0..100.0).into()]; let ranges = vec![(10.0..100.0).into()];
let collector = get_collector_from_ranges(ranges, ColumnType::F64); let collector = get_collector_from_ranges(ranges, ColumnType::F64);
let search = |val: u64| get_bucket_pos(val, &collector.parent_buckets[0]); let search = |val: u64| collector.get_bucket_pos(val);
assert_eq!(search(u64::MIN), 0); assert_eq!(search(u64::MIN), 0);
assert_eq!(search(9f64.to_u64()), 0); assert_eq!(search(9f64.to_u64()), 0);
@@ -961,3 +890,63 @@ mod tests {
// the max value // the max value
} }
} }
#[cfg(all(test, feature = "unstable"))]
mod bench {
use itertools::Itertools;
use rand::seq::SliceRandom;
use rand::thread_rng;
use super::*;
use crate::aggregation::bucket::range::tests::get_collector_from_ranges;
const TOTAL_DOCS: u64 = 1_000_000u64;
const NUM_DOCS: u64 = 50_000u64;
fn get_collector_with_buckets(num_buckets: u64, num_docs: u64) -> SegmentRangeCollector {
let bucket_size = num_docs / num_buckets;
let mut buckets: Vec<RangeAggregationRange> = vec![];
for i in 0..num_buckets {
let bucket_start = (i * bucket_size) as f64;
buckets.push((bucket_start..bucket_start + bucket_size as f64).into())
}
get_collector_from_ranges(buckets, ColumnType::U64)
}
fn get_rand_docs(total_docs: u64, num_docs_returned: u64) -> Vec<u64> {
let mut rng = thread_rng();
let all_docs = (0..total_docs - 1).collect_vec();
let mut vals = all_docs
.as_slice()
.choose_multiple(&mut rng, num_docs_returned as usize)
.cloned()
.collect_vec();
vals.sort();
vals
}
fn bench_range_binary_search(b: &mut test::Bencher, num_buckets: u64) {
let collector = get_collector_with_buckets(num_buckets, TOTAL_DOCS);
let vals = get_rand_docs(TOTAL_DOCS, NUM_DOCS);
b.iter(|| {
let mut bucket_pos = 0;
for val in &vals {
bucket_pos = collector.get_bucket_pos(*val);
}
bucket_pos
})
}
#[bench]
fn bench_range_100_buckets(b: &mut test::Bencher) {
bench_range_binary_search(b, 100)
}
#[bench]
fn bench_range_10_buckets(b: &mut test::Bencher) {
bench_range_binary_search(b, 10)
}
}

View File

@@ -0,0 +1,196 @@
use std::fmt::Debug;
use columnar::ColumnType;
use rustc_hash::FxHashMap;
use super::OrderTarget;
use crate::aggregation::agg_data::{
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
};
use crate::aggregation::agg_limits::MemoryConsumption;
use crate::aggregation::bucket::get_agg_name_and_property;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults,
};
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::TantivyError;
#[derive(Clone, Debug, Default)]
/// Container to store term_ids/or u64 values and their buckets.
struct TermBuckets {
pub(crate) entries: FxHashMap<u64, u32>,
pub(crate) sub_aggs: FxHashMap<u64, Box<dyn SegmentAggregationCollector>>,
}
impl TermBuckets {
fn get_memory_consumption(&self) -> usize {
let sub_aggs_mem = self.sub_aggs.memory_consumption();
let buckets_mem = self.entries.memory_consumption();
sub_aggs_mem + buckets_mem
}
fn force_flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
for sub_aggregations in &mut self.sub_aggs.values_mut() {
sub_aggregations.as_mut().flush(agg_data)?;
}
Ok(())
}
}
/// The collector puts values from the fast field into the correct buckets and does a conversion to
/// the correct datatype.
#[derive(Clone, Debug)]
pub struct SegmentTermCollector {
/// The buckets containing the aggregation data.
term_buckets: TermBuckets,
accessor_idx: usize,
}
impl SegmentAggregationCollector for SegmentTermCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_data.get_term_req_data(self.accessor_idx).name.clone();
let entries: Vec<(u64, u32)> = self.term_buckets.entries.into_iter().collect();
let bucket = super::into_intermediate_bucket_result(
self.accessor_idx,
entries,
self.term_buckets.sub_aggs,
agg_data,
)?;
results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
Ok(())
}
#[inline]
fn collect(
&mut self,
doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
self.collect_block(&[doc], agg_data)
}
#[inline]
fn collect_block(
&mut self,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
let mut req_data = agg_data.take_term_req_data(self.accessor_idx);
let mem_pre = self.get_memory_consumption();
if let Some(missing) = req_data.missing_value_for_accessor {
req_data.column_block_accessor.fetch_block_with_missing(
docs,
&req_data.accessor,
missing,
);
} else {
req_data
.column_block_accessor
.fetch_block(docs, &req_data.accessor);
}
for term_id in req_data.column_block_accessor.iter_vals() {
if let Some(allowed_bs) = req_data.allowed_term_ids.as_ref() {
if !allowed_bs.contains(term_id as u32) {
continue;
}
}
let entry = self.term_buckets.entries.entry(term_id).or_default();
*entry += 1;
}
// has subagg
if let Some(blueprint) = req_data.sub_aggregation_blueprint.as_ref() {
for (doc, term_id) in req_data
.column_block_accessor
.iter_docid_vals(docs, &req_data.accessor)
{
if let Some(allowed_bs) = req_data.allowed_term_ids.as_ref() {
if !allowed_bs.contains(term_id as u32) {
continue;
}
}
let sub_aggregations = self
.term_buckets
.sub_aggs
.entry(term_id)
.or_insert_with(|| blueprint.clone());
sub_aggregations.collect(doc, agg_data)?;
}
}
let mem_delta = self.get_memory_consumption() - mem_pre;
if mem_delta > 0 {
agg_data
.context
.limits
.add_memory_consumed(mem_delta as u64)?;
}
agg_data.put_back_term_req_data(self.accessor_idx, req_data);
Ok(())
}
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
self.term_buckets.force_flush(agg_data)?;
Ok(())
}
}
impl SegmentTermCollector {
pub fn from_req_and_validate(
req_data: &mut AggregationsSegmentCtx,
node: &AggRefNode,
) -> crate::Result<Self> {
let terms_req_data = req_data.get_term_req_data(node.idx_in_req_data);
let column_type = terms_req_data.column_type;
let accessor_idx = node.idx_in_req_data;
if column_type == ColumnType::Bytes {
return Err(TantivyError::InvalidArgument(format!(
"terms aggregation is not supported for column type {column_type:?}"
)));
}
let term_buckets = TermBuckets::default();
// Validate sub aggregation exists
if let OrderTarget::SubAggregation(sub_agg_name) = &terms_req_data.req.order.target {
let (agg_name, _agg_property) = get_agg_name_and_property(sub_agg_name);
node.get_sub_agg(agg_name, &req_data.per_request)
.ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"could not find aggregation with name {agg_name} in metric \
sub_aggregations"
))
})?;
}
let has_sub_aggregations = !node.children.is_empty();
let blueprint = if has_sub_aggregations {
let sub_aggregation = build_segment_agg_collectors(req_data, &node.children)?;
Some(sub_aggregation)
} else {
None
};
let terms_req_data = req_data.get_term_req_data_mut(node.idx_in_req_data);
terms_req_data.sub_aggregation_blueprint = blueprint;
Ok(SegmentTermCollector {
term_buckets,
accessor_idx,
})
}
fn get_memory_consumption(&self) -> usize {
let self_mem = std::mem::size_of::<Self>();
let term_buckets_mem = self.term_buckets.get_memory_consumption();
self_mem + term_buckets_mem
}
}

View File

@@ -0,0 +1,228 @@
use std::vec;
use rustc_hash::FxHashMap;
use crate::aggregation::agg_data::{
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
};
use crate::aggregation::bucket::{get_agg_name_and_property, OrderTarget};
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults,
};
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::{DocId, TantivyError};
const MAX_BATCH_SIZE: usize = 1_024;
#[derive(Debug, Clone)]
struct LowCardTermBuckets {
entries: Box<[u32]>,
sub_aggs: Vec<Box<dyn SegmentAggregationCollector>>,
doc_buffers: Box<[Vec<DocId>]>,
}
impl LowCardTermBuckets {
pub fn with_num_buckets(
num_buckets: usize,
sub_aggs_blueprint_opt: Option<&Box<dyn SegmentAggregationCollector>>,
) -> Self {
let sub_aggs = sub_aggs_blueprint_opt
.as_ref()
.map(|blueprint| {
std::iter::repeat_with(|| blueprint.clone_box())
.take(num_buckets)
.collect::<Vec<_>>()
})
.unwrap_or_default();
Self {
entries: vec![0; num_buckets].into_boxed_slice(),
sub_aggs,
doc_buffers: std::iter::repeat_with(|| Vec::with_capacity(MAX_BATCH_SIZE))
.take(num_buckets)
.collect::<Vec<_>>()
.into_boxed_slice(),
}
}
fn get_memory_consumption(&self) -> usize {
std::mem::size_of::<Self>()
+ self.entries.len() * std::mem::size_of::<u32>()
+ self.doc_buffers.len()
* (std::mem::size_of::<Vec<DocId>>()
+ std::mem::size_of::<DocId>() * MAX_BATCH_SIZE)
}
}
#[derive(Debug, Clone)]
pub struct LowCardSegmentTermCollector {
term_buckets: LowCardTermBuckets,
accessor_idx: usize,
}
impl LowCardSegmentTermCollector {
pub fn from_req_and_validate(
req_data: &mut AggregationsSegmentCtx,
node: &AggRefNode,
) -> crate::Result<Self> {
let terms_req_data = req_data.get_term_req_data(node.idx_in_req_data);
let accessor_idx = node.idx_in_req_data;
let cardinality = terms_req_data
.accessor
.max_value()
.max(terms_req_data.missing_value_for_accessor.unwrap_or(0))
+ 1;
assert!(cardinality <= super::LOW_CARDINALITY_THRESHOLD);
// Validate sub aggregation exists
if let OrderTarget::SubAggregation(sub_agg_name) = &terms_req_data.req.order.target {
let (agg_name, _agg_property) = get_agg_name_and_property(sub_agg_name);
node.get_sub_agg(agg_name, &req_data.per_request)
.ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"could not find aggregation with name {agg_name} in metric \
sub_aggregations"
))
})?;
}
let has_sub_aggregations = !node.children.is_empty();
let blueprint = if has_sub_aggregations {
let sub_aggregation = build_segment_agg_collectors(req_data, &node.children)?;
Some(sub_aggregation)
} else {
None
};
let terms_req_data = req_data.get_term_req_data_mut(node.idx_in_req_data);
let term_buckets =
LowCardTermBuckets::with_num_buckets(cardinality as usize, blueprint.as_ref());
terms_req_data.sub_aggregation_blueprint = blueprint;
Ok(LowCardSegmentTermCollector {
term_buckets,
accessor_idx,
})
}
fn get_memory_consumption(&self) -> usize {
let self_mem = std::mem::size_of::<Self>();
let term_buckets_mem = self.term_buckets.get_memory_consumption();
self_mem + term_buckets_mem
}
}
impl SegmentAggregationCollector for LowCardSegmentTermCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_data.get_term_req_data(self.accessor_idx).name.clone();
let sub_aggs: FxHashMap<u64, Box<dyn SegmentAggregationCollector>> = self
.term_buckets
.sub_aggs
.into_iter()
.enumerate()
.filter(|(bucket_id, _sub_agg)| self.term_buckets.entries[*bucket_id] > 0)
.map(|(bucket_id, sub_agg)| (bucket_id as u64, sub_agg))
.collect();
let entries: Vec<(u64, u32)> = self
.term_buckets
.entries
.iter()
.enumerate()
.filter(|(_, count)| **count > 0)
.map(|(bucket_id, count)| (bucket_id as u64, *count))
.collect();
let bucket =
super::into_intermediate_bucket_result(self.accessor_idx, entries, sub_aggs, agg_data)?;
results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
Ok(())
}
fn collect_block(
&mut self,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
if docs.len() > MAX_BATCH_SIZE {
for batch in docs.chunks(MAX_BATCH_SIZE) {
self.collect_block(batch, agg_data)?;
}
}
let mut req_data = agg_data.take_term_req_data(self.accessor_idx);
let mem_pre = self.get_memory_consumption();
if let Some(missing) = req_data.missing_value_for_accessor {
req_data.column_block_accessor.fetch_block_with_missing(
docs,
&req_data.accessor,
missing,
);
} else {
req_data
.column_block_accessor
.fetch_block(docs, &req_data.accessor);
}
// has subagg
if req_data.sub_aggregation_blueprint.is_some() {
for (doc, term_id) in req_data
.column_block_accessor
.iter_docid_vals(docs, &req_data.accessor)
{
if let Some(allowed_bs) = req_data.allowed_term_ids.as_ref() {
if !allowed_bs.contains(term_id as u32) {
continue;
}
}
self.term_buckets.doc_buffers[term_id as usize].push(doc);
}
for (bucket_id, docs) in self.term_buckets.doc_buffers.iter_mut().enumerate() {
self.term_buckets.entries[bucket_id] += docs.len() as u32;
self.term_buckets.sub_aggs[bucket_id].collect_block(&docs[..], agg_data)?;
docs.clear();
}
} else {
for term_id in req_data.column_block_accessor.iter_vals() {
if let Some(allowed_bs) = req_data.allowed_term_ids.as_ref() {
if !allowed_bs.contains(term_id as u32) {
continue;
}
}
self.term_buckets.entries[term_id as usize] += 1;
}
}
let mem_delta = self.get_memory_consumption() - mem_pre;
if mem_delta > 0 {
agg_data
.context
.limits
.add_memory_consumed(mem_delta as u64)?;
}
agg_data.put_back_term_req_data(self.accessor_idx, req_data);
Ok(())
}
fn collect(
&mut self,
doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
self.collect_block(&[doc], agg_data)
}
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
for sub_aggregations in &mut self.term_buckets.sub_aggs.iter_mut() {
sub_aggregations.as_mut().flush(agg_data)?;
}
Ok(())
}
}

View File

@@ -5,13 +5,11 @@ use crate::aggregation::agg_data::{
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx, build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
}; };
use crate::aggregation::bucket::term_agg::TermsAggregation; use crate::aggregation::bucket::term_agg::TermsAggregation;
use crate::aggregation::cached_sub_aggs::{CachedSubAggs, HighCardCachedSubAggs};
use crate::aggregation::intermediate_agg_result::{ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult, IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
IntermediateKey, IntermediateTermBucketEntry, IntermediateTermBucketResult, IntermediateKey, IntermediateTermBucketEntry, IntermediateTermBucketResult,
}; };
use crate::aggregation::segment_agg_result::{BucketIdProvider, SegmentAggregationCollector}; use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::BucketId;
/// Special aggregation to handle missing values for term aggregations. /// Special aggregation to handle missing values for term aggregations.
/// This missing aggregation will check multiple columns for existence. /// This missing aggregation will check multiple columns for existence.
@@ -37,55 +35,41 @@ impl MissingTermAggReqData {
} }
} }
#[derive(Default, Debug, Clone)]
struct MissingCount {
missing_count: u32,
bucket_id: BucketId,
}
/// The specialized missing term aggregation. /// The specialized missing term aggregation.
#[derive(Default, Debug)] #[derive(Default, Debug, Clone)]
pub struct TermMissingAgg { pub struct TermMissingAgg {
missing_count: u32,
accessor_idx: usize, accessor_idx: usize,
sub_agg: Option<HighCardCachedSubAggs>, sub_agg: Option<Box<dyn SegmentAggregationCollector>>,
/// Idx = parent bucket id, Value = missing count for that bucket
missing_count_per_bucket: Vec<MissingCount>,
bucket_id_provider: BucketIdProvider,
} }
impl TermMissingAgg { impl TermMissingAgg {
pub(crate) fn new( pub(crate) fn new(
agg_data: &mut AggregationsSegmentCtx, req_data: &mut AggregationsSegmentCtx,
node: &AggRefNode, node: &AggRefNode,
) -> crate::Result<Self> { ) -> crate::Result<Self> {
let has_sub_aggregations = !node.children.is_empty(); let has_sub_aggregations = !node.children.is_empty();
let accessor_idx = node.idx_in_req_data; let accessor_idx = node.idx_in_req_data;
let sub_agg = if has_sub_aggregations { let sub_agg = if has_sub_aggregations {
let sub_aggregation = build_segment_agg_collectors(agg_data, &node.children)?; let sub_aggregation = build_segment_agg_collectors(req_data, &node.children)?;
Some(sub_aggregation) Some(sub_aggregation)
} else { } else {
None None
}; };
let sub_agg = sub_agg.map(CachedSubAggs::new);
let bucket_id_provider = BucketIdProvider::default();
Ok(Self { Ok(Self {
accessor_idx, accessor_idx,
sub_agg, sub_agg,
missing_count_per_bucket: Vec::new(), ..Default::default()
bucket_id_provider,
}) })
} }
} }
impl SegmentAggregationCollector for TermMissingAgg { impl SegmentAggregationCollector for TermMissingAgg {
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
&mut self, self: Box<Self>,
agg_data: &AggregationsSegmentCtx, agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults, results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> { ) -> crate::Result<()> {
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
let req_data = agg_data.get_missing_term_req_data(self.accessor_idx); let req_data = agg_data.get_missing_term_req_data(self.accessor_idx);
let term_agg = &req_data.req; let term_agg = &req_data.req;
let missing = term_agg let missing = term_agg
@@ -96,16 +80,13 @@ impl SegmentAggregationCollector for TermMissingAgg {
let mut entries: FxHashMap<IntermediateKey, IntermediateTermBucketEntry> = let mut entries: FxHashMap<IntermediateKey, IntermediateTermBucketEntry> =
Default::default(); Default::default();
let missing_count = &self.missing_count_per_bucket[parent_bucket_id as usize];
let mut missing_entry = IntermediateTermBucketEntry { let mut missing_entry = IntermediateTermBucketEntry {
doc_count: missing_count.missing_count, doc_count: self.missing_count,
sub_aggregation: Default::default(), sub_aggregation: Default::default(),
}; };
if let Some(sub_agg) = &mut self.sub_agg { if let Some(sub_agg) = self.sub_agg {
let mut res = IntermediateAggregationResults::default(); let mut res = IntermediateAggregationResults::default();
sub_agg sub_agg.add_intermediate_aggregation_result(agg_data, &mut res)?;
.get_sub_agg_collector()
.add_intermediate_aggregation_result(agg_data, &mut res, missing_count.bucket_id)?;
missing_entry.sub_aggregation = res; missing_entry.sub_aggregation = res;
} }
entries.insert(missing.into(), missing_entry); entries.insert(missing.into(), missing_entry);
@@ -128,52 +109,30 @@ impl SegmentAggregationCollector for TermMissingAgg {
fn collect( fn collect(
&mut self, &mut self,
parent_bucket_id: BucketId, doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
let req_data = agg_data.get_missing_term_req_data(self.accessor_idx);
let has_value = req_data
.accessors
.iter()
.any(|(acc, _)| acc.index.has_value(doc));
if !has_value {
self.missing_count += 1;
if let Some(sub_agg) = self.sub_agg.as_mut() {
sub_agg.collect(doc, agg_data)?;
}
}
Ok(())
}
fn collect_block(
&mut self,
docs: &[crate::DocId], docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> { ) -> crate::Result<()> {
let bucket = &mut self.missing_count_per_bucket[parent_bucket_id as usize];
let req_data = agg_data.get_missing_term_req_data(self.accessor_idx);
for doc in docs { for doc in docs {
let doc = *doc; self.collect(*doc, agg_data)?;
let has_value = req_data
.accessors
.iter()
.any(|(acc, _)| acc.index.has_value(doc));
if !has_value {
bucket.missing_count += 1;
if let Some(sub_agg) = self.sub_agg.as_mut() {
sub_agg.push(bucket.bucket_id, doc);
}
}
}
if let Some(sub_agg) = self.sub_agg.as_mut() {
sub_agg.check_flush_local(agg_data)?;
}
Ok(())
}
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
_agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
while self.missing_count_per_bucket.len() <= max_bucket as usize {
let bucket_id = self.bucket_id_provider.next_bucket_id();
self.missing_count_per_bucket.push(MissingCount {
missing_count: 0,
bucket_id,
});
}
Ok(())
}
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
if let Some(sub_agg) = self.sub_agg.as_mut() {
sub_agg.flush(agg_data)?;
} }
Ok(()) Ok(())
} }

View File

@@ -0,0 +1,83 @@
use super::intermediate_agg_result::IntermediateAggregationResults;
use super::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::agg_data::AggregationsSegmentCtx;
use crate::DocId;
pub(crate) const DOC_BLOCK_SIZE: usize = 64;
pub(crate) type DocBlock = [DocId; DOC_BLOCK_SIZE];
/// BufAggregationCollector buffers documents before calling collect_block().
#[derive(Clone)]
pub(crate) struct BufAggregationCollector {
pub(crate) collector: Box<dyn SegmentAggregationCollector>,
staged_docs: DocBlock,
num_staged_docs: usize,
}
impl std::fmt::Debug for BufAggregationCollector {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SegmentAggregationResultsCollector")
.field("staged_docs", &&self.staged_docs[..self.num_staged_docs])
.field("num_staged_docs", &self.num_staged_docs)
.finish()
}
}
impl BufAggregationCollector {
pub fn new(collector: Box<dyn SegmentAggregationCollector>) -> Self {
Self {
collector,
num_staged_docs: 0,
staged_docs: [0; DOC_BLOCK_SIZE],
}
}
}
impl SegmentAggregationCollector for BufAggregationCollector {
#[inline]
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
Box::new(self.collector).add_intermediate_aggregation_result(agg_data, results)
}
#[inline]
fn collect(
&mut self,
doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
self.staged_docs[self.num_staged_docs] = doc;
self.num_staged_docs += 1;
if self.num_staged_docs == self.staged_docs.len() {
self.collector
.collect_block(&self.staged_docs[..self.num_staged_docs], agg_data)?;
self.num_staged_docs = 0;
}
Ok(())
}
#[inline]
fn collect_block(
&mut self,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
self.collector.collect_block(docs, agg_data)?;
Ok(())
}
#[inline]
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
self.collector
.collect_block(&self.staged_docs[..self.num_staged_docs], agg_data)?;
self.num_staged_docs = 0;
self.collector.flush(agg_data)?;
Ok(())
}
}

View File

@@ -1,245 +0,0 @@
use std::fmt::Debug;
use super::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::agg_data::AggregationsSegmentCtx;
use crate::aggregation::bucket::MAX_NUM_TERMS_FOR_VEC;
use crate::aggregation::BucketId;
use crate::DocId;
/// A cache for sub-aggregations, storing doc ids per bucket id.
/// Depending on the cardinality of the parent aggregation, we use different
/// storage strategies.
///
/// ## Low Cardinality
/// Cardinality here refers to the number of unique flattened buckets that can be created
/// by the parent aggregation.
/// Flattened buckets are the result of combining all buckets per collector
/// into a single list of buckets, where each bucket is identified by its BucketId.
///
/// ## Usage
/// Since this is caching for sub-aggregations, it is only used by bucket
/// aggregations.
///
/// TODO: consider using a more advanced data structure for high cardinality
/// aggregations.
/// What this datastructure does in general is to group docs by bucket id.
#[derive(Debug)]
pub(crate) struct CachedSubAggs<C: SubAggCache> {
cache: C,
sub_agg_collector: Box<dyn SegmentAggregationCollector>,
num_docs: usize,
}
pub type LowCardCachedSubAggs = CachedSubAggs<LowCardSubAggCache>;
pub type HighCardCachedSubAggs = CachedSubAggs<HighCardSubAggCache>;
const FLUSH_THRESHOLD: usize = 2048;
/// A trait for caching sub-aggregation doc ids per bucket id.
/// Different implementations can be used depending on the cardinality
/// of the parent aggregation.
pub trait SubAggCache: Debug {
fn new() -> Self;
fn push(&mut self, bucket_id: BucketId, doc_id: DocId);
fn flush_local(
&mut self,
sub_agg: &mut Box<dyn SegmentAggregationCollector>,
agg_data: &mut AggregationsSegmentCtx,
force: bool,
) -> crate::Result<()>;
}
impl<Backend: SubAggCache + Debug> CachedSubAggs<Backend> {
pub fn new(sub_agg: Box<dyn SegmentAggregationCollector>) -> Self {
Self {
cache: Backend::new(),
sub_agg_collector: sub_agg,
num_docs: 0,
}
}
pub fn get_sub_agg_collector(&mut self) -> &mut Box<dyn SegmentAggregationCollector> {
&mut self.sub_agg_collector
}
#[inline]
pub fn push(&mut self, bucket_id: BucketId, doc_id: DocId) {
self.cache.push(bucket_id, doc_id);
self.num_docs += 1;
}
/// Check if we need to flush based on the number of documents cached.
/// If so, flushes the cache to the provided aggregation collector.
pub fn check_flush_local(
&mut self,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
if self.num_docs >= FLUSH_THRESHOLD {
self.cache
.flush_local(&mut self.sub_agg_collector, agg_data, false)?;
self.num_docs = 0;
}
Ok(())
}
/// Note: this _does_ flush the sub aggregations.
pub fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
if self.num_docs != 0 {
self.cache
.flush_local(&mut self.sub_agg_collector, agg_data, true)?;
self.num_docs = 0;
}
self.sub_agg_collector.flush(agg_data)?;
Ok(())
}
}
/// Number of partitions for high cardinality sub-aggregation cache.
const NUM_PARTITIONS: usize = 16;
#[derive(Debug)]
pub(crate) struct HighCardSubAggCache {
/// This weird partitioning is used to do some cheap grouping on the bucket ids.
/// bucket ids are dense, e.g. when we don't detect the cardinality as low cardinality,
/// but there are just 16 bucket ids, each bucket id will go to its own partition.
///
/// We want to keep this cheap, because high cardinality aggregations can have a lot of
/// buckets, and there may be nothing to group.
partitions: Box<[PartitionEntry; NUM_PARTITIONS]>,
}
impl HighCardSubAggCache {
#[inline]
fn clear(&mut self) {
for partition in self.partitions.iter_mut() {
partition.clear();
}
}
}
#[derive(Debug, Clone, Default)]
struct PartitionEntry {
bucket_ids: Vec<BucketId>,
docs: Vec<DocId>,
}
impl PartitionEntry {
#[inline]
fn clear(&mut self) {
self.bucket_ids.clear();
self.docs.clear();
}
}
impl SubAggCache for HighCardSubAggCache {
fn new() -> Self {
Self {
partitions: Box::new(core::array::from_fn(|_| PartitionEntry::default())),
}
}
fn push(&mut self, bucket_id: BucketId, doc_id: DocId) {
let idx = bucket_id % NUM_PARTITIONS as u32;
let slot = &mut self.partitions[idx as usize];
slot.bucket_ids.push(bucket_id);
slot.docs.push(doc_id);
}
fn flush_local(
&mut self,
sub_agg: &mut Box<dyn SegmentAggregationCollector>,
agg_data: &mut AggregationsSegmentCtx,
_force: bool,
) -> crate::Result<()> {
let mut max_bucket = 0u32;
for partition in self.partitions.iter() {
if let Some(&local_max) = partition.bucket_ids.iter().max() {
max_bucket = max_bucket.max(local_max);
}
}
sub_agg.prepare_max_bucket(max_bucket, agg_data)?;
for slot in self.partitions.iter() {
if !slot.bucket_ids.is_empty() {
// Reduce dynamic dispatch overhead by collecting a full partition in one call.
sub_agg.collect_multiple(&slot.bucket_ids, &slot.docs, agg_data)?;
}
}
self.clear();
Ok(())
}
}
#[derive(Debug)]
pub(crate) struct LowCardSubAggCache {
/// Cache doc ids per bucket for sub-aggregations.
///
/// The outer Vec is indexed by BucketId.
per_bucket_docs: Vec<Vec<DocId>>,
}
impl LowCardSubAggCache {
#[inline]
fn clear(&mut self) {
for v in &mut self.per_bucket_docs {
v.clear();
}
}
}
impl SubAggCache for LowCardSubAggCache {
fn new() -> Self {
Self {
per_bucket_docs: Vec::new(),
}
}
fn push(&mut self, bucket_id: BucketId, doc_id: DocId) {
let idx = bucket_id as usize;
if self.per_bucket_docs.len() <= idx {
self.per_bucket_docs.resize_with(idx + 1, Vec::new);
}
self.per_bucket_docs[idx].push(doc_id);
}
fn flush_local(
&mut self,
sub_agg: &mut Box<dyn SegmentAggregationCollector>,
agg_data: &mut AggregationsSegmentCtx,
force: bool,
) -> crate::Result<()> {
// Pre-aggregated: call collect per bucket.
let max_bucket = (self.per_bucket_docs.len() as BucketId).saturating_sub(1);
sub_agg.prepare_max_bucket(max_bucket, agg_data)?;
// The threshold above which we flush buckets individually.
// Note: We need to make sure that we don't lock ourselves into a situation where we hit
// the FLUSH_THRESHOLD, but never flush any buckets. (except the final flush)
let mut bucket_treshold = FLUSH_THRESHOLD / (self.per_bucket_docs.len().max(1) * 2);
const _: () = {
// MAX_NUM_TERMS_FOR_VEC threshold is used for term aggregations
// Note: There may be other flexible values, for other aggregations, but we can use the
// const value here as a upper bound. (better than nothing)
let bucket_treshold_limit = FLUSH_THRESHOLD / (MAX_NUM_TERMS_FOR_VEC as usize * 2);
assert!(
bucket_treshold_limit > 0,
"Bucket threshold must be greater than 0"
);
};
if force {
bucket_treshold = 0;
}
for (bucket_id, docs) in self
.per_bucket_docs
.iter()
.enumerate()
.filter(|(_, docs)| docs.len() > bucket_treshold)
{
sub_agg.collect(bucket_id as BucketId, docs, agg_data)?;
}
self.clear();
Ok(())
}
}

View File

@@ -1,9 +1,9 @@
use super::agg_req::Aggregations; use super::agg_req::Aggregations;
use super::agg_result::AggregationResults; use super::agg_result::AggregationResults;
use super::cached_sub_aggs::LowCardCachedSubAggs; use super::buf_collector::BufAggregationCollector;
use super::intermediate_agg_result::IntermediateAggregationResults; use super::intermediate_agg_result::IntermediateAggregationResults;
use super::segment_agg_result::SegmentAggregationCollector;
use super::AggContextParams; use super::AggContextParams;
// group buffering strategy is chosen explicitly by callers; no need to hash-group on the fly.
use crate::aggregation::agg_data::{ use crate::aggregation::agg_data::{
build_aggregations_data_from_req, build_segment_agg_collectors_root, AggregationsSegmentCtx, build_aggregations_data_from_req, build_segment_agg_collectors_root, AggregationsSegmentCtx,
}; };
@@ -136,7 +136,7 @@ fn merge_fruits(
/// `AggregationSegmentCollector` does the aggregation collection on a segment. /// `AggregationSegmentCollector` does the aggregation collection on a segment.
pub struct AggregationSegmentCollector { pub struct AggregationSegmentCollector {
aggs_with_accessor: AggregationsSegmentCtx, aggs_with_accessor: AggregationsSegmentCtx,
agg_collector: LowCardCachedSubAggs, agg_collector: BufAggregationCollector,
error: Option<TantivyError>, error: Option<TantivyError>,
} }
@@ -151,11 +151,8 @@ impl AggregationSegmentCollector {
) -> crate::Result<Self> { ) -> crate::Result<Self> {
let mut agg_data = let mut agg_data =
build_aggregations_data_from_req(agg, reader, segment_ordinal, context.clone())?; build_aggregations_data_from_req(agg, reader, segment_ordinal, context.clone())?;
let mut result = let result =
LowCardCachedSubAggs::new(build_segment_agg_collectors_root(&mut agg_data)?); BufAggregationCollector::new(build_segment_agg_collectors_root(&mut agg_data)?);
result
.get_sub_agg_collector()
.prepare_max_bucket(0, &agg_data)?; // prepare for bucket zero
Ok(AggregationSegmentCollector { Ok(AggregationSegmentCollector {
aggs_with_accessor: agg_data, aggs_with_accessor: agg_data,
@@ -173,31 +170,26 @@ impl SegmentCollector for AggregationSegmentCollector {
if self.error.is_some() { if self.error.is_some() {
return; return;
} }
self.agg_collector.push(0, doc); if let Err(err) = self
match self
.agg_collector .agg_collector
.check_flush_local(&mut self.aggs_with_accessor) .collect(doc, &mut self.aggs_with_accessor)
{ {
Ok(_) => {} self.error = Some(err);
Err(e) => {
self.error = Some(e);
}
} }
} }
/// The query pushes the documents to the collector via this method.
///
/// Only valid for Collectors that ignore docs
fn collect_block(&mut self, docs: &[DocId]) { fn collect_block(&mut self, docs: &[DocId]) {
if self.error.is_some() { if self.error.is_some() {
return; return;
} }
if let Err(err) = self
match self.agg_collector.get_sub_agg_collector().collect( .agg_collector
0, .collect_block(docs, &mut self.aggs_with_accessor)
docs, {
&mut self.aggs_with_accessor, self.error = Some(err);
) {
Ok(_) => {}
Err(e) => {
self.error = Some(e);
}
} }
} }
@@ -208,13 +200,10 @@ impl SegmentCollector for AggregationSegmentCollector {
self.agg_collector.flush(&mut self.aggs_with_accessor)?; self.agg_collector.flush(&mut self.aggs_with_accessor)?;
let mut sub_aggregation_res = IntermediateAggregationResults::default(); let mut sub_aggregation_res = IntermediateAggregationResults::default();
self.agg_collector Box::new(self.agg_collector).add_intermediate_aggregation_result(
.get_sub_agg_collector() &self.aggs_with_accessor,
.add_intermediate_aggregation_result( &mut sub_aggregation_res,
&self.aggs_with_accessor, )?;
&mut sub_aggregation_res,
0,
)?;
Ok(sub_aggregation_res) Ok(sub_aggregation_res)
} }

View File

@@ -792,7 +792,7 @@ pub struct IntermediateRangeBucketEntry {
/// The number of documents in the bucket. /// The number of documents in the bucket.
pub doc_count: u64, pub doc_count: u64,
/// The sub_aggregation in this bucket. /// The sub_aggregation in this bucket.
pub sub_aggregation_res: IntermediateAggregationResults, pub sub_aggregation: IntermediateAggregationResults,
/// The from range of the bucket. Equals `f64::MIN` when `None`. /// The from range of the bucket. Equals `f64::MIN` when `None`.
pub from: Option<f64>, pub from: Option<f64>,
/// The to range of the bucket. Equals `f64::MAX` when `None`. /// The to range of the bucket. Equals `f64::MAX` when `None`.
@@ -811,7 +811,7 @@ impl IntermediateRangeBucketEntry {
key: self.key.into(), key: self.key.into(),
doc_count: self.doc_count, doc_count: self.doc_count,
sub_aggregation: self sub_aggregation: self
.sub_aggregation_res .sub_aggregation
.into_final_result_internal(req, limits)?, .into_final_result_internal(req, limits)?,
to: self.to, to: self.to,
from: self.from, from: self.from,
@@ -857,8 +857,7 @@ impl MergeFruits for IntermediateTermBucketEntry {
impl MergeFruits for IntermediateRangeBucketEntry { impl MergeFruits for IntermediateRangeBucketEntry {
fn merge_fruits(&mut self, other: IntermediateRangeBucketEntry) -> crate::Result<()> { fn merge_fruits(&mut self, other: IntermediateRangeBucketEntry) -> crate::Result<()> {
self.doc_count += other.doc_count; self.doc_count += other.doc_count;
self.sub_aggregation_res self.sub_aggregation.merge_fruits(other.sub_aggregation)?;
.merge_fruits(other.sub_aggregation_res)?;
Ok(()) Ok(())
} }
} }
@@ -888,7 +887,7 @@ mod tests {
IntermediateRangeBucketEntry { IntermediateRangeBucketEntry {
key: IntermediateKey::Str(key.to_string()), key: IntermediateKey::Str(key.to_string()),
doc_count: *doc_count, doc_count: *doc_count,
sub_aggregation_res: Default::default(), sub_aggregation: Default::default(),
from: None, from: None,
to: None, to: None,
}, },
@@ -921,7 +920,7 @@ mod tests {
doc_count: *doc_count, doc_count: *doc_count,
from: None, from: None,
to: None, to: None,
sub_aggregation_res: get_sub_test_tree(&[( sub_aggregation: get_sub_test_tree(&[(
sub_aggregation_key.to_string(), sub_aggregation_key.to_string(),
*sub_aggregation_count, *sub_aggregation_count,
)]), )]),

View File

@@ -52,8 +52,10 @@ pub struct IntermediateAverage {
impl IntermediateAverage { impl IntermediateAverage {
/// Creates a new [`IntermediateAverage`] instance from a [`SegmentStatsCollector`]. /// Creates a new [`IntermediateAverage`] instance from a [`SegmentStatsCollector`].
pub(crate) fn from_stats(stats: IntermediateStats) -> Self { pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
Self { stats } Self {
stats: collector.stats,
}
} }
/// Merges the other intermediate result into self. /// Merges the other intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateAverage) { pub fn merge_fruits(&mut self, other: IntermediateAverage) {

View File

@@ -2,7 +2,7 @@ use std::collections::hash_map::DefaultHasher;
use std::hash::{BuildHasher, Hasher}; use std::hash::{BuildHasher, Hasher};
use columnar::column_values::CompactSpaceU64Accessor; use columnar::column_values::CompactSpaceU64Accessor;
use columnar::{Column, ColumnType, Dictionary, StrColumn}; use columnar::{Column, ColumnBlockAccessor, ColumnType, Dictionary, StrColumn};
use common::f64_to_u64; use common::f64_to_u64;
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus}; use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
use rustc_hash::FxHashSet; use rustc_hash::FxHashSet;
@@ -106,6 +106,8 @@ pub struct CardinalityAggReqData {
pub str_dict_column: Option<StrColumn>, pub str_dict_column: Option<StrColumn>,
/// The missing value normalized to the internal u64 representation of the field type. /// The missing value normalized to the internal u64 representation of the field type.
pub missing_value_for_accessor: Option<u64>, pub missing_value_for_accessor: Option<u64>,
/// The column block accessor to access the fast field values.
pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
/// The name of the aggregation. /// The name of the aggregation.
pub name: String, pub name: String,
/// The aggregation request. /// The aggregation request.
@@ -133,34 +135,45 @@ impl CardinalityAggregationReq {
} }
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug, PartialEq)]
pub(crate) struct SegmentCardinalityCollector { pub(crate) struct SegmentCardinalityCollector {
buckets: Vec<SegmentCardinalityCollectorBucket>,
accessor_idx: usize,
/// The column accessor to access the fast field values.
accessor: Column<u64>,
/// The column_type of the field.
column_type: ColumnType,
/// The missing value normalized to the internal u64 representation of the field type.
missing_value_for_accessor: Option<u64>,
}
#[derive(Clone, Debug, PartialEq, Default)]
pub(crate) struct SegmentCardinalityCollectorBucket {
cardinality: CardinalityCollector, cardinality: CardinalityCollector,
entries: FxHashSet<u64>, entries: FxHashSet<u64>,
accessor_idx: usize,
} }
impl SegmentCardinalityCollectorBucket {
pub fn new(column_type: ColumnType) -> Self { impl SegmentCardinalityCollector {
pub fn from_req(column_type: ColumnType, accessor_idx: usize) -> Self {
Self { Self {
cardinality: CardinalityCollector::new(column_type as u8), cardinality: CardinalityCollector::new(column_type as u8),
entries: FxHashSet::default(), entries: Default::default(),
accessor_idx,
} }
} }
fn fetch_block_with_field(
&mut self,
docs: &[crate::DocId],
agg_data: &mut CardinalityAggReqData,
) {
if let Some(missing) = agg_data.missing_value_for_accessor {
agg_data.column_block_accessor.fetch_block_with_missing(
docs,
&agg_data.accessor,
missing,
);
} else {
agg_data
.column_block_accessor
.fetch_block(docs, &agg_data.accessor);
}
}
fn into_intermediate_metric_result( fn into_intermediate_metric_result(
mut self, mut self,
req_data: &CardinalityAggReqData, agg_data: &AggregationsSegmentCtx,
) -> crate::Result<IntermediateMetricResult> { ) -> crate::Result<IntermediateMetricResult> {
let req_data = &agg_data.get_cardinality_req_data(self.accessor_idx);
if req_data.column_type == ColumnType::Str { if req_data.column_type == ColumnType::Str {
let fallback_dict = Dictionary::empty(); let fallback_dict = Dictionary::empty();
let dict = req_data let dict = req_data
@@ -181,7 +194,6 @@ impl SegmentCardinalityCollectorBucket {
term_ids.push(term_ord as u32); term_ids.push(term_ord as u32);
} }
} }
term_ids.sort_unstable(); term_ids.sort_unstable();
dict.sorted_ords_to_term_cb(term_ids.iter().map(|term| *term as u64), |term| { dict.sorted_ords_to_term_cb(term_ids.iter().map(|term| *term as u64), |term| {
self.cardinality.sketch.insert_any(&term); self.cardinality.sketch.insert_any(&term);
@@ -215,49 +227,16 @@ impl SegmentCardinalityCollectorBucket {
} }
} }
impl SegmentCardinalityCollector {
pub fn from_req(
column_type: ColumnType,
accessor_idx: usize,
accessor: Column<u64>,
missing_value_for_accessor: Option<u64>,
) -> Self {
Self {
buckets: vec![SegmentCardinalityCollectorBucket::new(column_type); 1],
column_type,
accessor_idx,
accessor,
missing_value_for_accessor,
}
}
fn fetch_block_with_field(
&mut self,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) {
agg_data.column_block_accessor.fetch_block_with_missing(
docs,
&self.accessor,
self.missing_value_for_accessor,
);
}
}
impl SegmentAggregationCollector for SegmentCardinalityCollector { impl SegmentAggregationCollector for SegmentCardinalityCollector {
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
&mut self, self: Box<Self>,
agg_data: &AggregationsSegmentCtx, agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults, results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> { ) -> crate::Result<()> {
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
let req_data = &agg_data.get_cardinality_req_data(self.accessor_idx); let req_data = &agg_data.get_cardinality_req_data(self.accessor_idx);
let name = req_data.name.to_string(); let name = req_data.name.to_string();
// take the bucket in buckets and replace it with a new empty one
let bucket = std::mem::take(&mut self.buckets[parent_bucket_id as usize]);
let intermediate_result = bucket.into_intermediate_metric_result(req_data)?; let intermediate_result = self.into_intermediate_metric_result(agg_data)?;
results.push( results.push(
name, name,
IntermediateAggregationResult::Metric(intermediate_result), IntermediateAggregationResult::Metric(intermediate_result),
@@ -268,20 +247,27 @@ impl SegmentAggregationCollector for SegmentCardinalityCollector {
fn collect( fn collect(
&mut self, &mut self,
parent_bucket_id: BucketId, doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
self.collect_block(&[doc], agg_data)
}
fn collect_block(
&mut self,
docs: &[crate::DocId], docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> { ) -> crate::Result<()> {
self.fetch_block_with_field(docs, agg_data); let req_data = agg_data.get_cardinality_req_data_mut(self.accessor_idx);
let bucket = &mut self.buckets[parent_bucket_id as usize]; self.fetch_block_with_field(docs, req_data);
let col_block_accessor = &agg_data.column_block_accessor; let col_block_accessor = &req_data.column_block_accessor;
if self.column_type == ColumnType::Str { if req_data.column_type == ColumnType::Str {
for term_ord in col_block_accessor.iter_vals() { for term_ord in col_block_accessor.iter_vals() {
bucket.entries.insert(term_ord); self.entries.insert(term_ord);
} }
} else if self.column_type == ColumnType::IpAddr { } else if req_data.column_type == ColumnType::IpAddr {
let compact_space_accessor = self let compact_space_accessor = req_data
.accessor .accessor
.values .values
.clone() .clone()
@@ -296,29 +282,16 @@ impl SegmentAggregationCollector for SegmentCardinalityCollector {
})?; })?;
for val in col_block_accessor.iter_vals() { for val in col_block_accessor.iter_vals() {
let val: u128 = compact_space_accessor.compact_to_u128(val as u32); let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
bucket.cardinality.sketch.insert_any(&val); self.cardinality.sketch.insert_any(&val);
} }
} else { } else {
for val in col_block_accessor.iter_vals() { for val in col_block_accessor.iter_vals() {
bucket.cardinality.sketch.insert_any(&val); self.cardinality.sketch.insert_any(&val);
} }
} }
Ok(()) Ok(())
} }
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
_agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
if max_bucket as usize >= self.buckets.len() {
self.buckets.resize_with(max_bucket as usize + 1, || {
SegmentCardinalityCollectorBucket::new(self.column_type)
});
}
Ok(())
}
} }
#[derive(Clone, Debug, Serialize, Deserialize)] #[derive(Clone, Debug, Serialize, Deserialize)]

View File

@@ -52,8 +52,10 @@ pub struct IntermediateCount {
impl IntermediateCount { impl IntermediateCount {
/// Creates a new [`IntermediateCount`] instance from a [`SegmentStatsCollector`]. /// Creates a new [`IntermediateCount`] instance from a [`SegmentStatsCollector`].
pub(crate) fn from_stats(stats: IntermediateStats) -> Self { pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
Self { stats } Self {
stats: collector.stats,
}
} }
/// Merges the other intermediate result into self. /// Merges the other intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateCount) { pub fn merge_fruits(&mut self, other: IntermediateCount) {

View File

@@ -8,9 +8,10 @@ use crate::aggregation::agg_data::AggregationsSegmentCtx;
use crate::aggregation::intermediate_agg_result::{ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult, IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
}; };
use crate::aggregation::metric::MetricAggReqData;
use crate::aggregation::segment_agg_result::SegmentAggregationCollector; use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::*; use crate::aggregation::*;
use crate::TantivyError; use crate::{DocId, TantivyError};
/// A multi-value metric aggregation that computes a collection of extended statistics /// A multi-value metric aggregation that computes a collection of extended statistics
/// on numeric values that are extracted /// on numeric values that are extracted
@@ -61,7 +62,7 @@ impl ExtendedStatsAggregation {
/// Extended stats contains a collection of statistics /// Extended stats contains a collection of statistics
/// they extends stats adding variance, standard deviation /// they extends stats adding variance, standard deviation
/// and bound information /// and bound informations
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ExtendedStats { pub struct ExtendedStats {
/// The number of documents. /// The number of documents.
@@ -317,28 +318,51 @@ impl IntermediateExtendedStats {
} }
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug, PartialEq)]
pub(crate) struct SegmentExtendedStatsCollector { pub(crate) struct SegmentExtendedStatsCollector {
name: String,
missing: Option<u64>, missing: Option<u64>,
field_type: ColumnType, field_type: ColumnType,
accessor: columnar::Column<u64>, pub(crate) extended_stats: IntermediateExtendedStats,
buckets: Vec<IntermediateExtendedStats>, pub(crate) accessor_idx: usize,
sigma: Option<f64>, val_cache: Vec<u64>,
} }
impl SegmentExtendedStatsCollector { impl SegmentExtendedStatsCollector {
pub fn from_req(req: &MetricAggReqData, sigma: Option<f64>) -> Self { pub fn from_req(
let missing = req field_type: ColumnType,
.missing sigma: Option<f64>,
.and_then(|val| f64_to_fastfield_u64(val, &req.field_type)); accessor_idx: usize,
missing: Option<f64>,
) -> Self {
let missing = missing.and_then(|val| f64_to_fastfield_u64(val, &field_type));
Self { Self {
name: req.name.clone(), field_type,
field_type: req.field_type, extended_stats: IntermediateExtendedStats::with_sigma(sigma),
accessor: req.accessor.clone(), accessor_idx,
missing, missing,
buckets: vec![IntermediateExtendedStats::with_sigma(sigma); 16], val_cache: Default::default(),
sigma, }
}
#[inline]
pub(crate) fn collect_block_with_field(
&mut self,
docs: &[DocId],
req_data: &mut MetricAggReqData,
) {
if let Some(missing) = self.missing.as_ref() {
req_data.column_block_accessor.fetch_block_with_missing(
docs,
&req_data.accessor,
*missing,
);
} else {
req_data
.column_block_accessor
.fetch_block(docs, &req_data.accessor);
}
for val in req_data.column_block_accessor.iter_vals() {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.extended_stats.collect(val1);
} }
} }
} }
@@ -346,18 +370,15 @@ impl SegmentExtendedStatsCollector {
impl SegmentAggregationCollector for SegmentExtendedStatsCollector { impl SegmentAggregationCollector for SegmentExtendedStatsCollector {
#[inline] #[inline]
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
&mut self, self: Box<Self>,
agg_data: &AggregationsSegmentCtx, agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults, results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> { ) -> crate::Result<()> {
let name = self.name.clone(); let name = agg_data.get_metric_req_data(self.accessor_idx).name.clone();
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
let extended_stats = std::mem::take(&mut self.buckets[parent_bucket_id as usize]);
results.push( results.push(
name, name,
IntermediateAggregationResult::Metric(IntermediateMetricResult::ExtendedStats( IntermediateAggregationResult::Metric(IntermediateMetricResult::ExtendedStats(
extended_stats, self.extended_stats,
)), )),
)?; )?;
@@ -367,36 +388,39 @@ impl SegmentAggregationCollector for SegmentExtendedStatsCollector {
#[inline] #[inline]
fn collect( fn collect(
&mut self, &mut self,
parent_bucket_id: BucketId, doc: crate::DocId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> { ) -> crate::Result<()> {
let mut extended_stats = self.buckets[parent_bucket_id as usize].clone(); let req_data = agg_data.get_metric_req_data(self.accessor_idx);
if let Some(missing) = self.missing {
agg_data let mut has_val = false;
.column_block_accessor for val in req_data.accessor.values_for_doc(doc) {
.fetch_block_with_missing(docs, &self.accessor, self.missing); let val1 = f64_from_fastfield_u64(val, &self.field_type);
for val in agg_data.column_block_accessor.iter_vals() { self.extended_stats.collect(val1);
let val1 = f64_from_fastfield_u64(val, self.field_type); has_val = true;
extended_stats.collect(val1); }
if !has_val {
self.extended_stats
.collect(f64_from_fastfield_u64(missing, &self.field_type));
}
} else {
for val in req_data.accessor.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.extended_stats.collect(val1);
}
} }
// store back
self.buckets[parent_bucket_id as usize] = extended_stats;
Ok(()) Ok(())
} }
fn prepare_max_bucket( #[inline]
fn collect_block(
&mut self, &mut self,
max_bucket: BucketId, docs: &[crate::DocId],
_agg_data: &AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> { ) -> crate::Result<()> {
if self.buckets.len() <= max_bucket as usize { let req_data = agg_data.get_metric_req_data_mut(self.accessor_idx);
self.buckets.resize_with(max_bucket as usize + 1, || { self.collect_block_with_field(docs, req_data);
IntermediateExtendedStats::with_sigma(self.sigma)
});
}
Ok(()) Ok(())
} }
} }

View File

@@ -52,8 +52,10 @@ pub struct IntermediateMax {
impl IntermediateMax { impl IntermediateMax {
/// Creates a new [`IntermediateMax`] instance from a [`SegmentStatsCollector`]. /// Creates a new [`IntermediateMax`] instance from a [`SegmentStatsCollector`].
pub(crate) fn from_stats(stats: IntermediateStats) -> Self { pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
Self { stats } Self {
stats: collector.stats,
}
} }
/// Merges the other intermediate result into self. /// Merges the other intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateMax) { pub fn merge_fruits(&mut self, other: IntermediateMax) {

View File

@@ -52,8 +52,10 @@ pub struct IntermediateMin {
impl IntermediateMin { impl IntermediateMin {
/// Creates a new [`IntermediateMin`] instance from a [`SegmentStatsCollector`]. /// Creates a new [`IntermediateMin`] instance from a [`SegmentStatsCollector`].
pub(crate) fn from_stats(stats: IntermediateStats) -> Self { pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
Self { stats } Self {
stats: collector.stats,
}
} }
/// Merges the other intermediate result into self. /// Merges the other intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateMin) { pub fn merge_fruits(&mut self, other: IntermediateMin) {

View File

@@ -31,7 +31,7 @@ use std::collections::HashMap;
pub use average::*; pub use average::*;
pub use cardinality::*; pub use cardinality::*;
use columnar::{Column, ColumnType}; use columnar::{Column, ColumnBlockAccessor, ColumnType};
pub use count::*; pub use count::*;
pub use extended_stats::*; pub use extended_stats::*;
pub use max::*; pub use max::*;
@@ -55,6 +55,8 @@ pub struct MetricAggReqData {
pub field_type: ColumnType, pub field_type: ColumnType,
/// The missing value normalized to the internal u64 representation of the field type. /// The missing value normalized to the internal u64 representation of the field type.
pub missing_u64: Option<u64>, pub missing_u64: Option<u64>,
/// The column block accessor to access the fast field values.
pub column_block_accessor: ColumnBlockAccessor<u64>,
/// The column accessor to access the fast field values. /// The column accessor to access the fast field values.
pub accessor: Column<u64>, pub accessor: Column<u64>,
/// Used when converting to intermediate result /// Used when converting to intermediate result

View File

@@ -7,9 +7,10 @@ use crate::aggregation::agg_data::AggregationsSegmentCtx;
use crate::aggregation::intermediate_agg_result::{ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult, IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
}; };
use crate::aggregation::metric::MetricAggReqData;
use crate::aggregation::segment_agg_result::SegmentAggregationCollector; use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::*; use crate::aggregation::*;
use crate::TantivyError; use crate::{DocId, TantivyError};
/// # Percentiles /// # Percentiles
/// ///
@@ -130,16 +131,10 @@ impl PercentilesAggregationReq {
} }
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug, PartialEq)]
pub(crate) struct SegmentPercentilesCollector { pub(crate) struct SegmentPercentilesCollector {
pub(crate) buckets: Vec<PercentilesCollector>, pub(crate) percentiles: PercentilesCollector,
pub(crate) accessor_idx: usize, pub(crate) accessor_idx: usize,
/// The type of the field.
pub field_type: ColumnType,
/// The missing value normalized to the internal u64 representation of the field type.
pub missing_u64: Option<u64>,
/// The column accessor to access the fast field values.
pub accessor: Column<u64>,
} }
#[derive(Clone, Serialize, Deserialize)] #[derive(Clone, Serialize, Deserialize)]
@@ -234,18 +229,33 @@ impl PercentilesCollector {
} }
impl SegmentPercentilesCollector { impl SegmentPercentilesCollector {
pub fn from_req_and_validate( pub fn from_req_and_validate(accessor_idx: usize) -> crate::Result<Self> {
field_type: ColumnType, Ok(Self {
missing_u64: Option<u64>, percentiles: PercentilesCollector::new(),
accessor: Column<u64>,
accessor_idx: usize,
) -> Self {
Self {
buckets: Vec::with_capacity(64),
field_type,
missing_u64,
accessor,
accessor_idx, accessor_idx,
})
}
#[inline]
pub(crate) fn collect_block_with_field(
&mut self,
docs: &[DocId],
req_data: &mut MetricAggReqData,
) {
if let Some(missing) = req_data.missing_u64.as_ref() {
req_data.column_block_accessor.fetch_block_with_missing(
docs,
&req_data.accessor,
*missing,
);
} else {
req_data
.column_block_accessor
.fetch_block(docs, &req_data.accessor);
}
for val in req_data.column_block_accessor.iter_vals() {
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
self.percentiles.collect(val1);
} }
} }
} }
@@ -253,18 +263,12 @@ impl SegmentPercentilesCollector {
impl SegmentAggregationCollector for SegmentPercentilesCollector { impl SegmentAggregationCollector for SegmentPercentilesCollector {
#[inline] #[inline]
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
&mut self, self: Box<Self>,
agg_data: &AggregationsSegmentCtx, agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults, results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> { ) -> crate::Result<()> {
let name = agg_data.get_metric_req_data(self.accessor_idx).name.clone(); let name = agg_data.get_metric_req_data(self.accessor_idx).name.clone();
self.prepare_max_bucket(parent_bucket_id, agg_data)?; let intermediate_metric_result = IntermediateMetricResult::Percentiles(self.percentiles);
// Swap collector with an empty one to avoid cloning
let percentiles_collector = std::mem::take(&mut self.buckets[parent_bucket_id as usize]);
let intermediate_metric_result =
IntermediateMetricResult::Percentiles(percentiles_collector);
results.push( results.push(
name, name,
@@ -277,33 +281,40 @@ impl SegmentAggregationCollector for SegmentPercentilesCollector {
#[inline] #[inline]
fn collect( fn collect(
&mut self, &mut self,
parent_bucket_id: BucketId, doc: crate::DocId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> { ) -> crate::Result<()> {
let percentiles = &mut self.buckets[parent_bucket_id as usize]; let req_data = agg_data.get_metric_req_data(self.accessor_idx);
agg_data.column_block_accessor.fetch_block_with_missing(
docs,
&self.accessor,
self.missing_u64,
);
for val in agg_data.column_block_accessor.iter_vals() { if let Some(missing) = req_data.missing_u64 {
let val1 = f64_from_fastfield_u64(val, self.field_type); let mut has_val = false;
percentiles.collect(val1); for val in req_data.accessor.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
self.percentiles.collect(val1);
has_val = true;
}
if !has_val {
self.percentiles
.collect(f64_from_fastfield_u64(missing, &req_data.field_type));
}
} else {
for val in req_data.accessor.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
self.percentiles.collect(val1);
}
} }
Ok(()) Ok(())
} }
fn prepare_max_bucket( #[inline]
fn collect_block(
&mut self, &mut self,
max_bucket: BucketId, docs: &[crate::DocId],
_agg_data: &AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> { ) -> crate::Result<()> {
while self.buckets.len() <= max_bucket as usize { let req_data = agg_data.get_metric_req_data_mut(self.accessor_idx);
self.buckets.push(PercentilesCollector::new()); self.collect_block_with_field(docs, req_data);
}
Ok(()) Ok(())
} }
} }

View File

@@ -1,6 +1,5 @@
use std::fmt::Debug; use std::fmt::Debug;
use columnar::{Column, ColumnType};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::*; use super::*;
@@ -8,9 +7,10 @@ use crate::aggregation::agg_data::AggregationsSegmentCtx;
use crate::aggregation::intermediate_agg_result::{ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult, IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
}; };
use crate::aggregation::metric::MetricAggReqData;
use crate::aggregation::segment_agg_result::SegmentAggregationCollector; use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::*; use crate::aggregation::*;
use crate::TantivyError; use crate::{DocId, TantivyError};
/// A multi-value metric aggregation that computes a collection of statistics on numeric values that /// A multi-value metric aggregation that computes a collection of statistics on numeric values that
/// are extracted from the aggregated documents. /// are extracted from the aggregated documents.
@@ -83,7 +83,7 @@ impl Stats {
/// Intermediate result of the stats aggregation that can be combined with other intermediate /// Intermediate result of the stats aggregation that can be combined with other intermediate
/// results. /// results.
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)] #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateStats { pub struct IntermediateStats {
/// The number of extracted values. /// The number of extracted values.
pub(crate) count: u64, pub(crate) count: u64,
@@ -187,75 +187,75 @@ pub enum StatsType {
Percentiles, Percentiles,
} }
fn create_collector<const TYPE_ID: u8>( #[derive(Clone, Debug)]
req: &MetricAggReqData, pub(crate) struct SegmentStatsCollector {
) -> Box<dyn SegmentAggregationCollector> { pub(crate) stats: IntermediateStats,
Box::new(SegmentStatsCollector::<TYPE_ID> { pub(crate) accessor_idx: usize,
name: req.name.clone(),
collecting_for: req.collecting_for,
is_number_or_date_type: req.is_number_or_date_type,
missing_u64: req.missing_u64,
accessor: req.accessor.clone(),
buckets: vec![IntermediateStats::default()],
})
} }
/// Build a concrete `SegmentStatsCollector` depending on the column type. impl SegmentStatsCollector {
pub(crate) fn build_segment_stats_collector( pub fn from_req(accessor_idx: usize) -> Self {
req: &MetricAggReqData, Self {
) -> crate::Result<Box<dyn SegmentAggregationCollector>> { stats: IntermediateStats::default(),
match req.field_type { accessor_idx,
ColumnType::I64 => Ok(create_collector::<{ ColumnType::I64 as u8 }>(req)), }
ColumnType::U64 => Ok(create_collector::<{ ColumnType::U64 as u8 }>(req)), }
ColumnType::F64 => Ok(create_collector::<{ ColumnType::F64 as u8 }>(req)), #[inline]
ColumnType::Bool => Ok(create_collector::<{ ColumnType::Bool as u8 }>(req)), pub(crate) fn collect_block_with_field(
ColumnType::DateTime => Ok(create_collector::<{ ColumnType::DateTime as u8 }>(req)), &mut self,
ColumnType::Bytes => Ok(create_collector::<{ ColumnType::Bytes as u8 }>(req)), docs: &[DocId],
ColumnType::Str => Ok(create_collector::<{ ColumnType::Str as u8 }>(req)), req_data: &mut MetricAggReqData,
ColumnType::IpAddr => Ok(create_collector::<{ ColumnType::IpAddr as u8 }>(req)), ) {
if let Some(missing) = req_data.missing_u64.as_ref() {
req_data.column_block_accessor.fetch_block_with_missing(
docs,
&req_data.accessor,
*missing,
);
} else {
req_data
.column_block_accessor
.fetch_block(docs, &req_data.accessor);
}
if req_data.is_number_or_date_type {
for val in req_data.column_block_accessor.iter_vals() {
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
self.stats.collect(val1);
}
} else {
for _val in req_data.column_block_accessor.iter_vals() {
// we ignore the value and simply record that we got something
self.stats.collect(0.0);
}
}
} }
} }
#[repr(C)] impl SegmentAggregationCollector for SegmentStatsCollector {
#[derive(Clone, Debug)]
pub(crate) struct SegmentStatsCollector<const COLUMN_TYPE_ID: u8> {
pub(crate) missing_u64: Option<u64>,
pub(crate) accessor: Column<u64>,
pub(crate) is_number_or_date_type: bool,
pub(crate) buckets: Vec<IntermediateStats>,
pub(crate) name: String,
pub(crate) collecting_for: StatsType,
}
impl<const COLUMN_TYPE_ID: u8> SegmentAggregationCollector
for SegmentStatsCollector<COLUMN_TYPE_ID>
{
#[inline] #[inline]
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
&mut self, self: Box<Self>,
agg_data: &AggregationsSegmentCtx, agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults, results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> { ) -> crate::Result<()> {
let name = self.name.clone(); let req = agg_data.get_metric_req_data(self.accessor_idx);
let name = req.name.clone();
self.prepare_max_bucket(parent_bucket_id, agg_data)?; let intermediate_metric_result = match req.collecting_for {
let stats = self.buckets[parent_bucket_id as usize];
let intermediate_metric_result = match self.collecting_for {
StatsType::Average => { StatsType::Average => {
IntermediateMetricResult::Average(IntermediateAverage::from_stats(stats)) IntermediateMetricResult::Average(IntermediateAverage::from_collector(*self))
} }
StatsType::Count => { StatsType::Count => {
IntermediateMetricResult::Count(IntermediateCount::from_stats(stats)) IntermediateMetricResult::Count(IntermediateCount::from_collector(*self))
} }
StatsType::Max => IntermediateMetricResult::Max(IntermediateMax::from_stats(stats)), StatsType::Max => IntermediateMetricResult::Max(IntermediateMax::from_collector(*self)),
StatsType::Min => IntermediateMetricResult::Min(IntermediateMin::from_stats(stats)), StatsType::Min => IntermediateMetricResult::Min(IntermediateMin::from_collector(*self)),
StatsType::Stats => IntermediateMetricResult::Stats(stats), StatsType::Stats => IntermediateMetricResult::Stats(self.stats),
StatsType::Sum => IntermediateMetricResult::Sum(IntermediateSum::from_stats(stats)), StatsType::Sum => IntermediateMetricResult::Sum(IntermediateSum::from_collector(*self)),
_ => { _ => {
return Err(TantivyError::InvalidArgument(format!( return Err(TantivyError::InvalidArgument(format!(
"Unsupported stats type for stats aggregation: {:?}", "Unsupported stats type for stats aggregation: {:?}",
self.collecting_for req.collecting_for
))) )))
} }
}; };
@@ -271,67 +271,41 @@ impl<const COLUMN_TYPE_ID: u8> SegmentAggregationCollector
#[inline] #[inline]
fn collect( fn collect(
&mut self, &mut self,
parent_bucket_id: BucketId, doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
let req_data = agg_data.get_metric_req_data(self.accessor_idx);
if let Some(missing) = req_data.missing_u64 {
let mut has_val = false;
for val in req_data.accessor.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
self.stats.collect(val1);
has_val = true;
}
if !has_val {
self.stats
.collect(f64_from_fastfield_u64(missing, &req_data.field_type));
}
} else {
for val in req_data.accessor.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
self.stats.collect(val1);
}
}
Ok(())
}
#[inline]
fn collect_block(
&mut self,
docs: &[crate::DocId], docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> { ) -> crate::Result<()> {
// TODO: remove once we fetch all values for all bucket ids in one go let req_data = agg_data.get_metric_req_data_mut(self.accessor_idx);
if docs.len() == 1 && self.missing_u64.is_none() { self.collect_block_with_field(docs, req_data);
collect_stats::<COLUMN_TYPE_ID>(
&mut self.buckets[parent_bucket_id as usize],
self.accessor.values_for_doc(docs[0]),
self.is_number_or_date_type,
)?;
return Ok(());
}
agg_data.column_block_accessor.fetch_block_with_missing(
docs,
&self.accessor,
self.missing_u64,
);
collect_stats::<COLUMN_TYPE_ID>(
&mut self.buckets[parent_bucket_id as usize],
agg_data.column_block_accessor.iter_vals(),
self.is_number_or_date_type,
)?;
Ok(()) Ok(())
} }
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
_agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
let required_buckets = (max_bucket as usize) + 1;
if self.buckets.len() < required_buckets {
self.buckets
.resize_with(required_buckets, IntermediateStats::default);
}
Ok(())
}
}
#[inline]
fn collect_stats<const COLUMN_TYPE_ID: u8>(
stats: &mut IntermediateStats,
vals: impl Iterator<Item = u64>,
is_number_or_date_type: bool,
) -> crate::Result<()> {
if is_number_or_date_type {
for val in vals {
let val1 = convert_to_f64::<COLUMN_TYPE_ID>(val);
stats.collect(val1);
}
} else {
for _val in vals {
// we ignore the value and simply record that we got something
stats.collect(0.0);
}
}
Ok(())
} }
#[cfg(test)] #[cfg(test)]

View File

@@ -52,8 +52,10 @@ pub struct IntermediateSum {
impl IntermediateSum { impl IntermediateSum {
/// Creates a new [`IntermediateSum`] instance from a [`SegmentStatsCollector`]. /// Creates a new [`IntermediateSum`] instance from a [`SegmentStatsCollector`].
pub(crate) fn from_stats(stats: IntermediateStats) -> Self { pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
Self { stats } Self {
stats: collector.stats,
}
} }
/// Merges the other intermediate result into self. /// Merges the other intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateSum) { pub fn merge_fruits(&mut self, other: IntermediateSum) {

View File

@@ -15,11 +15,11 @@ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateMetricResult, IntermediateAggregationResult, IntermediateMetricResult,
}; };
use crate::aggregation::segment_agg_result::SegmentAggregationCollector; use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::{AggregationError, BucketId}; use crate::aggregation::AggregationError;
use crate::collector::sort_key::ReverseComparator;
use crate::collector::TopNComputer; use crate::collector::TopNComputer;
use crate::schema::OwnedValue; use crate::schema::OwnedValue;
use crate::{DocAddress, DocId, SegmentOrdinal}; use crate::{DocAddress, DocId, SegmentOrdinal};
// duplicate import removed; already imported above
/// Contains all information required by the TopHitsSegmentCollector to perform the /// Contains all information required by the TopHitsSegmentCollector to perform the
/// top_hits aggregation on a segment. /// top_hits aggregation on a segment.
@@ -458,7 +458,7 @@ impl Eq for DocSortValuesAndFields {}
#[derive(Clone, Serialize, Deserialize, Debug)] #[derive(Clone, Serialize, Deserialize, Debug)]
pub struct TopHitsTopNComputer { pub struct TopHitsTopNComputer {
req: TopHitsAggregationReq, req: TopHitsAggregationReq,
top_n: TopNComputer<DocSortValuesAndFields, DocAddress, ReverseComparator>, top_n: TopNComputer<DocSortValuesAndFields, DocAddress, false>,
} }
impl std::cmp::PartialEq for TopHitsTopNComputer { impl std::cmp::PartialEq for TopHitsTopNComputer {
@@ -471,10 +471,7 @@ impl TopHitsTopNComputer {
/// Create a new TopHitsCollector /// Create a new TopHitsCollector
pub fn new(req: &TopHitsAggregationReq) -> Self { pub fn new(req: &TopHitsAggregationReq) -> Self {
Self { Self {
top_n: TopNComputer::new_with_comparator( top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
req.size + req.from.unwrap_or(0),
ReverseComparator,
),
req: req.clone(), req: req.clone(),
} }
} }
@@ -485,7 +482,7 @@ impl TopHitsTopNComputer {
pub(crate) fn merge_fruits(&mut self, other_fruit: Self) -> crate::Result<()> { pub(crate) fn merge_fruits(&mut self, other_fruit: Self) -> crate::Result<()> {
for doc in other_fruit.top_n.into_vec() { for doc in other_fruit.top_n.into_vec() {
self.collect(doc.sort_key, doc.doc); self.collect(doc.feature, doc.doc);
} }
Ok(()) Ok(())
} }
@@ -497,9 +494,9 @@ impl TopHitsTopNComputer {
.into_sorted_vec() .into_sorted_vec()
.into_iter() .into_iter()
.map(|doc| TopHitsVecEntry { .map(|doc| TopHitsVecEntry {
sort: doc.sort_key.sorts.iter().map(|f| f.value).collect(), sort: doc.feature.sorts.iter().map(|f| f.value).collect(),
doc_value_fields: doc doc_value_fields: doc
.sort_key .feature
.doc_value_fields .doc_value_fields
.into_iter() .into_iter()
.map(|(k, v)| (k, v.into())) .map(|(k, v)| (k, v.into()))
@@ -520,8 +517,7 @@ impl TopHitsTopNComputer {
pub(crate) struct TopHitsSegmentCollector { pub(crate) struct TopHitsSegmentCollector {
segment_ordinal: SegmentOrdinal, segment_ordinal: SegmentOrdinal,
accessor_idx: usize, accessor_idx: usize,
buckets: Vec<TopNComputer<Vec<DocValueAndOrder>, DocAddress, ReverseComparator>>, top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, false>,
num_hits: usize,
} }
impl TopHitsSegmentCollector { impl TopHitsSegmentCollector {
@@ -530,35 +526,25 @@ impl TopHitsSegmentCollector {
accessor_idx: usize, accessor_idx: usize,
segment_ordinal: SegmentOrdinal, segment_ordinal: SegmentOrdinal,
) -> Self { ) -> Self {
let num_hits = req.size + req.from.unwrap_or(0);
Self { Self {
num_hits, top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
segment_ordinal, segment_ordinal,
accessor_idx, accessor_idx,
buckets: vec![TopNComputer::new_with_comparator(num_hits, ReverseComparator); 1],
} }
} }
fn get_top_hits_computer( fn into_top_hits_collector(
&mut self, self,
parent_bucket_id: BucketId,
value_accessors: &HashMap<String, Vec<DynamicColumn>>, value_accessors: &HashMap<String, Vec<DynamicColumn>>,
req: &TopHitsAggregationReq, req: &TopHitsAggregationReq,
) -> TopHitsTopNComputer { ) -> TopHitsTopNComputer {
if parent_bucket_id as usize >= self.buckets.len() {
return TopHitsTopNComputer::new(req);
}
let top_n = std::mem::replace(
&mut self.buckets[parent_bucket_id as usize],
TopNComputer::new(0),
);
let mut top_hits_computer = TopHitsTopNComputer::new(req); let mut top_hits_computer = TopHitsTopNComputer::new(req);
let top_results = top_n.into_vec(); let top_results = self.top_n.into_vec();
for res in top_results { for res in top_results {
let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id); let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id);
top_hits_computer.collect( top_hits_computer.collect(
DocSortValuesAndFields { DocSortValuesAndFields {
sorts: res.sort_key, sorts: res.feature,
doc_value_fields, doc_value_fields,
}, },
res.doc, res.doc,
@@ -567,24 +553,54 @@ impl TopHitsSegmentCollector {
top_hits_computer top_hits_computer
} }
/// TODO add a specialized variant for a single sort field
fn collect_with(
&mut self,
doc_id: crate::DocId,
req: &TopHitsAggregationReq,
accessors: &[(Column<u64>, ColumnType)],
) -> crate::Result<()> {
let sorts: Vec<DocValueAndOrder> = req
.sort
.iter()
.enumerate()
.map(|(idx, KeyOrder { order, .. })| {
let order = *order;
let value = accessors
.get(idx)
.expect("could not find field in accessors")
.0
.values_for_doc(doc_id)
.next();
DocValueAndOrder { value, order }
})
.collect();
self.top_n.push(
sorts,
DocAddress {
segment_ord: self.segment_ordinal,
doc_id,
},
);
Ok(())
}
} }
impl SegmentAggregationCollector for TopHitsSegmentCollector { impl SegmentAggregationCollector for TopHitsSegmentCollector {
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
&mut self, self: Box<Self>,
agg_data: &AggregationsSegmentCtx, agg_data: &AggregationsSegmentCtx,
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults, results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> { ) -> crate::Result<()> {
let req_data = agg_data.get_top_hits_req_data(self.accessor_idx); let req_data = agg_data.get_top_hits_req_data(self.accessor_idx);
let value_accessors = &req_data.value_accessors; let value_accessors = &req_data.value_accessors;
let intermediate_result = IntermediateMetricResult::TopHits(self.get_top_hits_computer( let intermediate_result = IntermediateMetricResult::TopHits(
parent_bucket_id, self.into_top_hits_collector(value_accessors, &req_data.req),
value_accessors, );
&req_data.req,
));
results.push( results.push(
req_data.name.to_string(), req_data.name.to_string(),
IntermediateAggregationResult::Metric(intermediate_result), IntermediateAggregationResult::Metric(intermediate_result),
@@ -594,54 +610,24 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
/// TODO: Consider a caching layer to reduce the call overhead /// TODO: Consider a caching layer to reduce the call overhead
fn collect( fn collect(
&mut self, &mut self,
parent_bucket_id: BucketId, doc_id: crate::DocId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> { ) -> crate::Result<()> {
let top_n = &mut self.buckets[parent_bucket_id as usize];
let req_data = agg_data.get_top_hits_req_data(self.accessor_idx); let req_data = agg_data.get_top_hits_req_data(self.accessor_idx);
let req = &req_data.req; self.collect_with(doc_id, &req_data.req, &req_data.accessors)?;
let accessors = &req_data.accessors;
for &doc_id in docs {
// TODO: this is terrible, a new vec is allocated for every doc
// We can fetch blocks instead
// We don't need to store the order for every value
let sorts: Vec<DocValueAndOrder> = req
.sort
.iter()
.enumerate()
.map(|(idx, KeyOrder { order, .. })| {
let order = *order;
let value = accessors
.get(idx)
.expect("could not find field in accessors")
.0
.values_for_doc(doc_id)
.next();
DocValueAndOrder { value, order }
})
.collect();
top_n.push(
sorts,
DocAddress {
segment_ord: self.segment_ordinal,
doc_id,
},
);
}
Ok(()) Ok(())
} }
fn prepare_max_bucket( fn collect_block(
&mut self, &mut self,
max_bucket: BucketId, docs: &[crate::DocId],
_agg_data: &AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> { ) -> crate::Result<()> {
self.buckets.resize( let req_data = agg_data.get_top_hits_req_data(self.accessor_idx);
(max_bucket as usize) + 1, // TODO: Consider getting fields with the column block accessor.
TopNComputer::new_with_comparator(self.num_hits, ReverseComparator), for doc in docs {
); self.collect_with(*doc, &req_data.req, &req_data.accessors)?;
}
Ok(()) Ok(())
} }
} }
@@ -659,7 +645,6 @@ mod tests {
use crate::aggregation::bucket::tests::get_test_index_from_docs; use crate::aggregation::bucket::tests::get_test_index_from_docs;
use crate::aggregation::tests::get_test_index_from_values; use crate::aggregation::tests::get_test_index_from_values;
use crate::aggregation::AggregationCollector; use crate::aggregation::AggregationCollector;
use crate::collector::sort_key::ReverseComparator;
use crate::collector::ComparableDoc; use crate::collector::ComparableDoc;
use crate::query::AllQuery; use crate::query::AllQuery;
use crate::schema::OwnedValue; use crate::schema::OwnedValue;
@@ -675,7 +660,7 @@ mod tests {
fn collector_with_capacity(capacity: usize) -> super::TopHitsTopNComputer { fn collector_with_capacity(capacity: usize) -> super::TopHitsTopNComputer {
super::TopHitsTopNComputer { super::TopHitsTopNComputer {
top_n: super::TopNComputer::new_with_comparator(capacity, ReverseComparator), top_n: super::TopNComputer::new(capacity),
req: Default::default(), req: Default::default(),
} }
} }
@@ -759,7 +744,7 @@ mod tests {
], ],
"from": 0, "from": 0,
} }
} }
})) }))
.unwrap(); .unwrap();
@@ -789,12 +774,12 @@ mod tests {
#[test] #[test]
fn test_top_hits_collector_single_feature() -> crate::Result<()> { fn test_top_hits_collector_single_feature() -> crate::Result<()> {
let docs = vec![ let docs = vec![
ComparableDoc::<_, _> { ComparableDoc::<_, _, false> {
doc: crate::DocAddress { doc: crate::DocAddress {
segment_ord: 0, segment_ord: 0,
doc_id: 0, doc_id: 0,
}, },
sort_key: DocSortValuesAndFields { feature: DocSortValuesAndFields {
sorts: vec![DocValueAndOrder { sorts: vec![DocValueAndOrder {
value: Some(1), value: Some(1),
order: Order::Asc, order: Order::Asc,
@@ -807,7 +792,7 @@ mod tests {
segment_ord: 0, segment_ord: 0,
doc_id: 2, doc_id: 2,
}, },
sort_key: DocSortValuesAndFields { feature: DocSortValuesAndFields {
sorts: vec![DocValueAndOrder { sorts: vec![DocValueAndOrder {
value: Some(3), value: Some(3),
order: Order::Asc, order: Order::Asc,
@@ -820,7 +805,7 @@ mod tests {
segment_ord: 0, segment_ord: 0,
doc_id: 1, doc_id: 1,
}, },
sort_key: DocSortValuesAndFields { feature: DocSortValuesAndFields {
sorts: vec![DocValueAndOrder { sorts: vec![DocValueAndOrder {
value: Some(5), value: Some(5),
order: Order::Asc, order: Order::Asc,
@@ -832,7 +817,7 @@ mod tests {
let mut collector = collector_with_capacity(3); let mut collector = collector_with_capacity(3);
for doc in docs.clone() { for doc in docs.clone() {
collector.collect(doc.sort_key, doc.doc); collector.collect(doc.feature, doc.doc);
} }
let res = collector.into_final_result(); let res = collector.into_final_result();
@@ -842,15 +827,15 @@ mod tests {
super::TopHitsMetricResult { super::TopHitsMetricResult {
hits: vec![ hits: vec![
super::TopHitsVecEntry { super::TopHitsVecEntry {
sort: vec![docs[0].sort_key.sorts[0].value], sort: vec![docs[0].feature.sorts[0].value],
doc_value_fields: Default::default(), doc_value_fields: Default::default(),
}, },
super::TopHitsVecEntry { super::TopHitsVecEntry {
sort: vec![docs[1].sort_key.sorts[0].value], sort: vec![docs[1].feature.sorts[0].value],
doc_value_fields: Default::default(), doc_value_fields: Default::default(),
}, },
super::TopHitsVecEntry { super::TopHitsVecEntry {
sort: vec![docs[2].sort_key.sorts[0].value], sort: vec![docs[2].feature.sorts[0].value],
doc_value_fields: Default::default(), doc_value_fields: Default::default(),
}, },
] ]
@@ -888,7 +873,7 @@ mod tests {
"mixed.*", "mixed.*",
], ],
} }
} }
}))?; }))?;
let collector = AggregationCollector::from_aggs(d, Default::default()); let collector = AggregationCollector::from_aggs(d, Default::default());

View File

@@ -133,7 +133,7 @@ mod agg_limits;
pub mod agg_req; pub mod agg_req;
pub mod agg_result; pub mod agg_result;
pub mod bucket; pub mod bucket;
pub(crate) mod cached_sub_aggs; mod buf_collector;
mod collector; mod collector;
mod date; mod date;
mod error; mod error;
@@ -162,19 +162,6 @@ use serde::{Deserialize, Deserializer, Serialize};
use crate::tokenizer::TokenizerManager; use crate::tokenizer::TokenizerManager;
/// A bucket id is a dense identifier for a bucket within an aggregation.
/// It is used to index into a Vec that hold per-bucket data.
///
/// For example, in a terms aggregation, each unique term will be assigned a incremental BucketId.
/// This BucketId will be forwarded to sub-aggregations to identify the parent bucket.
///
/// This allows to have a single AggregationCollector instance per aggregation,
/// that can handle multiple buckets efficiently.
///
/// The API to call sub-aggregations is therefore a &[(BucketId, &[DocId])].
/// For that we'll need a buffer. One Vec per bucket aggregation is needed.
pub type BucketId = u32;
/// Context parameters for aggregation execution /// Context parameters for aggregation execution
/// ///
/// This struct holds shared resources needed during aggregation execution: /// This struct holds shared resources needed during aggregation execution:
@@ -348,37 +335,19 @@ impl Display for Key {
} }
} }
pub(crate) fn convert_to_f64<const COLUMN_TYPE_ID: u8>(val: u64) -> f64 {
if COLUMN_TYPE_ID == ColumnType::U64 as u8 {
val as f64
} else if COLUMN_TYPE_ID == ColumnType::I64 as u8
|| COLUMN_TYPE_ID == ColumnType::DateTime as u8
{
i64::from_u64(val) as f64
} else if COLUMN_TYPE_ID == ColumnType::F64 as u8 {
f64::from_u64(val)
} else if COLUMN_TYPE_ID == ColumnType::Bool as u8 {
val as f64
} else {
panic!(
"ColumnType ID {} cannot be converted to f64 metric",
COLUMN_TYPE_ID
)
}
}
/// Inverse of `to_fastfield_u64`. Used to convert to `f64` for metrics. /// Inverse of `to_fastfield_u64`. Used to convert to `f64` for metrics.
/// ///
/// # Panics /// # Panics
/// Only `u64`, `f64`, `date`, and `i64` are supported. /// Only `u64`, `f64`, `date`, and `i64` are supported.
pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: ColumnType) -> f64 { pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: &ColumnType) -> f64 {
match field_type { match field_type {
ColumnType::U64 => convert_to_f64::<{ ColumnType::U64 as u8 }>(val), ColumnType::U64 => val as f64,
ColumnType::I64 => convert_to_f64::<{ ColumnType::I64 as u8 }>(val), ColumnType::I64 | ColumnType::DateTime => i64::from_u64(val) as f64,
ColumnType::F64 => convert_to_f64::<{ ColumnType::F64 as u8 }>(val), ColumnType::F64 => f64::from_u64(val),
ColumnType::Bool => convert_to_f64::<{ ColumnType::Bool as u8 }>(val), ColumnType::Bool => val as f64,
ColumnType::DateTime => convert_to_f64::<{ ColumnType::DateTime as u8 }>(val), _ => {
_ => panic!("unexpected type {field_type:?}. This should not happen"), panic!("unexpected type {field_type:?}. This should not happen")
}
} }
} }

View File

@@ -8,67 +8,28 @@ use std::fmt::Debug;
pub(crate) use super::agg_limits::AggregationLimitsGuard; pub(crate) use super::agg_limits::AggregationLimitsGuard;
use super::intermediate_agg_result::IntermediateAggregationResults; use super::intermediate_agg_result::IntermediateAggregationResults;
use crate::aggregation::agg_data::AggregationsSegmentCtx; use crate::aggregation::agg_data::AggregationsSegmentCtx;
use crate::aggregation::BucketId;
/// Monotonically increasing provider of BucketIds.
#[derive(Debug, Clone, Default)]
pub struct BucketIdProvider(u32);
impl BucketIdProvider {
/// Get the next BucketId.
pub fn next_bucket_id(&mut self) -> BucketId {
let bucket_id = self.0;
self.0 += 1;
bucket_id
}
}
/// A SegmentAggregationCollector is used to collect aggregation results. /// A SegmentAggregationCollector is used to collect aggregation results.
pub trait SegmentAggregationCollector: Debug { pub trait SegmentAggregationCollector: CollectorClone + Debug {
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
&mut self, self: Box<Self>,
agg_data: &AggregationsSegmentCtx, agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults, results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()>; ) -> crate::Result<()>;
/// Note: The caller needs to call `prepare_max_bucket` before calling `collect`. #[inline]
fn collect( fn collect(
&mut self, &mut self,
parent_bucket_id: BucketId, doc: crate::DocId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()>;
/// Collect docs for multiple buckets in one call.
/// Minimizes dynamic dispatch overhead when collecting many buckets.
///
/// Note: The caller needs to call `prepare_max_bucket` before calling `collect`.
fn collect_multiple(
&mut self,
bucket_ids: &[BucketId],
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> { ) -> crate::Result<()> {
debug_assert_eq!(bucket_ids.len(), docs.len()); self.collect_block(&[doc], agg_data)
let mut start = 0;
while start < bucket_ids.len() {
let bucket_id = bucket_ids[start];
let mut end = start + 1;
while end < bucket_ids.len() && bucket_ids[end] == bucket_id {
end += 1;
}
self.collect(bucket_id, &docs[start..end], agg_data)?;
start = end;
}
Ok(())
} }
/// Prepare the collector for collecting up to BucketId `max_bucket`. fn collect_block(
/// This is useful so we can split allocation ahead of time of collecting.
fn prepare_max_bucket(
&mut self, &mut self,
max_bucket: BucketId, docs: &[crate::DocId],
agg_data: &AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()>; ) -> crate::Result<()>;
/// Finalize method. Some Aggregator collect blocks of docs before calling `collect_block`. /// Finalize method. Some Aggregator collect blocks of docs before calling `collect_block`.
@@ -78,7 +39,26 @@ pub trait SegmentAggregationCollector: Debug {
} }
} }
#[derive(Default)] /// A helper trait to enable cloning of Box<dyn SegmentAggregationCollector>
pub trait CollectorClone {
fn clone_box(&self) -> Box<dyn SegmentAggregationCollector>;
}
impl<T> CollectorClone for T
where T: 'static + SegmentAggregationCollector + Clone
{
fn clone_box(&self) -> Box<dyn SegmentAggregationCollector> {
Box::new(self.clone())
}
}
impl Clone for Box<dyn SegmentAggregationCollector> {
fn clone(&self) -> Box<dyn SegmentAggregationCollector> {
self.clone_box()
}
}
#[derive(Clone, Default)]
/// The GenericSegmentAggregationResultsCollector is the generic version of the collector, which /// The GenericSegmentAggregationResultsCollector is the generic version of the collector, which
/// can handle arbitrary complexity of sub-aggregations. Ideally we never have to pick this one /// can handle arbitrary complexity of sub-aggregations. Ideally we never have to pick this one
/// and can provide specialized versions instead, that remove some of its overhead. /// and can provide specialized versions instead, that remove some of its overhead.
@@ -96,13 +76,12 @@ impl Debug for GenericSegmentAggregationResultsCollector {
impl SegmentAggregationCollector for GenericSegmentAggregationResultsCollector { impl SegmentAggregationCollector for GenericSegmentAggregationResultsCollector {
fn add_intermediate_aggregation_result( fn add_intermediate_aggregation_result(
&mut self, self: Box<Self>,
agg_data: &AggregationsSegmentCtx, agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults, results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> { ) -> crate::Result<()> {
for agg in &mut self.aggs { for agg in self.aggs {
agg.add_intermediate_aggregation_result(agg_data, results, parent_bucket_id)?; agg.add_intermediate_aggregation_result(agg_data, results)?;
} }
Ok(()) Ok(())
@@ -110,13 +89,23 @@ impl SegmentAggregationCollector for GenericSegmentAggregationResultsCollector {
fn collect( fn collect(
&mut self, &mut self,
parent_bucket_id: BucketId, doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
self.collect_block(&[doc], agg_data)?;
Ok(())
}
fn collect_block(
&mut self,
docs: &[crate::DocId], docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx, agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> { ) -> crate::Result<()> {
for collector in &mut self.aggs { for collector in &mut self.aggs {
collector.collect(parent_bucket_id, docs, agg_data)?; collector.collect_block(docs, agg_data)?;
} }
Ok(()) Ok(())
} }
@@ -126,15 +115,4 @@ impl SegmentAggregationCollector for GenericSegmentAggregationResultsCollector {
} }
Ok(()) Ok(())
} }
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
for collector in &mut self.aggs {
collector.prepare_max_bucket(max_bucket, agg_data)?;
}
Ok(())
}
} }

View File

@@ -0,0 +1,121 @@
use crate::collector::top_collector::{TopCollector, TopSegmentCollector};
use crate::collector::{Collector, SegmentCollector};
use crate::{DocAddress, DocId, Score, SegmentReader};
pub(crate) struct CustomScoreTopCollector<TCustomScorer, TScore = Score> {
custom_scorer: TCustomScorer,
collector: TopCollector<TScore>,
}
impl<TCustomScorer, TScore> CustomScoreTopCollector<TCustomScorer, TScore>
where TScore: Clone + PartialOrd
{
pub(crate) fn new(
custom_scorer: TCustomScorer,
collector: TopCollector<TScore>,
) -> CustomScoreTopCollector<TCustomScorer, TScore> {
CustomScoreTopCollector {
custom_scorer,
collector,
}
}
}
/// A custom segment scorer makes it possible to define any kind of score
/// for a given document belonging to a specific segment.
///
/// It is the segment local version of the [`CustomScorer`].
pub trait CustomSegmentScorer<TScore>: 'static {
/// Computes the score of a specific `doc`.
fn score(&mut self, doc: DocId) -> TScore;
}
/// `CustomScorer` makes it possible to define any kind of score.
///
/// The `CustomerScorer` itself does not make much of the computation itself.
/// Instead, it helps constructing `Self::Child` instances that will compute
/// the score at a segment scale.
pub trait CustomScorer<TScore>: Sync {
/// Type of the associated [`CustomSegmentScorer`].
type Child: CustomSegmentScorer<TScore>;
/// Builds a child scorer for a specific segment. The child scorer is associated with
/// a specific segment.
fn segment_scorer(&self, segment_reader: &SegmentReader) -> crate::Result<Self::Child>;
}
impl<TCustomScorer, TScore> Collector for CustomScoreTopCollector<TCustomScorer, TScore>
where
TCustomScorer: CustomScorer<TScore> + Send + Sync,
TScore: 'static + PartialOrd + Clone + Send + Sync,
{
type Fruit = Vec<(TScore, DocAddress)>;
type Child = CustomScoreTopSegmentCollector<TCustomScorer::Child, TScore>;
fn for_segment(
&self,
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
let segment_collector = self.collector.for_segment(segment_local_id, segment_reader);
let segment_scorer = self.custom_scorer.segment_scorer(segment_reader)?;
Ok(CustomScoreTopSegmentCollector {
segment_collector,
segment_scorer,
})
}
fn requires_scoring(&self) -> bool {
false
}
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> crate::Result<Self::Fruit> {
self.collector.merge_fruits(segment_fruits)
}
}
pub struct CustomScoreTopSegmentCollector<T, TScore>
where
TScore: 'static + PartialOrd + Clone + Send + Sync + Sized,
T: CustomSegmentScorer<TScore>,
{
segment_collector: TopSegmentCollector<TScore>,
segment_scorer: T,
}
impl<T, TScore> SegmentCollector for CustomScoreTopSegmentCollector<T, TScore>
where
TScore: 'static + PartialOrd + Clone + Send + Sync,
T: 'static + CustomSegmentScorer<TScore>,
{
type Fruit = Vec<(TScore, DocAddress)>;
fn collect(&mut self, doc: DocId, _score: Score) {
let score = self.segment_scorer.score(doc);
self.segment_collector.collect(doc, score);
}
fn harvest(self) -> Vec<(TScore, DocAddress)> {
self.segment_collector.harvest()
}
}
impl<F, TScore, T> CustomScorer<TScore> for F
where
F: 'static + Send + Sync + Fn(&SegmentReader) -> T,
T: CustomSegmentScorer<TScore>,
{
type Child = T;
fn segment_scorer(&self, segment_reader: &SegmentReader) -> crate::Result<Self::Child> {
Ok((self)(segment_reader))
}
}
impl<F, TScore> CustomSegmentScorer<TScore> for F
where F: 'static + FnMut(DocId) -> TScore
{
fn score(&mut self, doc: DocId) -> TScore {
(self)(doc)
}
}

View File

@@ -12,7 +12,6 @@ use std::marker::PhantomData;
use columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType}; use columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType};
use crate::collector::{Collector, SegmentCollector}; use crate::collector::{Collector, SegmentCollector};
use crate::schema::Schema;
use crate::{DocId, Score, SegmentReader}; use crate::{DocId, Score, SegmentReader};
/// The `FilterCollector` filters docs using a fast field value and a predicate. /// The `FilterCollector` filters docs using a fast field value and a predicate.
@@ -50,13 +49,13 @@ use crate::{DocId, Score, SegmentReader};
/// ///
/// let query_parser = QueryParser::for_index(&index, vec![title]); /// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?; /// let query = query_parser.parse_query("diary")?;
/// let no_filter_collector = FilterCollector::new("price".to_string(), |value: u64| value > 20_120u64, TopDocs::with_limit(2).order_by_score()); /// let no_filter_collector = FilterCollector::new("price".to_string(), |value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &no_filter_collector)?; /// let top_docs = searcher.search(&query, &no_filter_collector)?;
/// ///
/// assert_eq!(top_docs.len(), 1); /// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1)); /// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
/// ///
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new("price".to_string(), |value| value < 5u64, TopDocs::with_limit(2).order_by_score()); /// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new("price".to_string(), |value| value < 5u64, TopDocs::with_limit(2));
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?; /// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
/// ///
/// assert_eq!(filtered_top_docs.len(), 0); /// assert_eq!(filtered_top_docs.len(), 0);
@@ -105,11 +104,6 @@ where
type Child = FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>; type Child = FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>;
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
self.collector.check_schema(schema)?;
Ok(())
}
fn for_segment( fn for_segment(
&self, &self,
segment_local_id: u32, segment_local_id: u32,
@@ -126,7 +120,6 @@ where
segment_collector, segment_collector,
predicate: self.predicate.clone(), predicate: self.predicate.clone(),
t_predicate_value: PhantomData, t_predicate_value: PhantomData,
filtered_docs: Vec::with_capacity(crate::COLLECT_BLOCK_BUFFER_LEN),
}) })
} }
@@ -147,7 +140,6 @@ pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue
segment_collector: TSegmentCollector, segment_collector: TSegmentCollector,
predicate: TPredicate, predicate: TPredicate,
t_predicate_value: PhantomData<TPredicateValue>, t_predicate_value: PhantomData<TPredicateValue>,
filtered_docs: Vec<DocId>,
} }
impl<TSegmentCollector, TPredicate, TPredicateValue> impl<TSegmentCollector, TPredicate, TPredicateValue>
@@ -184,20 +176,6 @@ where
} }
} }
fn collect_block(&mut self, docs: &[DocId]) {
self.filtered_docs.clear();
for &doc in docs {
// TODO: `accept_document` could be further optimized to do batch lookups of column
// values for single-valued columns.
if self.accept_document(doc) {
self.filtered_docs.push(doc);
}
}
if !self.filtered_docs.is_empty() {
self.segment_collector.collect_block(&self.filtered_docs);
}
}
fn harvest(self) -> TSegmentCollector::Fruit { fn harvest(self) -> TSegmentCollector::Fruit {
self.segment_collector.harvest() self.segment_collector.harvest()
} }
@@ -240,7 +218,7 @@ where
/// ///
/// let query_parser = QueryParser::for_index(&index, vec![title]); /// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?; /// let query = query_parser.parse_query("diary")?;
/// let filter_collector = BytesFilterCollector::new("barcode".to_string(), |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2).order_by_score()); /// let filter_collector = BytesFilterCollector::new("barcode".to_string(), |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &filter_collector)?; /// let top_docs = searcher.search(&query, &filter_collector)?;
/// ///
/// assert_eq!(top_docs.len(), 1); /// assert_eq!(top_docs.len(), 1);
@@ -280,10 +258,6 @@ where
type Child = BytesFilterSegmentCollector<TCollector::Child, TPredicate>; type Child = BytesFilterSegmentCollector<TCollector::Child, TPredicate>;
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
self.collector.check_schema(schema)
}
fn for_segment( fn for_segment(
&self, &self,
segment_local_id: u32, segment_local_id: u32,
@@ -300,7 +274,6 @@ where
segment_collector, segment_collector,
predicate: self.predicate.clone(), predicate: self.predicate.clone(),
buffer: Vec::new(), buffer: Vec::new(),
filtered_docs: Vec::with_capacity(crate::COLLECT_BLOCK_BUFFER_LEN),
}) })
} }
@@ -323,7 +296,6 @@ where TPredicate: 'static
segment_collector: TSegmentCollector, segment_collector: TSegmentCollector,
predicate: TPredicate, predicate: TPredicate,
buffer: Vec<u8>, buffer: Vec<u8>,
filtered_docs: Vec<DocId>,
} }
impl<TSegmentCollector, TPredicate> BytesFilterSegmentCollector<TSegmentCollector, TPredicate> impl<TSegmentCollector, TPredicate> BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
@@ -362,20 +334,6 @@ where
} }
} }
fn collect_block(&mut self, docs: &[DocId]) {
self.filtered_docs.clear();
for &doc in docs {
// TODO: `accept_document` could be further optimized to do batch lookups of column
// values for single-valued columns.
if self.accept_document(doc) {
self.filtered_docs.push(doc);
}
}
if !self.filtered_docs.is_empty() {
self.segment_collector.collect_block(&self.filtered_docs);
}
}
fn harvest(self) -> TSegmentCollector::Fruit { fn harvest(self) -> TSegmentCollector::Fruit {
self.segment_collector.harvest() self.segment_collector.harvest()
} }

View File

@@ -57,7 +57,7 @@
//! # let query_parser = QueryParser::for_index(&index, vec![title]); //! # let query_parser = QueryParser::for_index(&index, vec![title]);
//! # let query = query_parser.parse_query("diary")?; //! # let query = query_parser.parse_query("diary")?;
//! let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) = //! let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
//! searcher.search(&query, &(Count, TopDocs::with_limit(2).order_by_score()))?; //! searcher.search(&query, &(Count, TopDocs::with_limit(2)))?;
//! # Ok(()) //! # Ok(())
//! # } //! # }
//! ``` //! ```
@@ -83,15 +83,11 @@
use downcast_rs::impl_downcast; use downcast_rs::impl_downcast;
use crate::schema::Schema;
use crate::{DocId, Score, SegmentOrdinal, SegmentReader}; use crate::{DocId, Score, SegmentOrdinal, SegmentReader};
mod count_collector; mod count_collector;
pub use self::count_collector::Count; pub use self::count_collector::Count;
/// Sort keys
pub mod sort_key;
mod histogram_collector; mod histogram_collector;
pub use histogram_collector::HistogramCollector; pub use histogram_collector::HistogramCollector;
@@ -99,13 +95,16 @@ mod multi_collector;
pub use self::multi_collector::{FruitHandle, MultiCollector, MultiFruit}; pub use self::multi_collector::{FruitHandle, MultiCollector, MultiFruit};
mod top_collector; mod top_collector;
pub use self::top_collector::ComparableDoc;
mod top_score_collector; mod top_score_collector;
pub use self::top_collector::ComparableDoc;
pub use self::top_score_collector::{TopDocs, TopNComputer}; pub use self::top_score_collector::{TopDocs, TopNComputer};
mod sort_key_top_collector; mod custom_score_top_collector;
pub use self::sort_key::{SegmentSortKeyComputer, SortKeyComputer}; pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};
mod tweak_score_top_collector;
pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
mod facet_collector; mod facet_collector;
pub use self::facet_collector::{FacetCollector, FacetCounts}; pub use self::facet_collector::{FacetCollector, FacetCounts};
use crate::query::Weight; use crate::query::Weight;
@@ -146,11 +145,6 @@ pub trait Collector: Sync + Send {
/// Type of the `SegmentCollector` associated with this collector. /// Type of the `SegmentCollector` associated with this collector.
type Child: SegmentCollector; type Child: SegmentCollector;
/// Returns an error if the schema is not compatible with the collector.
fn check_schema(&self, _schema: &Schema) -> crate::Result<()> {
Ok(())
}
/// `set_segment` is called before beginning to enumerate /// `set_segment` is called before beginning to enumerate
/// on this segment. /// on this segment.
fn for_segment( fn for_segment(
@@ -176,50 +170,41 @@ pub trait Collector: Sync + Send {
segment_ord: u32, segment_ord: u32,
reader: &SegmentReader, reader: &SegmentReader,
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> { ) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
let with_scoring = self.requires_scoring();
let mut segment_collector = self.for_segment(segment_ord, reader)?; let mut segment_collector = self.for_segment(segment_ord, reader)?;
default_collect_segment_impl(&mut segment_collector, weight, reader, with_scoring)?;
match (reader.alive_bitset(), self.requires_scoring()) {
(Some(alive_bitset), true) => {
weight.for_each(reader, &mut |doc, score| {
if alive_bitset.is_alive(doc) {
segment_collector.collect(doc, score);
}
})?;
}
(Some(alive_bitset), false) => {
weight.for_each_no_score(reader, &mut |docs| {
for doc in docs.iter().cloned() {
if alive_bitset.is_alive(doc) {
segment_collector.collect(doc, 0.0);
}
}
})?;
}
(None, true) => {
weight.for_each(reader, &mut |doc, score| {
segment_collector.collect(doc, score);
})?;
}
(None, false) => {
weight.for_each_no_score(reader, &mut |docs| {
segment_collector.collect_block(docs);
})?;
}
}
Ok(segment_collector.harvest()) Ok(segment_collector.harvest())
} }
} }
pub(crate) fn default_collect_segment_impl<TSegmentCollector: SegmentCollector>(
segment_collector: &mut TSegmentCollector,
weight: &dyn Weight,
reader: &SegmentReader,
with_scoring: bool,
) -> crate::Result<()> {
match (reader.alive_bitset(), with_scoring) {
(Some(alive_bitset), true) => {
weight.for_each(reader, &mut |doc, score| {
if alive_bitset.is_alive(doc) {
segment_collector.collect(doc, score);
}
})?;
}
(Some(alive_bitset), false) => {
weight.for_each_no_score(reader, &mut |docs| {
for doc in docs.iter().cloned() {
if alive_bitset.is_alive(doc) {
segment_collector.collect(doc, 0.0);
}
}
})?;
}
(None, true) => {
weight.for_each(reader, &mut |doc, score| {
segment_collector.collect(doc, score);
})?;
}
(None, false) => {
weight.for_each_no_score(reader, &mut |docs| {
segment_collector.collect_block(docs);
})?;
}
}
Ok(())
}
impl<TSegmentCollector: SegmentCollector> SegmentCollector for Option<TSegmentCollector> { impl<TSegmentCollector: SegmentCollector> SegmentCollector for Option<TSegmentCollector> {
type Fruit = Option<TSegmentCollector::Fruit>; type Fruit = Option<TSegmentCollector::Fruit>;
@@ -229,12 +214,6 @@ impl<TSegmentCollector: SegmentCollector> SegmentCollector for Option<TSegmentCo
} }
} }
fn collect_block(&mut self, docs: &[DocId]) {
if let Some(segment_collector) = self {
segment_collector.collect_block(docs);
}
}
fn harvest(self) -> Self::Fruit { fn harvest(self) -> Self::Fruit {
self.map(|segment_collector| segment_collector.harvest()) self.map(|segment_collector| segment_collector.harvest())
} }
@@ -245,13 +224,6 @@ impl<TCollector: Collector> Collector for Option<TCollector> {
type Child = Option<<TCollector as Collector>::Child>; type Child = Option<<TCollector as Collector>::Child>;
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
if let Some(underlying_collector) = self {
underlying_collector.check_schema(schema)?;
}
Ok(())
}
fn for_segment( fn for_segment(
&self, &self,
segment_local_id: SegmentOrdinal, segment_local_id: SegmentOrdinal,
@@ -327,12 +299,6 @@ where
type Fruit = (Left::Fruit, Right::Fruit); type Fruit = (Left::Fruit, Right::Fruit);
type Child = (Left::Child, Right::Child); type Child = (Left::Child, Right::Child);
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
self.0.check_schema(schema)?;
self.1.check_schema(schema)?;
Ok(())
}
fn for_segment( fn for_segment(
&self, &self,
segment_local_id: u32, segment_local_id: u32,
@@ -376,11 +342,6 @@ where
self.1.collect(doc, score); self.1.collect(doc, score);
} }
fn collect_block(&mut self, docs: &[DocId]) {
self.0.collect_block(docs);
self.1.collect_block(docs);
}
fn harvest(self) -> <Self as SegmentCollector>::Fruit { fn harvest(self) -> <Self as SegmentCollector>::Fruit {
(self.0.harvest(), self.1.harvest()) (self.0.harvest(), self.1.harvest())
} }
@@ -397,13 +358,6 @@ where
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit); type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
type Child = (One::Child, Two::Child, Three::Child); type Child = (One::Child, Two::Child, Three::Child);
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
self.0.check_schema(schema)?;
self.1.check_schema(schema)?;
self.2.check_schema(schema)?;
Ok(())
}
fn for_segment( fn for_segment(
&self, &self,
segment_local_id: u32, segment_local_id: u32,
@@ -453,12 +407,6 @@ where
self.2.collect(doc, score); self.2.collect(doc, score);
} }
fn collect_block(&mut self, docs: &[DocId]) {
self.0.collect_block(docs);
self.1.collect_block(docs);
self.2.collect_block(docs);
}
fn harvest(self) -> <Self as SegmentCollector>::Fruit { fn harvest(self) -> <Self as SegmentCollector>::Fruit {
(self.0.harvest(), self.1.harvest(), self.2.harvest()) (self.0.harvest(), self.1.harvest(), self.2.harvest())
} }
@@ -476,14 +424,6 @@ where
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit); type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
type Child = (One::Child, Two::Child, Three::Child, Four::Child); type Child = (One::Child, Two::Child, Three::Child, Four::Child);
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
self.0.check_schema(schema)?;
self.1.check_schema(schema)?;
self.2.check_schema(schema)?;
self.3.check_schema(schema)?;
Ok(())
}
fn for_segment( fn for_segment(
&self, &self,
segment_local_id: u32, segment_local_id: u32,
@@ -542,13 +482,6 @@ where
self.3.collect(doc, score); self.3.collect(doc, score);
} }
fn collect_block(&mut self, docs: &[DocId]) {
self.0.collect_block(docs);
self.1.collect_block(docs);
self.2.collect_block(docs);
self.3.collect_block(docs);
}
fn harvest(self) -> <Self as SegmentCollector>::Fruit { fn harvest(self) -> <Self as SegmentCollector>::Fruit {
( (
self.0.harvest(), self.0.harvest(),

View File

@@ -3,7 +3,6 @@ use std::ops::Deref;
use super::{Collector, SegmentCollector}; use super::{Collector, SegmentCollector};
use crate::collector::Fruit; use crate::collector::Fruit;
use crate::schema::Schema;
use crate::{DocId, Score, SegmentOrdinal, SegmentReader, TantivyError}; use crate::{DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
/// MultiFruit keeps Fruits from every nested Collector /// MultiFruit keeps Fruits from every nested Collector
@@ -17,10 +16,6 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
type Fruit = Box<dyn Fruit>; type Fruit = Box<dyn Fruit>;
type Child = Box<dyn BoxableSegmentCollector>; type Child = Box<dyn BoxableSegmentCollector>;
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
self.0.check_schema(schema)
}
fn for_segment( fn for_segment(
&self, &self,
segment_local_id: u32, segment_local_id: u32,
@@ -152,7 +147,7 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// let searcher = reader.searcher(); /// let searcher = reader.searcher();
/// ///
/// let mut collectors = MultiCollector::new(); /// let mut collectors = MultiCollector::new();
/// let top_docs_handle = collectors.add_collector(TopDocs::with_limit(2).order_by_score()); /// let top_docs_handle = collectors.add_collector(TopDocs::with_limit(2));
/// let count_handle = collectors.add_collector(Count); /// let count_handle = collectors.add_collector(Count);
/// let query_parser = QueryParser::for_index(&index, vec![title]); /// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary").unwrap(); /// let query = query_parser.parse_query("diary").unwrap();
@@ -199,13 +194,6 @@ impl Collector for MultiCollector<'_> {
type Fruit = MultiFruit; type Fruit = MultiFruit;
type Child = MultiCollectorChild; type Child = MultiCollectorChild;
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
for collector in &self.collector_wrappers {
collector.check_schema(schema)?;
}
Ok(())
}
fn for_segment( fn for_segment(
&self, &self,
segment_local_id: SegmentOrdinal, segment_local_id: SegmentOrdinal,
@@ -262,12 +250,6 @@ impl SegmentCollector for MultiCollectorChild {
} }
} }
fn collect_block(&mut self, docs: &[DocId]) {
for child in &mut self.children {
child.collect_block(docs);
}
}
fn harvest(self) -> MultiFruit { fn harvest(self) -> MultiFruit {
MultiFruit { MultiFruit {
sub_fruits: self sub_fruits: self
@@ -311,7 +293,7 @@ mod tests {
let query = TermQuery::new(term, IndexRecordOption::Basic); let query = TermQuery::new(term, IndexRecordOption::Basic);
let mut collectors = MultiCollector::new(); let mut collectors = MultiCollector::new();
let topdocs_handler = collectors.add_collector(TopDocs::with_limit(2).order_by_score()); let topdocs_handler = collectors.add_collector(TopDocs::with_limit(2));
let count_handler = collectors.add_collector(Count); let count_handler = collectors.add_collector(Count);
let mut multifruits = searcher.search(&query, &collectors).unwrap(); let mut multifruits = searcher.search(&query, &collectors).unwrap();

View File

@@ -1,454 +0,0 @@
mod order;
mod sort_by_erased_type;
mod sort_by_score;
mod sort_by_static_fast_value;
mod sort_by_string;
mod sort_key_computer;
pub use order::*;
pub use sort_by_erased_type::SortByErasedType;
pub use sort_by_score::SortBySimilarityScore;
pub use sort_by_static_fast_value::SortByStaticFastValue;
pub use sort_by_string::SortByString;
pub use sort_key_computer::{SegmentSortKeyComputer, SortKeyComputer};
#[cfg(test)]
pub(crate) mod tests {
// By spec, regardless of whether ascending or descending order was requested, in presence of a
// tie, we sort by ascending doc id/doc address.
pub(crate) fn sort_hits<TSortKey: Ord, D: Ord>(
hits: &mut [ComparableDoc<TSortKey, D>],
order: Order,
) {
if order.is_asc() {
hits.sort_by(|l, r| l.sort_key.cmp(&r.sort_key).then(l.doc.cmp(&r.doc)));
} else {
hits.sort_by(|l, r| {
l.sort_key
.cmp(&r.sort_key)
.reverse() // This is descending
.then(l.doc.cmp(&r.doc))
});
}
}
use std::collections::HashMap;
use std::ops::Range;
use crate::collector::sort_key::{
SortByErasedType, SortBySimilarityScore, SortByStaticFastValue, SortByString,
};
use crate::collector::{ComparableDoc, DocSetCollector, TopDocs};
use crate::indexer::NoMergePolicy;
use crate::query::{AllQuery, QueryParser};
use crate::schema::{OwnedValue, Schema, FAST, TEXT};
use crate::{DocAddress, Document, Index, Order, Score, Searcher};
fn make_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let id = schema_builder.add_u64_field("id", FAST);
let city = schema_builder.add_text_field("city", TEXT | FAST);
let catchphrase = schema_builder.add_text_field("catchphrase", TEXT);
let altitude = schema_builder.add_f64_field("altitude", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
fn create_segment(index: &Index, docs: Vec<impl Document>) -> crate::Result<()> {
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for doc in docs {
index_writer.add_document(doc)?;
}
index_writer.commit()?;
Ok(())
}
create_segment(
&index,
vec![
doc!(
id => 0_u64,
city => "austin",
catchphrase => "Hills, Barbeque, Glow",
altitude => 149.0,
),
doc!(
id => 1_u64,
city => "greenville",
catchphrase => "Grow, Glow, Glow",
altitude => 27.0,
),
],
)?;
create_segment(
&index,
vec![doc!(
id => 2_u64,
city => "tokyo",
catchphrase => "Glow, Glow, Glow",
altitude => 40.0,
)],
)?;
create_segment(
&index,
vec![doc!(
id => 3_u64,
catchphrase => "No, No, No",
altitude => 0.0,
)],
)?;
Ok(index)
}
// NOTE: You cannot determine the SegmentIds that will be generated for Segments
// ahead of time, so DocAddresses must be mapped back to a unique id for each Searcher.
fn id_mapping(searcher: &Searcher) -> HashMap<DocAddress, u64> {
searcher
.search(&AllQuery, &DocSetCollector)
.unwrap()
.into_iter()
.map(|doc_address| {
let column = searcher.segment_readers()[doc_address.segment_ord as usize]
.fast_fields()
.u64("id")
.unwrap();
(doc_address, column.first(doc_address.doc_id).unwrap())
})
.collect()
}
#[test]
fn test_order_by_string() -> crate::Result<()> {
let index = make_index()?;
#[track_caller]
fn assert_query(
index: &Index,
order: Order,
doc_range: Range<usize>,
expected: Vec<(Option<String>, u64)>,
) -> crate::Result<()> {
let searcher = index.reader()?.searcher();
let ids = id_mapping(&searcher);
// Try as primitive.
let top_collector = TopDocs::for_doc_range(doc_range)
.order_by((SortByString::for_field("city"), order));
let actual = searcher
.search(&AllQuery, &top_collector)?
.into_iter()
.map(|(sort_key_opt, doc)| (sort_key_opt, ids[&doc]))
.collect::<Vec<_>>();
assert_eq!(actual, expected);
Ok(())
}
assert_query(
&index,
Order::Asc,
0..4,
vec![
(Some("austin".to_owned()), 0),
(Some("greenville".to_owned()), 1),
(Some("tokyo".to_owned()), 2),
(None, 3),
],
)?;
assert_query(
&index,
Order::Asc,
0..3,
vec![
(Some("austin".to_owned()), 0),
(Some("greenville".to_owned()), 1),
(Some("tokyo".to_owned()), 2),
],
)?;
assert_query(
&index,
Order::Asc,
0..2,
vec![
(Some("austin".to_owned()), 0),
(Some("greenville".to_owned()), 1),
],
)?;
assert_query(
&index,
Order::Asc,
0..1,
vec![(Some("austin".to_string()), 0)],
)?;
assert_query(
&index,
Order::Asc,
1..3,
vec![
(Some("greenville".to_owned()), 1),
(Some("tokyo".to_owned()), 2),
],
)?;
assert_query(
&index,
Order::Desc,
0..4,
vec![
(Some("tokyo".to_owned()), 2),
(Some("greenville".to_owned()), 1),
(Some("austin".to_owned()), 0),
(None, 3),
],
)?;
assert_query(
&index,
Order::Desc,
1..3,
vec![
(Some("greenville".to_owned()), 1),
(Some("austin".to_owned()), 0),
],
)?;
assert_query(
&index,
Order::Desc,
0..1,
vec![(Some("tokyo".to_owned()), 2)],
)?;
Ok(())
}
#[test]
fn test_order_by_f64() -> crate::Result<()> {
let index = make_index()?;
fn assert_query(
index: &Index,
order: Order,
expected: Vec<(Option<f64>, u64)>,
) -> crate::Result<()> {
let searcher = index.reader()?.searcher();
let ids = id_mapping(&searcher);
// Try as primitive.
let top_collector = TopDocs::with_limit(3)
.order_by((SortByStaticFastValue::<f64>::for_field("altitude"), order));
let actual = searcher
.search(&AllQuery, &top_collector)?
.into_iter()
.map(|(altitude_opt, doc)| (altitude_opt, ids[&doc]))
.collect::<Vec<_>>();
assert_eq!(actual, expected);
Ok(())
}
assert_query(
&index,
Order::Asc,
vec![(Some(0.0), 3), (Some(27.0), 1), (Some(40.0), 2)],
)?;
assert_query(
&index,
Order::Desc,
vec![(Some(149.0), 0), (Some(40.0), 2), (Some(27.0), 1)],
)?;
Ok(())
}
#[test]
fn test_order_by_score() -> crate::Result<()> {
let index = make_index()?;
fn query(index: &Index, order: Order) -> crate::Result<Vec<(Score, u64)>> {
let searcher = index.reader()?.searcher();
let ids = id_mapping(&searcher);
let top_collector = TopDocs::with_limit(4).order_by((SortBySimilarityScore, order));
let field = index.schema().get_field("catchphrase").unwrap();
let query_parser = QueryParser::for_index(index, vec![field]);
let text_query = query_parser.parse_query("glow")?;
Ok(searcher
.search(&text_query, &top_collector)?
.into_iter()
.map(|(score, doc)| (score, ids[&doc]))
.collect())
}
assert_eq!(
&query(&index, Order::Desc)?,
&[(0.5604893, 2), (0.4904281, 1), (0.35667497, 0),]
);
assert_eq!(
&query(&index, Order::Asc)?,
&[(0.35667497, 0), (0.4904281, 1), (0.5604893, 2),]
);
Ok(())
}
#[test]
fn test_order_by_score_then_string() -> crate::Result<()> {
let index = make_index()?;
type SortKey = (Score, Option<String>);
fn query(
index: &Index,
score_order: Order,
city_order: Order,
) -> crate::Result<Vec<(SortKey, u64)>> {
let searcher = index.reader()?.searcher();
let ids = id_mapping(&searcher);
let top_collector = TopDocs::with_limit(4).order_by((
(SortBySimilarityScore, score_order),
(SortByString::for_field("city"), city_order),
));
let results: Vec<((Score, Option<String>), DocAddress)> =
searcher.search(&AllQuery, &top_collector)?;
Ok(results.into_iter().map(|(f, doc)| (f, ids[&doc])).collect())
}
assert_eq!(
&query(&index, Order::Asc, Order::Asc)?,
&[
((1.0, Some("austin".to_owned())), 0),
((1.0, Some("greenville".to_owned())), 1),
((1.0, Some("tokyo".to_owned())), 2),
((1.0, None), 3),
]
);
assert_eq!(
&query(&index, Order::Asc, Order::Desc)?,
&[
((1.0, Some("tokyo".to_owned())), 2),
((1.0, Some("greenville".to_owned())), 1),
((1.0, Some("austin".to_owned())), 0),
((1.0, None), 3),
]
);
Ok(())
}
#[test]
fn test_order_by_score_then_owned_value() -> crate::Result<()> {
let index = make_index()?;
type SortKey = (Score, OwnedValue);
fn query(
index: &Index,
score_order: Order,
city_order: Order,
) -> crate::Result<Vec<(SortKey, u64)>> {
let searcher = index.reader()?.searcher();
let ids = id_mapping(&searcher);
let top_collector = TopDocs::with_limit(4).order_by::<(Score, OwnedValue)>((
(SortBySimilarityScore, score_order),
(SortByErasedType::for_field("city"), city_order),
));
let results: Vec<((Score, OwnedValue), DocAddress)> =
searcher.search(&AllQuery, &top_collector)?;
Ok(results.into_iter().map(|(f, doc)| (f, ids[&doc])).collect())
}
assert_eq!(
&query(&index, Order::Asc, Order::Asc)?,
&[
((1.0, OwnedValue::Str("austin".to_owned())), 0),
((1.0, OwnedValue::Str("greenville".to_owned())), 1),
((1.0, OwnedValue::Str("tokyo".to_owned())), 2),
((1.0, OwnedValue::Null), 3),
]
);
assert_eq!(
&query(&index, Order::Asc, Order::Desc)?,
&[
((1.0, OwnedValue::Str("tokyo".to_owned())), 2),
((1.0, OwnedValue::Str("greenville".to_owned())), 1),
((1.0, OwnedValue::Str("austin".to_owned())), 0),
((1.0, OwnedValue::Null), 3),
]
);
Ok(())
}
use proptest::prelude::*;
proptest! {
#[test]
fn test_order_by_string_prop(
order in prop_oneof!(Just(Order::Desc), Just(Order::Asc)),
limit in 1..64_usize,
offset in 0..64_usize,
segments_terms in
proptest::collection::vec(
proptest::collection::vec(0..32_u8, 1..32_usize),
0..8_usize,
)
) {
let mut schema_builder = Schema::builder();
let city = schema_builder.add_text_field("city", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
// A Vec<Vec<u8>>, where the outer Vec represents segments, and the inner Vec
// represents terms.
for segment_terms in segments_terms.into_iter() {
for term in segment_terms.into_iter() {
let term = format!("{term:0>3}");
index_writer.add_document(doc!(
city => term,
))?;
}
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let top_n_results = searcher.search(&AllQuery, &TopDocs::with_limit(limit)
.and_offset(offset)
.order_by_string_fast_field("city", order))?;
let all_results = searcher.search(&AllQuery, &DocSetCollector)?.into_iter().map(|doc_address| {
// Get the term for this address.
let column = searcher.segment_readers()[doc_address.segment_ord as usize].fast_fields().str("city").unwrap().unwrap();
let value = column.term_ords(doc_address.doc_id).next().map(|term_ord| {
let mut city = Vec::new();
column.dictionary().ord_to_term(term_ord, &mut city).unwrap();
String::try_from(city).unwrap()
});
(value, doc_address)
});
// Using the TopDocs collector should always be equivalent to sorting, skipping the
// offset, and then taking the limit.
let sorted_docs: Vec<_> = {
let mut comparable_docs: Vec<ComparableDoc<_, _>> =
all_results.into_iter().map(|(sort_key, doc)| ComparableDoc { sort_key, doc}).collect();
sort_hits(&mut comparable_docs, order);
comparable_docs.into_iter().map(|cd| (cd.sort_key, cd.doc)).collect()
};
let expected_docs = sorted_docs.into_iter().skip(offset).take(limit).collect::<Vec<_>>();
prop_assert_eq!(
expected_docs,
top_n_results
);
}
}
}

View File

@@ -1,567 +0,0 @@
use std::cmp::Ordering;
use columnar::MonotonicallyMappableToU64;
use serde::{Deserialize, Serialize};
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::schema::{OwnedValue, Schema};
use crate::{DocId, Order, Score};
fn compare_owned_value<const NULLS_FIRST: bool>(lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
match (lhs, rhs) {
(OwnedValue::Null, OwnedValue::Null) => Ordering::Equal,
(OwnedValue::Null, _) => {
if NULLS_FIRST {
Ordering::Less
} else {
Ordering::Greater
}
}
(_, OwnedValue::Null) => {
if NULLS_FIRST {
Ordering::Greater
} else {
Ordering::Less
}
}
(OwnedValue::Str(a), OwnedValue::Str(b)) => a.cmp(b),
(OwnedValue::PreTokStr(a), OwnedValue::PreTokStr(b)) => a.cmp(b),
(OwnedValue::U64(a), OwnedValue::U64(b)) => a.cmp(b),
(OwnedValue::I64(a), OwnedValue::I64(b)) => a.cmp(b),
(OwnedValue::F64(a), OwnedValue::F64(b)) => a.to_u64().cmp(&b.to_u64()),
(OwnedValue::Bool(a), OwnedValue::Bool(b)) => a.cmp(b),
(OwnedValue::Date(a), OwnedValue::Date(b)) => a.cmp(b),
(OwnedValue::Facet(a), OwnedValue::Facet(b)) => a.cmp(b),
(OwnedValue::Bytes(a), OwnedValue::Bytes(b)) => a.cmp(b),
(OwnedValue::IpAddr(a), OwnedValue::IpAddr(b)) => a.cmp(b),
(OwnedValue::U64(a), OwnedValue::I64(b)) => {
if *b < 0 {
Ordering::Greater
} else {
a.cmp(&(*b as u64))
}
}
(OwnedValue::I64(a), OwnedValue::U64(b)) => {
if *a < 0 {
Ordering::Less
} else {
(*a as u64).cmp(b)
}
}
(OwnedValue::U64(a), OwnedValue::F64(b)) => (*a as f64).to_u64().cmp(&b.to_u64()),
(OwnedValue::F64(a), OwnedValue::U64(b)) => a.to_u64().cmp(&(*b as f64).to_u64()),
(OwnedValue::I64(a), OwnedValue::F64(b)) => (*a as f64).to_u64().cmp(&b.to_u64()),
(OwnedValue::F64(a), OwnedValue::I64(b)) => a.to_u64().cmp(&(*b as f64).to_u64()),
(a, b) => {
let ord = a.discriminant_value().cmp(&b.discriminant_value());
// If the discriminant is equal, it's because a new type was added, but hasn't been
// included in this `match` statement.
assert!(
ord != Ordering::Equal,
"Unimplemented comparison for type of {a:?}, {b:?}"
);
ord
}
}
}
/// Comparator trait defining the order in which documents should be ordered.
pub trait Comparator<T>: Send + Sync + std::fmt::Debug + Default {
/// Return the order between two values.
fn compare(&self, lhs: &T, rhs: &T) -> Ordering;
}
/// Compare values naturally (e.g. 1 < 2).
///
/// When used with `TopDocs`, which reverses the order, this results in a
/// "Descending" sort (Greatest values first).
///
/// `None` (or Null for `OwnedValue`) values are considered to be smaller than any other value,
/// and will therefore appear last in a descending sort (e.g. `[Some(20), Some(10), None]`).
#[derive(Debug, Copy, Clone, Default, Serialize, Deserialize)]
pub struct NaturalComparator;
impl<T: PartialOrd> Comparator<T> for NaturalComparator {
#[inline(always)]
fn compare(&self, lhs: &T, rhs: &T) -> Ordering {
lhs.partial_cmp(rhs).unwrap_or(Ordering::Equal)
}
}
/// A (partial) implementation of comparison for OwnedValue.
///
/// Intended for use within columns of homogenous types, and so will panic for OwnedValues with
/// mismatched types. The one exception is Null, for which we do define all comparisons.
impl Comparator<OwnedValue> for NaturalComparator {
#[inline(always)]
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
compare_owned_value::</* NULLS_FIRST= */ true>(lhs, rhs)
}
}
/// Compare values in reverse (e.g. 2 < 1).
///
/// When used with `TopDocs`, which reverses the order, this results in an
/// "Ascending" sort (Smallest values first).
///
/// `None` is considered smaller than `Some` in the underlying comparator, but because the
/// comparison is reversed, `None` is effectively treated as the lowest value in the resulting
/// Ascending sort (e.g. `[None, Some(10), Some(20)]`).
///
/// The ReverseComparator does not necessarily imply that the sort order is reversed compared
/// to the NaturalComparator. In presence of a tie on the sort key, documents will always be
/// sorted by ascending `DocId`/`DocAddress` in TopN results, regardless of the sort key's order.
#[derive(Debug, Copy, Clone, Default, Serialize, Deserialize)]
pub struct ReverseComparator;
impl<T> Comparator<T> for ReverseComparator
where NaturalComparator: Comparator<T>
{
#[inline(always)]
fn compare(&self, lhs: &T, rhs: &T) -> Ordering {
NaturalComparator.compare(rhs, lhs)
}
}
/// Compare values in reverse, but treating `None` as lower than `Some`.
///
/// When used with `TopDocs`, which reverses the order, this results in an
/// "Ascending" sort (Smallest values first), but with `None` values appearing last
/// (e.g. `[Some(10), Some(20), None]`).
///
/// This is usually what is wanted when sorting by a field in an ascending order.
/// For instance, in an e-commerce website, if sorting by price ascending,
/// the cheapest items would appear first, and items without a price would appear last.
#[derive(Debug, Copy, Clone, Default)]
pub struct ReverseNoneIsLowerComparator;
impl<T> Comparator<Option<T>> for ReverseNoneIsLowerComparator
where ReverseComparator: Comparator<T>
{
#[inline(always)]
fn compare(&self, lhs_opt: &Option<T>, rhs_opt: &Option<T>) -> Ordering {
match (lhs_opt, rhs_opt) {
(None, None) => Ordering::Equal,
(None, Some(_)) => Ordering::Less,
(Some(_), None) => Ordering::Greater,
(Some(lhs), Some(rhs)) => ReverseComparator.compare(lhs, rhs),
}
}
}
impl Comparator<u32> for ReverseNoneIsLowerComparator {
#[inline(always)]
fn compare(&self, lhs: &u32, rhs: &u32) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
}
impl Comparator<u64> for ReverseNoneIsLowerComparator {
#[inline(always)]
fn compare(&self, lhs: &u64, rhs: &u64) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
}
impl Comparator<f64> for ReverseNoneIsLowerComparator {
#[inline(always)]
fn compare(&self, lhs: &f64, rhs: &f64) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
}
impl Comparator<f32> for ReverseNoneIsLowerComparator {
#[inline(always)]
fn compare(&self, lhs: &f32, rhs: &f32) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
}
impl Comparator<i64> for ReverseNoneIsLowerComparator {
#[inline(always)]
fn compare(&self, lhs: &i64, rhs: &i64) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
}
impl Comparator<String> for ReverseNoneIsLowerComparator {
#[inline(always)]
fn compare(&self, lhs: &String, rhs: &String) -> Ordering {
ReverseComparator.compare(lhs, rhs)
}
}
impl Comparator<OwnedValue> for ReverseNoneIsLowerComparator {
#[inline(always)]
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
compare_owned_value::</* NULLS_FIRST= */ false>(rhs, lhs)
}
}
/// Compare values naturally, but treating `None` as higher than `Some`.
///
/// When used with `TopDocs`, which reverses the order, this results in a
/// "Descending" sort (Greatest values first), but with `None` values appearing first
/// (e.g. `[None, Some(20), Some(10)]`).
#[derive(Debug, Copy, Clone, Default, Serialize, Deserialize)]
pub struct NaturalNoneIsHigherComparator;
impl<T> Comparator<Option<T>> for NaturalNoneIsHigherComparator
where NaturalComparator: Comparator<T>
{
#[inline(always)]
fn compare(&self, lhs_opt: &Option<T>, rhs_opt: &Option<T>) -> Ordering {
match (lhs_opt, rhs_opt) {
(None, None) => Ordering::Equal,
(None, Some(_)) => Ordering::Greater,
(Some(_), None) => Ordering::Less,
(Some(lhs), Some(rhs)) => NaturalComparator.compare(lhs, rhs),
}
}
}
impl Comparator<u32> for NaturalNoneIsHigherComparator {
#[inline(always)]
fn compare(&self, lhs: &u32, rhs: &u32) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
}
impl Comparator<u64> for NaturalNoneIsHigherComparator {
#[inline(always)]
fn compare(&self, lhs: &u64, rhs: &u64) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
}
impl Comparator<f64> for NaturalNoneIsHigherComparator {
#[inline(always)]
fn compare(&self, lhs: &f64, rhs: &f64) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
}
impl Comparator<f32> for NaturalNoneIsHigherComparator {
#[inline(always)]
fn compare(&self, lhs: &f32, rhs: &f32) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
}
impl Comparator<i64> for NaturalNoneIsHigherComparator {
#[inline(always)]
fn compare(&self, lhs: &i64, rhs: &i64) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
}
impl Comparator<String> for NaturalNoneIsHigherComparator {
#[inline(always)]
fn compare(&self, lhs: &String, rhs: &String) -> Ordering {
NaturalComparator.compare(lhs, rhs)
}
}
impl Comparator<OwnedValue> for NaturalNoneIsHigherComparator {
#[inline(always)]
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
compare_owned_value::</* NULLS_FIRST= */ false>(lhs, rhs)
}
}
/// An enum representing the different sort orders.
#[derive(Debug, Clone, Copy, Eq, PartialEq, Default)]
pub enum ComparatorEnum {
/// Natural order (See [NaturalComparator])
#[default]
Natural,
/// Reverse order (See [ReverseComparator])
Reverse,
/// Reverse order by treating None as the lowest value. (See [ReverseNoneLowerComparator])
ReverseNoneLower,
/// Natural order but treating None as the highest value. (See [NaturalNoneIsHigherComparator])
NaturalNoneHigher,
}
impl From<Order> for ComparatorEnum {
fn from(order: Order) -> Self {
match order {
Order::Asc => ComparatorEnum::ReverseNoneLower,
Order::Desc => ComparatorEnum::Natural,
}
}
}
impl<T> Comparator<T> for ComparatorEnum
where
ReverseNoneIsLowerComparator: Comparator<T>,
NaturalComparator: Comparator<T>,
ReverseComparator: Comparator<T>,
NaturalNoneIsHigherComparator: Comparator<T>,
{
#[inline(always)]
fn compare(&self, lhs: &T, rhs: &T) -> Ordering {
match self {
ComparatorEnum::Natural => NaturalComparator.compare(lhs, rhs),
ComparatorEnum::Reverse => ReverseComparator.compare(lhs, rhs),
ComparatorEnum::ReverseNoneLower => ReverseNoneIsLowerComparator.compare(lhs, rhs),
ComparatorEnum::NaturalNoneHigher => NaturalNoneIsHigherComparator.compare(lhs, rhs),
}
}
}
impl<Head, Tail, LeftComparator, RightComparator> Comparator<(Head, Tail)>
for (LeftComparator, RightComparator)
where
LeftComparator: Comparator<Head>,
RightComparator: Comparator<Tail>,
{
#[inline(always)]
fn compare(&self, lhs: &(Head, Tail), rhs: &(Head, Tail)) -> Ordering {
self.0
.compare(&lhs.0, &rhs.0)
.then_with(|| self.1.compare(&lhs.1, &rhs.1))
}
}
impl<Type1, Type2, Type3, Comparator1, Comparator2, Comparator3> Comparator<(Type1, (Type2, Type3))>
for (Comparator1, Comparator2, Comparator3)
where
Comparator1: Comparator<Type1>,
Comparator2: Comparator<Type2>,
Comparator3: Comparator<Type3>,
{
#[inline(always)]
fn compare(&self, lhs: &(Type1, (Type2, Type3)), rhs: &(Type1, (Type2, Type3))) -> Ordering {
self.0
.compare(&lhs.0, &rhs.0)
.then_with(|| self.1.compare(&lhs.1 .0, &rhs.1 .0))
.then_with(|| self.2.compare(&lhs.1 .1, &rhs.1 .1))
}
}
impl<Type1, Type2, Type3, Comparator1, Comparator2, Comparator3> Comparator<(Type1, Type2, Type3)>
for (Comparator1, Comparator2, Comparator3)
where
Comparator1: Comparator<Type1>,
Comparator2: Comparator<Type2>,
Comparator3: Comparator<Type3>,
{
#[inline(always)]
fn compare(&self, lhs: &(Type1, Type2, Type3), rhs: &(Type1, Type2, Type3)) -> Ordering {
self.0
.compare(&lhs.0, &rhs.0)
.then_with(|| self.1.compare(&lhs.1, &rhs.1))
.then_with(|| self.2.compare(&lhs.2, &rhs.2))
}
}
impl<Type1, Type2, Type3, Type4, Comparator1, Comparator2, Comparator3, Comparator4>
Comparator<(Type1, (Type2, (Type3, Type4)))>
for (Comparator1, Comparator2, Comparator3, Comparator4)
where
Comparator1: Comparator<Type1>,
Comparator2: Comparator<Type2>,
Comparator3: Comparator<Type3>,
Comparator4: Comparator<Type4>,
{
#[inline(always)]
fn compare(
&self,
lhs: &(Type1, (Type2, (Type3, Type4))),
rhs: &(Type1, (Type2, (Type3, Type4))),
) -> Ordering {
self.0
.compare(&lhs.0, &rhs.0)
.then_with(|| self.1.compare(&lhs.1 .0, &rhs.1 .0))
.then_with(|| self.2.compare(&lhs.1 .1 .0, &rhs.1 .1 .0))
.then_with(|| self.3.compare(&lhs.1 .1 .1, &rhs.1 .1 .1))
}
}
impl<Type1, Type2, Type3, Type4, Comparator1, Comparator2, Comparator3, Comparator4>
Comparator<(Type1, Type2, Type3, Type4)>
for (Comparator1, Comparator2, Comparator3, Comparator4)
where
Comparator1: Comparator<Type1>,
Comparator2: Comparator<Type2>,
Comparator3: Comparator<Type3>,
Comparator4: Comparator<Type4>,
{
#[inline(always)]
fn compare(
&self,
lhs: &(Type1, Type2, Type3, Type4),
rhs: &(Type1, Type2, Type3, Type4),
) -> Ordering {
self.0
.compare(&lhs.0, &rhs.0)
.then_with(|| self.1.compare(&lhs.1, &rhs.1))
.then_with(|| self.2.compare(&lhs.2, &rhs.2))
.then_with(|| self.3.compare(&lhs.3, &rhs.3))
}
}
impl<TSortKeyComputer> SortKeyComputer for (TSortKeyComputer, ComparatorEnum)
where
TSortKeyComputer: SortKeyComputer,
ComparatorEnum: Comparator<TSortKeyComputer::SortKey>,
ComparatorEnum: Comparator<
<<TSortKeyComputer as SortKeyComputer>::Child as SegmentSortKeyComputer>::SegmentSortKey,
>,
{
type SortKey = TSortKeyComputer::SortKey;
type Child = SegmentSortKeyComputerWithComparator<TSortKeyComputer::Child, Self::Comparator>;
type Comparator = ComparatorEnum;
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
self.0.check_schema(schema)
}
fn requires_scoring(&self) -> bool {
self.0.requires_scoring()
}
fn comparator(&self) -> Self::Comparator {
self.1
}
fn segment_sort_key_computer(
&self,
segment_reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let child = self.0.segment_sort_key_computer(segment_reader)?;
Ok(SegmentSortKeyComputerWithComparator {
segment_sort_key_computer: child,
comparator: self.comparator(),
})
}
}
impl<TSortKeyComputer> SortKeyComputer for (TSortKeyComputer, Order)
where
TSortKeyComputer: SortKeyComputer,
ComparatorEnum: Comparator<TSortKeyComputer::SortKey>,
ComparatorEnum: Comparator<
<<TSortKeyComputer as SortKeyComputer>::Child as SegmentSortKeyComputer>::SegmentSortKey,
>,
{
type SortKey = TSortKeyComputer::SortKey;
type Child = SegmentSortKeyComputerWithComparator<TSortKeyComputer::Child, Self::Comparator>;
type Comparator = ComparatorEnum;
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
self.0.check_schema(schema)
}
fn requires_scoring(&self) -> bool {
self.0.requires_scoring()
}
fn comparator(&self) -> Self::Comparator {
self.1.into()
}
fn segment_sort_key_computer(
&self,
segment_reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let child = self.0.segment_sort_key_computer(segment_reader)?;
Ok(SegmentSortKeyComputerWithComparator {
segment_sort_key_computer: child,
comparator: self.comparator(),
})
}
}
/// A segment sort key computer with a custom ordering.
pub struct SegmentSortKeyComputerWithComparator<TSegmentSortKeyComputer, TComparator> {
segment_sort_key_computer: TSegmentSortKeyComputer,
comparator: TComparator,
}
impl<TSegmentSortKeyComputer, TSegmentSortKey, TComparator> SegmentSortKeyComputer
for SegmentSortKeyComputerWithComparator<TSegmentSortKeyComputer, TComparator>
where
TSegmentSortKeyComputer: SegmentSortKeyComputer<SegmentSortKey = TSegmentSortKey>,
TSegmentSortKey: Clone + 'static + Sync + Send,
TComparator: Comparator<TSegmentSortKey> + 'static + Sync + Send,
{
type SortKey = TSegmentSortKeyComputer::SortKey;
type SegmentSortKey = TSegmentSortKey;
type SegmentComparator = TComparator;
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey {
self.segment_sort_key_computer.segment_sort_key(doc, score)
}
#[inline(always)]
fn compare_segment_sort_key(
&self,
left: &Self::SegmentSortKey,
right: &Self::SegmentSortKey,
) -> Ordering {
self.comparator.compare(left, right)
}
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey {
self.segment_sort_key_computer
.convert_segment_sort_key(sort_key)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::schema::OwnedValue;
#[test]
fn test_natural_none_is_higher() {
let comp = NaturalNoneIsHigherComparator;
let null = None;
let v1 = Some(1_u64);
let v2 = Some(2_u64);
// NaturalNoneIsGreaterComparator logic:
// 1. Delegates to NaturalComparator for non-nulls.
// NaturalComparator compare(2, 1) -> 2.cmp(1) -> Greater.
assert_eq!(comp.compare(&v2, &v1), Ordering::Greater);
// 2. Treats None (Null) as Greater than any value.
// compare(None, Some(2)) should be Greater.
assert_eq!(comp.compare(&null, &v2), Ordering::Greater);
// compare(Some(1), None) should be Less.
assert_eq!(comp.compare(&v1, &null), Ordering::Less);
// compare(None, None) should be Equal.
assert_eq!(comp.compare(&null, &null), Ordering::Equal);
}
#[test]
fn test_mixed_ownedvalue_compare() {
let u = OwnedValue::U64(10);
let i = OwnedValue::I64(10);
let f = OwnedValue::F64(10.0);
let nc = NaturalComparator;
assert_eq!(nc.compare(&u, &i), Ordering::Equal);
assert_eq!(nc.compare(&u, &f), Ordering::Equal);
assert_eq!(nc.compare(&i, &f), Ordering::Equal);
let u2 = OwnedValue::U64(11);
assert_eq!(nc.compare(&u2, &f), Ordering::Greater);
let s = OwnedValue::Str("a".to_string());
// Str < U64
assert_eq!(nc.compare(&s, &u), Ordering::Less);
// Str < I64
assert_eq!(nc.compare(&s, &i), Ordering::Less);
// Str < F64
assert_eq!(nc.compare(&s, &f), Ordering::Less);
}
}

View File

@@ -1,361 +0,0 @@
use columnar::{ColumnType, MonotonicallyMappableToU64};
use crate::collector::sort_key::{
NaturalComparator, SortBySimilarityScore, SortByStaticFastValue, SortByString,
};
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::fastfield::FastFieldNotAvailableError;
use crate::schema::OwnedValue;
use crate::{DateTime, DocId, Score};
/// Sort by the boxed / OwnedValue representation of either a fast field, or of the score.
///
/// Using the OwnedValue representation allows for type erasure, and can be useful when sort orders
/// are not known until runtime. But it comes with a performance cost: wherever possible, prefer to
/// use a SortKeyComputer implementation with a known-type at compile time.
#[derive(Debug, Clone)]
pub enum SortByErasedType {
/// Sort by a fast field
Field(String),
/// Sort by score
Score,
}
impl SortByErasedType {
/// Creates a new sort key computer which will sort by the given fast field column, with type
/// erasure.
pub fn for_field(column_name: impl ToString) -> Self {
Self::Field(column_name.to_string())
}
/// Creates a new sort key computer which will sort by score, with type erasure.
pub fn for_score() -> Self {
Self::Score
}
}
trait ErasedSegmentSortKeyComputer: Send + Sync {
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Option<u64>;
fn convert_segment_sort_key(&self, sort_key: Option<u64>) -> OwnedValue;
}
struct ErasedSegmentSortKeyComputerWrapper<C, F> {
inner: C,
converter: F,
}
impl<C, F> ErasedSegmentSortKeyComputer for ErasedSegmentSortKeyComputerWrapper<C, F>
where
C: SegmentSortKeyComputer<SegmentSortKey = Option<u64>> + Send + Sync,
F: Fn(C::SortKey) -> OwnedValue + Send + Sync + 'static,
{
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Option<u64> {
self.inner.segment_sort_key(doc, score)
}
fn convert_segment_sort_key(&self, sort_key: Option<u64>) -> OwnedValue {
let val = self.inner.convert_segment_sort_key(sort_key);
(self.converter)(val)
}
}
struct ScoreSegmentSortKeyComputer {
segment_computer: SortBySimilarityScore,
}
impl ErasedSegmentSortKeyComputer for ScoreSegmentSortKeyComputer {
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Option<u64> {
let score_value: f64 = self.segment_computer.segment_sort_key(doc, score).into();
Some(score_value.to_u64())
}
fn convert_segment_sort_key(&self, sort_key: Option<u64>) -> OwnedValue {
let score_value: u64 = sort_key.expect("This implementation always produces a score.");
OwnedValue::F64(f64::from_u64(score_value))
}
}
impl SortKeyComputer for SortByErasedType {
type SortKey = OwnedValue;
type Child = ErasedColumnSegmentSortKeyComputer;
type Comparator = NaturalComparator;
fn requires_scoring(&self) -> bool {
matches!(self, Self::Score)
}
fn segment_sort_key_computer(
&self,
segment_reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let inner: Box<dyn ErasedSegmentSortKeyComputer> = match self {
Self::Field(column_name) => {
let fast_fields = segment_reader.fast_fields();
// TODO: We currently double-open the column to avoid relying on the implementation
// details of `SortByString` or `SortByStaticFastValue`. Once
// https://github.com/quickwit-oss/tantivy/issues/2776 is resolved, we should
// consider directly constructing the appropriate `SegmentSortKeyComputer` type for
// the column that we open here.
let (_column, column_type) =
fast_fields.u64_lenient(column_name)?.ok_or_else(|| {
FastFieldNotAvailableError {
field_name: column_name.to_owned(),
}
})?;
match column_type {
ColumnType::Str => {
let computer = SortByString::for_field(column_name);
let inner = computer.segment_sort_key_computer(segment_reader)?;
Box::new(ErasedSegmentSortKeyComputerWrapper {
inner,
converter: |val: Option<String>| {
val.map(OwnedValue::Str).unwrap_or(OwnedValue::Null)
},
})
}
ColumnType::U64 => {
let computer = SortByStaticFastValue::<u64>::for_field(column_name);
let inner = computer.segment_sort_key_computer(segment_reader)?;
Box::new(ErasedSegmentSortKeyComputerWrapper {
inner,
converter: |val: Option<u64>| {
val.map(OwnedValue::U64).unwrap_or(OwnedValue::Null)
},
})
}
ColumnType::I64 => {
let computer = SortByStaticFastValue::<i64>::for_field(column_name);
let inner = computer.segment_sort_key_computer(segment_reader)?;
Box::new(ErasedSegmentSortKeyComputerWrapper {
inner,
converter: |val: Option<i64>| {
val.map(OwnedValue::I64).unwrap_or(OwnedValue::Null)
},
})
}
ColumnType::F64 => {
let computer = SortByStaticFastValue::<f64>::for_field(column_name);
let inner = computer.segment_sort_key_computer(segment_reader)?;
Box::new(ErasedSegmentSortKeyComputerWrapper {
inner,
converter: |val: Option<f64>| {
val.map(OwnedValue::F64).unwrap_or(OwnedValue::Null)
},
})
}
ColumnType::Bool => {
let computer = SortByStaticFastValue::<bool>::for_field(column_name);
let inner = computer.segment_sort_key_computer(segment_reader)?;
Box::new(ErasedSegmentSortKeyComputerWrapper {
inner,
converter: |val: Option<bool>| {
val.map(OwnedValue::Bool).unwrap_or(OwnedValue::Null)
},
})
}
ColumnType::DateTime => {
let computer = SortByStaticFastValue::<DateTime>::for_field(column_name);
let inner = computer.segment_sort_key_computer(segment_reader)?;
Box::new(ErasedSegmentSortKeyComputerWrapper {
inner,
converter: |val: Option<DateTime>| {
val.map(OwnedValue::Date).unwrap_or(OwnedValue::Null)
},
})
}
column_type => {
return Err(crate::TantivyError::SchemaError(format!(
"Field `{}` is of type {column_type:?}, which is not supported for \
sorting by owned value yet.",
column_name
)))
}
}
}
Self::Score => Box::new(ScoreSegmentSortKeyComputer {
segment_computer: SortBySimilarityScore,
}),
};
Ok(ErasedColumnSegmentSortKeyComputer { inner })
}
}
pub struct ErasedColumnSegmentSortKeyComputer {
inner: Box<dyn ErasedSegmentSortKeyComputer>,
}
impl SegmentSortKeyComputer for ErasedColumnSegmentSortKeyComputer {
type SortKey = OwnedValue;
type SegmentSortKey = Option<u64>;
type SegmentComparator = NaturalComparator;
#[inline(always)]
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Option<u64> {
self.inner.segment_sort_key(doc, score)
}
fn convert_segment_sort_key(&self, segment_sort_key: Self::SegmentSortKey) -> OwnedValue {
self.inner.convert_segment_sort_key(segment_sort_key)
}
}
#[cfg(test)]
mod tests {
use crate::collector::sort_key::{ComparatorEnum, SortByErasedType};
use crate::collector::TopDocs;
use crate::query::AllQuery;
use crate::schema::{OwnedValue, Schema, FAST, TEXT};
use crate::Index;
#[test]
fn test_sort_by_owned_u64() {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc!(id_field => 10u64)).unwrap();
writer.add_document(doc!(id_field => 2u64)).unwrap();
writer.add_document(doc!()).unwrap();
writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let collector = TopDocs::with_limit(10)
.order_by((SortByErasedType::for_field("id"), ComparatorEnum::Natural));
let top_docs = searcher.search(&AllQuery, &collector).unwrap();
let values: Vec<OwnedValue> = top_docs.into_iter().map(|(key, _)| key).collect();
assert_eq!(
values,
vec![OwnedValue::U64(10), OwnedValue::U64(2), OwnedValue::Null]
);
let collector = TopDocs::with_limit(10).order_by((
SortByErasedType::for_field("id"),
ComparatorEnum::ReverseNoneLower,
));
let top_docs = searcher.search(&AllQuery, &collector).unwrap();
let values: Vec<OwnedValue> = top_docs.into_iter().map(|(key, _)| key).collect();
assert_eq!(
values,
vec![OwnedValue::U64(2), OwnedValue::U64(10), OwnedValue::Null]
);
}
#[test]
fn test_sort_by_owned_string() {
let mut schema_builder = Schema::builder();
let city_field = schema_builder.add_text_field("city", FAST | TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc!(city_field => "tokyo")).unwrap();
writer.add_document(doc!(city_field => "austin")).unwrap();
writer.add_document(doc!()).unwrap();
writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let collector = TopDocs::with_limit(10).order_by((
SortByErasedType::for_field("city"),
ComparatorEnum::ReverseNoneLower,
));
let top_docs = searcher.search(&AllQuery, &collector).unwrap();
let values: Vec<OwnedValue> = top_docs.into_iter().map(|(key, _)| key).collect();
assert_eq!(
values,
vec![
OwnedValue::Str("austin".to_string()),
OwnedValue::Str("tokyo".to_string()),
OwnedValue::Null
]
);
}
#[test]
fn test_sort_by_owned_reverse() {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc!(id_field => 10u64)).unwrap();
writer.add_document(doc!(id_field => 2u64)).unwrap();
writer.add_document(doc!()).unwrap();
writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let collector = TopDocs::with_limit(10)
.order_by((SortByErasedType::for_field("id"), ComparatorEnum::Reverse));
let top_docs = searcher.search(&AllQuery, &collector).unwrap();
let values: Vec<OwnedValue> = top_docs.into_iter().map(|(key, _)| key).collect();
assert_eq!(
values,
vec![OwnedValue::Null, OwnedValue::U64(2), OwnedValue::U64(10)]
);
}
#[test]
fn test_sort_by_owned_score() {
let mut schema_builder = Schema::builder();
let body_field = schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc!(body_field => "a a")).unwrap();
writer.add_document(doc!(body_field => "a")).unwrap();
writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query_parser = crate::query::QueryParser::for_index(&index, vec![body_field]);
let query = query_parser.parse_query("a").unwrap();
// Sort by score descending (Natural)
let collector = TopDocs::with_limit(10)
.order_by((SortByErasedType::for_score(), ComparatorEnum::Natural));
let top_docs = searcher.search(&query, &collector).unwrap();
let values: Vec<f64> = top_docs
.into_iter()
.map(|(key, _)| match key {
OwnedValue::F64(val) => val,
_ => panic!("Wrong type {key:?}"),
})
.collect();
assert_eq!(values.len(), 2);
assert!(values[0] > values[1]);
// Sort by score ascending (ReverseNoneLower)
let collector = TopDocs::with_limit(10).order_by((
SortByErasedType::for_score(),
ComparatorEnum::ReverseNoneLower,
));
let top_docs = searcher.search(&query, &collector).unwrap();
let values: Vec<f64> = top_docs
.into_iter()
.map(|(key, _)| match key {
OwnedValue::F64(val) => val,
_ => panic!("Wrong type {key:?}"),
})
.collect();
assert_eq!(values.len(), 2);
assert!(values[0] < values[1]);
}
}

View File

@@ -1,77 +0,0 @@
use crate::collector::sort_key::NaturalComparator;
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer, TopNComputer};
use crate::{DocAddress, DocId, Score};
/// Sort by similarity score.
#[derive(Clone, Debug, Copy)]
pub struct SortBySimilarityScore;
impl SortKeyComputer for SortBySimilarityScore {
type SortKey = Score;
type Child = SortBySimilarityScore;
type Comparator = NaturalComparator;
fn requires_scoring(&self) -> bool {
true
}
fn segment_sort_key_computer(
&self,
_segment_reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
Ok(SortBySimilarityScore)
}
// Sorting by score is special in that it allows for the Block-Wand optimization.
fn collect_segment_top_k(
&self,
k: usize,
weight: &dyn crate::query::Weight,
reader: &crate::SegmentReader,
segment_ord: u32,
) -> crate::Result<Vec<(Self::SortKey, DocAddress)>> {
let mut top_n: TopNComputer<Score, DocId, Self::Comparator> =
TopNComputer::new_with_comparator(k, self.comparator());
if let Some(alive_bitset) = reader.alive_bitset() {
let mut threshold = Score::MIN;
top_n.threshold = Some(threshold);
weight.for_each_pruning(Score::MIN, reader, &mut |doc, score| {
if alive_bitset.is_deleted(doc) {
return threshold;
}
top_n.push(score, doc);
threshold = top_n.threshold.unwrap_or(Score::MIN);
threshold
})?;
} else {
weight.for_each_pruning(Score::MIN, reader, &mut |doc, score| {
top_n.push(score, doc);
top_n.threshold.unwrap_or(Score::MIN)
})?;
}
Ok(top_n
.into_vec()
.into_iter()
.map(|cid| (cid.sort_key, DocAddress::new(segment_ord, cid.doc)))
.collect())
}
}
impl SegmentSortKeyComputer for SortBySimilarityScore {
type SortKey = Score;
type SegmentSortKey = Score;
type SegmentComparator = NaturalComparator;
#[inline(always)]
fn segment_sort_key(&mut self, _doc: DocId, score: Score) -> Score {
score
}
fn convert_segment_sort_key(&self, score: Score) -> Score {
score
}
}

View File

@@ -1,96 +0,0 @@
use std::marker::PhantomData;
use columnar::Column;
use crate::collector::sort_key::NaturalComparator;
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::{DocId, Score, SegmentReader};
/// Sorts by a fast value (u64, i64, f64, bool).
///
/// The field must appear explicitly in the schema, with the right type, and declared as
/// a fast field..
///
/// If the field is multivalued, only the first value is considered.
///
/// Documents that do not have this value are still considered.
/// Their sort key will simply be `None`.
#[derive(Debug, Clone)]
pub struct SortByStaticFastValue<T: FastValue> {
field: String,
typ: PhantomData<T>,
}
impl<T: FastValue> SortByStaticFastValue<T> {
/// Creates a new `SortByStaticFastValue` instance for the given field.
pub fn for_field(column_name: impl ToString) -> SortByStaticFastValue<T> {
Self {
field: column_name.to_string(),
typ: PhantomData,
}
}
}
impl<T: FastValue> SortKeyComputer for SortByStaticFastValue<T> {
type Child = SortByFastValueSegmentSortKeyComputer<T>;
type SortKey = Option<T>;
type Comparator = NaturalComparator;
fn check_schema(&self, schema: &crate::schema::Schema) -> crate::Result<()> {
// At the segment sort key computer level, we rely on the u64 representation.
// The mapping is monotonic, so it is sufficient to compute our top-K docs.
let field = schema.get_field(&self.field)?;
let field_entry = schema.get_field_entry(field);
if !field_entry.is_fast() {
return Err(crate::TantivyError::SchemaError(format!(
"Field `{}` is not a fast field.",
self.field,
)));
}
let schema_type = field_entry.field_type().value_type();
if schema_type != T::to_type() {
return Err(crate::TantivyError::SchemaError(format!(
"Field `{}` is of type {schema_type:?}, not of the type {:?}.",
&self.field,
T::to_type()
)));
}
Ok(())
}
fn segment_sort_key_computer(
&self,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
let sort_column_opt = segment_reader.fast_fields().u64_lenient(&self.field)?;
let (sort_column, _sort_column_type) =
sort_column_opt.ok_or_else(|| FastFieldNotAvailableError {
field_name: self.field.clone(),
})?;
Ok(SortByFastValueSegmentSortKeyComputer {
sort_column,
typ: PhantomData,
})
}
}
pub struct SortByFastValueSegmentSortKeyComputer<T> {
sort_column: Column<u64>,
typ: PhantomData<T>,
}
impl<T: FastValue> SegmentSortKeyComputer for SortByFastValueSegmentSortKeyComputer<T> {
type SortKey = Option<T>;
type SegmentSortKey = Option<u64>;
type SegmentComparator = NaturalComparator;
#[inline(always)]
fn segment_sort_key(&mut self, doc: DocId, _score: Score) -> Self::SegmentSortKey {
self.sort_column.first(doc)
}
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey {
sort_key.map(T::from_u64)
}
}

View File

@@ -1,72 +0,0 @@
use columnar::StrColumn;
use crate::collector::sort_key::NaturalComparator;
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::termdict::TermOrdinal;
use crate::{DocId, Score};
/// Sort by the first value of a string column.
///
/// The string can be dynamic (coming from a json field)
/// or static (being specificaly defined in the configuration).
///
/// If the field is multivalued, only the first value is considered.
///
/// Documents that do not have this value are still considered.
/// Their sort key will simply be `None`.
#[derive(Debug, Clone)]
pub struct SortByString {
column_name: String,
}
impl SortByString {
/// Creates a new sort by string sort key computer.
pub fn for_field(column_name: impl ToString) -> Self {
SortByString {
column_name: column_name.to_string(),
}
}
}
impl SortKeyComputer for SortByString {
type SortKey = Option<String>;
type Child = ByStringColumnSegmentSortKeyComputer;
type Comparator = NaturalComparator;
fn segment_sort_key_computer(
&self,
segment_reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let str_column_opt = segment_reader.fast_fields().str(&self.column_name)?;
Ok(ByStringColumnSegmentSortKeyComputer { str_column_opt })
}
}
pub struct ByStringColumnSegmentSortKeyComputer {
str_column_opt: Option<StrColumn>,
}
impl SegmentSortKeyComputer for ByStringColumnSegmentSortKeyComputer {
type SortKey = Option<String>;
type SegmentSortKey = Option<TermOrdinal>;
type SegmentComparator = NaturalComparator;
#[inline(always)]
fn segment_sort_key(&mut self, doc: DocId, _score: Score) -> Option<TermOrdinal> {
let str_column = self.str_column_opt.as_ref()?;
str_column.ords().first(doc)
}
fn convert_segment_sort_key(&self, term_ord_opt: Option<TermOrdinal>) -> Option<String> {
// TODO: Individual lookups to the dictionary like this are very likely to repeatedly
// decompress the same blocks. See https://github.com/quickwit-oss/tantivy/issues/2776
let term_ord = term_ord_opt?;
let str_column = self.str_column_opt.as_ref()?;
let mut bytes = Vec::new();
str_column
.dictionary()
.ord_to_term(term_ord, &mut bytes)
.ok()?;
String::try_from(bytes).ok()
}
}

View File

@@ -1,643 +0,0 @@
use std::cmp::Ordering;
use crate::collector::sort_key::{Comparator, NaturalComparator};
use crate::collector::sort_key_top_collector::TopBySortKeySegmentCollector;
use crate::collector::{default_collect_segment_impl, SegmentCollector as _, TopNComputer};
use crate::schema::Schema;
use crate::{DocAddress, DocId, Result, Score, SegmentReader};
/// A `SegmentSortKeyComputer` makes it possible to modify the default score
/// for a given document belonging to a specific segment.
///
/// It is the segment local version of the [`SortKeyComputer`].
pub trait SegmentSortKeyComputer: 'static {
/// The final score being emitted.
type SortKey: 'static + Send + Sync + Clone;
/// Sort key used by at the segment level by the `SegmentSortKeyComputer`.
///
/// It is typically small like a `u64`, and is meant to be converted
/// to the final score at the end of the collection of the segment.
type SegmentSortKey: 'static + Clone + Send + Sync + Clone;
/// Comparator type.
type SegmentComparator: Comparator<Self::SegmentSortKey> + 'static;
/// Returns the segment sort key comparator.
fn segment_comparator(&self) -> Self::SegmentComparator {
Self::SegmentComparator::default()
}
/// Computes the sort key for the given document and score.
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey;
/// Computes the sort key and pushes the document in a TopN Computer.
///
/// When using a tuple as the sorting key, the sort key is evaluated in a lazy manner.
#[inline(always)]
fn compute_sort_key_and_collect<C: Comparator<Self::SegmentSortKey>>(
&mut self,
doc: DocId,
score: Score,
top_n_computer: &mut TopNComputer<Self::SegmentSortKey, DocId, C>,
) {
let sort_key = self.segment_sort_key(doc, score);
top_n_computer.push(sort_key, doc);
}
/// A SegmentSortKeyComputer maps to a SegmentSortKey, but it can also decide on
/// its ordering.
///
/// This method must be consistent with the `SortKey` ordering.
#[inline(always)]
fn compare_segment_sort_key(
&self,
left: &Self::SegmentSortKey,
right: &Self::SegmentSortKey,
) -> Ordering {
self.segment_comparator().compare(left, right)
}
/// Implementing this method makes it possible to avoid computing
/// a sort_key entirely if we can assess that it won't pass a threshold
/// with a partial computation.
///
/// This is currently used for lexicographic sorting.
fn accept_sort_key_lazy(
&mut self,
doc_id: DocId,
score: Score,
threshold: &Self::SegmentSortKey,
) -> Option<(Ordering, Self::SegmentSortKey)> {
let sort_key = self.segment_sort_key(doc_id, score);
let cmp = self.compare_segment_sort_key(&sort_key, threshold);
if cmp == Ordering::Less {
None
} else {
Some((cmp, sort_key))
}
}
/// Convert a segment level sort key into the global sort key.
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey;
}
/// `SortKeyComputer` defines the sort key to be used by a TopK Collector.
///
/// The `SortKeyComputer` itself does not make much of the computation itself.
/// Instead, it helps constructing `Self::Child` instances that will compute
/// the sort key at a segment scale.
pub trait SortKeyComputer: Sync {
/// The sort key type.
type SortKey: 'static + Send + Sync + Clone + std::fmt::Debug;
/// Type of the associated [`SegmentSortKeyComputer`].
type Child: SegmentSortKeyComputer<SortKey = Self::SortKey>;
/// Comparator type.
type Comparator: Comparator<Self::SortKey>
+ Comparator<<Self::Child as SegmentSortKeyComputer>::SegmentSortKey>
+ 'static;
/// Checks whether the schema is compatible with the sort key computer.
fn check_schema(&self, _schema: &Schema) -> crate::Result<()> {
Ok(())
}
/// Returns the sort key comparator.
fn comparator(&self) -> Self::Comparator {
Self::Comparator::default()
}
/// Indicates whether the sort key actually uses the similarity score (by default BM25).
/// If set to false, the similary score might not be computed (as an optimization),
/// and the score fed in the segment sort key computer could take any value.
fn requires_scoring(&self) -> bool {
false
}
/// Sorting by score has a overriding implementation for BM25 scores, using Block-WAND.
fn collect_segment_top_k(
&self,
k: usize,
weight: &dyn crate::query::Weight,
reader: &crate::SegmentReader,
segment_ord: u32,
) -> crate::Result<Vec<(Self::SortKey, DocAddress)>> {
let with_scoring = self.requires_scoring();
let segment_sort_key_computer = self.segment_sort_key_computer(reader)?;
let topn_computer = TopNComputer::new_with_comparator(k, self.comparator());
let mut segment_top_key_collector = TopBySortKeySegmentCollector {
topn_computer,
segment_ord,
segment_sort_key_computer,
};
default_collect_segment_impl(&mut segment_top_key_collector, weight, reader, with_scoring)?;
Ok(segment_top_key_collector.harvest())
}
/// Builds a child sort key computer for a specific segment.
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child>;
}
impl<HeadSortKeyComputer, TailSortKeyComputer> SortKeyComputer
for (HeadSortKeyComputer, TailSortKeyComputer)
where
HeadSortKeyComputer: SortKeyComputer,
TailSortKeyComputer: SortKeyComputer,
{
type SortKey = (HeadSortKeyComputer::SortKey, TailSortKeyComputer::SortKey);
type Child = (HeadSortKeyComputer::Child, TailSortKeyComputer::Child);
type Comparator = (
HeadSortKeyComputer::Comparator,
TailSortKeyComputer::Comparator,
);
fn comparator(&self) -> Self::Comparator {
(self.0.comparator(), self.1.comparator())
}
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
Ok((
self.0.segment_sort_key_computer(segment_reader)?,
self.1.segment_sort_key_computer(segment_reader)?,
))
}
/// Checks whether the schema is compatible with the sort key computer.
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
self.0.check_schema(schema)?;
self.1.check_schema(schema)?;
Ok(())
}
/// Indicates whether the sort key actually uses the similarity score (by default BM25).
/// If set to false, the similary score might not be computed (as an optimization),
/// and the score fed in the segment sort key computer could take any value.
fn requires_scoring(&self) -> bool {
self.0.requires_scoring() || self.1.requires_scoring()
}
}
impl<HeadSegmentSortKeyComputer, TailSegmentSortKeyComputer> SegmentSortKeyComputer
for (HeadSegmentSortKeyComputer, TailSegmentSortKeyComputer)
where
HeadSegmentSortKeyComputer: SegmentSortKeyComputer,
TailSegmentSortKeyComputer: SegmentSortKeyComputer,
{
type SortKey = (
HeadSegmentSortKeyComputer::SortKey,
TailSegmentSortKeyComputer::SortKey,
);
type SegmentSortKey = (
HeadSegmentSortKeyComputer::SegmentSortKey,
TailSegmentSortKeyComputer::SegmentSortKey,
);
type SegmentComparator = (
HeadSegmentSortKeyComputer::SegmentComparator,
TailSegmentSortKeyComputer::SegmentComparator,
);
/// A SegmentSortKeyComputer maps to a SegmentSortKey, but it can also decide on
/// its ordering.
///
/// By default, it uses the natural ordering.
#[inline]
fn compare_segment_sort_key(
&self,
left: &Self::SegmentSortKey,
right: &Self::SegmentSortKey,
) -> Ordering {
self.0
.compare_segment_sort_key(&left.0, &right.0)
.then_with(|| self.1.compare_segment_sort_key(&left.1, &right.1))
}
#[inline(always)]
fn compute_sort_key_and_collect<C: Comparator<Self::SegmentSortKey>>(
&mut self,
doc: DocId,
score: Score,
top_n_computer: &mut TopNComputer<Self::SegmentSortKey, DocId, C>,
) {
let sort_key: Self::SegmentSortKey;
if let Some(threshold) = &top_n_computer.threshold {
if let Some((_cmp, lazy_sort_key)) = self.accept_sort_key_lazy(doc, score, threshold) {
sort_key = lazy_sort_key;
} else {
return;
}
} else {
sort_key = self.segment_sort_key(doc, score);
};
top_n_computer.append_doc(doc, sort_key);
}
#[inline(always)]
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey {
let head_sort_key = self.0.segment_sort_key(doc, score);
let tail_sort_key = self.1.segment_sort_key(doc, score);
(head_sort_key, tail_sort_key)
}
fn accept_sort_key_lazy(
&mut self,
doc_id: DocId,
score: Score,
threshold: &Self::SegmentSortKey,
) -> Option<(Ordering, Self::SegmentSortKey)> {
let (head_threshold, tail_threshold) = threshold;
let (head_cmp, head_sort_key) =
self.0.accept_sort_key_lazy(doc_id, score, head_threshold)?;
if head_cmp == Ordering::Equal {
let (tail_cmp, tail_sort_key) =
self.1.accept_sort_key_lazy(doc_id, score, tail_threshold)?;
Some((tail_cmp, (head_sort_key, tail_sort_key)))
} else {
let tail_sort_key = self.1.segment_sort_key(doc_id, score);
Some((head_cmp, (head_sort_key, tail_sort_key)))
}
}
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey {
let (head_sort_key, tail_sort_key) = sort_key;
(
self.0.convert_segment_sort_key(head_sort_key),
self.1.convert_segment_sort_key(tail_sort_key),
)
}
}
/// This struct is used as an adapter to take a sort key computer and map its score to another
/// new sort key.
pub struct MappedSegmentSortKeyComputer<T, PreviousSortKey, NewSortKey> {
sort_key_computer: T,
map: fn(PreviousSortKey) -> NewSortKey,
}
impl<T, PreviousScore, NewScore> SegmentSortKeyComputer
for MappedSegmentSortKeyComputer<T, PreviousScore, NewScore>
where
T: SegmentSortKeyComputer<SortKey = PreviousScore>,
PreviousScore: 'static + Clone + Send + Sync,
NewScore: 'static + Clone + Send + Sync,
{
type SortKey = NewScore;
type SegmentSortKey = T::SegmentSortKey;
type SegmentComparator = T::SegmentComparator;
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey {
self.sort_key_computer.segment_sort_key(doc, score)
}
fn accept_sort_key_lazy(
&mut self,
doc_id: DocId,
score: Score,
threshold: &Self::SegmentSortKey,
) -> Option<(Ordering, Self::SegmentSortKey)> {
self.sort_key_computer
.accept_sort_key_lazy(doc_id, score, threshold)
}
#[inline(always)]
fn compute_sort_key_and_collect<C: Comparator<Self::SegmentSortKey>>(
&mut self,
doc: DocId,
score: Score,
top_n_computer: &mut TopNComputer<Self::SegmentSortKey, DocId, C>,
) {
self.sort_key_computer
.compute_sort_key_and_collect(doc, score, top_n_computer);
}
fn convert_segment_sort_key(&self, segment_sort_key: Self::SegmentSortKey) -> Self::SortKey {
(self.map)(
self.sort_key_computer
.convert_segment_sort_key(segment_sort_key),
)
}
}
// We then re-use our (head, tail) implement and our mapper by seeing mapping any tuple (a, b, c,
// ...) as the chain (a, (b, (c, ...)))
impl<SortKeyComputer1, SortKeyComputer2, SortKeyComputer3> SortKeyComputer
for (SortKeyComputer1, SortKeyComputer2, SortKeyComputer3)
where
SortKeyComputer1: SortKeyComputer,
SortKeyComputer2: SortKeyComputer,
SortKeyComputer3: SortKeyComputer,
{
type SortKey = (
SortKeyComputer1::SortKey,
SortKeyComputer2::SortKey,
SortKeyComputer3::SortKey,
);
type Child = MappedSegmentSortKeyComputer<
<(SortKeyComputer1, (SortKeyComputer2, SortKeyComputer3)) as SortKeyComputer>::Child,
(
SortKeyComputer1::SortKey,
(SortKeyComputer2::SortKey, SortKeyComputer3::SortKey),
),
Self::SortKey,
>;
type Comparator = (
SortKeyComputer1::Comparator,
SortKeyComputer2::Comparator,
SortKeyComputer3::Comparator,
);
fn comparator(&self) -> Self::Comparator {
(
self.0.comparator(),
self.1.comparator(),
self.2.comparator(),
)
}
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
let sort_key_computer1 = self.0.segment_sort_key_computer(segment_reader)?;
let sort_key_computer2 = self.1.segment_sort_key_computer(segment_reader)?;
let sort_key_computer3 = self.2.segment_sort_key_computer(segment_reader)?;
let map = |(sort_key1, (sort_key2, sort_key3))| (sort_key1, sort_key2, sort_key3);
Ok(MappedSegmentSortKeyComputer {
sort_key_computer: (sort_key_computer1, (sort_key_computer2, sort_key_computer3)),
map,
})
}
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
self.0.check_schema(schema)?;
self.1.check_schema(schema)?;
self.2.check_schema(schema)?;
Ok(())
}
fn requires_scoring(&self) -> bool {
self.0.requires_scoring() || self.1.requires_scoring() || self.2.requires_scoring()
}
}
impl<SortKeyComputer1, SortKeyComputer2, SortKeyComputer3, SortKeyComputer4> SortKeyComputer
for (
SortKeyComputer1,
SortKeyComputer2,
SortKeyComputer3,
SortKeyComputer4,
)
where
SortKeyComputer1: SortKeyComputer,
SortKeyComputer2: SortKeyComputer,
SortKeyComputer3: SortKeyComputer,
SortKeyComputer4: SortKeyComputer,
{
type Child = MappedSegmentSortKeyComputer<
<(
SortKeyComputer1,
(SortKeyComputer2, (SortKeyComputer3, SortKeyComputer4)),
) as SortKeyComputer>::Child,
(
SortKeyComputer1::SortKey,
(
SortKeyComputer2::SortKey,
(SortKeyComputer3::SortKey, SortKeyComputer4::SortKey),
),
),
Self::SortKey,
>;
type SortKey = (
SortKeyComputer1::SortKey,
SortKeyComputer2::SortKey,
SortKeyComputer3::SortKey,
SortKeyComputer4::SortKey,
);
type Comparator = (
SortKeyComputer1::Comparator,
SortKeyComputer2::Comparator,
SortKeyComputer3::Comparator,
SortKeyComputer4::Comparator,
);
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
let sort_key_computer1 = self.0.segment_sort_key_computer(segment_reader)?;
let sort_key_computer2 = self.1.segment_sort_key_computer(segment_reader)?;
let sort_key_computer3 = self.2.segment_sort_key_computer(segment_reader)?;
let sort_key_computer4 = self.3.segment_sort_key_computer(segment_reader)?;
Ok(MappedSegmentSortKeyComputer {
sort_key_computer: (
sort_key_computer1,
(sort_key_computer2, (sort_key_computer3, sort_key_computer4)),
),
map: |(sort_key1, (sort_key2, (sort_key3, sort_key4)))| {
(sort_key1, sort_key2, sort_key3, sort_key4)
},
})
}
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
self.0.check_schema(schema)?;
self.1.check_schema(schema)?;
self.2.check_schema(schema)?;
self.3.check_schema(schema)?;
Ok(())
}
fn requires_scoring(&self) -> bool {
self.0.requires_scoring()
|| self.1.requires_scoring()
|| self.2.requires_scoring()
|| self.3.requires_scoring()
}
}
impl<F, SegmentF, TSortKey> SortKeyComputer for F
where
F: 'static + Send + Sync + Fn(&SegmentReader) -> SegmentF,
SegmentF: 'static + FnMut(DocId) -> TSortKey,
TSortKey: 'static + PartialOrd + Clone + Send + Sync + std::fmt::Debug,
{
type SortKey = TSortKey;
type Child = SegmentF;
type Comparator = NaturalComparator;
fn segment_sort_key_computer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
Ok((self)(segment_reader))
}
}
impl<F, TSortKey> SegmentSortKeyComputer for F
where
F: 'static + FnMut(DocId) -> TSortKey,
TSortKey: 'static + PartialOrd + Clone + Send + Sync,
{
type SortKey = TSortKey;
type SegmentSortKey = TSortKey;
type SegmentComparator = NaturalComparator;
fn segment_sort_key(&mut self, doc: DocId, _score: Score) -> TSortKey {
(self)(doc)
}
/// Convert a segment level score into the global level score.
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey {
sort_key
}
}
#[cfg(test)]
mod tests {
use std::cmp::Ordering;
use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering};
use std::sync::Arc;
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::schema::Schema;
use crate::{DocId, Index, Order, SegmentReader};
fn build_test_index() -> Index {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(crate::TantivyDocument::default())
.unwrap();
index_writer.commit().unwrap();
index
}
#[test]
fn test_lazy_score_computer() {
let score_computer_primary = |_segment_reader: &SegmentReader| |_doc: DocId| 200u32;
let call_count = Arc::new(AtomicUsize::new(0));
let call_count_clone = call_count.clone();
let score_computer_secondary = move |_segment_reader: &SegmentReader| {
let call_count_new_clone = call_count_clone.clone();
move |_doc: DocId| {
call_count_new_clone.fetch_add(1, AtomicOrdering::SeqCst);
"b"
}
};
let lazy_score_computer = (score_computer_primary, score_computer_secondary);
let index = build_test_index();
let searcher = index.reader().unwrap().searcher();
let mut segment_sort_key_computer = lazy_score_computer
.segment_sort_key_computer(searcher.segment_reader(0))
.unwrap();
let expected_sort_key = (200, "b");
{
let sort_key_opt =
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(100u32, "a"));
assert_eq!(sort_key_opt, Some((Ordering::Greater, expected_sort_key)));
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 1);
}
{
let sort_key_opt =
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(100u32, "c"));
assert_eq!(sort_key_opt, Some((Ordering::Greater, expected_sort_key)));
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 2);
}
{
let sort_key_opt =
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(200u32, "a"));
assert_eq!(sort_key_opt, Some((Ordering::Greater, expected_sort_key)));
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 3);
}
{
let sort_key_opt =
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(200u32, "c"));
assert!(sort_key_opt.is_none());
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 4);
}
{
let sort_key_opt =
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(300u32, "a"));
assert_eq!(sort_key_opt, None);
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 4);
}
{
let sort_key_opt =
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(300u32, "c"));
assert_eq!(sort_key_opt, None);
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 4);
}
{
let sort_key_opt =
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &expected_sort_key);
assert_eq!(sort_key_opt, Some((Ordering::Equal, expected_sort_key)));
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 5);
}
}
#[test]
fn test_lazy_score_computer_dynamic_ordering() {
let score_computer_primary = |_segment_reader: &SegmentReader| |_doc: DocId| 200u32;
let call_count = Arc::new(AtomicUsize::new(0));
let call_count_clone = call_count.clone();
let score_computer_secondary = move |_segment_reader: &SegmentReader| {
let call_count_new_clone = call_count_clone.clone();
move |_doc: DocId| {
call_count_new_clone.fetch_add(1, AtomicOrdering::SeqCst);
2u32
}
};
let lazy_score_computer = (
(score_computer_primary, Order::Desc),
(score_computer_secondary, Order::Asc),
);
let index = build_test_index();
let searcher = index.reader().unwrap().searcher();
let mut segment_sort_key_computer = lazy_score_computer
.segment_sort_key_computer(searcher.segment_reader(0))
.unwrap();
let expected_sort_key = (200, 2u32);
{
let sort_key_opt =
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(100u32, 1u32));
assert_eq!(sort_key_opt, Some((Ordering::Greater, expected_sort_key)));
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 1);
}
{
let sort_key_opt =
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(100u32, 3u32));
assert_eq!(sort_key_opt, Some((Ordering::Greater, expected_sort_key)));
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 2);
}
{
let sort_key_opt =
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(200u32, 1u32));
assert!(sort_key_opt.is_none());
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 3);
}
{
let sort_key_opt =
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(200u32, 3u32));
assert_eq!(sort_key_opt, Some((Ordering::Greater, expected_sort_key)));
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 4);
}
{
let sort_key_opt =
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(300u32, 1u32));
assert_eq!(sort_key_opt, None);
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 4);
}
{
let sort_key_opt =
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &(300u32, 3u32));
assert_eq!(sort_key_opt, None);
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 4);
}
{
let sort_key_opt =
segment_sort_key_computer.accept_sort_key_lazy(0u32, 1f32, &expected_sort_key);
assert_eq!(sort_key_opt, Some((Ordering::Equal, expected_sort_key)));
assert_eq!(call_count.load(AtomicOrdering::SeqCst), 5);
}
assert_eq!(
segment_sort_key_computer.convert_segment_sort_key(expected_sort_key),
(200u32, 2u32)
);
}
}

View File

@@ -1,193 +0,0 @@
use std::ops::Range;
use crate::collector::sort_key::{Comparator, SegmentSortKeyComputer, SortKeyComputer};
use crate::collector::{Collector, SegmentCollector, TopNComputer};
use crate::query::Weight;
use crate::schema::Schema;
use crate::{DocAddress, DocId, Result, Score, SegmentReader};
pub(crate) struct TopBySortKeyCollector<TSortKeyComputer> {
sort_key_computer: TSortKeyComputer,
doc_range: Range<usize>,
}
impl<TSortKeyComputer> TopBySortKeyCollector<TSortKeyComputer> {
pub fn new(sort_key_computer: TSortKeyComputer, doc_range: Range<usize>) -> Self {
TopBySortKeyCollector {
sort_key_computer,
doc_range,
}
}
}
impl<TSortKeyComputer> Collector for TopBySortKeyCollector<TSortKeyComputer>
where TSortKeyComputer: SortKeyComputer + Send + Sync + 'static
{
type Fruit = Vec<(TSortKeyComputer::SortKey, DocAddress)>;
type Child =
TopBySortKeySegmentCollector<TSortKeyComputer::Child, TSortKeyComputer::Comparator>;
fn check_schema(&self, schema: &Schema) -> crate::Result<()> {
self.sort_key_computer.check_schema(schema)
}
fn for_segment(&self, segment_ord: u32, segment_reader: &SegmentReader) -> Result<Self::Child> {
let segment_sort_key_computer = self
.sort_key_computer
.segment_sort_key_computer(segment_reader)?;
let topn_computer = TopNComputer::new_with_comparator(
self.doc_range.end,
self.sort_key_computer.comparator(),
);
Ok(TopBySortKeySegmentCollector {
topn_computer,
segment_ord,
segment_sort_key_computer,
})
}
fn requires_scoring(&self) -> bool {
self.sort_key_computer.requires_scoring()
}
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit> {
Ok(merge_top_k(
segment_fruits.into_iter().flatten(),
self.doc_range.clone(),
self.sort_key_computer.comparator(),
))
}
fn collect_segment(
&self,
weight: &dyn Weight,
segment_ord: u32,
reader: &SegmentReader,
) -> crate::Result<Vec<(TSortKeyComputer::SortKey, DocAddress)>> {
let k = self.doc_range.end;
let docs = self
.sort_key_computer
.collect_segment_top_k(k, weight, reader, segment_ord)?;
Ok(docs)
}
}
fn merge_top_k<D: Ord, TSortKey: Clone + std::fmt::Debug, C: Comparator<TSortKey>>(
sort_key_docs: impl Iterator<Item = (TSortKey, D)>,
doc_range: Range<usize>,
comparator: C,
) -> Vec<(TSortKey, D)> {
if doc_range.is_empty() {
return Vec::new();
}
let mut top_collector: TopNComputer<TSortKey, D, C> =
TopNComputer::new_with_comparator(doc_range.end, comparator);
for (sort_key, doc) in sort_key_docs {
top_collector.push(sort_key, doc);
}
top_collector
.into_sorted_vec()
.into_iter()
.skip(doc_range.start)
.map(|cdoc| (cdoc.sort_key, cdoc.doc))
.collect()
}
pub struct TopBySortKeySegmentCollector<TSegmentSortKeyComputer, C>
where
TSegmentSortKeyComputer: SegmentSortKeyComputer,
C: Comparator<TSegmentSortKeyComputer::SegmentSortKey>,
{
pub(crate) topn_computer: TopNComputer<TSegmentSortKeyComputer::SegmentSortKey, DocId, C>,
pub(crate) segment_ord: u32,
pub(crate) segment_sort_key_computer: TSegmentSortKeyComputer,
}
impl<TSegmentSortKeyComputer, C> SegmentCollector
for TopBySortKeySegmentCollector<TSegmentSortKeyComputer, C>
where
TSegmentSortKeyComputer: 'static + SegmentSortKeyComputer,
C: Comparator<TSegmentSortKeyComputer::SegmentSortKey> + 'static,
{
type Fruit = Vec<(TSegmentSortKeyComputer::SortKey, DocAddress)>;
fn collect(&mut self, doc: DocId, score: Score) {
self.segment_sort_key_computer.compute_sort_key_and_collect(
doc,
score,
&mut self.topn_computer,
);
}
fn harvest(self) -> Self::Fruit {
let segment_ord = self.segment_ord;
let segment_hits: Vec<(TSegmentSortKeyComputer::SortKey, DocAddress)> = self
.topn_computer
.into_vec()
.into_iter()
.map(|comparable_doc| {
let sort_key = self
.segment_sort_key_computer
.convert_segment_sort_key(comparable_doc.sort_key);
(
sort_key,
DocAddress {
segment_ord,
doc_id: comparable_doc.doc,
},
)
})
.collect();
segment_hits
}
}
#[cfg(test)]
mod tests {
use std::ops::Range;
use rand;
use rand::seq::SliceRandom as _;
use super::merge_top_k;
use crate::collector::sort_key::ComparatorEnum;
use crate::Order;
fn test_merge_top_k_aux(
order: Order,
doc_range: Range<usize>,
expected: &[(crate::Score, usize)],
) {
let mut vals: Vec<(crate::Score, usize)> = (0..10).map(|val| (val as f32, val)).collect();
vals.shuffle(&mut rand::thread_rng());
let vals_merged = merge_top_k(vals.into_iter(), doc_range, ComparatorEnum::from(order));
assert_eq!(&vals_merged, expected);
}
#[test]
fn test_merge_top_k() {
test_merge_top_k_aux(Order::Asc, 0..0, &[]);
test_merge_top_k_aux(Order::Asc, 3..3, &[]);
test_merge_top_k_aux(Order::Asc, 0..3, &[(0.0f32, 0), (1.0f32, 1), (2.0f32, 2)]);
test_merge_top_k_aux(
Order::Asc,
0..11,
&[
(0.0f32, 0),
(1.0f32, 1),
(2.0f32, 2),
(3.0f32, 3),
(4.0f32, 4),
(5.0f32, 5),
(6.0f32, 6),
(7.0f32, 7),
(8.0f32, 8),
(9.0f32, 9),
],
);
test_merge_top_k_aux(Order::Asc, 1..3, &[(1.0f32, 1), (2.0f32, 2)]);
test_merge_top_k_aux(Order::Desc, 0..2, &[(9.0f32, 9), (8.0f32, 8)]);
test_merge_top_k_aux(Order::Desc, 2..4, &[(7.0f32, 7), (6.0f32, 6)]);
}
}

View File

@@ -40,7 +40,7 @@ pub fn test_filter_collector() -> crate::Result<()> {
let filter_some_collector = FilterCollector::new( let filter_some_collector = FilterCollector::new(
"price".to_string(), "price".to_string(),
&|value: u64| value > 20_120u64, &|value: u64| value > 20_120u64,
TopDocs::with_limit(2).order_by_score(), TopDocs::with_limit(2),
); );
let top_docs = searcher.search(&query, &filter_some_collector)?; let top_docs = searcher.search(&query, &filter_some_collector)?;
@@ -50,7 +50,7 @@ pub fn test_filter_collector() -> crate::Result<()> {
let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new( let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(
"price".to_string(), "price".to_string(),
&|value| value < 5u64, &|value| value < 5u64,
TopDocs::with_limit(2).order_by_score(), TopDocs::with_limit(2),
); );
let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap(); let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
@@ -62,11 +62,8 @@ pub fn test_filter_collector() -> crate::Result<()> {
> 0 > 0
} }
let filter_dates_collector = FilterCollector::new( let filter_dates_collector =
"date".to_string(), FilterCollector::new("date".to_string(), &date_filter, TopDocs::with_limit(5));
&date_filter,
TopDocs::with_limit(5).order_by_score(),
);
let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?; let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?;
assert_eq!(filtered_date_docs.len(), 2); assert_eq!(filtered_date_docs.len(), 2);

View File

@@ -1,22 +1,374 @@
use std::cmp::Ordering;
use std::marker::PhantomData;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::top_score_collector::TopNComputer;
use crate::index::SegmentReader;
use crate::{DocAddress, DocId, SegmentOrdinal};
/// Contains a feature (field, score, etc.) of a document along with the document address. /// Contains a feature (field, score, etc.) of a document along with the document address.
/// ///
/// Used only by TopNComputer, which implements the actual comparison via a `Comparator`. /// It guarantees stable sorting: in case of a tie on the feature, the document
#[derive(Clone, Default, Eq, PartialEq, Serialize, Deserialize)] /// address is used.
pub struct ComparableDoc<T, D> { ///
/// The REVERSE_ORDER generic parameter controls whether the by-feature order
/// should be reversed, which is useful for achieving for example largest-first
/// semantics without having to wrap the feature in a `Reverse`.
#[derive(Clone, Default, Serialize, Deserialize)]
pub struct ComparableDoc<T, D, const REVERSE_ORDER: bool = false> {
/// The feature of the document. In practice, this is /// The feature of the document. In practice, this is
/// is a type which can be compared with a `Comparator<T>`. /// is any type that implements `PartialOrd`.
pub sort_key: T, pub feature: T,
/// The document address. In practice, this is either a `DocId` or `DocAddress`. /// The document address. In practice, this is any
/// type that implements `PartialOrd`, and is guaranteed
/// to be unique for each document.
pub doc: D, pub doc: D,
} }
impl<T: std::fmt::Debug, D: std::fmt::Debug, const R: bool> std::fmt::Debug
impl<T: std::fmt::Debug, D: std::fmt::Debug> std::fmt::Debug for ComparableDoc<T, D> { for ComparableDoc<T, D, R>
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { {
f.debug_struct("ComparableDoc") fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
.field("feature", &self.sort_key) f.debug_struct(format!("ComparableDoc<_, _ {R}").as_str())
.field("feature", &self.feature)
.field("doc", &self.doc) .field("doc", &self.doc)
.finish() .finish()
} }
} }
impl<T: PartialOrd, D: PartialOrd, const R: bool> PartialOrd for ComparableDoc<T, D, R> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<T: PartialOrd, D: PartialOrd, const R: bool> Ord for ComparableDoc<T, D, R> {
#[inline]
fn cmp(&self, other: &Self) -> Ordering {
let by_feature = self
.feature
.partial_cmp(&other.feature)
.map(|ord| if R { ord.reverse() } else { ord })
.unwrap_or(Ordering::Equal);
let lazy_by_doc_address = || self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal);
// In case of a tie on the feature, we sort by ascending
// `DocAddress` in order to ensure a stable sorting of the
// documents.
by_feature.then_with(lazy_by_doc_address)
}
}
impl<T: PartialOrd, D: PartialOrd, const R: bool> PartialEq for ComparableDoc<T, D, R> {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == Ordering::Equal
}
}
impl<T: PartialOrd, D: PartialOrd, const R: bool> Eq for ComparableDoc<T, D, R> {}
pub(crate) struct TopCollector<T> {
pub limit: usize,
pub offset: usize,
_marker: PhantomData<T>,
}
impl<T> TopCollector<T>
where T: PartialOrd + Clone
{
/// Creates a top collector, with a number of documents equal to "limit".
///
/// # Panics
/// The method panics if limit is 0
pub fn with_limit(limit: usize) -> TopCollector<T> {
assert!(limit >= 1, "Limit must be strictly greater than 0.");
Self {
limit,
offset: 0,
_marker: PhantomData,
}
}
/// Skip the first "offset" documents when collecting.
///
/// This is equivalent to `OFFSET` in MySQL or PostgreSQL and `start` in
/// Lucene's TopDocsCollector.
pub fn and_offset(mut self, offset: usize) -> TopCollector<T> {
self.offset = offset;
self
}
pub fn merge_fruits(
&self,
children: Vec<Vec<(T, DocAddress)>>,
) -> crate::Result<Vec<(T, DocAddress)>> {
if self.limit == 0 {
return Ok(Vec::new());
}
let mut top_collector: TopNComputer<_, _> = TopNComputer::new(self.limit + self.offset);
for child_fruit in children {
for (feature, doc) in child_fruit {
top_collector.push(feature, doc);
}
}
Ok(top_collector
.into_sorted_vec()
.into_iter()
.skip(self.offset)
.map(|cdoc| (cdoc.feature, cdoc.doc))
.collect())
}
pub(crate) fn for_segment<F: PartialOrd + Clone>(
&self,
segment_id: SegmentOrdinal,
_: &SegmentReader,
) -> TopSegmentCollector<F> {
TopSegmentCollector::new(segment_id, self.limit + self.offset)
}
/// Create a new TopCollector with the same limit and offset.
///
/// Ideally we would use Into but the blanket implementation seems to cause the Scorer traits
/// to fail.
#[doc(hidden)]
pub(crate) fn into_tscore<TScore: PartialOrd + Clone>(self) -> TopCollector<TScore> {
TopCollector {
limit: self.limit,
offset: self.offset,
_marker: PhantomData,
}
}
}
/// The Top Collector keeps track of the K documents
/// sorted by type `T`.
///
/// The implementation is based on a repeatedly truncating on the median after K * 2 documents
/// The theoretical complexity for collecting the top `K` out of `n` documents
/// is `O(n + K)`.
pub(crate) struct TopSegmentCollector<T> {
/// We reverse the order of the feature in order to
/// have top-semantics instead of bottom semantics.
topn_computer: TopNComputer<T, DocId>,
segment_ord: u32,
}
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
fn new(segment_ord: SegmentOrdinal, limit: usize) -> TopSegmentCollector<T> {
TopSegmentCollector {
topn_computer: TopNComputer::new(limit),
segment_ord,
}
}
}
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
pub fn harvest(self) -> Vec<(T, DocAddress)> {
let segment_ord = self.segment_ord;
self.topn_computer
.into_sorted_vec()
.into_iter()
.map(|comparable_doc| {
(
comparable_doc.feature,
DocAddress {
segment_ord,
doc_id: comparable_doc.doc,
},
)
})
.collect()
}
/// Collects a document scored by the given feature
///
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
/// will compare the lowest scoring item with the given one and keep whichever is greater.
#[inline]
pub fn collect(&mut self, doc: DocId, feature: T) {
self.topn_computer.push(feature, doc);
}
}
#[cfg(test)]
mod tests {
use super::{TopCollector, TopSegmentCollector};
use crate::DocAddress;
#[test]
fn test_top_collector_not_at_capacity() {
let mut top_collector = TopSegmentCollector::new(0, 4);
top_collector.collect(1, 0.8);
top_collector.collect(3, 0.2);
top_collector.collect(5, 0.3);
assert_eq!(
top_collector.harvest(),
vec![
(0.8, DocAddress::new(0, 1)),
(0.3, DocAddress::new(0, 5)),
(0.2, DocAddress::new(0, 3))
]
);
}
#[test]
fn test_top_collector_at_capacity() {
let mut top_collector = TopSegmentCollector::new(0, 4);
top_collector.collect(1, 0.8);
top_collector.collect(3, 0.2);
top_collector.collect(5, 0.3);
top_collector.collect(7, 0.9);
top_collector.collect(9, -0.2);
assert_eq!(
top_collector.harvest(),
vec![
(0.9, DocAddress::new(0, 7)),
(0.8, DocAddress::new(0, 1)),
(0.3, DocAddress::new(0, 5)),
(0.2, DocAddress::new(0, 3))
]
);
}
#[test]
fn test_top_segment_collector_stable_ordering_for_equal_feature() {
// given that the documents are collected in ascending doc id order,
// when harvesting we have to guarantee stable sorting in case of a tie
// on the score
let doc_ids_collection = [4, 5, 6];
let score = 3.3f32;
let mut top_collector_limit_2 = TopSegmentCollector::new(0, 2);
for id in &doc_ids_collection {
top_collector_limit_2.collect(*id, score);
}
let mut top_collector_limit_3 = TopSegmentCollector::new(0, 3);
for id in &doc_ids_collection {
top_collector_limit_3.collect(*id, score);
}
assert_eq!(
top_collector_limit_2.harvest(),
top_collector_limit_3.harvest()[..2].to_vec(),
);
}
#[test]
fn test_top_collector_with_limit_and_offset() {
let collector = TopCollector::with_limit(2).and_offset(1);
let results = collector
.merge_fruits(vec![vec![
(0.9, DocAddress::new(0, 1)),
(0.8, DocAddress::new(0, 2)),
(0.7, DocAddress::new(0, 3)),
(0.6, DocAddress::new(0, 4)),
(0.5, DocAddress::new(0, 5)),
]])
.unwrap();
assert_eq!(
results,
vec![(0.8, DocAddress::new(0, 2)), (0.7, DocAddress::new(0, 3)),]
);
}
#[test]
fn test_top_collector_with_limit_larger_than_set_and_offset() {
let collector = TopCollector::with_limit(2).and_offset(1);
let results = collector
.merge_fruits(vec![vec![
(0.9, DocAddress::new(0, 1)),
(0.8, DocAddress::new(0, 2)),
]])
.unwrap();
assert_eq!(results, vec![(0.8, DocAddress::new(0, 2)),]);
}
#[test]
fn test_top_collector_with_limit_and_offset_larger_than_set() {
let collector = TopCollector::with_limit(2).and_offset(20);
let results = collector
.merge_fruits(vec![vec![
(0.9, DocAddress::new(0, 1)),
(0.8, DocAddress::new(0, 2)),
]])
.unwrap();
assert_eq!(results, vec![]);
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use test::Bencher;
use super::TopSegmentCollector;
#[bench]
fn bench_top_segment_collector_collect_not_at_capacity(b: &mut Bencher) {
let mut top_collector = TopSegmentCollector::new(0, 400);
b.iter(|| {
for i in 0..100 {
top_collector.collect(i, 0.8);
}
});
}
#[bench]
fn bench_top_segment_collector_collect_at_capacity(b: &mut Bencher) {
let mut top_collector = TopSegmentCollector::new(0, 100);
for i in 0..100 {
top_collector.collect(i, 0.8);
}
b.iter(|| {
for i in 0..100 {
top_collector.collect(i, 0.8);
}
});
}
#[bench]
fn bench_top_segment_collector_collect_and_harvest_many_ties(b: &mut Bencher) {
b.iter(|| {
let mut top_collector = TopSegmentCollector::new(0, 100);
for i in 0..100 {
top_collector.collect(i, 0.8);
}
// it would be nice to be able to do the setup N times but still
// measure only harvest(). We can't since harvest() consumes
// the top_collector.
top_collector.harvest()
});
}
#[bench]
fn bench_top_segment_collector_collect_and_harvest_no_tie(b: &mut Bencher) {
b.iter(|| {
let mut top_collector = TopSegmentCollector::new(0, 100);
let mut score = 1.0;
for i in 0..100 {
score += 1.0;
top_collector.collect(i, score);
}
// it would be nice to be able to do the setup N times but still
// measure only harvest(). We can't since harvest() consumes
// the top_collector.
top_collector.harvest()
});
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,124 @@
use crate::collector::top_collector::{TopCollector, TopSegmentCollector};
use crate::collector::{Collector, SegmentCollector};
use crate::{DocAddress, DocId, Result, Score, SegmentReader};
pub(crate) struct TweakedScoreTopCollector<TScoreTweaker, TScore = Score> {
score_tweaker: TScoreTweaker,
collector: TopCollector<TScore>,
}
impl<TScoreTweaker, TScore> TweakedScoreTopCollector<TScoreTweaker, TScore>
where TScore: Clone + PartialOrd
{
pub fn new(
score_tweaker: TScoreTweaker,
collector: TopCollector<TScore>,
) -> TweakedScoreTopCollector<TScoreTweaker, TScore> {
TweakedScoreTopCollector {
score_tweaker,
collector,
}
}
}
/// A `ScoreSegmentTweaker` makes it possible to modify the default score
/// for a given document belonging to a specific segment.
///
/// It is the segment local version of the [`ScoreTweaker`].
pub trait ScoreSegmentTweaker<TScore>: 'static {
/// Tweak the given `score` for the document `doc`.
fn score(&mut self, doc: DocId, score: Score) -> TScore;
}
/// `ScoreTweaker` makes it possible to tweak the score
/// emitted by the scorer into another one.
///
/// The `ScoreTweaker` itself does not make much of the computation itself.
/// Instead, it helps constructing `Self::Child` instances that will compute
/// the score at a segment scale.
pub trait ScoreTweaker<TScore>: Sync {
/// Type of the associated [`ScoreSegmentTweaker`].
type Child: ScoreSegmentTweaker<TScore>;
/// Builds a child tweaker for a specific segment. The child scorer is associated with
/// a specific segment.
fn segment_tweaker(&self, segment_reader: &SegmentReader) -> Result<Self::Child>;
}
impl<TScoreTweaker, TScore> Collector for TweakedScoreTopCollector<TScoreTweaker, TScore>
where
TScoreTweaker: ScoreTweaker<TScore> + Send + Sync,
TScore: 'static + PartialOrd + Clone + Send + Sync,
{
type Fruit = Vec<(TScore, DocAddress)>;
type Child = TopTweakedScoreSegmentCollector<TScoreTweaker::Child, TScore>;
fn for_segment(
&self,
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> Result<Self::Child> {
let segment_scorer = self.score_tweaker.segment_tweaker(segment_reader)?;
let segment_collector = self.collector.for_segment(segment_local_id, segment_reader);
Ok(TopTweakedScoreSegmentCollector {
segment_collector,
segment_scorer,
})
}
fn requires_scoring(&self) -> bool {
true
}
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit> {
self.collector.merge_fruits(segment_fruits)
}
}
pub struct TopTweakedScoreSegmentCollector<TSegmentScoreTweaker, TScore>
where
TScore: 'static + PartialOrd + Clone + Send + Sync + Sized,
TSegmentScoreTweaker: ScoreSegmentTweaker<TScore>,
{
segment_collector: TopSegmentCollector<TScore>,
segment_scorer: TSegmentScoreTweaker,
}
impl<TSegmentScoreTweaker, TScore> SegmentCollector
for TopTweakedScoreSegmentCollector<TSegmentScoreTweaker, TScore>
where
TScore: 'static + PartialOrd + Clone + Send + Sync,
TSegmentScoreTweaker: 'static + ScoreSegmentTweaker<TScore>,
{
type Fruit = Vec<(TScore, DocAddress)>;
fn collect(&mut self, doc: DocId, score: Score) {
let score = self.segment_scorer.score(doc, score);
self.segment_collector.collect(doc, score);
}
fn harvest(self) -> Vec<(TScore, DocAddress)> {
self.segment_collector.harvest()
}
}
impl<F, TScore, TSegmentScoreTweaker> ScoreTweaker<TScore> for F
where
F: 'static + Send + Sync + Fn(&SegmentReader) -> TSegmentScoreTweaker,
TSegmentScoreTweaker: ScoreSegmentTweaker<TScore>,
{
type Child = TSegmentScoreTweaker;
fn segment_tweaker(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
Ok((self)(segment_reader))
}
}
impl<F, TScore> ScoreSegmentTweaker<TScore> for F
where F: 'static + FnMut(DocId, Score) -> TScore
{
fn score(&mut self, doc: DocId, score: Score) -> TScore {
(self)(doc, score)
}
}

View File

@@ -69,7 +69,7 @@ fn assert_date_time_precision(index: &Index, doc_store_precision: DateTimePrecis
.parse_query("dateformat") .parse_query("dateformat")
.expect("Failed to parse query"); .expect("Failed to parse query");
let top_docs = searcher let top_docs = searcher
.search(&query, &TopDocs::with_limit(1).order_by_score()) .search(&query, &TopDocs::with_limit(1))
.expect("Search failed"); .expect("Search failed");
assert_eq!(top_docs.len(), 1, "Expected 1 search result"); assert_eq!(top_docs.len(), 1, "Expected 1 search result");

View File

@@ -48,15 +48,7 @@ impl Executor {
F: Sized + Sync + Fn(A) -> crate::Result<R>, F: Sized + Sync + Fn(A) -> crate::Result<R>,
{ {
match self { match self {
Executor::SingleThread => { Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
// Avoid `collect`, since the stacktrace is blown up by it, which makes profiling
// harder.
let mut result = Vec::with_capacity(args.size_hint().0);
for arg in args {
result.push(f(arg)?);
}
Ok(result)
}
Executor::ThreadPool(pool) => { Executor::ThreadPool(pool) => {
let args: Vec<A> = args.collect(); let args: Vec<A> = args.collect();
let num_fruits = args.len(); let num_fruits = args.len();

View File

@@ -3,7 +3,6 @@ use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
use common::{replace_in_place, JsonPathWriter}; use common::{replace_in_place, JsonPathWriter};
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use crate::indexer::indexing_term::IndexingTerm;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value}; use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED}; use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
@@ -78,7 +77,7 @@ fn index_json_object<'a, V: Value<'a>>(
doc: DocId, doc: DocId,
json_visitor: V::ObjectIter, json_visitor: V::ObjectIter,
text_analyzer: &mut TextAnalyzer, text_analyzer: &mut TextAnalyzer,
term_buffer: &mut IndexingTerm, term_buffer: &mut Term,
json_path_writer: &mut JsonPathWriter, json_path_writer: &mut JsonPathWriter,
postings_writer: &mut dyn PostingsWriter, postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext, ctx: &mut IndexingContext,
@@ -108,17 +107,17 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
doc: DocId, doc: DocId,
json_value: V, json_value: V,
text_analyzer: &mut TextAnalyzer, text_analyzer: &mut TextAnalyzer,
term_buffer: &mut IndexingTerm, term_buffer: &mut Term,
json_path_writer: &mut JsonPathWriter, json_path_writer: &mut JsonPathWriter,
postings_writer: &mut dyn PostingsWriter, postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext, ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath, positions_per_path: &mut IndexingPositionsPerPath,
) { ) {
let set_path_id = |term_buffer: &mut IndexingTerm, unordered_id: u32| { let set_path_id = |term_buffer: &mut Term, unordered_id: u32| {
term_buffer.truncate_value_bytes(0); term_buffer.truncate_value_bytes(0);
term_buffer.append_bytes(&unordered_id.to_be_bytes()); term_buffer.append_bytes(&unordered_id.to_be_bytes());
}; };
let set_type = |term_buffer: &mut IndexingTerm, typ: Type| { let set_type = |term_buffer: &mut Term, typ: Type| {
term_buffer.append_bytes(&[typ.to_code()]); term_buffer.append_bytes(&[typ.to_code()]);
}; };
@@ -406,7 +405,7 @@ mod tests {
let mut term = Term::from_field_json_path(field, "color", false); let mut term = Term::from_field_json_path(field, "color", false);
term.append_type_and_str("red"); term.append_type_and_str("red");
assert_eq!(term.serialized_value_bytes(), b"color\x00sred".to_vec()) assert_eq!(term.serialized_term(), b"\x00\x00\x00\x01jcolor\x00sred")
} }
#[test] #[test]
@@ -416,8 +415,8 @@ mod tests {
term.append_type_and_fast_value(-4i64); term.append_type_and_fast_value(-4i64);
assert_eq!( assert_eq!(
term.serialized_value_bytes(), term.serialized_term(),
b"color\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc".to_vec() b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
) )
} }
@@ -428,8 +427,8 @@ mod tests {
term.append_type_and_fast_value(4u64); term.append_type_and_fast_value(4u64);
assert_eq!( assert_eq!(
term.serialized_value_bytes(), term.serialized_term(),
b"color\x00u\x00\x00\x00\x00\x00\x00\x00\x04".to_vec() b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
) )
} }
@@ -439,8 +438,8 @@ mod tests {
let mut term = Term::from_field_json_path(field, "color", false); let mut term = Term::from_field_json_path(field, "color", false);
term.append_type_and_fast_value(4.0f64); term.append_type_and_fast_value(4.0f64);
assert_eq!( assert_eq!(
term.serialized_value_bytes(), term.serialized_term(),
b"color\x00f\xc0\x10\x00\x00\x00\x00\x00\x00".to_vec() b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
) )
} }
@@ -450,8 +449,8 @@ mod tests {
let mut term = Term::from_field_json_path(field, "color", false); let mut term = Term::from_field_json_path(field, "color", false);
term.append_type_and_fast_value(true); term.append_type_and_fast_value(true);
assert_eq!( assert_eq!(
term.serialized_value_bytes(), term.serialized_term(),
b"color\x00o\x00\x00\x00\x00\x00\x00\x00\x01".to_vec() b"\x00\x00\x00\x01jcolor\x00o\x00\x00\x00\x00\x00\x00\x00\x01"
) )
} }

View File

@@ -225,7 +225,6 @@ impl Searcher {
enabled_scoring: EnableScoring, enabled_scoring: EnableScoring,
) -> crate::Result<C::Fruit> { ) -> crate::Result<C::Fruit> {
let weight = query.weight(enabled_scoring)?; let weight = query.weight(enabled_scoring)?;
collector.check_schema(self.schema())?;
let segment_readers = self.segment_readers(); let segment_readers = self.segment_readers();
let fruits = executor.map( let fruits = executor.map(
|(segment_ord, segment_reader)| { |(segment_ord, segment_reader)| {

View File

@@ -5,7 +5,7 @@ use std::ops::Range;
use common::{BinarySerializable, CountingWriter, HasLen, VInt}; use common::{BinarySerializable, CountingWriter, HasLen, VInt};
use crate::directory::{FileSlice, TerminatingWrite, WritePtr}; use crate::directory::{FileSlice, TerminatingWrite, WritePtr};
use crate::schema::{Field, Schema}; use crate::schema::Field;
use crate::space_usage::{FieldUsage, PerFieldSpaceUsage}; use crate::space_usage::{FieldUsage, PerFieldSpaceUsage};
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)] #[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
@@ -167,11 +167,10 @@ impl CompositeFile {
.map(|byte_range| self.data.slice(byte_range.clone())) .map(|byte_range| self.data.slice(byte_range.clone()))
} }
pub fn space_usage(&self, schema: &Schema) -> PerFieldSpaceUsage { pub fn space_usage(&self) -> PerFieldSpaceUsage {
let mut fields = Vec::new(); let mut fields = Vec::new();
for (&field_addr, byte_range) in &self.offsets_index { for (&field_addr, byte_range) in &self.offsets_index {
let field_name = schema.get_field_name(field_addr.field).to_string(); let mut field_usage = FieldUsage::empty(field_addr.field);
let mut field_usage = FieldUsage::empty(field_name);
field_usage.add_field_idx(field_addr.idx, byte_range.len().into()); field_usage.add_field_idx(field_addr.idx, byte_range.len().into());
fields.push(field_usage); fields.push(field_usage);
} }

View File

@@ -108,7 +108,7 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// Opens a file and returns a boxed `FileHandle`. /// Opens a file and returns a boxed `FileHandle`.
/// ///
/// Users of `Directory` should typically call `Directory::open_read(...)`, /// Users of `Directory` should typically call `Directory::open_read(...)`,
/// while `Directory` implementer should implement `get_file_handle()`. /// while `Directory` implementor should implement `get_file_handle()`.
fn get_file_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>, OpenReadError>; fn get_file_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>, OpenReadError>;
/// Once a virtual file is open, its data may not /// Once a virtual file is open, its data may not

View File

@@ -1,5 +1,3 @@
mod file_watcher;
use std::collections::HashMap; use std::collections::HashMap;
use std::fmt; use std::fmt;
use std::fs::{self, File, OpenOptions}; use std::fs::{self, File, OpenOptions};
@@ -9,7 +7,6 @@ use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock, Weak}; use std::sync::{Arc, RwLock, Weak};
use common::StableDeref; use common::StableDeref;
use file_watcher::FileWatcher;
use fs4::fs_std::FileExt; use fs4::fs_std::FileExt;
#[cfg(all(feature = "mmap", unix))] #[cfg(all(feature = "mmap", unix))]
pub use memmap2::Advice; pub use memmap2::Advice;
@@ -21,6 +18,7 @@ use crate::core::META_FILEPATH;
use crate::directory::error::{ use crate::directory::error::{
DeleteError, LockError, OpenDirectoryError, OpenReadError, OpenWriteError, DeleteError, LockError, OpenDirectoryError, OpenReadError, OpenWriteError,
}; };
use crate::directory::file_watcher::FileWatcher;
use crate::directory::{ use crate::directory::{
AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite, AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
WatchCallback, WatchHandle, WritePtr, WatchCallback, WatchHandle, WritePtr,

View File

@@ -5,6 +5,7 @@ mod mmap_directory;
mod directory; mod directory;
mod directory_lock; mod directory_lock;
mod file_watcher;
pub mod footer; pub mod footer;
mod managed_directory; mod managed_directory;
mod ram_directory; mod ram_directory;

View File

@@ -40,8 +40,6 @@ pub trait DocSet: Send {
/// of `DocSet` should support it. /// of `DocSet` should support it.
/// ///
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`. /// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`.
///
/// `target` has to be larger or equal to `.doc()` when calling `seek`.
fn seek(&mut self, target: DocId) -> DocId { fn seek(&mut self, target: DocId) -> DocId {
let mut doc = self.doc(); let mut doc = self.doc();
debug_assert!(doc <= target); debug_assert!(doc <= target);
@@ -51,33 +49,6 @@ pub trait DocSet: Send {
doc doc
} }
/// Seeks to the target if possible and returns true if the target is in the DocSet.
///
/// DocSets that already have an efficient `seek` method don't need to implement
/// `seek_into_the_danger_zone`. All wrapper DocSets should forward
/// `seek_into_the_danger_zone` to the underlying DocSet.
///
/// ## API Behaviour
/// If `seek_into_the_danger_zone` is returning true, a call to `doc()` has to return target.
/// If `seek_into_the_danger_zone` is returning false, a call to `doc()` may return any doc
/// between the last doc that matched and target or a doc that is a valid next hit after
/// target. The DocSet is considered to be in an invalid state until
/// `seek_into_the_danger_zone` returns true again.
///
/// `target` needs to be equal or larger than `doc` when in a valid state.
///
/// Consecutive calls are not allowed to have decreasing `target` values.
///
/// # Warning
/// This is an advanced API used by intersection. The API contract is tricky, avoid using it.
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
let current_doc = self.doc();
if current_doc < target {
self.seek(target);
}
self.doc() == target
}
/// Fills a given mutable buffer with the next doc ids from the /// Fills a given mutable buffer with the next doc ids from the
/// `DocSet` /// `DocSet`
/// ///
@@ -123,15 +94,6 @@ pub trait DocSet: Send {
/// which would be the number of documents in the DocSet. /// which would be the number of documents in the DocSet.
/// ///
/// By default this returns `size_hint()`. /// By default this returns `size_hint()`.
///
/// DocSets may have vastly different cost depending on their type,
/// e.g. an intersection with 10 hits is much cheaper than
/// a phrase search with 10 hits, since it needs to load positions.
///
/// ### Future Work
/// We may want to differentiate `DocSet` costs more more granular, e.g.
/// creation_cost, advance_cost, seek_cost on to get a good estimation
/// what query types to choose.
fn cost(&self) -> u64 { fn cost(&self) -> u64 {
self.size_hint() as u64 self.size_hint() as u64
} }
@@ -175,10 +137,6 @@ impl DocSet for &mut dyn DocSet {
(**self).seek(target) (**self).seek(target)
} }
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
(**self).seek_into_the_danger_zone(target)
}
fn doc(&self) -> u32 { fn doc(&self) -> u32 {
(**self).doc() (**self).doc()
} }
@@ -211,11 +169,6 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.seek(target) unboxed.seek(target)
} }
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.seek_into_the_danger_zone(target)
}
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
let unboxed: &mut TDocSet = self.borrow_mut(); let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.fill_buffer(buffer) unboxed.fill_buffer(buffer)

View File

@@ -104,7 +104,7 @@ pub enum TantivyError {
#[error("{0:?}")] #[error("{0:?}")]
IncompatibleIndex(Incompatibility), IncompatibleIndex(Incompatibility),
/// An internal error occurred. This is are internal states that should not be reached. /// An internal error occurred. This is are internal states that should not be reached.
/// e.g. a datastructure is incorrectly initialized. /// e.g. a datastructure is incorrectly inititalized.
#[error("Internal error: '{0}'")] #[error("Internal error: '{0}'")]
InternalError(String), InternalError(String),
#[error("Deserialize error: {0}")] #[error("Deserialize error: {0}")]

View File

@@ -726,22 +726,22 @@ mod tests {
.column_opt::<DateTime>("multi_date") .column_opt::<DateTime>("multi_date")
.unwrap() .unwrap()
.unwrap(); .unwrap();
let mut dates = Vec::new();
{ {
assert_eq!(date_fast_field.get_val(0).into_timestamp_nanos(), 1i64); assert_eq!(date_fast_field.get_val(0).into_timestamp_nanos(), 1i64);
let dates: Vec<DateTime> = dates_fast_field.values_for_doc(0u32).collect(); dates_fast_field.fill_vals(0u32, &mut dates);
assert_eq!(dates.len(), 2); assert_eq!(dates.len(), 2);
assert_eq!(dates[0].into_timestamp_nanos(), 2i64); assert_eq!(dates[0].into_timestamp_nanos(), 2i64);
assert_eq!(dates[1].into_timestamp_nanos(), 3i64); assert_eq!(dates[1].into_timestamp_nanos(), 3i64);
} }
{ {
assert_eq!(date_fast_field.get_val(1).into_timestamp_nanos(), 4i64); assert_eq!(date_fast_field.get_val(1).into_timestamp_nanos(), 4i64);
let dates: Vec<DateTime> = dates_fast_field.values_for_doc(1u32).collect(); dates_fast_field.fill_vals(1u32, &mut dates);
assert!(dates.is_empty()); assert!(dates.is_empty());
} }
{ {
assert_eq!(date_fast_field.get_val(2).into_timestamp_nanos(), 0i64); assert_eq!(date_fast_field.get_val(2).into_timestamp_nanos(), 0i64);
let dates: Vec<DateTime> = dates_fast_field.values_for_doc(2u32).collect(); dates_fast_field.fill_vals(2u32, &mut dates);
assert_eq!(dates.len(), 2); assert_eq!(dates.len(), 2);
assert_eq!(dates[0].into_timestamp_nanos(), 5i64); assert_eq!(dates[0].into_timestamp_nanos(), 5i64);
assert_eq!(dates[1].into_timestamp_nanos(), 6i64); assert_eq!(dates[1].into_timestamp_nanos(), 6i64);

View File

@@ -8,7 +8,7 @@ use columnar::{
}; };
use common::ByteCount; use common::ByteCount;
use crate::core::json_utils::{encode_column_name, json_path_sep_to_dot}; use crate::core::json_utils::encode_column_name;
use crate::directory::FileSlice; use crate::directory::FileSlice;
use crate::schema::{Field, FieldEntry, FieldType, Schema}; use crate::schema::{Field, FieldEntry, FieldType, Schema};
use crate::space_usage::{FieldUsage, PerFieldSpaceUsage}; use crate::space_usage::{FieldUsage, PerFieldSpaceUsage};
@@ -39,15 +39,19 @@ impl FastFieldReaders {
self.resolve_column_name_given_default_field(column_name, default_field_opt) self.resolve_column_name_given_default_field(column_name, default_field_opt)
} }
pub(crate) fn space_usage(&self) -> io::Result<PerFieldSpaceUsage> { pub(crate) fn space_usage(&self, schema: &Schema) -> io::Result<PerFieldSpaceUsage> {
let mut per_field_usages: Vec<FieldUsage> = Default::default(); let mut per_field_usages: Vec<FieldUsage> = Default::default();
for (mut field_name, column_handle) in self.columnar.iter_columns()? { for (field, field_entry) in schema.fields() {
json_path_sep_to_dot(&mut field_name); let column_handles = self.columnar.read_columns(field_entry.name())?;
let space_usage = column_handle.space_usage()?; let num_bytes: ByteCount = column_handles
let mut field_usage = FieldUsage::empty(field_name); .iter()
field_usage.set_column_usage(space_usage); .map(|column_handle| column_handle.num_bytes())
.sum();
let mut field_usage = FieldUsage::empty(field);
field_usage.add_field_idx(0, num_bytes);
per_field_usages.push(field_usage); per_field_usages.push(field_usage);
} }
// TODO fix space usage for JSON fields.
Ok(PerFieldSpaceUsage::new(per_field_usages)) Ok(PerFieldSpaceUsage::new(per_field_usages))
} }

View File

@@ -2,7 +2,7 @@ use std::sync::Arc;
use super::{fieldnorm_to_id, id_to_fieldnorm}; use super::{fieldnorm_to_id, id_to_fieldnorm};
use crate::directory::{CompositeFile, FileSlice, OwnedBytes}; use crate::directory::{CompositeFile, FileSlice, OwnedBytes};
use crate::schema::{Field, Schema}; use crate::schema::Field;
use crate::space_usage::PerFieldSpaceUsage; use crate::space_usage::PerFieldSpaceUsage;
use crate::DocId; use crate::DocId;
@@ -37,8 +37,8 @@ impl FieldNormReaders {
} }
/// Return a break down of the space usage per field. /// Return a break down of the space usage per field.
pub fn space_usage(&self, schema: &Schema) -> PerFieldSpaceUsage { pub fn space_usage(&self) -> PerFieldSpaceUsage {
self.data.space_usage(schema) self.data.space_usage()
} }
/// Returns a handle to inner file /// Returns a handle to inner file

View File

@@ -13,9 +13,9 @@ use crate::store::Compressor;
use crate::{Inventory, Opstamp, TrackedObject}; use crate::{Inventory, Opstamp, TrackedObject};
#[derive(Clone, Debug, Serialize, Deserialize)] #[derive(Clone, Debug, Serialize, Deserialize)]
pub struct DeleteMeta { struct DeleteMeta {
num_deleted_docs: u32, num_deleted_docs: u32,
pub opstamp: Opstamp, opstamp: Opstamp,
} }
#[derive(Clone, Default)] #[derive(Clone, Default)]
@@ -213,7 +213,7 @@ impl SegmentMeta {
struct InnerSegmentMeta { struct InnerSegmentMeta {
segment_id: SegmentId, segment_id: SegmentId,
max_doc: u32, max_doc: u32,
pub deletes: Option<DeleteMeta>, deletes: Option<DeleteMeta>,
/// If you want to avoid the SegmentComponent::TempStore file to be covered by /// If you want to avoid the SegmentComponent::TempStore file to be covered by
/// garbage collection and deleted, set this to true. This is used during merge. /// garbage collection and deleted, set this to true. This is used during merge.
#[serde(skip)] #[serde(skip)]
@@ -276,14 +276,13 @@ impl Default for IndexSettings {
} }
/// The order to sort by /// The order to sort by
#[derive(Clone, Copy, Debug, Serialize, Deserialize, Eq, PartialEq)] #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub enum Order { pub enum Order {
/// Ascending Order /// Ascending Order
Asc, Asc,
/// Descending Order /// Descending Order
Desc, Desc,
} }
impl Order { impl Order {
/// return if the Order is ascending /// return if the Order is ascending
pub fn is_asc(&self) -> bool { pub fn is_asc(&self) -> bool {
@@ -404,10 +403,7 @@ mod tests {
schema_builder.build() schema_builder.build()
}; };
let index_metas = IndexMeta { let index_metas = IndexMeta {
index_settings: IndexSettings { index_settings: IndexSettings::default(),
docstore_compression: Compressor::None,
..Default::default()
},
segments: Vec::new(), segments: Vec::new(),
schema, schema,
opstamp: 0u64, opstamp: 0u64,
@@ -416,7 +412,7 @@ mod tests {
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed"); let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!( assert_eq!(
json, json,
r#"{"index_settings":{"docstore_compression":"none","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"# r#"{"index_settings":{"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
); );
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap(); let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
@@ -497,8 +493,6 @@ mod tests {
#[test] #[test]
#[cfg(feature = "lz4-compression")] #[cfg(feature = "lz4-compression")]
fn test_index_settings_default() { fn test_index_settings_default() {
use crate::store::Compressor;
let mut index_settings = IndexSettings::default(); let mut index_settings = IndexSettings::default();
assert_eq!( assert_eq!(
index_settings, index_settings,

View File

@@ -46,7 +46,7 @@ impl Segment {
/// ///
/// This method is only used when updating `max_doc` from 0 /// This method is only used when updating `max_doc` from 0
/// as we finalize a fresh new segment. /// as we finalize a fresh new segment.
pub fn with_max_doc(self, max_doc: u32) -> Segment { pub(crate) fn with_max_doc(self, max_doc: u32) -> Segment {
Segment { Segment {
index: self.index, index: self.index,
meta: self.meta.with_max_doc(max_doc), meta: self.meta.with_max_doc(max_doc),

Some files were not shown because too many files have changed in this diff Show More