Compare commits

..

15 Commits

Author SHA1 Message Date
Paul Masurel
7af79305fa introducing tantivy codec 2026-01-21 16:01:16 +01:00
Paul Masurel
1d5fe6bc7c blop 2026-01-13 19:06:34 +01:00
Paul Masurel
d768b2a491 blop 2026-01-13 17:03:12 +01:00
Paul Masurel
7453df8db3 postings writer enum 2026-01-13 15:24:03 +01:00
Paul Masurel
ba6abba20a First stab at introducing codecs 2026-01-12 19:41:06 +01:00
Paul Masurel
d128e5c2a2 first stab at codec 2026-01-12 15:59:43 +01:00
Paul Masurel
e6d062bf2d Minor refactoring in PostingsSerializer
Removes the Write generics argument in PostingsSerializer.
This removes useless generic.
Prepares the path for codecs.
Removes one useless CountingWrite layer.
etc.
2026-01-12 11:06:45 +01:00
Paul Masurel
d904630e6a Bumped bitpacking version (#2797)
Co-authored-by: Paul Masurel <paul.masurel@datadoghq.com>
2026-01-08 15:50:22 +01:00
PSeitz-dd
65b5a1a306 one collector per agg request instead per bucket (#2759)
* improve bench

* add more tests for new collection type

* one collector per agg request instead per bucket

In this refactoring a collector knows in which bucket of the parent
their data is in. This allows to convert the previous approach of one
collector per bucket to one collector per request.

low card bucket optimization

* reduce dynamic dispatch, faster term agg

* use radix map, fix prepare_max_bucket

use paged term map in term agg
use special no sub agg term map impl

* specialize columntype in stats

* remove stacktrace bloat, use &mut helper

increase cache to 2048

* cleanup

remove clone
move data in term req, single doc opt for stats

* add comment

* share column block accessor

* simplify fetch block in column_block_accessor

* split subaggcache into two trait impls

* move partitions to heap

* fix name, add comment

---------

Co-authored-by: Pascal Seitz <pascal.seitz@gmail.com>
2026-01-06 11:50:55 +01:00
ChangRui-Ryan
db2ecc6057 fix Column.first method parameter type (#2792) 2026-01-05 10:03:01 +01:00
Paul Masurel
77505c3d03 Making stemming optional. (#2791)
Fixed code and CI to run on no default features.

Co-authored-by: Paul Masurel <paul.masurel@datadoghq.com>
2026-01-02 12:40:42 +01:00
PSeitz
735c588f4f fix union performance regression (#2790)
* add inlines

* fix union performance regression

Remove unwrap from hotpath generates better assembly.

closes #2788
2026-01-02 12:06:51 +01:00
PSeitz
242a1531bf fix flaky test (#2784)
Signed-off-by: Pascal Seitz <pascal.seitz@gmail.com>
2026-01-02 11:30:51 +01:00
trinity-1686a
6443b63177 document 1bit hole and some queries supporting running with just fastfield (#2779)
* add small doc on some queries using fast field when not indexed

* document 1 unused bit in skiplist
2026-01-02 10:32:37 +01:00
Stu Hood
4987495ee4 Add an erased SortKeyComputer to sort on types which are not known until runtime (#2770)
* Remove PartialOrd bound on compared values.

* Fix declared `SortKey` type of `impl<..> SortKeyComputer for (HeadSortKeyComputer, TailSortKeyComputer)`

* Add a SortByOwnedValue implementation to provide a type-erased column.

* Add support for comparing mismatched `OwnedValue` types.

* Support JSON columns.

* Refer to https://github.com/quickwit-oss/tantivy/issues/2776

* Rename to `SortByErasedType`.

* Comment on transitivity.

Co-authored-by: Paul Masurel <paul@quickwit.io>

* Fix clippy warnings in new code.

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
2026-01-02 10:28:47 +01:00
125 changed files with 5433 additions and 3613 deletions

View File

@@ -39,11 +39,11 @@ jobs:
- name: Check Formatting
run: cargo +nightly fmt --all -- --check
- name: Check Stable Compilation
run: cargo build --all-features
- name: Check Bench Compilation
run: cargo +nightly bench --no-run --profile=dev --all-features
@@ -59,10 +59,10 @@ jobs:
strategy:
matrix:
features: [
{ label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints" },
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
]
features:
- { label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints,stemmer" }
- { label: "quickwit", flags: "mmap,quickwit,failpoints" }
- { label: "none", flags: "" }
name: test-${{ matrix.features.label}}
@@ -80,7 +80,21 @@ jobs:
- uses: Swatinem/rust-cache@v2
- name: Run tests
run: cargo +stable nextest run --features ${{ matrix.features.flags }} --verbose --workspace
run: |
# if matrix.feature.flags is empty then run on --lib to avoid compiling examples
# (as most of them rely on mmap) otherwise run all
if [ -z "${{ matrix.features.flags }}" ]; then
cargo +stable nextest run --lib --no-default-features --verbose --workspace
else
cargo +stable nextest run --features ${{ matrix.features.flags }} --no-default-features --verbose --workspace
fi
- name: Run doctests
run: cargo +stable test --doc --features ${{ matrix.features.flags }} --verbose --workspace
run: |
# if matrix.feature.flags is empty then run on --lib to avoid compiling examples
# (as most of them rely on mmap) otherwise run all
if [ -z "${{ matrix.features.flags }}" ]; then
echo "no doctest for no feature flag"
else
cargo +stable test --doc --features ${{ matrix.features.flags }} --verbose --workspace
fi

View File

@@ -37,9 +37,9 @@ fs4 = { version = "0.13.1", optional = true }
levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4"
rust-stemmers = "1.2.0"
rust-stemmers = { version = "1.2.0", optional = true }
downcast-rs = "2.0.1"
bitpacking = { version = "0.9.2", default-features = false, features = [
bitpacking = { version = "0.9.3", default-features = false, features = [
"bitpacker4x",
] }
census = "0.4.2"
@@ -113,7 +113,8 @@ debug-assertions = true
overflow-checks = true
[features]
default = ["mmap", "stopwords", "lz4-compression", "columnar-zstd-compression"]
default = ["mmap", "stopwords", "lz4-compression", "columnar-zstd-compression", "stemmer"]
stemmer = ["rust-stemmers"]
mmap = ["fs4", "tempfile", "memmap2"]
stopwords = []

View File

@@ -54,33 +54,33 @@ fn bench_agg(mut group: InputGroup<Index>) {
register!(group, stats_f64);
register!(group, extendedstats_f64);
register!(group, percentiles_f64);
register!(group, terms_few);
register!(group, terms_7);
register!(group, terms_all_unique);
register!(group, terms_many);
register!(group, terms_150_000);
register!(group, terms_many_top_1000);
register!(group, terms_many_order_by_term);
register!(group, terms_many_with_top_hits);
register!(group, terms_all_unique_with_avg_sub_agg);
register!(group, terms_many_with_avg_sub_agg);
register!(group, terms_few_with_avg_sub_agg);
register!(group, terms_status_with_avg_sub_agg);
register!(group, terms_status);
register!(group, terms_few_with_histogram);
register!(group, terms_status_with_histogram);
register!(group, terms_zipf_1000);
register!(group, terms_zipf_1000_with_histogram);
register!(group, terms_zipf_1000_with_avg_sub_agg);
register!(group, terms_many_json_mixed_type_with_avg_sub_agg);
register!(group, cardinality_agg);
register!(group, terms_few_with_cardinality_agg);
register!(group, terms_status_with_cardinality_agg);
register!(group, range_agg);
register!(group, range_agg_with_avg_sub_agg);
register!(group, range_agg_with_term_agg_few);
register!(group, range_agg_with_term_agg_status);
register!(group, range_agg_with_term_agg_many);
register!(group, histogram);
register!(group, histogram_hard_bounds);
register!(group, histogram_with_avg_sub_agg);
register!(group, histogram_with_term_agg_few);
register!(group, histogram_with_term_agg_status);
register!(group, avg_and_range_with_avg_sub_agg);
// Filter aggregation benchmarks
@@ -159,10 +159,10 @@ fn cardinality_agg(index: &Index) {
});
execute_agg(index, agg_req);
}
fn terms_few_with_cardinality_agg(index: &Index) {
fn terms_status_with_cardinality_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_few_terms" },
"terms": { "field": "text_few_terms_status" },
"aggs": {
"cardinality": {
"cardinality": {
@@ -175,13 +175,7 @@ fn terms_few_with_cardinality_agg(index: &Index) {
execute_agg(index, agg_req);
}
fn terms_few(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_few_terms" } },
});
execute_agg(index, agg_req);
}
fn terms_status(index: &Index) {
fn terms_7(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_few_terms_status" } },
});
@@ -194,7 +188,7 @@ fn terms_all_unique(index: &Index) {
execute_agg(index, agg_req);
}
fn terms_many(index: &Index) {
fn terms_150_000(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_many_terms" } },
});
@@ -253,17 +247,6 @@ fn terms_all_unique_with_avg_sub_agg(index: &Index) {
});
execute_agg(index, agg_req);
}
fn terms_few_with_histogram(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_few_terms" },
"aggs": {
"histo": {"histogram": { "field": "score_f64", "interval": 10 }}
}
}
});
execute_agg(index, agg_req);
}
fn terms_status_with_histogram(index: &Index) {
let agg_req = json!({
"my_texts": {
@@ -276,17 +259,18 @@ fn terms_status_with_histogram(index: &Index) {
execute_agg(index, agg_req);
}
fn terms_few_with_avg_sub_agg(index: &Index) {
fn terms_zipf_1000_with_histogram(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_few_terms" },
"terms": { "field": "text_1000_terms_zipf" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
"histo": {"histogram": { "field": "score_f64", "interval": 10 }}
}
},
}
});
execute_agg(index, agg_req);
}
fn terms_status_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
@@ -299,6 +283,25 @@ fn terms_status_with_avg_sub_agg(index: &Index) {
execute_agg(index, agg_req);
}
fn terms_zipf_1000_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_1000_terms_zipf" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn terms_zipf_1000(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_1000_terms_zipf" } },
});
execute_agg(index, agg_req);
}
fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
@@ -354,7 +357,7 @@ fn range_agg_with_avg_sub_agg(index: &Index) {
execute_agg(index, agg_req);
}
fn range_agg_with_term_agg_few(index: &Index) {
fn range_agg_with_term_agg_status(index: &Index) {
let agg_req = json!({
"rangef64": {
"range": {
@@ -369,7 +372,7 @@ fn range_agg_with_term_agg_few(index: &Index) {
]
},
"aggs": {
"my_texts": { "terms": { "field": "text_few_terms" } },
"my_texts": { "terms": { "field": "text_few_terms_status" } },
}
},
});
@@ -425,12 +428,12 @@ fn histogram_with_avg_sub_agg(index: &Index) {
});
execute_agg(index, agg_req);
}
fn histogram_with_term_agg_few(index: &Index) {
fn histogram_with_term_agg_status(index: &Index) {
let agg_req = json!({
"rangef64": {
"histogram": { "field": "score_f64", "interval": 10 },
"aggs": {
"my_texts": { "terms": { "field": "text_few_terms" } }
"my_texts": { "terms": { "field": "text_few_terms_status" } }
}
}
});
@@ -475,6 +478,13 @@ fn get_collector(agg_req: Aggregations) -> AggregationCollector {
}
fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
// Flag to use existing index
let reuse_index = std::env::var("REUSE_AGG_BENCH_INDEX").is_ok();
if reuse_index && std::path::Path::new("agg_bench").exists() {
return Index::open_in_dir("agg_bench");
}
// crreate dir
std::fs::create_dir_all("agg_bench")?;
let mut schema_builder = Schema::builder();
let text_fieldtype = tantivy::schema::TextOptions::default()
.set_indexing_options(
@@ -486,24 +496,44 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
let text_field_all_unique_terms =
schema_builder.add_text_field("text_all_unique_terms", STRING | FAST);
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
let text_field_few_terms_status =
schema_builder.add_text_field("text_few_terms_status", STRING | FAST);
let text_field_1000_terms_zipf =
schema_builder.add_text_field("text_1000_terms_zipf", STRING | FAST);
let score_fieldtype = tantivy::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let index = Index::create_from_tempdir(schema_builder.build())?;
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
// Approximate production log proportions: INFO dominant, WARN and DEBUG occasional, ERROR rare.
let log_level_distribution = WeightedIndex::new([80u32, 3, 12, 5]).unwrap();
// use tmp dir
let index = if reuse_index {
Index::create_in_dir("agg_bench", schema_builder.build())?
} else {
Index::create_from_tempdir(schema_builder.build())?
};
// Approximate log proportions
let status_field_data = [
("INFO", 8000),
("ERROR", 300),
("WARN", 1200),
("DEBUG", 500),
("OK", 500),
("CRITICAL", 20),
("EMERGENCY", 1),
];
let log_level_distribution =
WeightedIndex::new(status_field_data.iter().map(|item| item.1)).unwrap();
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let many_terms_data = (0..150_000)
.map(|num| format!("author{num}"))
.collect::<Vec<_>>();
// Prepare 1000 unique terms sampled using a Zipf distribution.
// Exponent ~1.1 approximates top-20 terms covering around ~20%.
let terms_1000: Vec<String> = (1..=1000).map(|i| format!("term_{i}")).collect();
let zipf_1000 = rand_distr::Zipf::new(1000, 1.1f64).unwrap();
{
let mut rng = StdRng::from_seed([1u8; 32]);
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
@@ -513,8 +543,12 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
index_writer.add_document(doc!())?;
}
if cardinality == Cardinality::Multivalued {
let log_level_sample_a = few_terms_data[log_level_distribution.sample(&mut rng)];
let log_level_sample_b = few_terms_data[log_level_distribution.sample(&mut rng)];
let log_level_sample_a = status_field_data[log_level_distribution.sample(&mut rng)].0;
let log_level_sample_b = status_field_data[log_level_distribution.sample(&mut rng)].0;
let idx_a = zipf_1000.sample(&mut rng) as usize - 1;
let idx_b = zipf_1000.sample(&mut rng) as usize - 1;
let term_1000_a = &terms_1000[idx_a];
let term_1000_b = &terms_1000[idx_b];
index_writer.add_document(doc!(
json_field => json!({"mixed_type": 10.0}),
json_field => json!({"mixed_type": 10.0}),
@@ -524,10 +558,10 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
text_field_all_unique_terms => "coolo",
text_field_many_terms => "cool",
text_field_many_terms => "cool",
text_field_few_terms => "cool",
text_field_few_terms => "cool",
text_field_few_terms_status => log_level_sample_a,
text_field_few_terms_status => log_level_sample_b,
text_field_1000_terms_zipf => term_1000_a.as_str(),
text_field_1000_terms_zipf => term_1000_b.as_str(),
score_field => 1u64,
score_field => 1u64,
score_field_f64 => lg_norm.sample(&mut rng),
@@ -554,8 +588,8 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
json_field => json,
text_field_all_unique_terms => format!("unique_term_{}", rng.gen::<u64>()),
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms_status => few_terms_data[log_level_distribution.sample(&mut rng)],
text_field_few_terms_status => status_field_data[log_level_distribution.sample(&mut rng)].0,
text_field_1000_terms_zipf => terms_1000[zipf_1000.sample(&mut rng) as usize - 1].as_str(),
score_field => val as u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => val as i64,
@@ -607,7 +641,7 @@ fn filter_agg_all_query_with_sub_aggs(index: &Index) {
"avg_score": { "avg": { "field": "score" } },
"stats_score": { "stats": { "field": "score_f64" } },
"terms_text": {
"terms": { "field": "text_few_terms" }
"terms": { "field": "text_few_terms_status" }
}
}
}
@@ -623,7 +657,7 @@ fn filter_agg_term_query_with_sub_aggs(index: &Index) {
"avg_score": { "avg": { "field": "score" } },
"stats_score": { "stats": { "field": "score_f64" } },
"terms_text": {
"terms": { "field": "text_few_terms" }
"terms": { "field": "text_few_terms_status" }
}
}
}

View File

@@ -29,12 +29,20 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
}
}
#[inline]
pub fn fetch_block_with_missing(&mut self, docs: &[u32], accessor: &Column<T>, missing: T) {
pub fn fetch_block_with_missing(
&mut self,
docs: &[u32],
accessor: &Column<T>,
missing: Option<T>,
) {
self.fetch_block(docs, accessor);
// no missing values
if accessor.index.get_cardinality().is_full() {
return;
}
let Some(missing) = missing else {
return;
};
// We can compare docid_cache length with docs to find missing docs
// For multi value columns we can't rely on the length and always need to scan

View File

@@ -85,8 +85,8 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
}
#[inline]
pub fn first(&self, row_id: RowId) -> Option<T> {
self.values_for_doc(row_id).next()
pub fn first(&self, doc_id: DocId) -> Option<T> {
self.values_for_doc(doc_id).next()
}
/// Load the first value for each docid in the provided slice.

View File

@@ -60,7 +60,7 @@ fn test_dataframe_writer_bool() {
let DynamicColumn::Bool(bool_col) = dyn_bool_col else {
panic!();
};
let vals: Vec<Option<bool>> = (0..5).map(|row_id| bool_col.first(row_id)).collect();
let vals: Vec<Option<bool>> = (0..5).map(|doc_id| bool_col.first(doc_id)).collect();
assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
}
@@ -108,7 +108,7 @@ fn test_dataframe_writer_ip_addr() {
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else {
panic!();
};
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|row_id| ip_col.first(row_id)).collect();
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|doc_id| ip_col.first(doc_id)).collect();
assert_eq!(
&vals,
&[
@@ -169,7 +169,7 @@ fn test_dictionary_encoded_str() {
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else {
panic!();
};
let index: Vec<Option<u64>> = (0..5).map(|row_id| str_col.ords().first(row_id)).collect();
let index: Vec<Option<u64>> = (0..5).map(|doc_id| str_col.ords().first(doc_id)).collect();
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
assert_eq!(str_col.num_rows(), 5);
let mut term_buffer = String::new();
@@ -204,7 +204,7 @@ fn test_dictionary_encoded_bytes() {
panic!();
};
let index: Vec<Option<u64>> = (0..5)
.map(|row_id| bytes_col.ords().first(row_id))
.map(|doc_id| bytes_col.ords().first(doc_id))
.collect();
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
assert_eq!(bytes_col.num_rows(), 5);

View File

@@ -178,9 +178,15 @@ impl TinySet {
#[derive(Clone)]
pub struct BitSet {
tinysets: Box<[TinySet]>,
len: u64,
max_value: u32,
}
impl std::fmt::Debug for BitSet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("BitSet")
.field("max_value", &self.max_value)
.finish()
}
}
fn num_buckets(max_val: u32) -> u32 {
max_val.div_ceil(64u32)
@@ -204,7 +210,6 @@ impl BitSet {
let tinybitsets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
BitSet {
tinysets: tinybitsets,
len: 0,
max_value,
}
}
@@ -222,7 +227,6 @@ impl BitSet {
}
BitSet {
tinysets: tinybitsets,
len: max_value as u64,
max_value,
}
}
@@ -241,17 +245,19 @@ impl BitSet {
/// Intersect with tinysets
fn intersect_update_with_iter(&mut self, other: impl Iterator<Item = TinySet>) {
self.len = 0;
for (left, right) in self.tinysets.iter_mut().zip(other) {
*left = left.intersect(right);
self.len += left.len() as u64;
}
}
/// Returns the number of elements in the `BitSet`.
#[inline]
pub fn len(&self) -> usize {
self.len as usize
self.tinysets
.iter()
.copied()
.map(|tinyset| tinyset.len())
.sum::<u32>() as usize
}
/// Inserts an element in the `BitSet`
@@ -260,7 +266,7 @@ impl BitSet {
// we do not check saturated els.
let higher = el / 64u32;
let lower = el % 64u32;
self.len += u64::from(self.tinysets[higher as usize].insert_mut(lower));
self.tinysets[higher as usize].insert_mut(lower);
}
/// Inserts an element in the `BitSet`
@@ -269,7 +275,7 @@ impl BitSet {
// we do not check saturated els.
let higher = el / 64u32;
let lower = el % 64u32;
self.len -= u64::from(self.tinysets[higher as usize].remove_mut(lower));
self.tinysets[higher as usize].remove_mut(lower);
}
/// Returns true iff the elements is in the `BitSet`.

View File

@@ -91,46 +91,10 @@ fn main() -> tantivy::Result<()> {
}
}
// A `Term` is a text token associated with a field.
// Let's go through all docs containing the term `title:the` and access their position
let term_the = Term::from_field_text(title, "the");
// Some other powerful operations (especially `.skip_to`) may be useful to consume these
// Some other powerful operations (especially `.seek`) may be useful to consume these
// posting lists rapidly.
// You can check for them in the [`DocSet`](https://docs.rs/tantivy/~0/tantivy/trait.DocSet.html) trait
// and the [`Postings`](https://docs.rs/tantivy/~0/tantivy/trait.Postings.html) trait
// Also, for some VERY specific high performance use case like an OLAP analysis of logs,
// you can get better performance by accessing directly the blocks of doc ids.
for segment_reader in searcher.segment_readers() {
// A segment contains different data structure.
// Inverted index stands for the combination of
// - the term dictionary
// - the inverted lists associated with each terms and their positions
let inverted_index = segment_reader.inverted_index(title)?;
// This segment posting object is like a cursor over the documents matching the term.
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term
// frequencies and positions.
//
// If you don't need all this information, you may get better performance by decompressing
// less information.
if let Some(mut block_segment_postings) =
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)?
{
loop {
let docs = block_segment_postings.docs();
if docs.is_empty() {
break;
}
// Once again these docs MAY contains deleted documents as well.
let docs = block_segment_postings.docs();
// Prints `Docs [0, 2].`
println!("Docs {docs:?}");
block_segment_postings.advance();
}
}
}
Ok(())
}

View File

@@ -1,4 +1,4 @@
use columnar::{Column, ColumnType, StrColumn};
use columnar::{Column, ColumnBlockAccessor, ColumnType, StrColumn};
use common::BitSet;
use rustc_hash::FxHashSet;
use serde::Serialize;
@@ -10,16 +10,16 @@ use crate::aggregation::accessor_helpers::{
};
use crate::aggregation::agg_req::{Aggregation, AggregationVariants, Aggregations};
use crate::aggregation::bucket::{
FilterAggReqData, HistogramAggReqData, HistogramBounds, IncludeExcludeParam,
MissingTermAggReqData, RangeAggReqData, SegmentFilterCollector, SegmentHistogramCollector,
SegmentRangeCollector, TermMissingAgg, TermsAggReqData, TermsAggregation,
build_segment_filter_collector, build_segment_range_collector, FilterAggReqData,
HistogramAggReqData, HistogramBounds, IncludeExcludeParam, MissingTermAggReqData,
RangeAggReqData, SegmentHistogramCollector, TermMissingAgg, TermsAggReqData, TermsAggregation,
TermsAggregationInternal,
};
use crate::aggregation::metric::{
AverageAggregation, CardinalityAggReqData, CardinalityAggregationReq, CountAggregation,
ExtendedStatsAggregation, MaxAggregation, MetricAggReqData, MinAggregation,
SegmentCardinalityCollector, SegmentExtendedStatsCollector, SegmentPercentilesCollector,
SegmentStatsCollector, StatsAggregation, StatsType, SumAggregation, TopHitsAggReqData,
build_segment_stats_collector, AverageAggregation, CardinalityAggReqData,
CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation, MaxAggregation,
MetricAggReqData, MinAggregation, SegmentCardinalityCollector, SegmentExtendedStatsCollector,
SegmentPercentilesCollector, StatsAggregation, StatsType, SumAggregation, TopHitsAggReqData,
TopHitsSegmentCollector,
};
use crate::aggregation::segment_agg_result::{
@@ -35,6 +35,7 @@ pub struct AggregationsSegmentCtx {
/// Request data for each aggregation type.
pub per_request: PerRequestAggSegCtx,
pub context: AggContextParams,
pub column_block_accessor: ColumnBlockAccessor<u64>,
}
impl AggregationsSegmentCtx {
@@ -107,21 +108,14 @@ impl AggregationsSegmentCtx {
.as_deref()
.expect("range_req_data slot is empty (taken)")
}
#[inline]
pub(crate) fn get_filter_req_data(&self, idx: usize) -> &FilterAggReqData {
self.per_request.filter_req_data[idx]
.as_deref()
.expect("filter_req_data slot is empty (taken)")
}
// ---------- mutable getters ----------
#[inline]
pub(crate) fn get_term_req_data_mut(&mut self, idx: usize) -> &mut TermsAggReqData {
self.per_request.term_req_data[idx]
.as_deref_mut()
.expect("term_req_data slot is empty (taken)")
pub(crate) fn get_metric_req_data_mut(&mut self, idx: usize) -> &mut MetricAggReqData {
&mut self.per_request.stats_metric_req_data[idx]
}
#[inline]
pub(crate) fn get_cardinality_req_data_mut(
&mut self,
@@ -129,10 +123,7 @@ impl AggregationsSegmentCtx {
) -> &mut CardinalityAggReqData {
&mut self.per_request.cardinality_req_data[idx]
}
#[inline]
pub(crate) fn get_metric_req_data_mut(&mut self, idx: usize) -> &mut MetricAggReqData {
&mut self.per_request.stats_metric_req_data[idx]
}
#[inline]
pub(crate) fn get_histogram_req_data_mut(&mut self, idx: usize) -> &mut HistogramAggReqData {
self.per_request.histogram_req_data[idx]
@@ -142,21 +133,6 @@ impl AggregationsSegmentCtx {
// ---------- take / put (terms, histogram, range) ----------
/// Move out the boxed Terms request at `idx`, leaving `None`.
#[inline]
pub(crate) fn take_term_req_data(&mut self, idx: usize) -> Box<TermsAggReqData> {
self.per_request.term_req_data[idx]
.take()
.expect("term_req_data slot is empty (taken)")
}
/// Put back a Terms request into an empty slot at `idx`.
#[inline]
pub(crate) fn put_back_term_req_data(&mut self, idx: usize, value: Box<TermsAggReqData>) {
debug_assert!(self.per_request.term_req_data[idx].is_none());
self.per_request.term_req_data[idx] = Some(value);
}
/// Move out the boxed Histogram request at `idx`, leaving `None`.
#[inline]
pub(crate) fn take_histogram_req_data(&mut self, idx: usize) -> Box<HistogramAggReqData> {
@@ -320,6 +296,7 @@ impl PerRequestAggSegCtx {
/// Convert the aggregation tree into a serializable struct representation.
/// Each node contains: { name, kind, children }.
#[allow(dead_code)]
pub fn get_view_tree(&self) -> Vec<AggTreeViewNode> {
fn node_to_view(node: &AggRefNode, pr: &PerRequestAggSegCtx) -> AggTreeViewNode {
let mut children: Vec<AggTreeViewNode> =
@@ -345,12 +322,19 @@ impl PerRequestAggSegCtx {
pub(crate) fn build_segment_agg_collectors_root(
req: &mut AggregationsSegmentCtx,
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
build_segment_agg_collectors(req, &req.per_request.agg_tree.clone())
build_segment_agg_collectors_generic(req, &req.per_request.agg_tree.clone())
}
pub(crate) fn build_segment_agg_collectors(
req: &mut AggregationsSegmentCtx,
nodes: &[AggRefNode],
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
build_segment_agg_collectors_generic(req, nodes)
}
fn build_segment_agg_collectors_generic(
req: &mut AggregationsSegmentCtx,
nodes: &[AggRefNode],
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
let mut collectors = Vec::new();
for node in nodes.iter() {
@@ -388,6 +372,8 @@ pub(crate) fn build_segment_agg_collector(
Ok(Box::new(SegmentCardinalityCollector::from_req(
req_data.column_type,
node.idx_in_req_data,
req_data.accessor.clone(),
req_data.missing_value_for_accessor,
)))
}
AggKind::StatsKind(stats_type) => {
@@ -398,20 +384,21 @@ pub(crate) fn build_segment_agg_collector(
| StatsType::Count
| StatsType::Max
| StatsType::Min
| StatsType::Stats => Ok(Box::new(SegmentStatsCollector::from_req(
node.idx_in_req_data,
))),
StatsType::ExtendedStats(sigma) => {
Ok(Box::new(SegmentExtendedStatsCollector::from_req(
req_data.field_type,
sigma,
node.idx_in_req_data,
req_data.missing,
)))
}
StatsType::Percentiles => Ok(Box::new(
SegmentPercentilesCollector::from_req_and_validate(node.idx_in_req_data)?,
| StatsType::Stats => build_segment_stats_collector(req_data),
StatsType::ExtendedStats(sigma) => Ok(Box::new(
SegmentExtendedStatsCollector::from_req(req_data, sigma),
)),
StatsType::Percentiles => {
let req_data = req.get_metric_req_data_mut(node.idx_in_req_data);
Ok(Box::new(
SegmentPercentilesCollector::from_req_and_validate(
req_data.field_type,
req_data.missing_u64,
req_data.accessor.clone(),
node.idx_in_req_data,
),
))
}
}
}
AggKind::TopHits => {
@@ -428,12 +415,8 @@ pub(crate) fn build_segment_agg_collector(
AggKind::DateHistogram => Ok(Box::new(SegmentHistogramCollector::from_req_and_validate(
req, node,
)?)),
AggKind::Range => Ok(Box::new(SegmentRangeCollector::from_req_and_validate(
req, node,
)?)),
AggKind::Filter => Ok(Box::new(SegmentFilterCollector::from_req_and_validate(
req, node,
)?)),
AggKind::Range => Ok(build_segment_range_collector(req, node)?),
AggKind::Filter => build_segment_filter_collector(req, node),
}
}
@@ -493,6 +476,7 @@ pub(crate) fn build_aggregations_data_from_req(
let mut data = AggregationsSegmentCtx {
per_request: Default::default(),
context,
column_block_accessor: ColumnBlockAccessor::default(),
};
for (name, agg) in aggs.iter() {
@@ -521,9 +505,9 @@ fn build_nodes(
let idx_in_req_data = data.push_range_req_data(RangeAggReqData {
accessor,
field_type,
column_block_accessor: Default::default(),
name: agg_name.to_string(),
req: range_req.clone(),
is_top_level,
});
let children = build_children(&req.sub_aggregation, reader, segment_ordinal, data)?;
Ok(vec![AggRefNode {
@@ -541,9 +525,7 @@ fn build_nodes(
let idx_in_req_data = data.push_histogram_req_data(HistogramAggReqData {
accessor,
field_type,
column_block_accessor: Default::default(),
name: agg_name.to_string(),
sub_aggregation_blueprint: None,
req: histo_req.clone(),
is_date_histogram: false,
bounds: HistogramBounds {
@@ -568,9 +550,7 @@ fn build_nodes(
let idx_in_req_data = data.push_histogram_req_data(HistogramAggReqData {
accessor,
field_type,
column_block_accessor: Default::default(),
name: agg_name.to_string(),
sub_aggregation_blueprint: None,
req: histo_req,
is_date_histogram: true,
bounds: HistogramBounds {
@@ -650,7 +630,6 @@ fn build_nodes(
let idx_in_req_data = data.push_metric_req_data(MetricAggReqData {
accessor,
field_type,
column_block_accessor: Default::default(),
name: agg_name.to_string(),
collecting_for,
missing: *missing,
@@ -678,7 +657,6 @@ fn build_nodes(
let idx_in_req_data = data.push_metric_req_data(MetricAggReqData {
accessor,
field_type,
column_block_accessor: Default::default(),
name: agg_name.to_string(),
collecting_for: StatsType::Percentiles,
missing: percentiles_req.missing,
@@ -753,6 +731,7 @@ fn build_nodes(
segment_reader: reader.clone(),
evaluator,
matching_docs_buffer,
is_top_level,
});
let children = build_children(&req.sub_aggregation, reader, segment_ordinal, data)?;
Ok(vec![AggRefNode {
@@ -895,7 +874,7 @@ fn build_terms_or_cardinality_nodes(
});
}
// Add one node per accessor to mirror previous behavior and allow per-type missing handling.
// Add one node per accessor
for (accessor, column_type) in column_and_types {
let missing_value_for_accessor = if use_special_missing_agg {
None
@@ -926,11 +905,8 @@ fn build_terms_or_cardinality_nodes(
column_type,
str_dict_column: str_dict_column.clone(),
missing_value_for_accessor,
column_block_accessor: Default::default(),
name: agg_name.to_string(),
req: TermsAggregationInternal::from_req(req),
// Will be filled later when building collectors
sub_aggregation_blueprint: None,
sug_aggregations: sub_aggs.clone(),
allowed_term_ids,
is_top_level,
@@ -943,7 +919,6 @@ fn build_terms_or_cardinality_nodes(
column_type,
str_dict_column: str_dict_column.clone(),
missing_value_for_accessor,
column_block_accessor: Default::default(),
name: agg_name.to_string(),
req: req.clone(),
});

View File

@@ -2,15 +2,441 @@ use serde_json::Value;
use crate::aggregation::agg_req::{Aggregation, Aggregations};
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
use crate::aggregation::collector::AggregationCollector;
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
use crate::aggregation::DistributedAggregationCollector;
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, IndexWriter, Term};
// The following tests ensure that each bucket aggregation type correctly functions as a
// sub-aggregation of another bucket aggregation in two scenarios:
// 1) The parent has more buckets than the child sub-aggregation
// 2) The child sub-aggregation has more buckets than the parent
//
// These scenarios exercise the bucket id mapping and sub-aggregation routing logic.
#[test]
fn test_terms_as_subagg_parent_more_vs_child_more() -> crate::Result<()> {
let index = get_test_index_2_segments(false)?;
// Case A: parent has more buckets than child
// Parent: range with 4 buckets
// Child: terms on text -> 2 buckets
let agg_parent_more: Aggregations = serde_json::from_value(json!({
"parent_range": {
"range": {
"field": "score",
"ranges": [
{"to": 3.0},
{"from": 3.0, "to": 7.0},
{"from": 7.0, "to": 20.0},
{"from": 20.0}
]
},
"aggs": {
"child_terms": {"terms": {"field": "text", "order": {"_key": "asc"}}}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?;
// Exact expected structure and counts
assert_eq!(
res["parent_range"]["buckets"],
json!([
{
"key": "*-3",
"doc_count": 1,
"to": 3.0,
"child_terms": {
"buckets": [
{"doc_count": 1, "key": "cool"}
],
"sum_other_doc_count": 0
}
},
{
"key": "3-7",
"doc_count": 3,
"from": 3.0,
"to": 7.0,
"child_terms": {
"buckets": [
{"doc_count": 2, "key": "cool"},
{"doc_count": 1, "key": "nohit"}
],
"sum_other_doc_count": 0
}
},
{
"key": "7-20",
"doc_count": 3,
"from": 7.0,
"to": 20.0,
"child_terms": {
"buckets": [
{"doc_count": 3, "key": "cool"}
],
"sum_other_doc_count": 0
}
},
{
"key": "20-*",
"doc_count": 2,
"from": 20.0,
"child_terms": {
"buckets": [
{"doc_count": 1, "key": "cool"},
{"doc_count": 1, "key": "nohit"}
],
"sum_other_doc_count": 0
}
}
])
);
// Case B: child has more buckets than parent
// Parent: histogram on score with large interval -> 1 bucket
// Child: terms on text -> 2 buckets (cool/nohit)
let agg_child_more: Aggregations = serde_json::from_value(json!({
"parent_hist": {
"histogram": {"field": "score", "interval": 100.0},
"aggs": {
"child_terms": {"terms": {"field": "text", "order": {"_key": "asc"}}}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?;
assert_eq!(
res["parent_hist"],
json!({
"buckets": [
{
"key": 0.0,
"doc_count": 9,
"child_terms": {
"buckets": [
{"doc_count": 7, "key": "cool"},
{"doc_count": 2, "key": "nohit"}
],
"sum_other_doc_count": 0
}
}
]
})
);
Ok(())
}
#[test]
fn test_range_as_subagg_parent_more_vs_child_more() -> crate::Result<()> {
let index = get_test_index_2_segments(false)?;
// Case A: parent has more buckets than child
// Parent: range with 5 buckets
// Child: coarse range with 3 buckets
let agg_parent_more: Aggregations = serde_json::from_value(json!({
"parent_range": {
"range": {
"field": "score",
"ranges": [
{"to": 3.0},
{"from": 3.0, "to": 7.0},
{"from": 7.0, "to": 11.0},
{"from": 11.0, "to": 20.0},
{"from": 20.0}
]
},
"aggs": {
"child_range": {
"range": {
"field": "score",
"ranges": [
{"to": 3.0},
{"from": 3.0, "to": 20.0}
]
}
}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?;
assert_eq!(
res["parent_range"]["buckets"],
json!([
{"key": "*-3", "doc_count": 1, "to": 3.0,
"child_range": {"buckets": [
{"key": "*-3", "doc_count": 1, "to": 3.0},
{"key": "3-20", "doc_count": 0, "from": 3.0, "to": 20.0},
{"key": "20-*", "doc_count": 0, "from": 20.0}
]}
},
{"key": "3-7", "doc_count": 3, "from": 3.0, "to": 7.0,
"child_range": {"buckets": [
{"key": "*-3", "doc_count": 0, "to": 3.0},
{"key": "3-20", "doc_count": 3, "from": 3.0, "to": 20.0},
{"key": "20-*", "doc_count": 0, "from": 20.0}
]}
},
{"key": "7-11", "doc_count": 1, "from": 7.0, "to": 11.0,
"child_range": {"buckets": [
{"key": "*-3", "doc_count": 0, "to": 3.0},
{"key": "3-20", "doc_count": 1, "from": 3.0, "to": 20.0},
{"key": "20-*", "doc_count": 0, "from": 20.0}
]}
},
{"key": "11-20", "doc_count": 2, "from": 11.0, "to": 20.0,
"child_range": {"buckets": [
{"key": "*-3", "doc_count": 0, "to": 3.0},
{"key": "3-20", "doc_count": 2, "from": 3.0, "to": 20.0},
{"key": "20-*", "doc_count": 0, "from": 20.0}
]}
},
{"key": "20-*", "doc_count": 2, "from": 20.0,
"child_range": {"buckets": [
{"key": "*-3", "doc_count": 0, "to": 3.0},
{"key": "3-20", "doc_count": 0, "from": 3.0, "to": 20.0},
{"key": "20-*", "doc_count": 2, "from": 20.0}
]}
}
])
);
// Case B: child has more buckets than parent
// Parent: terms on text (2 buckets)
// Child: range with 4 buckets
let agg_child_more: Aggregations = serde_json::from_value(json!({
"parent_terms": {
"terms": {"field": "text"},
"aggs": {
"child_range": {
"range": {
"field": "score",
"ranges": [
{"to": 3.0},
{"from": 3.0, "to": 7.0},
{"from": 7.0, "to": 20.0}
]
}
}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?;
assert_eq!(
res["parent_terms"],
json!({
"buckets": [
{
"key": "cool",
"doc_count": 7,
"child_range": {
"buckets": [
{"key": "*-3", "doc_count": 1, "to": 3.0},
{"key": "3-7", "doc_count": 2, "from": 3.0, "to": 7.0},
{"key": "7-20", "doc_count": 3, "from": 7.0, "to": 20.0},
{"key": "20-*", "doc_count": 1, "from": 20.0}
]
}
},
{
"key": "nohit",
"doc_count": 2,
"child_range": {
"buckets": [
{"key": "*-3", "doc_count": 0, "to": 3.0},
{"key": "3-7", "doc_count": 1, "from": 3.0, "to": 7.0},
{"key": "7-20", "doc_count": 0, "from": 7.0, "to": 20.0},
{"key": "20-*", "doc_count": 1, "from": 20.0}
]
}
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
})
);
Ok(())
}
#[test]
fn test_histogram_as_subagg_parent_more_vs_child_more() -> crate::Result<()> {
let index = get_test_index_2_segments(false)?;
// Case A: parent has more buckets than child
// Parent: range with several ranges
// Child: histogram with large interval (single bucket per parent)
let agg_parent_more: Aggregations = serde_json::from_value(json!({
"parent_range": {
"range": {
"field": "score",
"ranges": [
{"to": 3.0},
{"from": 3.0, "to": 7.0},
{"from": 7.0, "to": 11.0},
{"from": 11.0, "to": 20.0},
{"from": 20.0}
]
},
"aggs": {
"child_hist": {"histogram": {"field": "score", "interval": 100.0}}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?;
assert_eq!(
res["parent_range"]["buckets"],
json!([
{"key": "*-3", "doc_count": 1, "to": 3.0,
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 1} ]}
},
{"key": "3-7", "doc_count": 3, "from": 3.0, "to": 7.0,
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 3} ]}
},
{"key": "7-11", "doc_count": 1, "from": 7.0, "to": 11.0,
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 1} ]}
},
{"key": "11-20", "doc_count": 2, "from": 11.0, "to": 20.0,
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 2} ]}
},
{"key": "20-*", "doc_count": 2, "from": 20.0,
"child_hist": {"buckets": [ {"key": 0.0, "doc_count": 2} ]}
}
])
);
// Case B: child has more buckets than parent
// Parent: terms on text -> 2 buckets
// Child: histogram with small interval -> multiple buckets including empties
let agg_child_more: Aggregations = serde_json::from_value(json!({
"parent_terms": {
"terms": {"field": "text"},
"aggs": {
"child_hist": {"histogram": {"field": "score", "interval": 10.0}}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?;
assert_eq!(
res["parent_terms"],
json!({
"buckets": [
{
"key": "cool",
"doc_count": 7,
"child_hist": {
"buckets": [
{"key": 0.0, "doc_count": 4},
{"key": 10.0, "doc_count": 2},
{"key": 20.0, "doc_count": 0},
{"key": 30.0, "doc_count": 0},
{"key": 40.0, "doc_count": 1}
]
}
},
{
"key": "nohit",
"doc_count": 2,
"child_hist": {
"buckets": [
{"key": 0.0, "doc_count": 1},
{"key": 10.0, "doc_count": 0},
{"key": 20.0, "doc_count": 0},
{"key": 30.0, "doc_count": 0},
{"key": 40.0, "doc_count": 1}
]
}
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
})
);
Ok(())
}
#[test]
fn test_date_histogram_as_subagg_parent_more_vs_child_more() -> crate::Result<()> {
let index = get_test_index_2_segments(false)?;
// Case A: parent has more buckets than child
// Parent: range with several buckets
// Child: date_histogram with 30d -> single bucket per parent
let agg_parent_more: Aggregations = serde_json::from_value(json!({
"parent_range": {
"range": {
"field": "score",
"ranges": [
{"to": 3.0},
{"from": 3.0, "to": 7.0},
{"from": 7.0, "to": 11.0},
{"from": 11.0, "to": 20.0},
{"from": 20.0}
]
},
"aggs": {
"child_date_hist": {"date_histogram": {"field": "date", "fixed_interval": "30d"}}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_parent_more, &index)?;
let buckets = res["parent_range"]["buckets"].as_array().unwrap();
// Verify each parent bucket has exactly one child date bucket with matching doc_count
for bucket in buckets {
let parent_count = bucket["doc_count"].as_u64().unwrap();
let child_buckets = bucket["child_date_hist"]["buckets"].as_array().unwrap();
assert_eq!(child_buckets.len(), 1);
assert_eq!(child_buckets[0]["doc_count"], parent_count);
}
// Case B: child has more buckets than parent
// Parent: terms on text (2 buckets)
// Child: date_histogram with 1d -> multiple buckets
let agg_child_more: Aggregations = serde_json::from_value(json!({
"parent_terms": {
"terms": {"field": "text"},
"aggs": {
"child_date_hist": {"date_histogram": {"field": "date", "fixed_interval": "1d"}}
}
}
}))
.unwrap();
let res = crate::aggregation::tests::exec_request(agg_child_more, &index)?;
let buckets = res["parent_terms"]["buckets"].as_array().unwrap();
// cool bucket
assert_eq!(buckets[0]["key"], "cool");
let cool_buckets = buckets[0]["child_date_hist"]["buckets"].as_array().unwrap();
assert_eq!(cool_buckets.len(), 3);
assert_eq!(cool_buckets[0]["doc_count"], 1); // day 0
assert_eq!(cool_buckets[1]["doc_count"], 4); // day 1
assert_eq!(cool_buckets[2]["doc_count"], 2); // day 2
// nohit bucket
assert_eq!(buckets[1]["key"], "nohit");
let nohit_buckets = buckets[1]["child_date_hist"]["buckets"].as_array().unwrap();
assert_eq!(nohit_buckets.len(), 2);
assert_eq!(nohit_buckets[0]["doc_count"], 1); // day 1
assert_eq!(nohit_buckets[1]["doc_count"], 1); // day 2
Ok(())
}
fn get_avg_req(field_name: &str) -> Aggregation {
serde_json::from_value(json!({
"avg": {
@@ -25,6 +451,10 @@ fn get_collector(agg_req: Aggregations) -> AggregationCollector {
}
// *** EVERY BUCKET-TYPE SHOULD BE TESTED HERE ***
// Note: The flushng part of these tests are outdated, since the buffering change after converting
// the collection into one collector per request instead of per bucket.
//
// However they are useful as they test a complex aggregation requests.
fn test_aggregation_flushing(
merge_segments: bool,
use_distributed_collector: bool,
@@ -37,8 +467,9 @@ fn test_aggregation_flushing(
let reader = index.reader()?;
assert_eq!(DOC_BLOCK_SIZE, 64);
// In the tree we cache Documents of DOC_BLOCK_SIZE, before passing them down as one block.
assert_eq!(COLLECT_BLOCK_BUFFER_LEN, 64);
// In the tree we cache documents of COLLECT_BLOCK_BUFFER_LEN before passing them down as one
// block.
//
// Build a request so that on the first level we have one full cache, which is then flushed.
// The same cache should have some residue docs at the end, which are flushed (Range 0-70)

View File

@@ -6,10 +6,14 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer};
use crate::aggregation::agg_data::{
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
};
use crate::aggregation::cached_sub_aggs::{
CachedSubAggs, HighCardSubAggCache, LowCardSubAggCache, SubAggCache,
};
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
};
use crate::aggregation::segment_agg_result::{CollectorClone, SegmentAggregationCollector};
use crate::aggregation::segment_agg_result::{BucketIdProvider, SegmentAggregationCollector};
use crate::aggregation::BucketId;
use crate::docset::DocSet;
use crate::query::{AllQuery, EnableScoring, Query, QueryParser};
use crate::schema::Schema;
@@ -404,15 +408,18 @@ pub struct FilterAggReqData {
pub evaluator: DocumentQueryEvaluator,
/// Reusable buffer for matching documents to minimize allocations during collection
pub matching_docs_buffer: Vec<DocId>,
/// True if this filter aggregation is at the top level of the aggregation tree (not nested).
pub is_top_level: bool,
}
impl FilterAggReqData {
pub(crate) fn get_memory_consumption(&self) -> usize {
// Estimate: name + segment reader reference + bitset + buffer capacity
self.name.len()
+ std::mem::size_of::<SegmentReader>()
+ self.evaluator.bitset.len() / 8 // BitSet memory (bits to bytes)
+ self.matching_docs_buffer.capacity() * std::mem::size_of::<DocId>()
+ std::mem::size_of::<SegmentReader>()
+ self.evaluator.bitset.len() / 8 // BitSet memory (bits to bytes)
+ self.matching_docs_buffer.capacity() * std::mem::size_of::<DocId>()
+ std::mem::size_of::<bool>()
}
}
@@ -446,7 +453,7 @@ impl DocumentQueryEvaluator {
let weight = query.weight(EnableScoring::disabled_from_schema(&schema))?;
// Get a scorer that iterates over matching documents
let mut scorer = weight.scorer(segment_reader, 1.0, 0)?;
let mut scorer = weight.scorer(segment_reader, 1.0)?;
// Create a BitSet to hold all matching documents
let mut bitset = BitSet::with_max_value(max_doc);
@@ -489,17 +496,24 @@ impl Debug for DocumentQueryEvaluator {
}
}
/// Segment collector for filter aggregation
pub struct SegmentFilterCollector {
/// Document count in this bucket
#[derive(Debug, Clone, PartialEq, Copy)]
struct DocCount {
doc_count: u64,
bucket_id: BucketId,
}
/// Segment collector for filter aggregation
pub struct SegmentFilterCollector<C: SubAggCache> {
/// Document counts per parent bucket
parent_buckets: Vec<DocCount>,
/// Sub-aggregation collectors
sub_aggregations: Option<Box<dyn SegmentAggregationCollector>>,
sub_aggregations: Option<CachedSubAggs<C>>,
bucket_id_provider: BucketIdProvider,
/// Accessor index for this filter aggregation (to access FilterAggReqData)
accessor_idx: usize,
}
impl SegmentFilterCollector {
impl<C: SubAggCache> SegmentFilterCollector<C> {
/// Create a new filter segment collector following the new agg_data pattern
pub(crate) fn from_req_and_validate(
req: &mut AggregationsSegmentCtx,
@@ -511,47 +525,75 @@ impl SegmentFilterCollector {
} else {
None
};
let sub_agg_collector = sub_agg_collector.map(CachedSubAggs::new);
Ok(SegmentFilterCollector {
doc_count: 0,
parent_buckets: Vec::new(),
sub_aggregations: sub_agg_collector,
accessor_idx: node.idx_in_req_data,
bucket_id_provider: BucketIdProvider::default(),
})
}
}
impl Debug for SegmentFilterCollector {
pub(crate) fn build_segment_filter_collector(
req: &mut AggregationsSegmentCtx,
node: &AggRefNode,
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
let is_top_level = req.per_request.filter_req_data[node.idx_in_req_data]
.as_ref()
.expect("filter_req_data slot is empty")
.is_top_level;
if is_top_level {
Ok(Box::new(
SegmentFilterCollector::<LowCardSubAggCache>::from_req_and_validate(req, node)?,
))
} else {
Ok(Box::new(
SegmentFilterCollector::<HighCardSubAggCache>::from_req_and_validate(req, node)?,
))
}
}
impl<C: SubAggCache> Debug for SegmentFilterCollector<C> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SegmentFilterCollector")
.field("doc_count", &self.doc_count)
.field("buckets", &self.parent_buckets)
.field("has_sub_aggs", &self.sub_aggregations.is_some())
.field("accessor_idx", &self.accessor_idx)
.finish()
}
}
impl CollectorClone for SegmentFilterCollector {
fn clone_box(&self) -> Box<dyn SegmentAggregationCollector> {
// For now, panic - this needs proper implementation with weight recreation
panic!("SegmentFilterCollector cloning not yet implemented - requires weight recreation")
}
}
impl SegmentAggregationCollector for SegmentFilterCollector {
impl<C: SubAggCache> SegmentAggregationCollector for SegmentFilterCollector<C> {
fn add_intermediate_aggregation_result(
self: Box<Self>,
&mut self,
agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> {
let mut sub_results = IntermediateAggregationResults::default();
let bucket_opt = self.parent_buckets.get(parent_bucket_id as usize);
if let Some(sub_aggs) = self.sub_aggregations {
sub_aggs.add_intermediate_aggregation_result(agg_data, &mut sub_results)?;
if let Some(sub_aggs) = &mut self.sub_aggregations {
sub_aggs
.get_sub_agg_collector()
.add_intermediate_aggregation_result(
agg_data,
&mut sub_results,
// Here we create a new bucket ID for sub-aggregations if the bucket doesn't
// exist, so that sub-aggregations can still produce results (e.g., zero doc
// count)
bucket_opt
.map(|bucket| bucket.bucket_id)
.unwrap_or(self.bucket_id_provider.next_bucket_id()),
)?;
}
// Create the filter bucket result
let filter_bucket_result = IntermediateBucketResult::Filter {
doc_count: self.doc_count,
doc_count: bucket_opt.map(|b| b.doc_count).unwrap_or(0),
sub_aggregations: sub_results,
};
@@ -570,32 +612,17 @@ impl SegmentAggregationCollector for SegmentFilterCollector {
Ok(())
}
fn collect(&mut self, doc: DocId, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
// Access the evaluator from FilterAggReqData
let req_data = agg_data.get_filter_req_data(self.accessor_idx);
// O(1) BitSet lookup to check if document matches filter
if req_data.evaluator.matches_document(doc) {
self.doc_count += 1;
// If we have sub-aggregations, collect on them for this filtered document
if let Some(sub_aggs) = &mut self.sub_aggregations {
sub_aggs.collect(doc, agg_data)?;
}
}
Ok(())
}
#[inline]
fn collect_block(
fn collect(
&mut self,
docs: &[DocId],
parent_bucket_id: BucketId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
if docs.is_empty() {
return Ok(());
}
let mut bucket = self.parent_buckets[parent_bucket_id as usize];
// Take the request data to avoid borrow checker issues with sub-aggregations
let mut req = agg_data.take_filter_req_data(self.accessor_idx);
@@ -604,18 +631,24 @@ impl SegmentAggregationCollector for SegmentFilterCollector {
req.evaluator
.filter_batch(docs, &mut req.matching_docs_buffer);
self.doc_count += req.matching_docs_buffer.len() as u64;
bucket.doc_count += req.matching_docs_buffer.len() as u64;
// Batch process sub-aggregations if we have matches
if !req.matching_docs_buffer.is_empty() {
if let Some(sub_aggs) = &mut self.sub_aggregations {
// Use collect_block for better sub-aggregation performance
sub_aggs.collect_block(&req.matching_docs_buffer, agg_data)?;
for &doc_id in &req.matching_docs_buffer {
sub_aggs.push(bucket.bucket_id, doc_id);
}
}
}
// Put the request data back
agg_data.put_back_filter_req_data(self.accessor_idx, req);
if let Some(sub_aggs) = &mut self.sub_aggregations {
sub_aggs.check_flush_local(agg_data)?;
}
// put back bucket
self.parent_buckets[parent_bucket_id as usize] = bucket;
Ok(())
}
@@ -626,6 +659,21 @@ impl SegmentAggregationCollector for SegmentFilterCollector {
}
Ok(())
}
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
_agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
while self.parent_buckets.len() <= max_bucket as usize {
let bucket_id = self.bucket_id_provider.next_bucket_id();
self.parent_buckets.push(DocCount {
doc_count: 0,
bucket_id,
});
}
Ok(())
}
}
/// Intermediate result for filter aggregation
@@ -1519,9 +1567,9 @@ mod tests {
let searcher = reader.searcher();
let agg = json!({
"test": {
"filter": deserialized,
"aggs": { "count": { "value_count": { "field": "brand" } } }
"test": {
"filter": deserialized,
"aggs": { "count": { "value_count": { "field": "brand" } } }
}
});

View File

@@ -1,6 +1,6 @@
use std::cmp::Ordering;
use columnar::{Column, ColumnBlockAccessor, ColumnType};
use columnar::{Column, ColumnType};
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use tantivy_bitpacker::minmax;
@@ -8,14 +8,14 @@ use tantivy_bitpacker::minmax;
use crate::aggregation::agg_data::{
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
};
use crate::aggregation::agg_limits::MemoryConsumption;
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::agg_result::BucketEntry;
use crate::aggregation::cached_sub_aggs::{CachedSubAggs, HighCardCachedSubAggs};
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
IntermediateHistogramBucketEntry,
};
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::segment_agg_result::{BucketIdProvider, SegmentAggregationCollector};
use crate::aggregation::*;
use crate::TantivyError;
@@ -26,13 +26,8 @@ pub struct HistogramAggReqData {
pub accessor: Column<u64>,
/// The field type of the fast field.
pub field_type: ColumnType,
/// The column block accessor to access the fast field values.
pub column_block_accessor: ColumnBlockAccessor<u64>,
/// The name of the aggregation.
pub name: String,
/// The sub aggregation blueprint, used to create sub aggregations for each bucket.
/// Will be filled during initialization of the collector.
pub sub_aggregation_blueprint: Option<Box<dyn SegmentAggregationCollector>>,
/// The histogram aggregation request.
pub req: HistogramAggregation,
/// True if this is a date_histogram aggregation.
@@ -257,18 +252,24 @@ impl HistogramBounds {
pub(crate) struct SegmentHistogramBucketEntry {
pub key: f64,
pub doc_count: u64,
pub bucket_id: BucketId,
}
impl SegmentHistogramBucketEntry {
pub(crate) fn into_intermediate_bucket_entry(
self,
sub_aggregation: Option<Box<dyn SegmentAggregationCollector>>,
sub_aggregation: &mut Option<HighCardCachedSubAggs>,
agg_data: &AggregationsSegmentCtx,
) -> crate::Result<IntermediateHistogramBucketEntry> {
let mut sub_aggregation_res = IntermediateAggregationResults::default();
if let Some(sub_aggregation) = sub_aggregation {
sub_aggregation
.add_intermediate_aggregation_result(agg_data, &mut sub_aggregation_res)?;
.get_sub_agg_collector()
.add_intermediate_aggregation_result(
agg_data,
&mut sub_aggregation_res,
self.bucket_id,
)?;
}
Ok(IntermediateHistogramBucketEntry {
key: self.key,
@@ -278,27 +279,38 @@ impl SegmentHistogramBucketEntry {
}
}
#[derive(Clone, Debug, Default)]
struct HistogramBuckets {
pub buckets: FxHashMap<i64, SegmentHistogramBucketEntry>,
}
/// The collector puts values from the fast field into the correct buckets and does a conversion to
/// the correct datatype.
#[derive(Clone, Debug)]
#[derive(Debug)]
pub struct SegmentHistogramCollector {
/// The buckets containing the aggregation data.
buckets: FxHashMap<i64, SegmentHistogramBucketEntry>,
sub_aggregations: FxHashMap<i64, Box<dyn SegmentAggregationCollector>>,
/// One Histogram bucket per parent bucket id.
parent_buckets: Vec<HistogramBuckets>,
sub_agg: Option<HighCardCachedSubAggs>,
accessor_idx: usize,
bucket_id_provider: BucketIdProvider,
}
impl SegmentAggregationCollector for SegmentHistogramCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
&mut self,
agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> {
let name = agg_data
.get_histogram_req_data(self.accessor_idx)
.name
.clone();
let bucket = self.into_intermediate_bucket_result(agg_data)?;
// TODO: avoid prepare_max_bucket here and handle empty buckets.
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
let histogram = std::mem::take(&mut self.parent_buckets[parent_bucket_id as usize]);
let bucket = self.add_intermediate_bucket_result(agg_data, histogram)?;
results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
Ok(())
@@ -307,44 +319,40 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
#[inline]
fn collect(
&mut self,
doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
self.collect_block(&[doc], agg_data)
}
#[inline]
fn collect_block(
&mut self,
parent_bucket_id: BucketId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
let mut req = agg_data.take_histogram_req_data(self.accessor_idx);
let req = agg_data.take_histogram_req_data(self.accessor_idx);
let mem_pre = self.get_memory_consumption();
let buckets = &mut self.parent_buckets[parent_bucket_id as usize].buckets;
let bounds = req.bounds;
let interval = req.req.interval;
let offset = req.offset;
let get_bucket_pos = |val| get_bucket_pos_f64(val, interval, offset) as i64;
req.column_block_accessor.fetch_block(docs, &req.accessor);
for (doc, val) in req
agg_data
.column_block_accessor
.fetch_block(docs, &req.accessor);
for (doc, val) in agg_data
.column_block_accessor
.iter_docid_vals(docs, &req.accessor)
{
let val = f64_from_fastfield_u64(val, &req.field_type);
let val = f64_from_fastfield_u64(val, req.field_type);
let bucket_pos = get_bucket_pos(val);
if bounds.contains(val) {
let bucket = self.buckets.entry(bucket_pos).or_insert_with(|| {
let bucket = buckets.entry(bucket_pos).or_insert_with(|| {
let key = get_bucket_key_from_pos(bucket_pos as f64, interval, offset);
SegmentHistogramBucketEntry { key, doc_count: 0 }
SegmentHistogramBucketEntry {
key,
doc_count: 0,
bucket_id: self.bucket_id_provider.next_bucket_id(),
}
});
bucket.doc_count += 1;
if let Some(sub_aggregation_blueprint) = req.sub_aggregation_blueprint.as_ref() {
self.sub_aggregations
.entry(bucket_pos)
.or_insert_with(|| sub_aggregation_blueprint.clone())
.collect(doc, agg_data)?;
if let Some(sub_agg) = &mut self.sub_agg {
sub_agg.push(bucket.bucket_id, doc);
}
}
}
@@ -358,14 +366,30 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
.add_memory_consumed(mem_delta as u64)?;
}
if let Some(sub_agg) = &mut self.sub_agg {
sub_agg.check_flush_local(agg_data)?;
}
Ok(())
}
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
for sub_aggregation in self.sub_aggregations.values_mut() {
if let Some(sub_aggregation) = &mut self.sub_agg {
sub_aggregation.flush(agg_data)?;
}
Ok(())
}
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
_agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
while self.parent_buckets.len() <= max_bucket as usize {
self.parent_buckets.push(HistogramBuckets {
buckets: FxHashMap::default(),
});
}
Ok(())
}
}
@@ -373,22 +397,19 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
impl SegmentHistogramCollector {
fn get_memory_consumption(&self) -> usize {
let self_mem = std::mem::size_of::<Self>();
let sub_aggs_mem = self.sub_aggregations.memory_consumption();
let buckets_mem = self.buckets.memory_consumption();
self_mem + sub_aggs_mem + buckets_mem
let buckets_mem = self.parent_buckets.len() * std::mem::size_of::<HistogramBuckets>();
self_mem + buckets_mem
}
/// Converts the collector result into a intermediate bucket result.
pub fn into_intermediate_bucket_result(
self,
fn add_intermediate_bucket_result(
&mut self,
agg_data: &AggregationsSegmentCtx,
histogram: HistogramBuckets,
) -> crate::Result<IntermediateBucketResult> {
let mut buckets = Vec::with_capacity(self.buckets.len());
let mut buckets = Vec::with_capacity(histogram.buckets.len());
for (bucket_pos, bucket) in self.buckets {
let bucket_res = bucket.into_intermediate_bucket_entry(
self.sub_aggregations.get(&bucket_pos).cloned(),
agg_data,
);
for bucket in histogram.buckets.into_values() {
let bucket_res = bucket.into_intermediate_bucket_entry(&mut self.sub_agg, agg_data);
buckets.push(bucket_res?);
}
@@ -408,7 +429,7 @@ impl SegmentHistogramCollector {
agg_data: &mut AggregationsSegmentCtx,
node: &AggRefNode,
) -> crate::Result<Self> {
let blueprint = if !node.children.is_empty() {
let sub_agg = if !node.children.is_empty() {
Some(build_segment_agg_collectors(agg_data, &node.children)?)
} else {
None
@@ -423,13 +444,13 @@ impl SegmentHistogramCollector {
max: f64::MAX,
});
req_data.offset = req_data.req.offset.unwrap_or(0.0);
req_data.sub_aggregation_blueprint = blueprint;
let sub_agg = sub_agg.map(CachedSubAggs::new);
Ok(Self {
buckets: Default::default(),
sub_aggregations: Default::default(),
parent_buckets: Default::default(),
sub_agg,
accessor_idx: node.idx_in_req_data,
bucket_id_provider: BucketIdProvider::default(),
})
}
}

View File

@@ -1,18 +1,22 @@
use std::fmt::Debug;
use std::ops::Range;
use columnar::{Column, ColumnBlockAccessor, ColumnType};
use columnar::{Column, ColumnType};
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use crate::aggregation::agg_data::{
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
};
use crate::aggregation::agg_limits::AggregationLimitsGuard;
use crate::aggregation::cached_sub_aggs::{
CachedSubAggs, HighCardSubAggCache, LowCardCachedSubAggs, LowCardSubAggCache, SubAggCache,
};
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
};
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::segment_agg_result::{BucketIdProvider, SegmentAggregationCollector};
use crate::aggregation::*;
use crate::TantivyError;
@@ -23,12 +27,12 @@ pub struct RangeAggReqData {
pub accessor: Column<u64>,
/// The type of the fast field.
pub field_type: ColumnType,
/// The column block accessor to access the fast field values.
pub column_block_accessor: ColumnBlockAccessor<u64>,
/// The range aggregation request.
pub req: RangeAggregation,
/// The name of the aggregation.
pub name: String,
/// Whether this is a top-level aggregation.
pub is_top_level: bool,
}
impl RangeAggReqData {
@@ -151,19 +155,47 @@ pub(crate) struct SegmentRangeAndBucketEntry {
/// The collector puts values from the fast field into the correct buckets and does a conversion to
/// the correct datatype.
#[derive(Clone, Debug)]
pub struct SegmentRangeCollector {
pub struct SegmentRangeCollector<C: SubAggCache> {
/// The buckets containing the aggregation data.
buckets: Vec<SegmentRangeAndBucketEntry>,
/// One for each ParentBucketId
parent_buckets: Vec<Vec<SegmentRangeAndBucketEntry>>,
column_type: ColumnType,
pub(crate) accessor_idx: usize,
sub_agg: Option<CachedSubAggs<C>>,
/// Here things get a bit weird. We need to assign unique bucket ids across all
/// parent buckets. So we keep track of the next available bucket id here.
/// This allows a kind of flattening of the bucket ids across all parent buckets.
/// E.g. in nested aggregations:
/// Term Agg -> Range aggregation -> Stats aggregation
/// E.g. the Term Agg creates 3 buckets ["INFO", "ERROR", "WARN"], each of these has a Range
/// aggregation with 4 buckets. The Range aggregation will create buckets with ids:
/// - INFO: 0,1,2,3
/// - ERROR: 4,5,6,7
/// - WARN: 8,9,10,11
///
/// This allows the Stats aggregation to have unique bucket ids to refer to.
bucket_id_provider: BucketIdProvider,
limits: AggregationLimitsGuard,
}
impl<C: SubAggCache> Debug for SegmentRangeCollector<C> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SegmentRangeCollector")
.field("parent_buckets_len", &self.parent_buckets.len())
.field("column_type", &self.column_type)
.field("accessor_idx", &self.accessor_idx)
.field("has_sub_agg", &self.sub_agg.is_some())
.finish()
}
}
/// TODO: Bad naming, there's also SegmentRangeAndBucketEntry
#[derive(Clone)]
pub(crate) struct SegmentRangeBucketEntry {
pub key: Key,
pub doc_count: u64,
pub sub_aggregation: Option<Box<dyn SegmentAggregationCollector>>,
// pub sub_aggregation: Option<Box<dyn SegmentAggregationCollector>>,
pub bucket_id: BucketId,
/// The from range of the bucket. Equals `f64::MIN` when `None`.
pub from: Option<f64>,
/// The to range of the bucket. Equals `f64::MAX` when `None`. Open interval, `to` is not
@@ -184,48 +216,50 @@ impl Debug for SegmentRangeBucketEntry {
impl SegmentRangeBucketEntry {
pub(crate) fn into_intermediate_bucket_entry(
self,
agg_data: &AggregationsSegmentCtx,
) -> crate::Result<IntermediateRangeBucketEntry> {
let mut sub_aggregation_res = IntermediateAggregationResults::default();
if let Some(sub_aggregation) = self.sub_aggregation {
sub_aggregation
.add_intermediate_aggregation_result(agg_data, &mut sub_aggregation_res)?
} else {
Default::default()
};
let sub_aggregation = IntermediateAggregationResults::default();
Ok(IntermediateRangeBucketEntry {
key: self.key.into(),
doc_count: self.doc_count,
sub_aggregation: sub_aggregation_res,
sub_aggregation_res: sub_aggregation,
from: self.from,
to: self.to,
})
}
}
impl SegmentAggregationCollector for SegmentRangeCollector {
impl<C: SubAggCache> SegmentAggregationCollector for SegmentRangeCollector<C> {
fn add_intermediate_aggregation_result(
self: Box<Self>,
&mut self,
agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> {
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
let field_type = self.column_type;
let name = agg_data
.get_range_req_data(self.accessor_idx)
.name
.to_string();
let buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry> = self
.buckets
let buckets = std::mem::take(&mut self.parent_buckets[parent_bucket_id as usize]);
let buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry> = buckets
.into_iter()
.map(move |range_bucket| {
Ok((
range_to_string(&range_bucket.range, &field_type)?,
range_bucket
.bucket
.into_intermediate_bucket_entry(agg_data)?,
))
.map(|range_bucket| {
let bucket_id = range_bucket.bucket.bucket_id;
let mut agg = range_bucket.bucket.into_intermediate_bucket_entry()?;
if let Some(sub_aggregation) = &mut self.sub_agg {
sub_aggregation
.get_sub_agg_collector()
.add_intermediate_aggregation_result(
agg_data,
&mut agg.sub_aggregation_res,
bucket_id,
)?;
}
Ok((range_to_string(&range_bucket.range, &field_type)?, agg))
})
.collect::<crate::Result<_>>()?;
@@ -242,73 +276,114 @@ impl SegmentAggregationCollector for SegmentRangeCollector {
#[inline]
fn collect(
&mut self,
doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
self.collect_block(&[doc], agg_data)
}
#[inline]
fn collect_block(
&mut self,
parent_bucket_id: BucketId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
// Take request data to avoid borrow conflicts during sub-aggregation
let mut req = agg_data.take_range_req_data(self.accessor_idx);
let req = agg_data.take_range_req_data(self.accessor_idx);
req.column_block_accessor.fetch_block(docs, &req.accessor);
agg_data
.column_block_accessor
.fetch_block(docs, &req.accessor);
for (doc, val) in req
let buckets = &mut self.parent_buckets[parent_bucket_id as usize];
for (doc, val) in agg_data
.column_block_accessor
.iter_docid_vals(docs, &req.accessor)
{
let bucket_pos = self.get_bucket_pos(val);
let bucket = &mut self.buckets[bucket_pos];
let bucket_pos = get_bucket_pos(val, buckets);
let bucket = &mut buckets[bucket_pos];
bucket.bucket.doc_count += 1;
if let Some(sub_agg) = bucket.bucket.sub_aggregation.as_mut() {
sub_agg.collect(doc, agg_data)?;
if let Some(sub_agg) = self.sub_agg.as_mut() {
sub_agg.push(bucket.bucket.bucket_id, doc);
}
}
agg_data.put_back_range_req_data(self.accessor_idx, req);
if let Some(sub_agg) = self.sub_agg.as_mut() {
sub_agg.check_flush_local(agg_data)?;
}
Ok(())
}
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
for bucket in self.buckets.iter_mut() {
if let Some(sub_agg) = bucket.bucket.sub_aggregation.as_mut() {
sub_agg.flush(agg_data)?;
}
if let Some(sub_agg) = self.sub_agg.as_mut() {
sub_agg.flush(agg_data)?;
}
Ok(())
}
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
while self.parent_buckets.len() <= max_bucket as usize {
let new_buckets = self.create_new_buckets(agg_data)?;
self.parent_buckets.push(new_buckets);
}
Ok(())
}
}
/// Build a concrete `SegmentRangeCollector` with either a Vec- or HashMap-backed
/// bucket storage, depending on the column type and aggregation level.
pub(crate) fn build_segment_range_collector(
agg_data: &mut AggregationsSegmentCtx,
node: &AggRefNode,
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
let accessor_idx = node.idx_in_req_data;
let req_data = agg_data.get_range_req_data(node.idx_in_req_data);
let field_type = req_data.field_type;
// TODO: A better metric instead of is_top_level would be the number of buckets expected.
// E.g. If range agg is not top level, but the parent is a bucket agg with less than 10 buckets,
// we can are still in low cardinality territory.
let is_low_card = req_data.is_top_level && req_data.req.ranges.len() <= 64;
let sub_agg = if !node.children.is_empty() {
Some(build_segment_agg_collectors(agg_data, &node.children)?)
} else {
None
};
if is_low_card {
Ok(Box::new(SegmentRangeCollector::<LowCardSubAggCache> {
sub_agg: sub_agg.map(LowCardCachedSubAggs::new),
column_type: field_type,
accessor_idx,
parent_buckets: Vec::new(),
bucket_id_provider: BucketIdProvider::default(),
limits: agg_data.context.limits.clone(),
}))
} else {
Ok(Box::new(SegmentRangeCollector::<HighCardSubAggCache> {
sub_agg: sub_agg.map(CachedSubAggs::new),
column_type: field_type,
accessor_idx,
parent_buckets: Vec::new(),
bucket_id_provider: BucketIdProvider::default(),
limits: agg_data.context.limits.clone(),
}))
}
}
impl SegmentRangeCollector {
pub(crate) fn from_req_and_validate(
req_data: &mut AggregationsSegmentCtx,
node: &AggRefNode,
) -> crate::Result<Self> {
let accessor_idx = node.idx_in_req_data;
let (field_type, ranges) = {
let req_view = req_data.get_range_req_data(node.idx_in_req_data);
(req_view.field_type, req_view.req.ranges.clone())
};
impl<C: SubAggCache> SegmentRangeCollector<C> {
pub(crate) fn create_new_buckets(
&mut self,
agg_data: &AggregationsSegmentCtx,
) -> crate::Result<Vec<SegmentRangeAndBucketEntry>> {
let field_type = self.column_type;
let req_data = agg_data.get_range_req_data(self.accessor_idx);
// The range input on the request is f64.
// We need to convert to u64 ranges, because we read the values as u64.
// The mapping from the conversion is monotonic so ordering is preserved.
let sub_agg_prototype = if !node.children.is_empty() {
Some(build_segment_agg_collectors(req_data, &node.children)?)
} else {
None
};
let buckets: Vec<_> = extend_validate_ranges(&ranges, &field_type)?
let buckets: Vec<_> = extend_validate_ranges(&req_data.req.ranges, &field_type)?
.iter()
.map(|range| {
let bucket_id = self.bucket_id_provider.next_bucket_id();
let key = range
.key
.clone()
@@ -317,20 +392,20 @@ impl SegmentRangeCollector {
let to = if range.range.end == u64::MAX {
None
} else {
Some(f64_from_fastfield_u64(range.range.end, &field_type))
Some(f64_from_fastfield_u64(range.range.end, field_type))
};
let from = if range.range.start == u64::MIN {
None
} else {
Some(f64_from_fastfield_u64(range.range.start, &field_type))
Some(f64_from_fastfield_u64(range.range.start, field_type))
};
let sub_aggregation = sub_agg_prototype.clone();
// let sub_aggregation = sub_agg_prototype.clone();
Ok(SegmentRangeAndBucketEntry {
range: range.range.clone(),
bucket: SegmentRangeBucketEntry {
doc_count: 0,
sub_aggregation,
bucket_id,
key,
from,
to,
@@ -339,27 +414,20 @@ impl SegmentRangeCollector {
})
.collect::<crate::Result<_>>()?;
req_data.context.limits.add_memory_consumed(
self.limits.add_memory_consumed(
buckets.len() as u64 * std::mem::size_of::<SegmentRangeAndBucketEntry>() as u64,
)?;
Ok(SegmentRangeCollector {
buckets,
column_type: field_type,
accessor_idx,
})
}
#[inline]
fn get_bucket_pos(&self, val: u64) -> usize {
let pos = self
.buckets
.binary_search_by_key(&val, |probe| probe.range.start)
.unwrap_or_else(|pos| pos - 1);
debug_assert!(self.buckets[pos].range.contains(&val));
pos
Ok(buckets)
}
}
#[inline]
fn get_bucket_pos(val: u64, buckets: &[SegmentRangeAndBucketEntry]) -> usize {
let pos = buckets
.binary_search_by_key(&val, |probe| probe.range.start)
.unwrap_or_else(|pos| pos - 1);
debug_assert!(buckets[pos].range.contains(&val));
pos
}
/// Converts the user provided f64 range value to fast field value space.
///
@@ -456,7 +524,7 @@ pub(crate) fn range_to_string(
let val = i64::from_u64(val);
format_date(val)
} else {
Ok(f64_from_fastfield_u64(val, field_type).to_string())
Ok(f64_from_fastfield_u64(val, *field_type).to_string())
}
};
@@ -486,7 +554,7 @@ mod tests {
pub fn get_collector_from_ranges(
ranges: Vec<RangeAggregationRange>,
field_type: ColumnType,
) -> SegmentRangeCollector {
) -> SegmentRangeCollector<HighCardSubAggCache> {
let req = RangeAggregation {
field: "dummy".to_string(),
ranges,
@@ -506,30 +574,33 @@ mod tests {
let to = if range.range.end == u64::MAX {
None
} else {
Some(f64_from_fastfield_u64(range.range.end, &field_type))
Some(f64_from_fastfield_u64(range.range.end, field_type))
};
let from = if range.range.start == u64::MIN {
None
} else {
Some(f64_from_fastfield_u64(range.range.start, &field_type))
Some(f64_from_fastfield_u64(range.range.start, field_type))
};
SegmentRangeAndBucketEntry {
range: range.range.clone(),
bucket: SegmentRangeBucketEntry {
doc_count: 0,
sub_aggregation: None,
key,
from,
to,
bucket_id: 0,
},
}
})
.collect();
SegmentRangeCollector {
buckets,
parent_buckets: vec![buckets],
column_type: field_type,
accessor_idx: 0,
sub_agg: None,
bucket_id_provider: Default::default(),
limits: AggregationLimitsGuard::default(),
}
}
@@ -776,7 +847,7 @@ mod tests {
let buckets = vec![(10f64..20f64).into(), (30f64..40f64).into()];
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
let buckets = collector.buckets;
let buckets = collector.parent_buckets[0].clone();
assert_eq!(buckets[0].range.start, u64::MIN);
assert_eq!(buckets[0].range.end, 10f64.to_u64());
assert_eq!(buckets[1].range.start, 10f64.to_u64());
@@ -799,7 +870,7 @@ mod tests {
];
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
let buckets = collector.buckets;
let buckets = collector.parent_buckets[0].clone();
assert_eq!(buckets[0].range.start, u64::MIN);
assert_eq!(buckets[0].range.end, 10f64.to_u64());
assert_eq!(buckets[1].range.start, 10f64.to_u64());
@@ -814,7 +885,7 @@ mod tests {
let buckets = vec![(-10f64..-1f64).into()];
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
let buckets = collector.buckets;
let buckets = collector.parent_buckets[0].clone();
assert_eq!(&buckets[0].bucket.key.to_string(), "*--10");
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "-1-*");
}
@@ -823,7 +894,7 @@ mod tests {
let buckets = vec![(0f64..10f64).into()];
let collector = get_collector_from_ranges(buckets, ColumnType::F64);
let buckets = collector.buckets;
let buckets = collector.parent_buckets[0].clone();
assert_eq!(&buckets[0].bucket.key.to_string(), "*-0");
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "10-*");
}
@@ -832,7 +903,7 @@ mod tests {
fn range_binary_search_test_u64() {
let check_ranges = |ranges: Vec<RangeAggregationRange>| {
let collector = get_collector_from_ranges(ranges, ColumnType::U64);
let search = |val: u64| collector.get_bucket_pos(val);
let search = |val: u64| get_bucket_pos(val, &collector.parent_buckets[0]);
assert_eq!(search(u64::MIN), 0);
assert_eq!(search(9), 0);
@@ -878,7 +949,7 @@ mod tests {
let ranges = vec![(10.0..100.0).into()];
let collector = get_collector_from_ranges(ranges, ColumnType::F64);
let search = |val: u64| collector.get_bucket_pos(val);
let search = |val: u64| get_bucket_pos(val, &collector.parent_buckets[0]);
assert_eq!(search(u64::MIN), 0);
assert_eq!(search(9f64.to_u64()), 0);
@@ -890,63 +961,3 @@ mod tests {
// the max value
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use itertools::Itertools;
use rand::seq::SliceRandom;
use rand::thread_rng;
use super::*;
use crate::aggregation::bucket::range::tests::get_collector_from_ranges;
const TOTAL_DOCS: u64 = 1_000_000u64;
const NUM_DOCS: u64 = 50_000u64;
fn get_collector_with_buckets(num_buckets: u64, num_docs: u64) -> SegmentRangeCollector {
let bucket_size = num_docs / num_buckets;
let mut buckets: Vec<RangeAggregationRange> = vec![];
for i in 0..num_buckets {
let bucket_start = (i * bucket_size) as f64;
buckets.push((bucket_start..bucket_start + bucket_size as f64).into())
}
get_collector_from_ranges(buckets, ColumnType::U64)
}
fn get_rand_docs(total_docs: u64, num_docs_returned: u64) -> Vec<u64> {
let mut rng = thread_rng();
let all_docs = (0..total_docs - 1).collect_vec();
let mut vals = all_docs
.as_slice()
.choose_multiple(&mut rng, num_docs_returned as usize)
.cloned()
.collect_vec();
vals.sort();
vals
}
fn bench_range_binary_search(b: &mut test::Bencher, num_buckets: u64) {
let collector = get_collector_with_buckets(num_buckets, TOTAL_DOCS);
let vals = get_rand_docs(TOTAL_DOCS, NUM_DOCS);
b.iter(|| {
let mut bucket_pos = 0;
for val in &vals {
bucket_pos = collector.get_bucket_pos(*val);
}
bucket_pos
})
}
#[bench]
fn bench_range_100_buckets(b: &mut test::Bencher) {
bench_range_binary_search(b, 100)
}
#[bench]
fn bench_range_10_buckets(b: &mut test::Bencher) {
bench_range_binary_search(b, 10)
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -5,11 +5,13 @@ use crate::aggregation::agg_data::{
build_segment_agg_collectors, AggRefNode, AggregationsSegmentCtx,
};
use crate::aggregation::bucket::term_agg::TermsAggregation;
use crate::aggregation::cached_sub_aggs::{CachedSubAggs, HighCardCachedSubAggs};
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
IntermediateKey, IntermediateTermBucketEntry, IntermediateTermBucketResult,
};
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::segment_agg_result::{BucketIdProvider, SegmentAggregationCollector};
use crate::aggregation::BucketId;
/// Special aggregation to handle missing values for term aggregations.
/// This missing aggregation will check multiple columns for existence.
@@ -35,41 +37,55 @@ impl MissingTermAggReqData {
}
}
/// The specialized missing term aggregation.
#[derive(Default, Debug, Clone)]
pub struct TermMissingAgg {
struct MissingCount {
missing_count: u32,
bucket_id: BucketId,
}
/// The specialized missing term aggregation.
#[derive(Default, Debug)]
pub struct TermMissingAgg {
accessor_idx: usize,
sub_agg: Option<Box<dyn SegmentAggregationCollector>>,
sub_agg: Option<HighCardCachedSubAggs>,
/// Idx = parent bucket id, Value = missing count for that bucket
missing_count_per_bucket: Vec<MissingCount>,
bucket_id_provider: BucketIdProvider,
}
impl TermMissingAgg {
pub(crate) fn new(
req_data: &mut AggregationsSegmentCtx,
agg_data: &mut AggregationsSegmentCtx,
node: &AggRefNode,
) -> crate::Result<Self> {
let has_sub_aggregations = !node.children.is_empty();
let accessor_idx = node.idx_in_req_data;
let sub_agg = if has_sub_aggregations {
let sub_aggregation = build_segment_agg_collectors(req_data, &node.children)?;
let sub_aggregation = build_segment_agg_collectors(agg_data, &node.children)?;
Some(sub_aggregation)
} else {
None
};
let sub_agg = sub_agg.map(CachedSubAggs::new);
let bucket_id_provider = BucketIdProvider::default();
Ok(Self {
accessor_idx,
sub_agg,
..Default::default()
missing_count_per_bucket: Vec::new(),
bucket_id_provider,
})
}
}
impl SegmentAggregationCollector for TermMissingAgg {
fn add_intermediate_aggregation_result(
self: Box<Self>,
&mut self,
agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> {
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
let req_data = agg_data.get_missing_term_req_data(self.accessor_idx);
let term_agg = &req_data.req;
let missing = term_agg
@@ -80,13 +96,16 @@ impl SegmentAggregationCollector for TermMissingAgg {
let mut entries: FxHashMap<IntermediateKey, IntermediateTermBucketEntry> =
Default::default();
let missing_count = &self.missing_count_per_bucket[parent_bucket_id as usize];
let mut missing_entry = IntermediateTermBucketEntry {
doc_count: self.missing_count,
doc_count: missing_count.missing_count,
sub_aggregation: Default::default(),
};
if let Some(sub_agg) = self.sub_agg {
if let Some(sub_agg) = &mut self.sub_agg {
let mut res = IntermediateAggregationResults::default();
sub_agg.add_intermediate_aggregation_result(agg_data, &mut res)?;
sub_agg
.get_sub_agg_collector()
.add_intermediate_aggregation_result(agg_data, &mut res, missing_count.bucket_id)?;
missing_entry.sub_aggregation = res;
}
entries.insert(missing.into(), missing_entry);
@@ -109,30 +128,52 @@ impl SegmentAggregationCollector for TermMissingAgg {
fn collect(
&mut self,
doc: crate::DocId,
parent_bucket_id: BucketId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
let bucket = &mut self.missing_count_per_bucket[parent_bucket_id as usize];
let req_data = agg_data.get_missing_term_req_data(self.accessor_idx);
let has_value = req_data
.accessors
.iter()
.any(|(acc, _)| acc.index.has_value(doc));
if !has_value {
self.missing_count += 1;
if let Some(sub_agg) = self.sub_agg.as_mut() {
sub_agg.collect(doc, agg_data)?;
for doc in docs {
let doc = *doc;
let has_value = req_data
.accessors
.iter()
.any(|(acc, _)| acc.index.has_value(doc));
if !has_value {
bucket.missing_count += 1;
if let Some(sub_agg) = self.sub_agg.as_mut() {
sub_agg.push(bucket.bucket_id, doc);
}
}
}
if let Some(sub_agg) = self.sub_agg.as_mut() {
sub_agg.check_flush_local(agg_data)?;
}
Ok(())
}
fn collect_block(
fn prepare_max_bucket(
&mut self,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
max_bucket: BucketId,
_agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
for doc in docs {
self.collect(*doc, agg_data)?;
while self.missing_count_per_bucket.len() <= max_bucket as usize {
let bucket_id = self.bucket_id_provider.next_bucket_id();
self.missing_count_per_bucket.push(MissingCount {
missing_count: 0,
bucket_id,
});
}
Ok(())
}
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
if let Some(sub_agg) = self.sub_agg.as_mut() {
sub_agg.flush(agg_data)?;
}
Ok(())
}

View File

@@ -1,87 +0,0 @@
use super::intermediate_agg_result::IntermediateAggregationResults;
use super::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::agg_data::AggregationsSegmentCtx;
use crate::DocId;
#[cfg(test)]
pub(crate) const DOC_BLOCK_SIZE: usize = 64;
#[cfg(not(test))]
pub(crate) const DOC_BLOCK_SIZE: usize = 256;
pub(crate) type DocBlock = [DocId; DOC_BLOCK_SIZE];
/// BufAggregationCollector buffers documents before calling collect_block().
#[derive(Clone)]
pub(crate) struct BufAggregationCollector {
pub(crate) collector: Box<dyn SegmentAggregationCollector>,
staged_docs: DocBlock,
num_staged_docs: usize,
}
impl std::fmt::Debug for BufAggregationCollector {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
f.debug_struct("SegmentAggregationResultsCollector")
.field("staged_docs", &&self.staged_docs[..self.num_staged_docs])
.field("num_staged_docs", &self.num_staged_docs)
.finish()
}
}
impl BufAggregationCollector {
pub fn new(collector: Box<dyn SegmentAggregationCollector>) -> Self {
Self {
collector,
num_staged_docs: 0,
staged_docs: [0; DOC_BLOCK_SIZE],
}
}
}
impl SegmentAggregationCollector for BufAggregationCollector {
#[inline]
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
Box::new(self.collector).add_intermediate_aggregation_result(agg_data, results)
}
#[inline]
fn collect(
&mut self,
doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
self.staged_docs[self.num_staged_docs] = doc;
self.num_staged_docs += 1;
if self.num_staged_docs == self.staged_docs.len() {
self.collector
.collect_block(&self.staged_docs[..self.num_staged_docs], agg_data)?;
self.num_staged_docs = 0;
}
Ok(())
}
#[inline]
fn collect_block(
&mut self,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
self.collector.collect_block(docs, agg_data)?;
Ok(())
}
#[inline]
fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
self.collector
.collect_block(&self.staged_docs[..self.num_staged_docs], agg_data)?;
self.num_staged_docs = 0;
self.collector.flush(agg_data)?;
Ok(())
}
}

View File

@@ -0,0 +1,245 @@
use std::fmt::Debug;
use super::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::agg_data::AggregationsSegmentCtx;
use crate::aggregation::bucket::MAX_NUM_TERMS_FOR_VEC;
use crate::aggregation::BucketId;
use crate::DocId;
/// A cache for sub-aggregations, storing doc ids per bucket id.
/// Depending on the cardinality of the parent aggregation, we use different
/// storage strategies.
///
/// ## Low Cardinality
/// Cardinality here refers to the number of unique flattened buckets that can be created
/// by the parent aggregation.
/// Flattened buckets are the result of combining all buckets per collector
/// into a single list of buckets, where each bucket is identified by its BucketId.
///
/// ## Usage
/// Since this is caching for sub-aggregations, it is only used by bucket
/// aggregations.
///
/// TODO: consider using a more advanced data structure for high cardinality
/// aggregations.
/// What this datastructure does in general is to group docs by bucket id.
#[derive(Debug)]
pub(crate) struct CachedSubAggs<C: SubAggCache> {
cache: C,
sub_agg_collector: Box<dyn SegmentAggregationCollector>,
num_docs: usize,
}
pub type LowCardCachedSubAggs = CachedSubAggs<LowCardSubAggCache>;
pub type HighCardCachedSubAggs = CachedSubAggs<HighCardSubAggCache>;
const FLUSH_THRESHOLD: usize = 2048;
/// A trait for caching sub-aggregation doc ids per bucket id.
/// Different implementations can be used depending on the cardinality
/// of the parent aggregation.
pub trait SubAggCache: Debug {
fn new() -> Self;
fn push(&mut self, bucket_id: BucketId, doc_id: DocId);
fn flush_local(
&mut self,
sub_agg: &mut Box<dyn SegmentAggregationCollector>,
agg_data: &mut AggregationsSegmentCtx,
force: bool,
) -> crate::Result<()>;
}
impl<Backend: SubAggCache + Debug> CachedSubAggs<Backend> {
pub fn new(sub_agg: Box<dyn SegmentAggregationCollector>) -> Self {
Self {
cache: Backend::new(),
sub_agg_collector: sub_agg,
num_docs: 0,
}
}
pub fn get_sub_agg_collector(&mut self) -> &mut Box<dyn SegmentAggregationCollector> {
&mut self.sub_agg_collector
}
#[inline]
pub fn push(&mut self, bucket_id: BucketId, doc_id: DocId) {
self.cache.push(bucket_id, doc_id);
self.num_docs += 1;
}
/// Check if we need to flush based on the number of documents cached.
/// If so, flushes the cache to the provided aggregation collector.
pub fn check_flush_local(
&mut self,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
if self.num_docs >= FLUSH_THRESHOLD {
self.cache
.flush_local(&mut self.sub_agg_collector, agg_data, false)?;
self.num_docs = 0;
}
Ok(())
}
/// Note: this _does_ flush the sub aggregations.
pub fn flush(&mut self, agg_data: &mut AggregationsSegmentCtx) -> crate::Result<()> {
if self.num_docs != 0 {
self.cache
.flush_local(&mut self.sub_agg_collector, agg_data, true)?;
self.num_docs = 0;
}
self.sub_agg_collector.flush(agg_data)?;
Ok(())
}
}
/// Number of partitions for high cardinality sub-aggregation cache.
const NUM_PARTITIONS: usize = 16;
#[derive(Debug)]
pub(crate) struct HighCardSubAggCache {
/// This weird partitioning is used to do some cheap grouping on the bucket ids.
/// bucket ids are dense, e.g. when we don't detect the cardinality as low cardinality,
/// but there are just 16 bucket ids, each bucket id will go to its own partition.
///
/// We want to keep this cheap, because high cardinality aggregations can have a lot of
/// buckets, and there may be nothing to group.
partitions: Box<[PartitionEntry; NUM_PARTITIONS]>,
}
impl HighCardSubAggCache {
#[inline]
fn clear(&mut self) {
for partition in self.partitions.iter_mut() {
partition.clear();
}
}
}
#[derive(Debug, Clone, Default)]
struct PartitionEntry {
bucket_ids: Vec<BucketId>,
docs: Vec<DocId>,
}
impl PartitionEntry {
#[inline]
fn clear(&mut self) {
self.bucket_ids.clear();
self.docs.clear();
}
}
impl SubAggCache for HighCardSubAggCache {
fn new() -> Self {
Self {
partitions: Box::new(core::array::from_fn(|_| PartitionEntry::default())),
}
}
fn push(&mut self, bucket_id: BucketId, doc_id: DocId) {
let idx = bucket_id % NUM_PARTITIONS as u32;
let slot = &mut self.partitions[idx as usize];
slot.bucket_ids.push(bucket_id);
slot.docs.push(doc_id);
}
fn flush_local(
&mut self,
sub_agg: &mut Box<dyn SegmentAggregationCollector>,
agg_data: &mut AggregationsSegmentCtx,
_force: bool,
) -> crate::Result<()> {
let mut max_bucket = 0u32;
for partition in self.partitions.iter() {
if let Some(&local_max) = partition.bucket_ids.iter().max() {
max_bucket = max_bucket.max(local_max);
}
}
sub_agg.prepare_max_bucket(max_bucket, agg_data)?;
for slot in self.partitions.iter() {
if !slot.bucket_ids.is_empty() {
// Reduce dynamic dispatch overhead by collecting a full partition in one call.
sub_agg.collect_multiple(&slot.bucket_ids, &slot.docs, agg_data)?;
}
}
self.clear();
Ok(())
}
}
#[derive(Debug)]
pub(crate) struct LowCardSubAggCache {
/// Cache doc ids per bucket for sub-aggregations.
///
/// The outer Vec is indexed by BucketId.
per_bucket_docs: Vec<Vec<DocId>>,
}
impl LowCardSubAggCache {
#[inline]
fn clear(&mut self) {
for v in &mut self.per_bucket_docs {
v.clear();
}
}
}
impl SubAggCache for LowCardSubAggCache {
fn new() -> Self {
Self {
per_bucket_docs: Vec::new(),
}
}
fn push(&mut self, bucket_id: BucketId, doc_id: DocId) {
let idx = bucket_id as usize;
if self.per_bucket_docs.len() <= idx {
self.per_bucket_docs.resize_with(idx + 1, Vec::new);
}
self.per_bucket_docs[idx].push(doc_id);
}
fn flush_local(
&mut self,
sub_agg: &mut Box<dyn SegmentAggregationCollector>,
agg_data: &mut AggregationsSegmentCtx,
force: bool,
) -> crate::Result<()> {
// Pre-aggregated: call collect per bucket.
let max_bucket = (self.per_bucket_docs.len() as BucketId).saturating_sub(1);
sub_agg.prepare_max_bucket(max_bucket, agg_data)?;
// The threshold above which we flush buckets individually.
// Note: We need to make sure that we don't lock ourselves into a situation where we hit
// the FLUSH_THRESHOLD, but never flush any buckets. (except the final flush)
let mut bucket_treshold = FLUSH_THRESHOLD / (self.per_bucket_docs.len().max(1) * 2);
const _: () = {
// MAX_NUM_TERMS_FOR_VEC threshold is used for term aggregations
// Note: There may be other flexible values, for other aggregations, but we can use the
// const value here as a upper bound. (better than nothing)
let bucket_treshold_limit = FLUSH_THRESHOLD / (MAX_NUM_TERMS_FOR_VEC as usize * 2);
assert!(
bucket_treshold_limit > 0,
"Bucket threshold must be greater than 0"
);
};
if force {
bucket_treshold = 0;
}
for (bucket_id, docs) in self
.per_bucket_docs
.iter()
.enumerate()
.filter(|(_, docs)| docs.len() > bucket_treshold)
{
sub_agg.collect(bucket_id as BucketId, docs, agg_data)?;
}
self.clear();
Ok(())
}
}

View File

@@ -1,9 +1,9 @@
use super::agg_req::Aggregations;
use super::agg_result::AggregationResults;
use super::buf_collector::BufAggregationCollector;
use super::cached_sub_aggs::LowCardCachedSubAggs;
use super::intermediate_agg_result::IntermediateAggregationResults;
use super::segment_agg_result::SegmentAggregationCollector;
use super::AggContextParams;
// group buffering strategy is chosen explicitly by callers; no need to hash-group on the fly.
use crate::aggregation::agg_data::{
build_aggregations_data_from_req, build_segment_agg_collectors_root, AggregationsSegmentCtx,
};
@@ -136,7 +136,7 @@ fn merge_fruits(
/// `AggregationSegmentCollector` does the aggregation collection on a segment.
pub struct AggregationSegmentCollector {
aggs_with_accessor: AggregationsSegmentCtx,
agg_collector: BufAggregationCollector,
agg_collector: LowCardCachedSubAggs,
error: Option<TantivyError>,
}
@@ -151,8 +151,11 @@ impl AggregationSegmentCollector {
) -> crate::Result<Self> {
let mut agg_data =
build_aggregations_data_from_req(agg, reader, segment_ordinal, context.clone())?;
let result =
BufAggregationCollector::new(build_segment_agg_collectors_root(&mut agg_data)?);
let mut result =
LowCardCachedSubAggs::new(build_segment_agg_collectors_root(&mut agg_data)?);
result
.get_sub_agg_collector()
.prepare_max_bucket(0, &agg_data)?; // prepare for bucket zero
Ok(AggregationSegmentCollector {
aggs_with_accessor: agg_data,
@@ -170,26 +173,31 @@ impl SegmentCollector for AggregationSegmentCollector {
if self.error.is_some() {
return;
}
if let Err(err) = self
self.agg_collector.push(0, doc);
match self
.agg_collector
.collect(doc, &mut self.aggs_with_accessor)
.check_flush_local(&mut self.aggs_with_accessor)
{
self.error = Some(err);
Ok(_) => {}
Err(e) => {
self.error = Some(e);
}
}
}
/// The query pushes the documents to the collector via this method.
///
/// Only valid for Collectors that ignore docs
fn collect_block(&mut self, docs: &[DocId]) {
if self.error.is_some() {
return;
}
if let Err(err) = self
.agg_collector
.collect_block(docs, &mut self.aggs_with_accessor)
{
self.error = Some(err);
match self.agg_collector.get_sub_agg_collector().collect(
0,
docs,
&mut self.aggs_with_accessor,
) {
Ok(_) => {}
Err(e) => {
self.error = Some(e);
}
}
}
@@ -200,10 +208,13 @@ impl SegmentCollector for AggregationSegmentCollector {
self.agg_collector.flush(&mut self.aggs_with_accessor)?;
let mut sub_aggregation_res = IntermediateAggregationResults::default();
Box::new(self.agg_collector).add_intermediate_aggregation_result(
&self.aggs_with_accessor,
&mut sub_aggregation_res,
)?;
self.agg_collector
.get_sub_agg_collector()
.add_intermediate_aggregation_result(
&self.aggs_with_accessor,
&mut sub_aggregation_res,
0,
)?;
Ok(sub_aggregation_res)
}

View File

@@ -792,7 +792,7 @@ pub struct IntermediateRangeBucketEntry {
/// The number of documents in the bucket.
pub doc_count: u64,
/// The sub_aggregation in this bucket.
pub sub_aggregation: IntermediateAggregationResults,
pub sub_aggregation_res: IntermediateAggregationResults,
/// The from range of the bucket. Equals `f64::MIN` when `None`.
pub from: Option<f64>,
/// The to range of the bucket. Equals `f64::MAX` when `None`.
@@ -811,7 +811,7 @@ impl IntermediateRangeBucketEntry {
key: self.key.into(),
doc_count: self.doc_count,
sub_aggregation: self
.sub_aggregation
.sub_aggregation_res
.into_final_result_internal(req, limits)?,
to: self.to,
from: self.from,
@@ -857,7 +857,8 @@ impl MergeFruits for IntermediateTermBucketEntry {
impl MergeFruits for IntermediateRangeBucketEntry {
fn merge_fruits(&mut self, other: IntermediateRangeBucketEntry) -> crate::Result<()> {
self.doc_count += other.doc_count;
self.sub_aggregation.merge_fruits(other.sub_aggregation)?;
self.sub_aggregation_res
.merge_fruits(other.sub_aggregation_res)?;
Ok(())
}
}
@@ -887,7 +888,7 @@ mod tests {
IntermediateRangeBucketEntry {
key: IntermediateKey::Str(key.to_string()),
doc_count: *doc_count,
sub_aggregation: Default::default(),
sub_aggregation_res: Default::default(),
from: None,
to: None,
},
@@ -920,7 +921,7 @@ mod tests {
doc_count: *doc_count,
from: None,
to: None,
sub_aggregation: get_sub_test_tree(&[(
sub_aggregation_res: get_sub_test_tree(&[(
sub_aggregation_key.to_string(),
*sub_aggregation_count,
)]),

View File

@@ -52,10 +52,8 @@ pub struct IntermediateAverage {
impl IntermediateAverage {
/// Creates a new [`IntermediateAverage`] instance from a [`SegmentStatsCollector`].
pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
Self {
stats: collector.stats,
}
pub(crate) fn from_stats(stats: IntermediateStats) -> Self {
Self { stats }
}
/// Merges the other intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateAverage) {

View File

@@ -2,7 +2,7 @@ use std::collections::hash_map::DefaultHasher;
use std::hash::{BuildHasher, Hasher};
use columnar::column_values::CompactSpaceU64Accessor;
use columnar::{Column, ColumnBlockAccessor, ColumnType, Dictionary, StrColumn};
use columnar::{Column, ColumnType, Dictionary, StrColumn};
use common::f64_to_u64;
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
use rustc_hash::FxHashSet;
@@ -106,8 +106,6 @@ pub struct CardinalityAggReqData {
pub str_dict_column: Option<StrColumn>,
/// The missing value normalized to the internal u64 representation of the field type.
pub missing_value_for_accessor: Option<u64>,
/// The column block accessor to access the fast field values.
pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
/// The name of the aggregation.
pub name: String,
/// The aggregation request.
@@ -135,45 +133,34 @@ impl CardinalityAggregationReq {
}
}
#[derive(Clone, Debug, PartialEq)]
#[derive(Clone, Debug)]
pub(crate) struct SegmentCardinalityCollector {
cardinality: CardinalityCollector,
entries: FxHashSet<u64>,
buckets: Vec<SegmentCardinalityCollectorBucket>,
accessor_idx: usize,
/// The column accessor to access the fast field values.
accessor: Column<u64>,
/// The column_type of the field.
column_type: ColumnType,
/// The missing value normalized to the internal u64 representation of the field type.
missing_value_for_accessor: Option<u64>,
}
impl SegmentCardinalityCollector {
pub fn from_req(column_type: ColumnType, accessor_idx: usize) -> Self {
#[derive(Clone, Debug, PartialEq, Default)]
pub(crate) struct SegmentCardinalityCollectorBucket {
cardinality: CardinalityCollector,
entries: FxHashSet<u64>,
}
impl SegmentCardinalityCollectorBucket {
pub fn new(column_type: ColumnType) -> Self {
Self {
cardinality: CardinalityCollector::new(column_type as u8),
entries: Default::default(),
accessor_idx,
entries: FxHashSet::default(),
}
}
fn fetch_block_with_field(
&mut self,
docs: &[crate::DocId],
agg_data: &mut CardinalityAggReqData,
) {
if let Some(missing) = agg_data.missing_value_for_accessor {
agg_data.column_block_accessor.fetch_block_with_missing(
docs,
&agg_data.accessor,
missing,
);
} else {
agg_data
.column_block_accessor
.fetch_block(docs, &agg_data.accessor);
}
}
fn into_intermediate_metric_result(
mut self,
agg_data: &AggregationsSegmentCtx,
req_data: &CardinalityAggReqData,
) -> crate::Result<IntermediateMetricResult> {
let req_data = &agg_data.get_cardinality_req_data(self.accessor_idx);
if req_data.column_type == ColumnType::Str {
let fallback_dict = Dictionary::empty();
let dict = req_data
@@ -194,6 +181,7 @@ impl SegmentCardinalityCollector {
term_ids.push(term_ord as u32);
}
}
term_ids.sort_unstable();
dict.sorted_ords_to_term_cb(term_ids.iter().map(|term| *term as u64), |term| {
self.cardinality.sketch.insert_any(&term);
@@ -227,16 +215,49 @@ impl SegmentCardinalityCollector {
}
}
impl SegmentCardinalityCollector {
pub fn from_req(
column_type: ColumnType,
accessor_idx: usize,
accessor: Column<u64>,
missing_value_for_accessor: Option<u64>,
) -> Self {
Self {
buckets: vec![SegmentCardinalityCollectorBucket::new(column_type); 1],
column_type,
accessor_idx,
accessor,
missing_value_for_accessor,
}
}
fn fetch_block_with_field(
&mut self,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) {
agg_data.column_block_accessor.fetch_block_with_missing(
docs,
&self.accessor,
self.missing_value_for_accessor,
);
}
}
impl SegmentAggregationCollector for SegmentCardinalityCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
&mut self,
agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> {
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
let req_data = &agg_data.get_cardinality_req_data(self.accessor_idx);
let name = req_data.name.to_string();
// take the bucket in buckets and replace it with a new empty one
let bucket = std::mem::take(&mut self.buckets[parent_bucket_id as usize]);
let intermediate_result = self.into_intermediate_metric_result(agg_data)?;
let intermediate_result = bucket.into_intermediate_metric_result(req_data)?;
results.push(
name,
IntermediateAggregationResult::Metric(intermediate_result),
@@ -247,27 +268,20 @@ impl SegmentAggregationCollector for SegmentCardinalityCollector {
fn collect(
&mut self,
doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
self.collect_block(&[doc], agg_data)
}
fn collect_block(
&mut self,
parent_bucket_id: BucketId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
let req_data = agg_data.get_cardinality_req_data_mut(self.accessor_idx);
self.fetch_block_with_field(docs, req_data);
self.fetch_block_with_field(docs, agg_data);
let bucket = &mut self.buckets[parent_bucket_id as usize];
let col_block_accessor = &req_data.column_block_accessor;
if req_data.column_type == ColumnType::Str {
let col_block_accessor = &agg_data.column_block_accessor;
if self.column_type == ColumnType::Str {
for term_ord in col_block_accessor.iter_vals() {
self.entries.insert(term_ord);
bucket.entries.insert(term_ord);
}
} else if req_data.column_type == ColumnType::IpAddr {
let compact_space_accessor = req_data
} else if self.column_type == ColumnType::IpAddr {
let compact_space_accessor = self
.accessor
.values
.clone()
@@ -282,16 +296,29 @@ impl SegmentAggregationCollector for SegmentCardinalityCollector {
})?;
for val in col_block_accessor.iter_vals() {
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
self.cardinality.sketch.insert_any(&val);
bucket.cardinality.sketch.insert_any(&val);
}
} else {
for val in col_block_accessor.iter_vals() {
self.cardinality.sketch.insert_any(&val);
bucket.cardinality.sketch.insert_any(&val);
}
}
Ok(())
}
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
_agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
if max_bucket as usize >= self.buckets.len() {
self.buckets.resize_with(max_bucket as usize + 1, || {
SegmentCardinalityCollectorBucket::new(self.column_type)
});
}
Ok(())
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]

View File

@@ -52,10 +52,8 @@ pub struct IntermediateCount {
impl IntermediateCount {
/// Creates a new [`IntermediateCount`] instance from a [`SegmentStatsCollector`].
pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
Self {
stats: collector.stats,
}
pub(crate) fn from_stats(stats: IntermediateStats) -> Self {
Self { stats }
}
/// Merges the other intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateCount) {

View File

@@ -8,10 +8,9 @@ use crate::aggregation::agg_data::AggregationsSegmentCtx;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
};
use crate::aggregation::metric::MetricAggReqData;
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::*;
use crate::{DocId, TantivyError};
use crate::TantivyError;
/// A multi-value metric aggregation that computes a collection of extended statistics
/// on numeric values that are extracted
@@ -318,51 +317,28 @@ impl IntermediateExtendedStats {
}
}
#[derive(Clone, Debug, PartialEq)]
#[derive(Clone, Debug)]
pub(crate) struct SegmentExtendedStatsCollector {
name: String,
missing: Option<u64>,
field_type: ColumnType,
pub(crate) extended_stats: IntermediateExtendedStats,
pub(crate) accessor_idx: usize,
val_cache: Vec<u64>,
accessor: columnar::Column<u64>,
buckets: Vec<IntermediateExtendedStats>,
sigma: Option<f64>,
}
impl SegmentExtendedStatsCollector {
pub fn from_req(
field_type: ColumnType,
sigma: Option<f64>,
accessor_idx: usize,
missing: Option<f64>,
) -> Self {
let missing = missing.and_then(|val| f64_to_fastfield_u64(val, &field_type));
pub fn from_req(req: &MetricAggReqData, sigma: Option<f64>) -> Self {
let missing = req
.missing
.and_then(|val| f64_to_fastfield_u64(val, &req.field_type));
Self {
field_type,
extended_stats: IntermediateExtendedStats::with_sigma(sigma),
accessor_idx,
name: req.name.clone(),
field_type: req.field_type,
accessor: req.accessor.clone(),
missing,
val_cache: Default::default(),
}
}
#[inline]
pub(crate) fn collect_block_with_field(
&mut self,
docs: &[DocId],
req_data: &mut MetricAggReqData,
) {
if let Some(missing) = self.missing.as_ref() {
req_data.column_block_accessor.fetch_block_with_missing(
docs,
&req_data.accessor,
*missing,
);
} else {
req_data
.column_block_accessor
.fetch_block(docs, &req_data.accessor);
}
for val in req_data.column_block_accessor.iter_vals() {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.extended_stats.collect(val1);
buckets: vec![IntermediateExtendedStats::with_sigma(sigma); 16],
sigma,
}
}
}
@@ -370,15 +346,18 @@ impl SegmentExtendedStatsCollector {
impl SegmentAggregationCollector for SegmentExtendedStatsCollector {
#[inline]
fn add_intermediate_aggregation_result(
self: Box<Self>,
&mut self,
agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> {
let name = agg_data.get_metric_req_data(self.accessor_idx).name.clone();
let name = self.name.clone();
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
let extended_stats = std::mem::take(&mut self.buckets[parent_bucket_id as usize]);
results.push(
name,
IntermediateAggregationResult::Metric(IntermediateMetricResult::ExtendedStats(
self.extended_stats,
extended_stats,
)),
)?;
@@ -388,39 +367,36 @@ impl SegmentAggregationCollector for SegmentExtendedStatsCollector {
#[inline]
fn collect(
&mut self,
doc: crate::DocId,
parent_bucket_id: BucketId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
let req_data = agg_data.get_metric_req_data(self.accessor_idx);
if let Some(missing) = self.missing {
let mut has_val = false;
for val in req_data.accessor.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.extended_stats.collect(val1);
has_val = true;
}
if !has_val {
self.extended_stats
.collect(f64_from_fastfield_u64(missing, &self.field_type));
}
} else {
for val in req_data.accessor.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.extended_stats.collect(val1);
}
let mut extended_stats = self.buckets[parent_bucket_id as usize].clone();
agg_data
.column_block_accessor
.fetch_block_with_missing(docs, &self.accessor, self.missing);
for val in agg_data.column_block_accessor.iter_vals() {
let val1 = f64_from_fastfield_u64(val, self.field_type);
extended_stats.collect(val1);
}
// store back
self.buckets[parent_bucket_id as usize] = extended_stats;
Ok(())
}
#[inline]
fn collect_block(
fn prepare_max_bucket(
&mut self,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
max_bucket: BucketId,
_agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
let req_data = agg_data.get_metric_req_data_mut(self.accessor_idx);
self.collect_block_with_field(docs, req_data);
if self.buckets.len() <= max_bucket as usize {
self.buckets.resize_with(max_bucket as usize + 1, || {
IntermediateExtendedStats::with_sigma(self.sigma)
});
}
Ok(())
}
}

View File

@@ -52,10 +52,8 @@ pub struct IntermediateMax {
impl IntermediateMax {
/// Creates a new [`IntermediateMax`] instance from a [`SegmentStatsCollector`].
pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
Self {
stats: collector.stats,
}
pub(crate) fn from_stats(stats: IntermediateStats) -> Self {
Self { stats }
}
/// Merges the other intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateMax) {

View File

@@ -52,10 +52,8 @@ pub struct IntermediateMin {
impl IntermediateMin {
/// Creates a new [`IntermediateMin`] instance from a [`SegmentStatsCollector`].
pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
Self {
stats: collector.stats,
}
pub(crate) fn from_stats(stats: IntermediateStats) -> Self {
Self { stats }
}
/// Merges the other intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateMin) {

View File

@@ -31,7 +31,7 @@ use std::collections::HashMap;
pub use average::*;
pub use cardinality::*;
use columnar::{Column, ColumnBlockAccessor, ColumnType};
use columnar::{Column, ColumnType};
pub use count::*;
pub use extended_stats::*;
pub use max::*;
@@ -55,8 +55,6 @@ pub struct MetricAggReqData {
pub field_type: ColumnType,
/// The missing value normalized to the internal u64 representation of the field type.
pub missing_u64: Option<u64>,
/// The column block accessor to access the fast field values.
pub column_block_accessor: ColumnBlockAccessor<u64>,
/// The column accessor to access the fast field values.
pub accessor: Column<u64>,
/// Used when converting to intermediate result

View File

@@ -7,10 +7,9 @@ use crate::aggregation::agg_data::AggregationsSegmentCtx;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
};
use crate::aggregation::metric::MetricAggReqData;
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::*;
use crate::{DocId, TantivyError};
use crate::TantivyError;
/// # Percentiles
///
@@ -131,10 +130,16 @@ impl PercentilesAggregationReq {
}
}
#[derive(Clone, Debug, PartialEq)]
#[derive(Clone, Debug)]
pub(crate) struct SegmentPercentilesCollector {
pub(crate) percentiles: PercentilesCollector,
pub(crate) buckets: Vec<PercentilesCollector>,
pub(crate) accessor_idx: usize,
/// The type of the field.
pub field_type: ColumnType,
/// The missing value normalized to the internal u64 representation of the field type.
pub missing_u64: Option<u64>,
/// The column accessor to access the fast field values.
pub accessor: Column<u64>,
}
#[derive(Clone, Serialize, Deserialize)]
@@ -229,33 +234,18 @@ impl PercentilesCollector {
}
impl SegmentPercentilesCollector {
pub fn from_req_and_validate(accessor_idx: usize) -> crate::Result<Self> {
Ok(Self {
percentiles: PercentilesCollector::new(),
pub fn from_req_and_validate(
field_type: ColumnType,
missing_u64: Option<u64>,
accessor: Column<u64>,
accessor_idx: usize,
) -> Self {
Self {
buckets: Vec::with_capacity(64),
field_type,
missing_u64,
accessor,
accessor_idx,
})
}
#[inline]
pub(crate) fn collect_block_with_field(
&mut self,
docs: &[DocId],
req_data: &mut MetricAggReqData,
) {
if let Some(missing) = req_data.missing_u64.as_ref() {
req_data.column_block_accessor.fetch_block_with_missing(
docs,
&req_data.accessor,
*missing,
);
} else {
req_data
.column_block_accessor
.fetch_block(docs, &req_data.accessor);
}
for val in req_data.column_block_accessor.iter_vals() {
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
self.percentiles.collect(val1);
}
}
}
@@ -263,12 +253,18 @@ impl SegmentPercentilesCollector {
impl SegmentAggregationCollector for SegmentPercentilesCollector {
#[inline]
fn add_intermediate_aggregation_result(
self: Box<Self>,
&mut self,
agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> {
let name = agg_data.get_metric_req_data(self.accessor_idx).name.clone();
let intermediate_metric_result = IntermediateMetricResult::Percentiles(self.percentiles);
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
// Swap collector with an empty one to avoid cloning
let percentiles_collector = std::mem::take(&mut self.buckets[parent_bucket_id as usize]);
let intermediate_metric_result =
IntermediateMetricResult::Percentiles(percentiles_collector);
results.push(
name,
@@ -281,40 +277,33 @@ impl SegmentAggregationCollector for SegmentPercentilesCollector {
#[inline]
fn collect(
&mut self,
doc: crate::DocId,
parent_bucket_id: BucketId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
let req_data = agg_data.get_metric_req_data(self.accessor_idx);
let percentiles = &mut self.buckets[parent_bucket_id as usize];
agg_data.column_block_accessor.fetch_block_with_missing(
docs,
&self.accessor,
self.missing_u64,
);
if let Some(missing) = req_data.missing_u64 {
let mut has_val = false;
for val in req_data.accessor.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
self.percentiles.collect(val1);
has_val = true;
}
if !has_val {
self.percentiles
.collect(f64_from_fastfield_u64(missing, &req_data.field_type));
}
} else {
for val in req_data.accessor.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
self.percentiles.collect(val1);
}
for val in agg_data.column_block_accessor.iter_vals() {
let val1 = f64_from_fastfield_u64(val, self.field_type);
percentiles.collect(val1);
}
Ok(())
}
#[inline]
fn collect_block(
fn prepare_max_bucket(
&mut self,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
max_bucket: BucketId,
_agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
let req_data = agg_data.get_metric_req_data_mut(self.accessor_idx);
self.collect_block_with_field(docs, req_data);
while self.buckets.len() <= max_bucket as usize {
self.buckets.push(PercentilesCollector::new());
}
Ok(())
}
}

View File

@@ -1,5 +1,6 @@
use std::fmt::Debug;
use columnar::{Column, ColumnType};
use serde::{Deserialize, Serialize};
use super::*;
@@ -7,10 +8,9 @@ use crate::aggregation::agg_data::AggregationsSegmentCtx;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
};
use crate::aggregation::metric::MetricAggReqData;
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::*;
use crate::{DocId, TantivyError};
use crate::TantivyError;
/// A multi-value metric aggregation that computes a collection of statistics on numeric values that
/// are extracted from the aggregated documents.
@@ -83,7 +83,7 @@ impl Stats {
/// Intermediate result of the stats aggregation that can be combined with other intermediate
/// results.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateStats {
/// The number of extracted values.
pub(crate) count: u64,
@@ -187,75 +187,75 @@ pub enum StatsType {
Percentiles,
}
fn create_collector<const TYPE_ID: u8>(
req: &MetricAggReqData,
) -> Box<dyn SegmentAggregationCollector> {
Box::new(SegmentStatsCollector::<TYPE_ID> {
name: req.name.clone(),
collecting_for: req.collecting_for,
is_number_or_date_type: req.is_number_or_date_type,
missing_u64: req.missing_u64,
accessor: req.accessor.clone(),
buckets: vec![IntermediateStats::default()],
})
}
/// Build a concrete `SegmentStatsCollector` depending on the column type.
pub(crate) fn build_segment_stats_collector(
req: &MetricAggReqData,
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
match req.field_type {
ColumnType::I64 => Ok(create_collector::<{ ColumnType::I64 as u8 }>(req)),
ColumnType::U64 => Ok(create_collector::<{ ColumnType::U64 as u8 }>(req)),
ColumnType::F64 => Ok(create_collector::<{ ColumnType::F64 as u8 }>(req)),
ColumnType::Bool => Ok(create_collector::<{ ColumnType::Bool as u8 }>(req)),
ColumnType::DateTime => Ok(create_collector::<{ ColumnType::DateTime as u8 }>(req)),
ColumnType::Bytes => Ok(create_collector::<{ ColumnType::Bytes as u8 }>(req)),
ColumnType::Str => Ok(create_collector::<{ ColumnType::Str as u8 }>(req)),
ColumnType::IpAddr => Ok(create_collector::<{ ColumnType::IpAddr as u8 }>(req)),
}
}
#[repr(C)]
#[derive(Clone, Debug)]
pub(crate) struct SegmentStatsCollector {
pub(crate) stats: IntermediateStats,
pub(crate) accessor_idx: usize,
pub(crate) struct SegmentStatsCollector<const COLUMN_TYPE_ID: u8> {
pub(crate) missing_u64: Option<u64>,
pub(crate) accessor: Column<u64>,
pub(crate) is_number_or_date_type: bool,
pub(crate) buckets: Vec<IntermediateStats>,
pub(crate) name: String,
pub(crate) collecting_for: StatsType,
}
impl SegmentStatsCollector {
pub fn from_req(accessor_idx: usize) -> Self {
Self {
stats: IntermediateStats::default(),
accessor_idx,
}
}
#[inline]
pub(crate) fn collect_block_with_field(
&mut self,
docs: &[DocId],
req_data: &mut MetricAggReqData,
) {
if let Some(missing) = req_data.missing_u64.as_ref() {
req_data.column_block_accessor.fetch_block_with_missing(
docs,
&req_data.accessor,
*missing,
);
} else {
req_data
.column_block_accessor
.fetch_block(docs, &req_data.accessor);
}
if req_data.is_number_or_date_type {
for val in req_data.column_block_accessor.iter_vals() {
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
self.stats.collect(val1);
}
} else {
for _val in req_data.column_block_accessor.iter_vals() {
// we ignore the value and simply record that we got something
self.stats.collect(0.0);
}
}
}
}
impl SegmentAggregationCollector for SegmentStatsCollector {
impl<const COLUMN_TYPE_ID: u8> SegmentAggregationCollector
for SegmentStatsCollector<COLUMN_TYPE_ID>
{
#[inline]
fn add_intermediate_aggregation_result(
self: Box<Self>,
&mut self,
agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> {
let req = agg_data.get_metric_req_data(self.accessor_idx);
let name = req.name.clone();
let name = self.name.clone();
let intermediate_metric_result = match req.collecting_for {
self.prepare_max_bucket(parent_bucket_id, agg_data)?;
let stats = self.buckets[parent_bucket_id as usize];
let intermediate_metric_result = match self.collecting_for {
StatsType::Average => {
IntermediateMetricResult::Average(IntermediateAverage::from_collector(*self))
IntermediateMetricResult::Average(IntermediateAverage::from_stats(stats))
}
StatsType::Count => {
IntermediateMetricResult::Count(IntermediateCount::from_collector(*self))
IntermediateMetricResult::Count(IntermediateCount::from_stats(stats))
}
StatsType::Max => IntermediateMetricResult::Max(IntermediateMax::from_collector(*self)),
StatsType::Min => IntermediateMetricResult::Min(IntermediateMin::from_collector(*self)),
StatsType::Stats => IntermediateMetricResult::Stats(self.stats),
StatsType::Sum => IntermediateMetricResult::Sum(IntermediateSum::from_collector(*self)),
StatsType::Max => IntermediateMetricResult::Max(IntermediateMax::from_stats(stats)),
StatsType::Min => IntermediateMetricResult::Min(IntermediateMin::from_stats(stats)),
StatsType::Stats => IntermediateMetricResult::Stats(stats),
StatsType::Sum => IntermediateMetricResult::Sum(IntermediateSum::from_stats(stats)),
_ => {
return Err(TantivyError::InvalidArgument(format!(
"Unsupported stats type for stats aggregation: {:?}",
req.collecting_for
self.collecting_for
)))
}
};
@@ -271,41 +271,67 @@ impl SegmentAggregationCollector for SegmentStatsCollector {
#[inline]
fn collect(
&mut self,
doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
let req_data = agg_data.get_metric_req_data(self.accessor_idx);
if let Some(missing) = req_data.missing_u64 {
let mut has_val = false;
for val in req_data.accessor.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
self.stats.collect(val1);
has_val = true;
}
if !has_val {
self.stats
.collect(f64_from_fastfield_u64(missing, &req_data.field_type));
}
} else {
for val in req_data.accessor.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &req_data.field_type);
self.stats.collect(val1);
}
}
Ok(())
}
#[inline]
fn collect_block(
&mut self,
parent_bucket_id: BucketId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
let req_data = agg_data.get_metric_req_data_mut(self.accessor_idx);
self.collect_block_with_field(docs, req_data);
// TODO: remove once we fetch all values for all bucket ids in one go
if docs.len() == 1 && self.missing_u64.is_none() {
collect_stats::<COLUMN_TYPE_ID>(
&mut self.buckets[parent_bucket_id as usize],
self.accessor.values_for_doc(docs[0]),
self.is_number_or_date_type,
)?;
return Ok(());
}
agg_data.column_block_accessor.fetch_block_with_missing(
docs,
&self.accessor,
self.missing_u64,
);
collect_stats::<COLUMN_TYPE_ID>(
&mut self.buckets[parent_bucket_id as usize],
agg_data.column_block_accessor.iter_vals(),
self.is_number_or_date_type,
)?;
Ok(())
}
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
_agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
let required_buckets = (max_bucket as usize) + 1;
if self.buckets.len() < required_buckets {
self.buckets
.resize_with(required_buckets, IntermediateStats::default);
}
Ok(())
}
}
#[inline]
fn collect_stats<const COLUMN_TYPE_ID: u8>(
stats: &mut IntermediateStats,
vals: impl Iterator<Item = u64>,
is_number_or_date_type: bool,
) -> crate::Result<()> {
if is_number_or_date_type {
for val in vals {
let val1 = convert_to_f64::<COLUMN_TYPE_ID>(val);
stats.collect(val1);
}
} else {
for _val in vals {
// we ignore the value and simply record that we got something
stats.collect(0.0);
}
}
Ok(())
}
#[cfg(test)]

View File

@@ -52,10 +52,8 @@ pub struct IntermediateSum {
impl IntermediateSum {
/// Creates a new [`IntermediateSum`] instance from a [`SegmentStatsCollector`].
pub(crate) fn from_collector(collector: SegmentStatsCollector) -> Self {
Self {
stats: collector.stats,
}
pub(crate) fn from_stats(stats: IntermediateStats) -> Self {
Self { stats }
}
/// Merges the other intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateSum) {

View File

@@ -15,12 +15,11 @@ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateMetricResult,
};
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::AggregationError;
use crate::aggregation::{AggregationError, BucketId};
use crate::collector::sort_key::ReverseComparator;
use crate::collector::TopNComputer;
use crate::schema::OwnedValue;
use crate::{DocAddress, DocId, SegmentOrdinal};
// duplicate import removed; already imported above
/// Contains all information required by the TopHitsSegmentCollector to perform the
/// top_hits aggregation on a segment.
@@ -472,7 +471,10 @@ impl TopHitsTopNComputer {
/// Create a new TopHitsCollector
pub fn new(req: &TopHitsAggregationReq) -> Self {
Self {
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
top_n: TopNComputer::new_with_comparator(
req.size + req.from.unwrap_or(0),
ReverseComparator,
),
req: req.clone(),
}
}
@@ -518,7 +520,8 @@ impl TopHitsTopNComputer {
pub(crate) struct TopHitsSegmentCollector {
segment_ordinal: SegmentOrdinal,
accessor_idx: usize,
top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, ReverseComparator>,
buckets: Vec<TopNComputer<Vec<DocValueAndOrder>, DocAddress, ReverseComparator>>,
num_hits: usize,
}
impl TopHitsSegmentCollector {
@@ -527,19 +530,29 @@ impl TopHitsSegmentCollector {
accessor_idx: usize,
segment_ordinal: SegmentOrdinal,
) -> Self {
let num_hits = req.size + req.from.unwrap_or(0);
Self {
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
num_hits,
segment_ordinal,
accessor_idx,
buckets: vec![TopNComputer::new_with_comparator(num_hits, ReverseComparator); 1],
}
}
fn into_top_hits_collector(
self,
fn get_top_hits_computer(
&mut self,
parent_bucket_id: BucketId,
value_accessors: &HashMap<String, Vec<DynamicColumn>>,
req: &TopHitsAggregationReq,
) -> TopHitsTopNComputer {
if parent_bucket_id as usize >= self.buckets.len() {
return TopHitsTopNComputer::new(req);
}
let top_n = std::mem::replace(
&mut self.buckets[parent_bucket_id as usize],
TopNComputer::new(0),
);
let mut top_hits_computer = TopHitsTopNComputer::new(req);
let top_results = self.top_n.into_vec();
let top_results = top_n.into_vec();
for res in top_results {
let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id);
@@ -554,54 +567,24 @@ impl TopHitsSegmentCollector {
top_hits_computer
}
/// TODO add a specialized variant for a single sort field
fn collect_with(
&mut self,
doc_id: crate::DocId,
req: &TopHitsAggregationReq,
accessors: &[(Column<u64>, ColumnType)],
) -> crate::Result<()> {
let sorts: Vec<DocValueAndOrder> = req
.sort
.iter()
.enumerate()
.map(|(idx, KeyOrder { order, .. })| {
let order = *order;
let value = accessors
.get(idx)
.expect("could not find field in accessors")
.0
.values_for_doc(doc_id)
.next();
DocValueAndOrder { value, order }
})
.collect();
self.top_n.push(
sorts,
DocAddress {
segment_ord: self.segment_ordinal,
doc_id,
},
);
Ok(())
}
}
impl SegmentAggregationCollector for TopHitsSegmentCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
&mut self,
agg_data: &AggregationsSegmentCtx,
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> {
let req_data = agg_data.get_top_hits_req_data(self.accessor_idx);
let value_accessors = &req_data.value_accessors;
let intermediate_result = IntermediateMetricResult::TopHits(
self.into_top_hits_collector(value_accessors, &req_data.req),
);
let intermediate_result = IntermediateMetricResult::TopHits(self.get_top_hits_computer(
parent_bucket_id,
value_accessors,
&req_data.req,
));
results.push(
req_data.name.to_string(),
IntermediateAggregationResult::Metric(intermediate_result),
@@ -611,26 +594,56 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
/// TODO: Consider a caching layer to reduce the call overhead
fn collect(
&mut self,
doc_id: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
let req_data = agg_data.get_top_hits_req_data(self.accessor_idx);
self.collect_with(doc_id, &req_data.req, &req_data.accessors)?;
Ok(())
}
fn collect_block(
&mut self,
parent_bucket_id: BucketId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
let top_n = &mut self.buckets[parent_bucket_id as usize];
let req_data = agg_data.get_top_hits_req_data(self.accessor_idx);
// TODO: Consider getting fields with the column block accessor.
for doc in docs {
self.collect_with(*doc, &req_data.req, &req_data.accessors)?;
let req = &req_data.req;
let accessors = &req_data.accessors;
for &doc_id in docs {
// TODO: this is terrible, a new vec is allocated for every doc
// We can fetch blocks instead
// We don't need to store the order for every value
let sorts: Vec<DocValueAndOrder> = req
.sort
.iter()
.enumerate()
.map(|(idx, KeyOrder { order, .. })| {
let order = *order;
let value = accessors
.get(idx)
.expect("could not find field in accessors")
.0
.values_for_doc(doc_id)
.next();
DocValueAndOrder { value, order }
})
.collect();
top_n.push(
sorts,
DocAddress {
segment_ord: self.segment_ordinal,
doc_id,
},
);
}
Ok(())
}
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
_agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
self.buckets.resize(
(max_bucket as usize) + 1,
TopNComputer::new_with_comparator(self.num_hits, ReverseComparator),
);
Ok(())
}
}
#[cfg(test)]
@@ -746,7 +759,7 @@ mod tests {
],
"from": 0,
}
}
}
}))
.unwrap();
@@ -875,7 +888,7 @@ mod tests {
"mixed.*",
],
}
}
}
}))?;
let collector = AggregationCollector::from_aggs(d, Default::default());

View File

@@ -133,7 +133,7 @@ mod agg_limits;
pub mod agg_req;
pub mod agg_result;
pub mod bucket;
mod buf_collector;
pub(crate) mod cached_sub_aggs;
mod collector;
mod date;
mod error;
@@ -162,6 +162,19 @@ use serde::{Deserialize, Deserializer, Serialize};
use crate::tokenizer::TokenizerManager;
/// A bucket id is a dense identifier for a bucket within an aggregation.
/// It is used to index into a Vec that hold per-bucket data.
///
/// For example, in a terms aggregation, each unique term will be assigned a incremental BucketId.
/// This BucketId will be forwarded to sub-aggregations to identify the parent bucket.
///
/// This allows to have a single AggregationCollector instance per aggregation,
/// that can handle multiple buckets efficiently.
///
/// The API to call sub-aggregations is therefore a &[(BucketId, &[DocId])].
/// For that we'll need a buffer. One Vec per bucket aggregation is needed.
pub type BucketId = u32;
/// Context parameters for aggregation execution
///
/// This struct holds shared resources needed during aggregation execution:
@@ -335,19 +348,37 @@ impl Display for Key {
}
}
pub(crate) fn convert_to_f64<const COLUMN_TYPE_ID: u8>(val: u64) -> f64 {
if COLUMN_TYPE_ID == ColumnType::U64 as u8 {
val as f64
} else if COLUMN_TYPE_ID == ColumnType::I64 as u8
|| COLUMN_TYPE_ID == ColumnType::DateTime as u8
{
i64::from_u64(val) as f64
} else if COLUMN_TYPE_ID == ColumnType::F64 as u8 {
f64::from_u64(val)
} else if COLUMN_TYPE_ID == ColumnType::Bool as u8 {
val as f64
} else {
panic!(
"ColumnType ID {} cannot be converted to f64 metric",
COLUMN_TYPE_ID
)
}
}
/// Inverse of `to_fastfield_u64`. Used to convert to `f64` for metrics.
///
/// # Panics
/// Only `u64`, `f64`, `date`, and `i64` are supported.
pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: &ColumnType) -> f64 {
pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: ColumnType) -> f64 {
match field_type {
ColumnType::U64 => val as f64,
ColumnType::I64 | ColumnType::DateTime => i64::from_u64(val) as f64,
ColumnType::F64 => f64::from_u64(val),
ColumnType::Bool => val as f64,
_ => {
panic!("unexpected type {field_type:?}. This should not happen")
}
ColumnType::U64 => convert_to_f64::<{ ColumnType::U64 as u8 }>(val),
ColumnType::I64 => convert_to_f64::<{ ColumnType::I64 as u8 }>(val),
ColumnType::F64 => convert_to_f64::<{ ColumnType::F64 as u8 }>(val),
ColumnType::Bool => convert_to_f64::<{ ColumnType::Bool as u8 }>(val),
ColumnType::DateTime => convert_to_f64::<{ ColumnType::DateTime as u8 }>(val),
_ => panic!("unexpected type {field_type:?}. This should not happen"),
}
}

View File

@@ -8,25 +8,67 @@ use std::fmt::Debug;
pub(crate) use super::agg_limits::AggregationLimitsGuard;
use super::intermediate_agg_result::IntermediateAggregationResults;
use crate::aggregation::agg_data::AggregationsSegmentCtx;
use crate::aggregation::BucketId;
/// Monotonically increasing provider of BucketIds.
#[derive(Debug, Clone, Default)]
pub struct BucketIdProvider(u32);
impl BucketIdProvider {
/// Get the next BucketId.
pub fn next_bucket_id(&mut self) -> BucketId {
let bucket_id = self.0;
self.0 += 1;
bucket_id
}
}
/// A SegmentAggregationCollector is used to collect aggregation results.
pub trait SegmentAggregationCollector: CollectorClone + Debug {
pub trait SegmentAggregationCollector: Debug {
fn add_intermediate_aggregation_result(
self: Box<Self>,
&mut self,
agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()>;
/// Note: The caller needs to call `prepare_max_bucket` before calling `collect`.
fn collect(
&mut self,
doc: crate::DocId,
parent_bucket_id: BucketId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()>;
fn collect_block(
/// Collect docs for multiple buckets in one call.
/// Minimizes dynamic dispatch overhead when collecting many buckets.
///
/// Note: The caller needs to call `prepare_max_bucket` before calling `collect`.
fn collect_multiple(
&mut self,
bucket_ids: &[BucketId],
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
debug_assert_eq!(bucket_ids.len(), docs.len());
let mut start = 0;
while start < bucket_ids.len() {
let bucket_id = bucket_ids[start];
let mut end = start + 1;
while end < bucket_ids.len() && bucket_ids[end] == bucket_id {
end += 1;
}
self.collect(bucket_id, &docs[start..end], agg_data)?;
start = end;
}
Ok(())
}
/// Prepare the collector for collecting up to BucketId `max_bucket`.
/// This is useful so we can split allocation ahead of time of collecting.
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()>;
/// Finalize method. Some Aggregator collect blocks of docs before calling `collect_block`.
@@ -36,26 +78,7 @@ pub trait SegmentAggregationCollector: CollectorClone + Debug {
}
}
/// A helper trait to enable cloning of Box<dyn SegmentAggregationCollector>
pub trait CollectorClone {
fn clone_box(&self) -> Box<dyn SegmentAggregationCollector>;
}
impl<T> CollectorClone for T
where T: 'static + SegmentAggregationCollector + Clone
{
fn clone_box(&self) -> Box<dyn SegmentAggregationCollector> {
Box::new(self.clone())
}
}
impl Clone for Box<dyn SegmentAggregationCollector> {
fn clone(&self) -> Box<dyn SegmentAggregationCollector> {
self.clone_box()
}
}
#[derive(Clone, Default)]
#[derive(Default)]
/// The GenericSegmentAggregationResultsCollector is the generic version of the collector, which
/// can handle arbitrary complexity of sub-aggregations. Ideally we never have to pick this one
/// and can provide specialized versions instead, that remove some of its overhead.
@@ -73,12 +96,13 @@ impl Debug for GenericSegmentAggregationResultsCollector {
impl SegmentAggregationCollector for GenericSegmentAggregationResultsCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
&mut self,
agg_data: &AggregationsSegmentCtx,
results: &mut IntermediateAggregationResults,
parent_bucket_id: BucketId,
) -> crate::Result<()> {
for agg in self.aggs {
agg.add_intermediate_aggregation_result(agg_data, results)?;
for agg in &mut self.aggs {
agg.add_intermediate_aggregation_result(agg_data, results, parent_bucket_id)?;
}
Ok(())
@@ -86,23 +110,13 @@ impl SegmentAggregationCollector for GenericSegmentAggregationResultsCollector {
fn collect(
&mut self,
doc: crate::DocId,
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
self.collect_block(&[doc], agg_data)?;
Ok(())
}
fn collect_block(
&mut self,
parent_bucket_id: BucketId,
docs: &[crate::DocId],
agg_data: &mut AggregationsSegmentCtx,
) -> crate::Result<()> {
for collector in &mut self.aggs {
collector.collect_block(docs, agg_data)?;
collector.collect(parent_bucket_id, docs, agg_data)?;
}
Ok(())
}
@@ -112,4 +126,15 @@ impl SegmentAggregationCollector for GenericSegmentAggregationResultsCollector {
}
Ok(())
}
fn prepare_max_bucket(
&mut self,
max_bucket: BucketId,
agg_data: &AggregationsSegmentCtx,
) -> crate::Result<()> {
for collector in &mut self.aggs {
collector.prepare_max_bucket(max_bucket, agg_data)?;
}
Ok(())
}
}

169
src/codec/mod.rs Normal file
View File

@@ -0,0 +1,169 @@
/// Codec specific to postings data.
pub mod postings;
/// Standard tantivy codec. This is the codec you use by default.
pub mod standard;
use std::io;
pub use standard::StandardCodec;
use crate::codec::postings::PostingsCodec;
use crate::fieldnorm::FieldNormReader;
use crate::postings::{Postings, TermInfo};
use crate::query::{box_scorer, Bm25Weight, Scorer};
use crate::schema::IndexRecordOption;
use crate::{DocId, InvertedIndexReader, Score};
/// Codecs describes how data is layed out on disk.
///
/// For the moment, only postings codec can be custom.
pub trait Codec: Clone + std::fmt::Debug + Send + Sync + 'static {
/// The specific postings type used by this codec.
type PostingsCodec: PostingsCodec;
/// Name of the codec. It should be unique to your codec.
const NAME: &'static str;
/// Load codec based on the codec configuration.
fn from_json_props(json_value: &serde_json::Value) -> crate::Result<Self>;
/// Get codec configuration.
fn to_json_props(&self) -> serde_json::Value;
/// Returns the postings codec.
fn postings_codec(&self) -> &Self::PostingsCodec;
}
/// Object-safe codec is a Codec that can be used in a trait object.
///
/// The point of it is to offer a way to use a codec without a proliferation of generics.
pub trait ObjectSafeCodec: 'static + Send + Sync {
/// Loads a type-erased Postings object for the given term.
///
/// If the schema used to build the index did not provide enough
/// information to match the requested `option`, a Postings is still
/// returned in a best-effort manner.
fn load_postings_type_erased(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
inverted_index_reader: &InvertedIndexReader,
) -> io::Result<Box<dyn Postings>>;
/// Loads a type-erased TermScorer object for the given term.
///
/// If the schema used to build the index did not provide enough
/// information to match the requested `option`, a TermScorer is still
/// returned in a best-effort manner.
///
/// The point of this contraption is that the return TermScorer is backed,
/// not by Box<dyn Postings> but by the codec's concrete Postings type.
fn load_term_scorer_type_erased(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
inverted_index_reader: &InvertedIndexReader,
fieldnorm_reader: FieldNormReader,
similarity_weight: Bm25Weight,
) -> io::Result<Box<dyn Scorer>>;
/// Loads a type-erased PhraseScorer object for the given term.
///
/// If the schema used to build the index did not provide enough
/// information to match the requested `option`, a TermScorer is still
/// returned in a best-effort manner.
///
/// The point of this contraption is that the return PhraseScorer is backed,
/// not by Box<dyn Postings> but by the codec's concrete Postings type.
fn new_phrase_scorer_type_erased(
&self,
term_infos: &[(usize, TermInfo)],
similarity_weight: Option<Bm25Weight>,
fieldnorm_reader: FieldNormReader,
slop: u32,
inverted_index_reader: &InvertedIndexReader,
) -> io::Result<Box<dyn Scorer>>;
/// Performs a for_each_pruning operation on the given scorer.
///
/// The function will go through matching documents and call the callback
/// function for all docs with a score exceeding the threshold.
///
/// The function itself will return a larger threshold value,
/// meant to update the threshold value.
///
/// If the codec and the scorer allow it, this function can rely on
/// optimizations like the block-max wand.
fn for_each_pruning(
&self,
threshold: Score,
scorer: Box<dyn Scorer>,
callback: &mut dyn FnMut(DocId, Score) -> Score,
);
}
impl<TCodec: Codec> ObjectSafeCodec for TCodec {
fn load_postings_type_erased(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
inverted_index_reader: &InvertedIndexReader,
) -> io::Result<Box<dyn Postings>> {
let postings = inverted_index_reader
.read_postings_from_terminfo_specialized(term_info, option, self)?;
Ok(Box::new(postings))
}
fn load_term_scorer_type_erased(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
inverted_index_reader: &InvertedIndexReader,
fieldnorm_reader: FieldNormReader,
similarity_weight: Bm25Weight,
) -> io::Result<Box<dyn Scorer>> {
let scorer = inverted_index_reader.new_term_scorer_specialized(
term_info,
option,
fieldnorm_reader,
similarity_weight,
self,
)?;
Ok(box_scorer(scorer))
}
fn new_phrase_scorer_type_erased(
&self,
term_infos: &[(usize, TermInfo)],
similarity_weight: Option<Bm25Weight>,
fieldnorm_reader: FieldNormReader,
slop: u32,
inverted_index_reader: &InvertedIndexReader,
) -> io::Result<Box<dyn Scorer>> {
let scorer = inverted_index_reader.new_phrase_scorer_type_specialized(
term_infos,
similarity_weight,
fieldnorm_reader,
slop,
self,
)?;
Ok(box_scorer(scorer))
}
fn for_each_pruning(
&self,
threshold: Score,
scorer: Box<dyn Scorer>,
callback: &mut dyn FnMut(DocId, Score) -> Score,
) {
let accerelerated_foreach_pruning_res =
<TCodec as Codec>::PostingsCodec::try_accelerated_for_each_pruning(
threshold, scorer, callback,
);
if let Err(mut scorer) = accerelerated_foreach_pruning_res {
// No acceleration available. We need to do things manually.
scorer.for_each_pruning(threshold, callback);
}
}
}

View File

@@ -1,5 +1,6 @@
use std::ops::{Deref, DerefMut};
use crate::codec::postings::PostingsWithBlockMax;
use crate::query::term_query::TermScorer;
use crate::query::Scorer;
use crate::{DocId, DocSet, Score, TERMINATED};
@@ -13,8 +14,8 @@ use crate::{DocId, DocSet, Score, TERMINATED};
/// We always have `before_pivot_len` < `pivot_len`.
///
/// `None` is returned if we establish that no document can exceed the threshold.
fn find_pivot_doc(
term_scorers: &[TermScorerWithMaxScore],
fn find_pivot_doc<TPostings: PostingsWithBlockMax>(
term_scorers: &[TermScorerWithMaxScore<TPostings>],
threshold: Score,
) -> Option<(usize, usize, DocId)> {
let mut max_score = 0.0;
@@ -46,8 +47,8 @@ fn find_pivot_doc(
/// the next doc candidate defined by the min of `last_doc_in_block + 1` for
/// scorer in scorers[..pivot_len] and `scorer.doc()` for scorer in scorers[pivot_len..].
/// Note: before and after calling this method, scorers need to be sorted by their `.doc()`.
fn block_max_was_too_low_advance_one_scorer(
scorers: &mut [TermScorerWithMaxScore],
fn block_max_was_too_low_advance_one_scorer<TPostings: PostingsWithBlockMax>(
scorers: &mut [TermScorerWithMaxScore<TPostings>],
pivot_len: usize,
) {
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
@@ -82,7 +83,10 @@ fn block_max_was_too_low_advance_one_scorer(
// Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted
// except term_scorers[ord] that might be in advance compared to its ranks,
// bubble up term_scorers[ord] in order to restore the ordering.
fn restore_ordering(term_scorers: &mut [TermScorerWithMaxScore], ord: usize) {
fn restore_ordering<TPostings: PostingsWithBlockMax>(
term_scorers: &mut [TermScorerWithMaxScore<TPostings>],
ord: usize,
) {
let doc = term_scorers[ord].doc();
for i in ord + 1..term_scorers.len() {
if term_scorers[i].doc() >= doc {
@@ -97,9 +101,10 @@ fn restore_ordering(term_scorers: &mut [TermScorerWithMaxScore], ord: usize) {
// If this works, return true.
// If this fails (ie: one of the term_scorer does not contain `pivot_doc` and seek goes past the
// pivot), reorder the term_scorers to ensure the list is still sorted and returns `false`.
// If a term_scorer reach TERMINATED in the process return false remove the term_scorer and return.
fn align_scorers(
term_scorers: &mut Vec<TermScorerWithMaxScore>,
// If a term_scorer reach TERMINATED in the process return false remove the term_scorer and
// return.
fn align_scorers<TPostings: PostingsWithBlockMax>(
term_scorers: &mut Vec<TermScorerWithMaxScore<TPostings>>,
pivot_doc: DocId,
before_pivot_len: usize,
) -> bool {
@@ -126,7 +131,10 @@ fn align_scorers(
// Assumes terms_scorers[..pivot_len] are positioned on the same doc (pivot_doc).
// Advance term_scorers[..pivot_len] and out of these removes the terminated scores.
// Restores the ordering of term_scorers.
fn advance_all_scorers_on_pivot(term_scorers: &mut Vec<TermScorerWithMaxScore>, pivot_len: usize) {
fn advance_all_scorers_on_pivot<TPostings: PostingsWithBlockMax>(
term_scorers: &mut Vec<TermScorerWithMaxScore<TPostings>>,
pivot_len: usize,
) {
for term_scorer in &mut term_scorers[..pivot_len] {
term_scorer.advance();
}
@@ -145,12 +153,12 @@ fn advance_all_scorers_on_pivot(term_scorers: &mut Vec<TermScorerWithMaxScore>,
/// Implements the WAND (Weak AND) algorithm for dynamic pruning
/// described in the paper "Faster Top-k Document Retrieval Using Block-Max Indexes".
/// Link: <http://engineering.nyu.edu/~suel/papers/bmw.pdf>
pub fn block_wand(
mut scorers: Vec<TermScorer>,
pub fn block_wand<TPostings: PostingsWithBlockMax>(
mut scorers: Vec<TermScorer<TPostings>>,
mut threshold: Score,
callback: &mut dyn FnMut(u32, Score) -> Score,
) {
let mut scorers: Vec<TermScorerWithMaxScore> = scorers
let mut scorers: Vec<TermScorerWithMaxScore<TPostings>> = scorers
.iter_mut()
.map(TermScorerWithMaxScore::from)
.collect();
@@ -166,10 +174,7 @@ pub fn block_wand(
let block_max_score_upperbound: Score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| {
scorer.seek_block(pivot_doc);
scorer.block_max_score()
})
.map(|scorer| scorer.seek_block_max(pivot_doc))
.sum();
// Beware after shallow advance, skip readers can be in advance compared to
@@ -220,21 +225,22 @@ pub fn block_wand(
/// - On a block, advance until the end and execute `callback` when the doc score is greater or
/// equal to the `threshold`.
pub fn block_wand_single_scorer(
mut scorer: TermScorer,
mut scorer: TermScorer<impl PostingsWithBlockMax>,
mut threshold: Score,
callback: &mut dyn FnMut(u32, Score) -> Score,
) {
let mut doc = scorer.doc();
let mut block_max_score = scorer.seek_block_max(doc);
loop {
// We position the scorer on a block that can reach
// the threshold.
while scorer.block_max_score() < threshold {
while block_max_score < threshold {
let last_doc_in_block = scorer.last_doc_in_block();
if last_doc_in_block == TERMINATED {
return;
}
doc = last_doc_in_block + 1;
scorer.seek_block(doc);
block_max_score = scorer.seek_block_max(doc);
}
// Seek will effectively load that block.
doc = scorer.seek(doc);
@@ -256,31 +262,33 @@ pub fn block_wand_single_scorer(
}
}
doc += 1;
scorer.seek_block(doc);
block_max_score = scorer.seek_block_max(doc);
}
}
struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
struct TermScorerWithMaxScore<'a, TPostings: PostingsWithBlockMax> {
scorer: &'a mut TermScorer<TPostings>,
max_score: Score,
}
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
fn from(scorer: &'a mut TermScorer) -> Self {
impl<'a, TPostings: PostingsWithBlockMax> From<&'a mut TermScorer<TPostings>>
for TermScorerWithMaxScore<'a, TPostings>
{
fn from(scorer: &'a mut TermScorer<TPostings>) -> Self {
let max_score = scorer.max_score();
TermScorerWithMaxScore { scorer, max_score }
}
}
impl Deref for TermScorerWithMaxScore<'_> {
type Target = TermScorer;
impl<TPostings: PostingsWithBlockMax> Deref for TermScorerWithMaxScore<'_, TPostings> {
type Target = TermScorer<TPostings>;
fn deref(&self) -> &Self::Target {
self.scorer
}
}
impl DerefMut for TermScorerWithMaxScore<'_> {
impl<TPostings: PostingsWithBlockMax> DerefMut for TermScorerWithMaxScore<'_, TPostings> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}

119
src/codec/postings/mod.rs Normal file
View File

@@ -0,0 +1,119 @@
use std::io;
/// Block-max WAND algorithm.
pub mod block_wand;
use common::OwnedBytes;
use crate::fieldnorm::FieldNormReader;
use crate::postings::Postings;
use crate::query::{Bm25Weight, Scorer};
use crate::schema::IndexRecordOption;
use crate::{DocId, Score};
/// Postings codec.
pub trait PostingsCodec: Send + Sync + 'static {
/// Serializer type for the postings codec.
type PostingsSerializer: PostingsSerializer;
/// Postings type for the postings codec.
type Postings: Postings + Clone;
/// Creates a new postings serializer.
fn new_serializer(
&self,
avg_fieldnorm: Score,
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
) -> Self::PostingsSerializer;
/// Loads postings
///
/// Record option is the option that was passed at indexing time.
/// Requested option is the option that is requested.
///
/// For instance, we may have term_freq in the posting list
/// but we can skip decompressing as we read the posting list.
///
/// If record option does not support the requested option,
/// this method does NOT return an error and will in fact restrict
/// requested_option to what is available.
fn load_postings(
&self,
doc_freq: u32,
postings_data: OwnedBytes,
record_option: IndexRecordOption,
requested_option: IndexRecordOption,
positions_data: Option<OwnedBytes>,
) -> io::Result<Self::Postings>;
/// If your codec supports different ways to accelerate `for_each_pruning` that's
/// where you should implement it.
///
/// Returning `Err(scorer)` without mutating the scorer nor calling the callback function,
/// is never "wrong". It just leaves the responsability to the caller to call a fallback
/// implementation on the scorer.
///
/// If your codec supports BlockMax-Wand, you just need to have your
/// postings implement `PostingsWithBlockMax` and copy what is done in the StandardPostings
/// codec to enable it.
fn try_accelerated_for_each_pruning(
_threshold: Score,
scorer: Box<dyn Scorer>,
_callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> Result<(), Box<dyn Scorer>> {
Err(scorer)
}
}
/// A postings serializer is a listener that is in charge of serializing postings
///
/// IO is done only once per postings, once all of the data has been received.
/// A serializer will therefore contain internal buffers.
///
/// A serializer is created once and recycled for all postings.
///
/// Clients should use PostingsSerializer as follows.
/// ```
/// // First postings list
/// serializer.new_term(2, true);
/// serializer.write_doc(2, 1);
/// serializer.write_doc(6, 2);
/// serializer.close_term(3);
/// serializer.clear();
/// // Second postings list
/// serializer.new_term(1, true);
/// serializer.write_doc(3, 1);
/// serializer.close_term(3);
/// ```
pub trait PostingsSerializer {
/// The term_doc_freq here is the number of documents
/// in the postings lists.
///
/// It can be used to compute the idf that will be used for the
/// blockmax parameters.
///
/// If not available (e.g. if we do not collect `term_frequencies`
/// blockwand is disabled), the term_doc_freq passed will be set 0.
fn new_term(&mut self, term_doc_freq: u32, record_term_freq: bool);
/// Records a new document id for the current term.
/// The serializer may ignore it.
fn write_doc(&mut self, doc_id: DocId, term_freq: u32);
/// Closes the current term and writes the postings list associated.
fn close_term(&mut self, doc_freq: u32, wrt: &mut impl io::Write) -> io::Result<()>;
}
/// A light complement interface to Postings to allow block-max wand acceleration.
pub trait PostingsWithBlockMax: Postings {
/// Moves the postings to the block containign `target_doc` and returns
/// an upperbound of the score for documents in the block.
///
/// `Warning`: Calling this method may leave the postings in an invalid state.
/// callers are required to call seek before calling any other of the
/// `Postings` method (like doc / advance etc.).
fn seek_block_max(&mut self, target_doc: crate::DocId, similarity_weight: &Bm25Weight)
-> Score;
/// Returns the last document in the current block (or Terminated if this
/// is the last block).
fn last_doc_in_block(&self) -> crate::DocId;
}

35
src/codec/standard/mod.rs Normal file
View File

@@ -0,0 +1,35 @@
use serde::{Deserialize, Serialize};
use crate::codec::standard::postings::StandardPostingsCodec;
use crate::codec::Codec;
/// Tantivy's default postings codec.
pub mod postings;
/// Tantivy's default codec.
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
pub struct StandardCodec;
impl Codec for StandardCodec {
type PostingsCodec = StandardPostingsCodec;
const NAME: &'static str = "standard";
fn from_json_props(json_value: &serde_json::Value) -> crate::Result<Self> {
if !json_value.is_null() {
return Err(crate::TantivyError::InvalidArgument(format!(
"Codec property for the StandardCodec are unexpected. expected null, got {}",
json_value.as_str().unwrap_or("null")
)));
}
Ok(StandardCodec)
}
fn to_json_props(&self) -> serde_json::Value {
serde_json::Value::Null
}
fn postings_codec(&self) -> &Self::PostingsCodec {
&StandardPostingsCodec
}
}

View File

@@ -0,0 +1,50 @@
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::DocId;
pub struct Block {
doc_ids: [DocId; COMPRESSION_BLOCK_SIZE],
term_freqs: [u32; COMPRESSION_BLOCK_SIZE],
len: usize,
}
impl Block {
pub fn new() -> Self {
Block {
doc_ids: [0u32; COMPRESSION_BLOCK_SIZE],
term_freqs: [0u32; COMPRESSION_BLOCK_SIZE],
len: 0,
}
}
pub fn doc_ids(&self) -> &[DocId] {
&self.doc_ids[..self.len]
}
pub fn term_freqs(&self) -> &[u32] {
&self.term_freqs[..self.len]
}
pub fn clear(&mut self) {
self.len = 0;
}
pub fn append_doc(&mut self, doc: DocId, term_freq: u32) {
let len = self.len;
self.doc_ids[len] = doc;
self.term_freqs[len] = term_freq;
self.len = len + 1;
}
pub fn is_full(&self) -> bool {
self.len == COMPRESSION_BLOCK_SIZE
}
pub fn is_empty(&self) -> bool {
self.len == 0
}
pub fn last_doc(&self) -> DocId {
assert_eq!(self.len, COMPRESSION_BLOCK_SIZE);
self.doc_ids[COMPRESSION_BLOCK_SIZE - 1]
}
}

View File

@@ -1,28 +1,18 @@
use std::io;
use common::VInt;
use common::{OwnedBytes, VInt};
use crate::directory::{FileSlice, OwnedBytes};
use crate::fieldnorm::FieldNormReader;
use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
use crate::postings::{BlockInfo, FreqReadingOption, SkipReader};
use crate::codec::standard::postings::skip::{BlockInfo, SkipReader};
use crate::codec::standard::postings::FreqReadingOption;
use crate::postings::compression::{BlockDecoder, VIntDecoder as _, COMPRESSION_BLOCK_SIZE};
use crate::query::Bm25Weight;
use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TERMINATED};
fn max_score<I: Iterator<Item = Score>>(mut it: I) -> Option<Score> {
it.next().map(|first| it.fold(first, Score::max))
}
/// `BlockSegmentPostings` is a cursor iterating over blocks
/// of documents.
///
/// # Warning
///
/// While it is useful for some very specific high-performance
/// use cases, you should prefer using `SegmentPostings` for most usage.
#[derive(Clone)]
pub struct BlockSegmentPostings {
pub(crate) struct BlockSegmentPostings {
pub(crate) doc_decoder: BlockDecoder,
block_loaded: bool,
freq_decoder: BlockDecoder,
@@ -88,7 +78,7 @@ fn split_into_skips_and_postings(
}
impl BlockSegmentPostings {
/// Opens a `BlockSegmentPostings`.
/// Opens a `StandardPostingsReader`.
/// `doc_freq` is the number of documents in the posting list.
/// `record_option` represents the amount of data available according to the schema.
/// `requested_option` is the amount of data requested by the user.
@@ -96,12 +86,10 @@ impl BlockSegmentPostings {
/// term frequency blocks.
pub(crate) fn open(
doc_freq: u32,
data: FileSlice,
bytes: OwnedBytes,
mut record_option: IndexRecordOption,
requested_option: IndexRecordOption,
seek_doc: DocId,
) -> io::Result<(BlockSegmentPostings, usize)> {
let bytes = data.read_bytes()?;
) -> io::Result<BlockSegmentPostings> {
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, bytes)?;
let skip_reader = match skip_data_opt {
Some(skip_data) => {
@@ -126,7 +114,7 @@ impl BlockSegmentPostings {
(_, _) => FreqReadingOption::ReadFreq,
};
let mut block_segment_postings: BlockSegmentPostings = BlockSegmentPostings {
let mut block_segment_postings = BlockSegmentPostings {
doc_decoder: BlockDecoder::with_val(TERMINATED),
block_loaded: false,
freq_decoder: BlockDecoder::with_val(1),
@@ -136,84 +124,12 @@ impl BlockSegmentPostings {
data: postings_data,
skip_reader,
};
let inner_pos = if seek_doc == 0 {
block_segment_postings.load_block();
0
} else {
block_segment_postings.seek(seek_doc)
};
Ok((block_segment_postings, inner_pos))
}
/// Returns the block_max_score for the current block.
/// It does not require the block to be loaded. For instance, it is ok to call this method
/// after having called `.shallow_advance(..)`.
///
/// See `TermScorer::block_max_score(..)` for more information.
pub fn block_max_score(
&mut self,
fieldnorm_reader: &FieldNormReader,
bm25_weight: &Bm25Weight,
) -> Score {
if let Some(score) = self.block_max_score_cache {
return score;
}
if let Some(skip_reader_max_score) = self.skip_reader.block_max_score(bm25_weight) {
// if we are on a full block, the skip reader should have the block max information
// for us
self.block_max_score_cache = Some(skip_reader_max_score);
return skip_reader_max_score;
}
// this is the last block of the segment posting list.
// If it is actually loaded, we can compute block max manually.
if self.block_is_loaded() {
let docs = self.doc_decoder.output_array().iter().cloned();
let freqs = self.freq_decoder.output_array().iter().cloned();
let bm25_scores = docs.zip(freqs).map(|(doc, term_freq)| {
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
bm25_weight.score(fieldnorm_id, term_freq)
});
let block_max_score = max_score(bm25_scores).unwrap_or(0.0);
self.block_max_score_cache = Some(block_max_score);
return block_max_score;
}
// We do not have access to any good block max value. We return bm25_weight.max_score()
// as it is a valid upperbound.
//
// We do not cache it however, so that it gets computed when once block is loaded.
bm25_weight.max_score()
}
pub(crate) fn freq_reading_option(&self) -> FreqReadingOption {
self.freq_reading_option
}
// Resets the block segment postings on another position
// in the postings file.
//
// This is useful for enumerating through a list of terms,
// and consuming the associated posting lists while avoiding
// reallocating a `BlockSegmentPostings`.
//
// # Warning
//
// This does not reset the positions list.
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedBytes) -> io::Result<()> {
let (skip_data_opt, postings_data) =
split_into_skips_and_postings(doc_freq, postings_data)?;
self.data = postings_data;
self.block_max_score_cache = None;
self.block_loaded = false;
if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data, doc_freq);
} else {
self.skip_reader.reset(OwnedBytes::empty(), doc_freq);
}
self.doc_freq = doc_freq;
self.load_block();
Ok(())
block_segment_postings.load_block();
Ok(block_segment_postings)
}
}
impl BlockSegmentPostings {
/// Returns the overall number of documents in the block postings.
/// It does not take in account whether documents are deleted or not.
///
@@ -229,7 +145,7 @@ impl BlockSegmentPostings {
/// returned by `.docs()` is empty.
#[inline]
pub fn docs(&self) -> &[DocId] {
debug_assert!(self.block_is_loaded());
debug_assert!(self.block_loaded);
self.doc_decoder.output_array()
}
@@ -242,37 +158,24 @@ impl BlockSegmentPostings {
/// Return the array of `term freq` in the block.
#[inline]
pub fn freqs(&self) -> &[u32] {
debug_assert!(self.block_is_loaded());
debug_assert!(self.block_loaded);
self.freq_decoder.output_array()
}
/// Return the frequency at index `idx` of the block.
#[inline]
pub fn freq(&self, idx: usize) -> u32 {
debug_assert!(self.block_is_loaded());
debug_assert!(self.block_loaded);
self.freq_decoder.output(idx)
}
/// Returns the length of the current block.
///
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
/// except the last block that may have a length
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
#[inline]
pub fn block_len(&self) -> usize {
debug_assert!(self.block_is_loaded());
self.doc_decoder.output_len
}
/// Position on a block that may contains `target_doc`, and returns the
/// position of the first document greater than or equal to `target_doc`
/// within that block.
/// Position on a block that may contains `target_doc`.
///
/// If all docs are smaller than target, the block loaded may be empty,
/// or be the last an incomplete VInt block.
pub fn seek(&mut self, target_doc: DocId) -> usize {
// Move to the block that might contain our document.
self.seek_block(target_doc);
self.seek_block_without_loading(target_doc);
self.load_block();
// At this point we are on the block that might contain our document.
@@ -289,32 +192,79 @@ impl BlockSegmentPostings {
doc
}
pub(crate) fn position_offset(&self) -> u64 {
pub fn position_offset(&self) -> u64 {
self.skip_reader.position_offset()
}
/// Advance to the next block.
pub fn advance(&mut self) {
self.skip_reader.advance();
self.block_loaded = false;
self.block_max_score_cache = None;
self.load_block();
}
/// Returns the block_max_score for the current block.
/// It does not require the block to be loaded. For instance, it is ok to call this method
/// after having called `.shallow_advance(..)`.
///
/// See `TermScorer::block_max_score(..)` for more information.
pub fn block_max_score(&mut self, bm25_weight: &Bm25Weight) -> Score {
if let Some(score) = self.block_max_score_cache {
return score;
}
if let Some(skip_reader_max_score) = self.skip_reader.block_max_score(bm25_weight) {
// if we are on a full block, the skip reader should have the block max information
// for us
self.block_max_score_cache = Some(skip_reader_max_score);
return skip_reader_max_score;
}
// We do not have access to any good block max value.
// It happens if this is the last block.
// We return bm25_weight.max_score() as it is a valid upperbound.
//
// We do not cache it however, so that it gets computed when once block is loaded.
bm25_weight.max_score()
}
}
impl BlockSegmentPostings {
/// Returns an empty segment postings object
pub fn empty() -> BlockSegmentPostings {
BlockSegmentPostings {
doc_decoder: BlockDecoder::with_val(TERMINATED),
block_loaded: true,
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option: FreqReadingOption::NoFreq,
block_max_score_cache: None,
doc_freq: 0,
data: OwnedBytes::empty(),
skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic),
}
}
pub(crate) fn skip_reader(&self) -> &SkipReader {
&self.skip_reader
}
/// Dangerous API! This calls seeks the next block on the skip list,
/// but does not `.load_block()` afterwards.
///
/// `.load_block()` needs to be called manually afterwards.
/// If all docs are smaller than target, the block loaded may be empty,
/// or be the last an incomplete VInt block.
pub(crate) fn seek_block(&mut self, target_doc: DocId) {
pub(crate) fn seek_block_without_loading(&mut self, target_doc: DocId) {
if self.skip_reader.seek(target_doc) {
self.block_max_score_cache = None;
self.block_loaded = false;
}
}
pub(crate) fn block_is_loaded(&self) -> bool {
self.block_loaded
}
pub(crate) fn load_block(&mut self) {
let offset = self.skip_reader.byte_offset();
if self.block_is_loaded() {
if self.block_loaded {
return;
}
let offset = self.skip_reader.byte_offset();
match self.skip_reader.block_info() {
BlockInfo::BitPacked {
doc_num_bits,
@@ -359,68 +309,40 @@ impl BlockSegmentPostings {
}
self.block_loaded = true;
}
/// Advance to the next block.
pub fn advance(&mut self) {
self.skip_reader.advance();
self.block_loaded = false;
self.block_max_score_cache = None;
self.load_block();
}
/// Returns an empty segment postings object
pub fn empty() -> BlockSegmentPostings {
BlockSegmentPostings {
doc_decoder: BlockDecoder::with_val(TERMINATED),
block_loaded: true,
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option: FreqReadingOption::NoFreq,
block_max_score_cache: None,
doc_freq: 0,
data: OwnedBytes::empty(),
skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic),
}
}
pub(crate) fn skip_reader(&self) -> &SkipReader {
&self.skip_reader
}
}
#[cfg(test)]
mod tests {
use common::HasLen;
use common::OwnedBytes;
use super::BlockSegmentPostings;
use crate::codec::postings::PostingsSerializer;
use crate::codec::standard::postings::segment_postings::SegmentPostings;
use crate::codec::standard::postings::StandardPostingsSerializer;
use crate::docset::{DocSet, TERMINATED};
use crate::index::Index;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::postings::Postings;
use crate::postings::SegmentPostings;
use crate::schema::{IndexRecordOption, Schema, Term, INDEXED};
use crate::DocId;
use crate::schema::IndexRecordOption;
#[test]
fn test_empty_segment_postings() {
let mut postings = SegmentPostings::empty();
assert_eq!(postings.doc(), TERMINATED);
assert_eq!(postings.advance(), TERMINATED);
assert_eq!(postings.advance(), TERMINATED);
assert_eq!(postings.doc_freq(), 0);
assert_eq!(postings.len(), 0);
}
#[test]
fn test_empty_postings_doc_returns_terminated() {
let mut postings = SegmentPostings::empty();
assert_eq!(postings.doc(), TERMINATED);
assert_eq!(postings.advance(), TERMINATED);
}
#[test]
fn test_empty_postings_doc_term_freq_returns_0() {
let postings = SegmentPostings::empty();
assert_eq!(postings.term_freq(), 1);
#[cfg(test)]
fn build_block_postings(docs: &[u32]) -> BlockSegmentPostings {
let doc_freq = docs.len() as u32;
let mut postings_serializer =
StandardPostingsSerializer::new(1.0f32, IndexRecordOption::Basic, None);
postings_serializer.new_term(docs.len() as u32, false);
for doc in docs {
postings_serializer.write_doc(*doc, 1u32);
}
let mut buffer: Vec<u8> = Vec::new();
postings_serializer
.close_term(doc_freq, &mut buffer)
.unwrap();
BlockSegmentPostings::open(
doc_freq,
OwnedBytes::new(buffer),
IndexRecordOption::Basic,
IndexRecordOption::Basic,
)
.unwrap()
}
#[test]
@@ -435,7 +357,7 @@ mod tests {
#[test]
fn test_block_segment_postings() -> crate::Result<()> {
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>())?;
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
let mut offset: u32 = 0u32;
// checking that the `doc_freq` is correct
assert_eq!(block_segments.doc_freq(), 100_000);
@@ -460,8 +382,8 @@ mod tests {
doc_ids.push(129);
doc_ids.push(130);
{
let block_segments = build_block_postings(&doc_ids)?;
let mut docset = SegmentPostings::from_block_postings(block_segments, None, 0);
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.seek(128), 129);
assert_eq!(docset.doc(), 129);
assert_eq!(docset.advance(), 130);
@@ -469,8 +391,8 @@ mod tests {
assert_eq!(docset.advance(), TERMINATED);
}
{
let block_segments = build_block_postings(&doc_ids)?;
let mut docset = SegmentPostings::from_block_postings(block_segments, None, 0);
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.seek(129), 129);
assert_eq!(docset.doc(), 129);
assert_eq!(docset.advance(), 130);
@@ -478,8 +400,8 @@ mod tests {
assert_eq!(docset.advance(), TERMINATED);
}
{
let block_segments = build_block_postings(&doc_ids)?;
let mut docset = SegmentPostings::from_block_postings(block_segments, None, 0);
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.doc(), 0);
assert_eq!(docset.seek(131), TERMINATED);
assert_eq!(docset.doc(), TERMINATED);
@@ -487,38 +409,13 @@ mod tests {
Ok(())
}
fn build_block_postings(docs: &[DocId]) -> crate::Result<BlockSegmentPostings> {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let mut last_doc = 0u32;
for &doc in docs {
for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64))?;
}
index_writer.add_document(doc!(int_field=>0u64))?;
last_doc = doc + 1;
}
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field).unwrap();
let term = Term::from_field_u64(int_field, 0u64);
let term_info = inverted_index.get_term_info(&term)?.unwrap();
let block_postings = inverted_index
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?;
Ok(block_postings)
}
#[test]
fn test_block_segment_postings_seek() -> crate::Result<()> {
let mut docs = vec![0];
let mut docs = Vec::new();
for i in 0..1300 {
docs.push((i * i / 100) + i);
}
let mut block_postings = build_block_postings(&docs[..])?;
let mut block_postings = build_block_postings(&docs[..]);
for i in &[0, 424, 10000] {
block_postings.seek(*i);
let docs = block_postings.docs();
@@ -529,40 +426,4 @@ mod tests {
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);
Ok(())
}
#[test]
fn test_reset_block_segment_postings() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
// create two postings list, one containing even number,
// the other containing odd numbers.
for i in 0..6 {
let doc = doc!(int_field=> (i % 2) as u64);
index_writer.add_document(doc)?;
}
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
let mut block_segments;
{
let term = Term::from_field_u64(int_field, 0u64);
let inverted_index = segment_reader.inverted_index(int_field)?;
let term_info = inverted_index.get_term_info(&term)?.unwrap();
block_segments = inverted_index
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?;
}
assert_eq!(block_segments.docs(), &[0, 2, 4]);
{
let term = Term::from_field_u64(int_field, 1u64);
let inverted_index = segment_reader.inverted_index(int_field)?;
let term_info = inverted_index.get_term_info(&term)?.unwrap();
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments)?;
}
assert_eq!(block_segments.docs(), &[1, 3, 5]);
Ok(())
}
}

View File

@@ -0,0 +1,107 @@
use std::io;
use crate::codec::postings::block_wand::{block_wand, block_wand_single_scorer};
use crate::codec::postings::PostingsCodec;
use crate::codec::standard::postings::block_segment_postings::BlockSegmentPostings;
use crate::codec::standard::postings::segment_postings::SegmentPostings;
use crate::fieldnorm::FieldNormReader;
use crate::positions::PositionReader;
use crate::query::term_query::TermScorer;
use crate::query::{BufferedUnionScorer, Scorer, SumCombiner};
use crate::schema::IndexRecordOption;
use crate::{DocSet as _, Score, TERMINATED};
mod block;
mod block_segment_postings;
mod segment_postings;
mod skip;
mod standard_postings_serializer;
pub use segment_postings::SegmentPostings as StandardPostings;
pub use standard_postings_serializer::StandardPostingsSerializer;
/// The default postings codec for tantivy.
pub struct StandardPostingsCodec;
#[expect(clippy::enum_variant_names)]
#[derive(Debug, PartialEq, Clone, Copy, Eq)]
pub(crate) enum FreqReadingOption {
NoFreq,
SkipFreq,
ReadFreq,
}
impl PostingsCodec for StandardPostingsCodec {
type PostingsSerializer = StandardPostingsSerializer;
type Postings = SegmentPostings;
fn new_serializer(
&self,
avg_fieldnorm: Score,
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
) -> Self::PostingsSerializer {
StandardPostingsSerializer::new(avg_fieldnorm, mode, fieldnorm_reader)
}
fn load_postings(
&self,
doc_freq: u32,
postings_data: common::OwnedBytes,
record_option: IndexRecordOption,
requested_option: IndexRecordOption,
positions_data_opt: Option<common::OwnedBytes>,
) -> io::Result<Self::Postings> {
// Rationalize record_option/requested_option.
let requested_option = requested_option.downgrade(record_option);
let block_segment_postings =
BlockSegmentPostings::open(doc_freq, postings_data, record_option, requested_option)?;
let position_reader = positions_data_opt.map(PositionReader::open).transpose()?;
Ok(SegmentPostings::from_block_postings(
block_segment_postings,
position_reader,
))
}
fn try_accelerated_for_each_pruning(
mut threshold: Score,
mut scorer: Box<dyn Scorer>,
callback: &mut dyn FnMut(crate::DocId, Score) -> Score,
) -> Result<(), Box<dyn Scorer>> {
scorer = match scorer.downcast::<TermScorer<Self::Postings>>() {
Ok(term_scorer) => {
block_wand_single_scorer(*term_scorer, threshold, callback);
return Ok(());
}
Err(scorer) => scorer,
};
let mut union_scorer =
scorer.downcast::<BufferedUnionScorer<Box<dyn Scorer>, SumCombiner>>()?;
if !union_scorer
.scorers()
.iter()
.all(|scorer| scorer.is::<TermScorer<Self::Postings>>())
{
return Err(union_scorer);
}
let doc = union_scorer.doc();
if doc == TERMINATED {
return Ok(());
}
let score = union_scorer.score();
if score > threshold {
threshold = callback(doc, score);
}
let boxed_scorers: Vec<Box<dyn Scorer>> = union_scorer.into_scorers();
let scorers: Vec<TermScorer<Self::Postings>> = boxed_scorers
.into_iter()
.map(|scorer| {
*scorer.downcast::<TermScorer<Self::Postings>>().ok().expect(
"Downcast failed despite the fact we already checked the type was correct",
)
})
.collect();
block_wand(scorers, threshold, callback);
Ok(())
}
}

View File

@@ -1,11 +1,13 @@
use common::HasLen;
use common::{BitSet, HasLen};
use super::BlockSegmentPostings;
use crate::codec::postings::PostingsWithBlockMax;
use crate::docset::DocSet;
use crate::fastfield::AliveBitSet;
use crate::positions::PositionReader;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::{BlockSegmentPostings, Postings};
use crate::{DocId, TERMINATED};
use crate::postings::{DocFreq, Postings};
use crate::query::Bm25Weight;
use crate::{DocId, Score};
/// `SegmentPostings` represents the inverted list or postings associated with
/// a term in a `Segment`.
@@ -29,31 +31,6 @@ impl SegmentPostings {
}
}
/// Compute the number of non-deleted documents.
///
/// This method will clone and scan through the posting lists.
/// (this is a rather expensive operation).
pub fn doc_freq_given_deletes(&self, alive_bitset: &AliveBitSet) -> u32 {
let mut docset = self.clone();
let mut doc_freq = 0;
loop {
let doc = docset.doc();
if doc == TERMINATED {
return doc_freq;
}
if alive_bitset.is_alive(doc) {
doc_freq += 1u32;
}
docset.advance();
}
}
/// Returns the overall number of documents in the block postings.
/// It does not take in account whether documents are deleted or not.
pub fn doc_freq(&self) -> u32 {
self.block_cursor.doc_freq()
}
/// Creates a segment postings object with the given documents
/// and no frequency encoded.
///
@@ -64,30 +41,35 @@ impl SegmentPostings {
/// buffer with the serialized data.
#[cfg(test)]
pub fn create_from_docs(docs: &[u32]) -> SegmentPostings {
use crate::directory::FileSlice;
use crate::postings::serializer::PostingsSerializer;
use common::OwnedBytes;
use crate::schema::IndexRecordOption;
let mut buffer = Vec::new();
{
use crate::codec::postings::PostingsSerializer;
let mut postings_serializer =
PostingsSerializer::new(&mut buffer, 0.0, IndexRecordOption::Basic, None);
crate::codec::standard::postings::StandardPostingsSerializer::new(
0.0,
IndexRecordOption::Basic,
None,
);
postings_serializer.new_term(docs.len() as u32, false);
for &doc in docs {
postings_serializer.write_doc(doc, 1u32);
}
postings_serializer
.close_term(docs.len() as u32)
.close_term(docs.len() as u32, &mut buffer)
.expect("In memory Serialization should never fail.");
}
let (block_segment_postings, position_within_block) = BlockSegmentPostings::open(
let block_segment_postings = BlockSegmentPostings::open(
docs.len() as u32,
FileSlice::from(buffer),
OwnedBytes::new(buffer),
IndexRecordOption::Basic,
IndexRecordOption::Basic,
0u32,
)
.unwrap();
SegmentPostings::from_block_postings(block_segment_postings, None, position_within_block)
SegmentPostings::from_block_postings(block_segment_postings, None)
}
/// Helper functions to create `SegmentPostings` for tests.
@@ -96,9 +78,11 @@ impl SegmentPostings {
doc_and_tfs: &[(u32, u32)],
fieldnorms: Option<&[u32]>,
) -> SegmentPostings {
use crate::directory::FileSlice;
use common::OwnedBytes;
use crate::codec::postings::PostingsSerializer as _;
use crate::codec::standard::postings::StandardPostingsSerializer;
use crate::fieldnorm::FieldNormReader;
use crate::postings::serializer::PostingsSerializer;
use crate::schema::IndexRecordOption;
use crate::Score;
let mut buffer: Vec<u8> = Vec::new();
@@ -115,8 +99,7 @@ impl SegmentPostings {
total_num_tokens as Score / fieldnorms.len() as Score
})
.unwrap_or(0.0);
let mut postings_serializer = PostingsSerializer::new(
&mut buffer,
let mut postings_serializer = StandardPostingsSerializer::new(
average_field_norm,
IndexRecordOption::WithFreqs,
fieldnorm_reader,
@@ -126,31 +109,30 @@ impl SegmentPostings {
postings_serializer.write_doc(doc, tf);
}
postings_serializer
.close_term(doc_and_tfs.len() as u32)
.close_term(doc_and_tfs.len() as u32, &mut buffer)
.unwrap();
let (block_segment_postings, position_within_block) = BlockSegmentPostings::open(
let block_segment_postings = BlockSegmentPostings::open(
doc_and_tfs.len() as u32,
FileSlice::from(buffer),
OwnedBytes::new(buffer),
IndexRecordOption::WithFreqs,
IndexRecordOption::WithFreqs,
0u32,
)
.unwrap();
SegmentPostings::from_block_postings(block_segment_postings, None, position_within_block)
SegmentPostings::from_block_postings(block_segment_postings, None)
}
/// Creates a Segment Postings from a
/// - `BlockSegmentPostings`,
/// - a position reader
/// - a target document to seek to
/// Reads a Segment postings from an &[u8]
///
/// * `len` - number of document in the posting lists.
/// * `data` - data array. The complete data is not necessarily used.
/// * `freq_handler` - the freq handler is in charge of decoding frequencies and/or positions
pub(crate) fn from_block_postings(
segment_block_postings: BlockSegmentPostings,
position_reader: Option<PositionReader>,
position_within_block: usize,
) -> SegmentPostings {
SegmentPostings {
block_cursor: segment_block_postings,
cur: position_within_block,
cur: 0, // cursor within the block
position_reader,
}
}
@@ -161,7 +143,6 @@ impl DocSet for SegmentPostings {
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self) -> DocId {
debug_assert!(self.block_cursor.block_is_loaded());
if self.cur == COMPRESSION_BLOCK_SIZE - 1 {
self.cur = 0;
self.block_cursor.advance();
@@ -194,6 +175,19 @@ impl DocSet for SegmentPostings {
fn size_hint(&self) -> u32 {
self.len() as u32
}
fn fill_bitset(&mut self, bitset: &mut BitSet) {
loop {
let docs = self.block_cursor.docs();
if docs.is_empty() {
break;
}
for &doc in docs {
bitset.insert(doc);
}
self.block_cursor.advance();
}
}
}
impl HasLen for SegmentPostings {
@@ -209,7 +203,7 @@ impl Postings for SegmentPostings {
///
/// # Panics
///
/// Will panics if called without having called advance before.
/// Will panics if called without having cagled advance before.
fn term_freq(&self) -> u32 {
debug_assert!(
// Here we do not use the len of `freqs()`
@@ -224,6 +218,13 @@ impl Postings for SegmentPostings {
self.block_cursor.freq(self.cur)
}
/// Returns the overall number of documents in the block postings.
/// It does not take in account whether documents are deleted or not.
#[inline(always)]
fn doc_freq(&self) -> DocFreq {
DocFreq::Exact(self.block_cursor.doc_freq())
}
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
let term_freq = self.term_freq();
let prev_len = output.len();
@@ -247,6 +248,25 @@ impl Postings for SegmentPostings {
}
}
}
fn has_freq(&self) -> bool {
!self.block_cursor.freqs().is_empty()
}
}
impl PostingsWithBlockMax for SegmentPostings {
fn seek_block_max(
&mut self,
target_doc: crate::DocId,
similarity_weight: &Bm25Weight,
) -> Score {
self.block_cursor.seek_block_without_loading(target_doc);
self.block_cursor.block_max_score(similarity_weight)
}
fn last_doc_in_block(&self) -> crate::DocId {
self.block_cursor.skip_reader().last_doc_in_block()
}
}
#[cfg(test)]
@@ -256,14 +276,15 @@ mod tests {
use super::SegmentPostings;
use crate::docset::{DocSet, TERMINATED};
use crate::fastfield::AliveBitSet;
use crate::postings::postings::Postings;
use crate::postings::Postings;
#[test]
fn test_empty_segment_postings() {
let mut postings = SegmentPostings::empty();
assert_eq!(postings.doc(), TERMINATED);
assert_eq!(postings.advance(), TERMINATED);
assert_eq!(postings.advance(), TERMINATED);
assert_eq!(postings.doc_freq(), crate::postings::DocFreq::Exact(0));
assert_eq!(postings.len(), 0);
}
@@ -279,15 +300,4 @@ mod tests {
let postings = SegmentPostings::empty();
assert_eq!(postings.term_freq(), 1);
}
#[test]
fn test_doc_freq() {
let docs = SegmentPostings::create_from_docs(&[0, 2, 10]);
assert_eq!(docs.doc_freq(), 3);
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12);
assert_eq!(docs.doc_freq_given_deletes(&alive_bitset), 2);
let all_deleted =
AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0);
}
}

View File

@@ -6,17 +6,21 @@ use crate::{DocId, Score, TERMINATED};
// doc num bits uses the following encoding:
// given 0b a b cdefgh
// |1|2| 3 |
// |1|2|3| 4 |
// - 1: unused
// - 2: is delta-1 encoded. 0 if not, 1, if yes
// - 3: a 6 bit number in 0..=32, the actual bitwidth
// - 3: unused
// - 4: a 5 bit number in 0..32, the actual bitwidth. Bitpacking could in theory say this is 32
// (requiring a 6th bit), but the biggest doc_id we can want to encode is TERMINATED-1, which can
// be represented on 31b without delta encoding.
fn encode_bitwidth(bitwidth: u8, delta_1: bool) -> u8 {
assert!(bitwidth < 32);
bitwidth | ((delta_1 as u8) << 6)
}
fn decode_bitwidth(raw_bitwidth: u8) -> (u8, bool) {
let delta_1 = ((raw_bitwidth >> 6) & 1) != 0;
let bitwidth = raw_bitwidth & 0x3f;
let bitwidth = raw_bitwidth & 0x1f;
(bitwidth, delta_1)
}
@@ -138,23 +142,6 @@ impl SkipReader {
skip_reader
}
pub fn reset(&mut self, data: OwnedBytes, doc_freq: u32) {
self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
0
} else {
TERMINATED
};
self.last_doc_in_previous_block = 0u32;
self.owned_read = data;
self.block_info = BlockInfo::VInt { num_docs: doc_freq };
self.byte_offset = 0;
self.remaining_docs = doc_freq;
self.position_offset = 0u64;
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
self.read_block_info();
}
}
// Returns the block max score for this block if available.
//
// The block max score is available for all full bitpacked block,
@@ -430,7 +417,7 @@ mod tests {
#[test]
fn test_encode_decode_bitwidth() {
for bitwidth in 0..=32 {
for bitwidth in 0..32 {
for delta_1 in [false, true] {
assert_eq!(
(bitwidth, delta_1),

View File

@@ -0,0 +1,185 @@
use std::cmp::Ordering;
use std::io::{self, Write as _};
use common::{BinarySerializable as _, VInt};
use crate::codec::postings::PostingsSerializer;
use crate::codec::standard::postings::block::Block;
use crate::codec::standard::postings::skip::SkipSerializer;
use crate::fieldnorm::FieldNormReader;
use crate::postings::compression::{BlockEncoder, VIntEncoder as _, COMPRESSION_BLOCK_SIZE};
use crate::query::Bm25Weight;
use crate::schema::IndexRecordOption;
use crate::{DocId, Score};
/// Default tantivy postings codec serializer.
pub struct StandardPostingsSerializer {
last_doc_id_encoded: u32,
block_encoder: BlockEncoder,
block: Box<Block>,
postings_write: Vec<u8>,
skip_write: SkipSerializer,
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
bm25_weight: Option<Bm25Weight>,
avg_fieldnorm: Score, /* Average number of term in the field for that segment.
* this value is used to compute the block wand information. */
term_has_freq: bool,
}
impl StandardPostingsSerializer {
/// Creates a new instance of `StandardPostingsSerializer`.
pub fn new(
avg_fieldnorm: Score,
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
) -> StandardPostingsSerializer {
Self {
last_doc_id_encoded: 0,
block_encoder: BlockEncoder::new(),
block: Box::new(Block::new()),
postings_write: Vec::new(),
skip_write: SkipSerializer::new(),
mode,
fieldnorm_reader,
bm25_weight: None,
avg_fieldnorm,
term_has_freq: false,
}
}
}
impl PostingsSerializer for StandardPostingsSerializer {
fn new_term(&mut self, term_doc_freq: u32, record_term_freq: bool) {
self.clear();
self.term_has_freq = self.mode.has_freq() && record_term_freq;
if !self.term_has_freq {
return;
}
let num_docs_in_segment: u64 =
if let Some(fieldnorm_reader) = self.fieldnorm_reader.as_ref() {
fieldnorm_reader.num_docs() as u64
} else {
return;
};
if num_docs_in_segment == 0 {
return;
}
self.bm25_weight = Some(Bm25Weight::for_one_term_without_explain(
term_doc_freq as u64,
num_docs_in_segment,
self.avg_fieldnorm,
));
}
fn write_doc(&mut self, doc_id: DocId, term_freq: u32) {
self.block.append_doc(doc_id, term_freq);
if self.block.is_full() {
self.write_block();
}
}
fn close_term(&mut self, doc_freq: u32, output_write: &mut impl io::Write) -> io::Result<()> {
if !self.block.is_empty() {
// we have doc ids waiting to be written
// this happens when the number of doc ids is
// not a perfect multiple of our block size.
//
// In that case, the remaining part is encoded
// using variable int encoding.
{
let block_encoded = self
.block_encoder
.compress_vint_sorted(self.block.doc_ids(), self.last_doc_id_encoded);
self.postings_write.write_all(block_encoded)?;
}
// ... Idem for term frequencies
if self.term_has_freq {
let block_encoded = self
.block_encoder
.compress_vint_unsorted(self.block.term_freqs());
self.postings_write.write_all(block_encoded)?;
}
self.block.clear();
}
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
let skip_data = self.skip_write.data();
VInt(skip_data.len() as u64).serialize(output_write)?;
output_write.write_all(skip_data)?;
}
output_write.write_all(&self.postings_write[..])?;
self.skip_write.clear();
self.postings_write.clear();
self.bm25_weight = None;
Ok(())
}
}
impl StandardPostingsSerializer {
fn clear(&mut self) {
self.bm25_weight = None;
self.block.clear();
self.last_doc_id_encoded = 0;
}
fn write_block(&mut self) {
{
// encode the doc ids
let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_sorted(self.block.doc_ids(), self.last_doc_id_encoded);
self.last_doc_id_encoded = self.block.last_doc();
self.skip_write
.write_doc(self.last_doc_id_encoded, num_bits);
// last el block 0, offset block 1,
self.postings_write.extend(block_encoded);
}
if self.term_has_freq {
let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_unsorted(self.block.term_freqs(), true);
self.postings_write.extend(block_encoded);
self.skip_write.write_term_freq(num_bits);
if self.mode.has_positions() {
// We serialize the sum of term freqs within the skip information
// in order to navigate through positions.
let sum_freq = self.block.term_freqs().iter().cloned().sum();
self.skip_write.write_total_term_freq(sum_freq);
}
let mut blockwand_params = (0u8, 0u32);
if let Some(bm25_weight) = self.bm25_weight.as_ref() {
if let Some(fieldnorm_reader) = self.fieldnorm_reader.as_ref() {
let docs = self.block.doc_ids().iter().cloned();
let term_freqs = self.block.term_freqs().iter().cloned();
let fieldnorms = docs.map(|doc| fieldnorm_reader.fieldnorm_id(doc));
blockwand_params = fieldnorms
.zip(term_freqs)
.max_by(
|(left_fieldnorm_id, left_term_freq),
(right_fieldnorm_id, right_term_freq)| {
let left_score =
bm25_weight.tf_factor(*left_fieldnorm_id, *left_term_freq);
let right_score =
bm25_weight.tf_factor(*right_fieldnorm_id, *right_term_freq);
left_score
.partial_cmp(&right_score)
.unwrap_or(Ordering::Equal)
},
)
.unwrap();
}
}
let (fieldnorm_id, term_freq) = blockwand_params;
self.skip_write.write_blockwand_max(fieldnorm_id, term_freq);
}
self.block.clear();
}
}

View File

@@ -1,10 +1,12 @@
mod order;
mod sort_by_erased_type;
mod sort_by_score;
mod sort_by_static_fast_value;
mod sort_by_string;
mod sort_key_computer;
pub use order::*;
pub use sort_by_erased_type::SortByErasedType;
pub use sort_by_score::SortBySimilarityScore;
pub use sort_by_static_fast_value::SortByStaticFastValue;
pub use sort_by_string::SortByString;
@@ -34,11 +36,13 @@ pub(crate) mod tests {
use std::collections::HashMap;
use std::ops::Range;
use crate::collector::sort_key::{SortBySimilarityScore, SortByStaticFastValue, SortByString};
use crate::collector::sort_key::{
SortByErasedType, SortBySimilarityScore, SortByStaticFastValue, SortByString,
};
use crate::collector::{ComparableDoc, DocSetCollector, TopDocs};
use crate::indexer::NoMergePolicy;
use crate::query::{AllQuery, QueryParser};
use crate::schema::{Schema, FAST, TEXT};
use crate::schema::{OwnedValue, Schema, FAST, TEXT};
use crate::{DocAddress, Document, Index, Order, Score, Searcher};
fn make_index() -> crate::Result<Index> {
@@ -313,11 +317,9 @@ pub(crate) mod tests {
(SortBySimilarityScore, score_order),
(SortByString::for_field("city"), city_order),
));
Ok(searcher
.search(&AllQuery, &top_collector)?
.into_iter()
.map(|(f, doc)| (f, ids[&doc]))
.collect())
let results: Vec<((Score, Option<String>), DocAddress)> =
searcher.search(&AllQuery, &top_collector)?;
Ok(results.into_iter().map(|(f, doc)| (f, ids[&doc])).collect())
}
assert_eq!(
@@ -342,6 +344,51 @@ pub(crate) mod tests {
Ok(())
}
#[test]
fn test_order_by_score_then_owned_value() -> crate::Result<()> {
let index = make_index()?;
type SortKey = (Score, OwnedValue);
fn query(
index: &Index,
score_order: Order,
city_order: Order,
) -> crate::Result<Vec<(SortKey, u64)>> {
let searcher = index.reader()?.searcher();
let ids = id_mapping(&searcher);
let top_collector = TopDocs::with_limit(4).order_by::<(Score, OwnedValue)>((
(SortBySimilarityScore, score_order),
(SortByErasedType::for_field("city"), city_order),
));
let results: Vec<((Score, OwnedValue), DocAddress)> =
searcher.search(&AllQuery, &top_collector)?;
Ok(results.into_iter().map(|(f, doc)| (f, ids[&doc])).collect())
}
assert_eq!(
&query(&index, Order::Asc, Order::Asc)?,
&[
((1.0, OwnedValue::Str("austin".to_owned())), 0),
((1.0, OwnedValue::Str("greenville".to_owned())), 1),
((1.0, OwnedValue::Str("tokyo".to_owned())), 2),
((1.0, OwnedValue::Null), 3),
]
);
assert_eq!(
&query(&index, Order::Asc, Order::Desc)?,
&[
((1.0, OwnedValue::Str("tokyo".to_owned())), 2),
((1.0, OwnedValue::Str("greenville".to_owned())), 1),
((1.0, OwnedValue::Str("austin".to_owned())), 0),
((1.0, OwnedValue::Null), 3),
]
);
Ok(())
}
use proptest::prelude::*;
proptest! {

View File

@@ -1,11 +1,70 @@
use std::cmp::Ordering;
use columnar::MonotonicallyMappableToU64;
use serde::{Deserialize, Serialize};
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::schema::Schema;
use crate::schema::{OwnedValue, Schema};
use crate::{DocId, Order, Score};
fn compare_owned_value<const NULLS_FIRST: bool>(lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
match (lhs, rhs) {
(OwnedValue::Null, OwnedValue::Null) => Ordering::Equal,
(OwnedValue::Null, _) => {
if NULLS_FIRST {
Ordering::Less
} else {
Ordering::Greater
}
}
(_, OwnedValue::Null) => {
if NULLS_FIRST {
Ordering::Greater
} else {
Ordering::Less
}
}
(OwnedValue::Str(a), OwnedValue::Str(b)) => a.cmp(b),
(OwnedValue::PreTokStr(a), OwnedValue::PreTokStr(b)) => a.cmp(b),
(OwnedValue::U64(a), OwnedValue::U64(b)) => a.cmp(b),
(OwnedValue::I64(a), OwnedValue::I64(b)) => a.cmp(b),
(OwnedValue::F64(a), OwnedValue::F64(b)) => a.to_u64().cmp(&b.to_u64()),
(OwnedValue::Bool(a), OwnedValue::Bool(b)) => a.cmp(b),
(OwnedValue::Date(a), OwnedValue::Date(b)) => a.cmp(b),
(OwnedValue::Facet(a), OwnedValue::Facet(b)) => a.cmp(b),
(OwnedValue::Bytes(a), OwnedValue::Bytes(b)) => a.cmp(b),
(OwnedValue::IpAddr(a), OwnedValue::IpAddr(b)) => a.cmp(b),
(OwnedValue::U64(a), OwnedValue::I64(b)) => {
if *b < 0 {
Ordering::Greater
} else {
a.cmp(&(*b as u64))
}
}
(OwnedValue::I64(a), OwnedValue::U64(b)) => {
if *a < 0 {
Ordering::Less
} else {
(*a as u64).cmp(b)
}
}
(OwnedValue::U64(a), OwnedValue::F64(b)) => (*a as f64).to_u64().cmp(&b.to_u64()),
(OwnedValue::F64(a), OwnedValue::U64(b)) => a.to_u64().cmp(&(*b as f64).to_u64()),
(OwnedValue::I64(a), OwnedValue::F64(b)) => (*a as f64).to_u64().cmp(&b.to_u64()),
(OwnedValue::F64(a), OwnedValue::I64(b)) => a.to_u64().cmp(&(*b as f64).to_u64()),
(a, b) => {
let ord = a.discriminant_value().cmp(&b.discriminant_value());
// If the discriminant is equal, it's because a new type was added, but hasn't been
// included in this `match` statement.
assert!(
ord != Ordering::Equal,
"Unimplemented comparison for type of {a:?}, {b:?}"
);
ord
}
}
}
/// Comparator trait defining the order in which documents should be ordered.
pub trait Comparator<T>: Send + Sync + std::fmt::Debug + Default {
/// Return the order between two values.
@@ -25,7 +84,18 @@ pub struct NaturalComparator;
impl<T: PartialOrd> Comparator<T> for NaturalComparator {
#[inline(always)]
fn compare(&self, lhs: &T, rhs: &T) -> Ordering {
lhs.partial_cmp(rhs).unwrap()
lhs.partial_cmp(rhs).unwrap_or(Ordering::Equal)
}
}
/// A (partial) implementation of comparison for OwnedValue.
///
/// Intended for use within columns of homogenous types, and so will panic for OwnedValues with
/// mismatched types. The one exception is Null, for which we do define all comparisons.
impl Comparator<OwnedValue> for NaturalComparator {
#[inline(always)]
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
compare_owned_value::</* NULLS_FIRST= */ true>(lhs, rhs)
}
}
@@ -121,6 +191,13 @@ impl Comparator<String> for ReverseNoneIsLowerComparator {
}
}
impl Comparator<OwnedValue> for ReverseNoneIsLowerComparator {
#[inline(always)]
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
compare_owned_value::</* NULLS_FIRST= */ false>(rhs, lhs)
}
}
/// Compare values naturally, but treating `None` as higher than `Some`.
///
/// When used with `TopDocs`, which reverses the order, this results in a
@@ -185,6 +262,13 @@ impl Comparator<String> for NaturalNoneIsHigherComparator {
}
}
impl Comparator<OwnedValue> for NaturalNoneIsHigherComparator {
#[inline(always)]
fn compare(&self, lhs: &OwnedValue, rhs: &OwnedValue) -> Ordering {
compare_owned_value::</* NULLS_FIRST= */ false>(lhs, rhs)
}
}
/// An enum representing the different sort orders.
#[derive(Debug, Clone, Copy, Eq, PartialEq, Default)]
pub enum ComparatorEnum {
@@ -404,11 +488,12 @@ impl<TSegmentSortKeyComputer, TSegmentSortKey, TComparator> SegmentSortKeyComput
for SegmentSortKeyComputerWithComparator<TSegmentSortKeyComputer, TComparator>
where
TSegmentSortKeyComputer: SegmentSortKeyComputer<SegmentSortKey = TSegmentSortKey>,
TSegmentSortKey: PartialOrd + Clone + 'static + Sync + Send,
TSegmentSortKey: Clone + 'static + Sync + Send,
TComparator: Comparator<TSegmentSortKey> + 'static + Sync + Send,
{
type SortKey = TSegmentSortKeyComputer::SortKey;
type SegmentSortKey = TSegmentSortKey;
type SegmentComparator = TComparator;
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey {
self.segment_sort_key_computer.segment_sort_key(doc, score)
@@ -432,6 +517,7 @@ where
#[cfg(test)]
mod tests {
use super::*;
use crate::schema::OwnedValue;
#[test]
fn test_natural_none_is_higher() {
@@ -455,4 +541,27 @@ mod tests {
// compare(None, None) should be Equal.
assert_eq!(comp.compare(&null, &null), Ordering::Equal);
}
#[test]
fn test_mixed_ownedvalue_compare() {
let u = OwnedValue::U64(10);
let i = OwnedValue::I64(10);
let f = OwnedValue::F64(10.0);
let nc = NaturalComparator;
assert_eq!(nc.compare(&u, &i), Ordering::Equal);
assert_eq!(nc.compare(&u, &f), Ordering::Equal);
assert_eq!(nc.compare(&i, &f), Ordering::Equal);
let u2 = OwnedValue::U64(11);
assert_eq!(nc.compare(&u2, &f), Ordering::Greater);
let s = OwnedValue::Str("a".to_string());
// Str < U64
assert_eq!(nc.compare(&s, &u), Ordering::Less);
// Str < I64
assert_eq!(nc.compare(&s, &i), Ordering::Less);
// Str < F64
assert_eq!(nc.compare(&s, &f), Ordering::Less);
}
}

View File

@@ -0,0 +1,361 @@
use columnar::{ColumnType, MonotonicallyMappableToU64};
use crate::collector::sort_key::{
NaturalComparator, SortBySimilarityScore, SortByStaticFastValue, SortByString,
};
use crate::collector::{SegmentSortKeyComputer, SortKeyComputer};
use crate::fastfield::FastFieldNotAvailableError;
use crate::schema::OwnedValue;
use crate::{DateTime, DocId, Score};
/// Sort by the boxed / OwnedValue representation of either a fast field, or of the score.
///
/// Using the OwnedValue representation allows for type erasure, and can be useful when sort orders
/// are not known until runtime. But it comes with a performance cost: wherever possible, prefer to
/// use a SortKeyComputer implementation with a known-type at compile time.
#[derive(Debug, Clone)]
pub enum SortByErasedType {
/// Sort by a fast field
Field(String),
/// Sort by score
Score,
}
impl SortByErasedType {
/// Creates a new sort key computer which will sort by the given fast field column, with type
/// erasure.
pub fn for_field(column_name: impl ToString) -> Self {
Self::Field(column_name.to_string())
}
/// Creates a new sort key computer which will sort by score, with type erasure.
pub fn for_score() -> Self {
Self::Score
}
}
trait ErasedSegmentSortKeyComputer: Send + Sync {
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Option<u64>;
fn convert_segment_sort_key(&self, sort_key: Option<u64>) -> OwnedValue;
}
struct ErasedSegmentSortKeyComputerWrapper<C, F> {
inner: C,
converter: F,
}
impl<C, F> ErasedSegmentSortKeyComputer for ErasedSegmentSortKeyComputerWrapper<C, F>
where
C: SegmentSortKeyComputer<SegmentSortKey = Option<u64>> + Send + Sync,
F: Fn(C::SortKey) -> OwnedValue + Send + Sync + 'static,
{
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Option<u64> {
self.inner.segment_sort_key(doc, score)
}
fn convert_segment_sort_key(&self, sort_key: Option<u64>) -> OwnedValue {
let val = self.inner.convert_segment_sort_key(sort_key);
(self.converter)(val)
}
}
struct ScoreSegmentSortKeyComputer {
segment_computer: SortBySimilarityScore,
}
impl ErasedSegmentSortKeyComputer for ScoreSegmentSortKeyComputer {
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Option<u64> {
let score_value: f64 = self.segment_computer.segment_sort_key(doc, score).into();
Some(score_value.to_u64())
}
fn convert_segment_sort_key(&self, sort_key: Option<u64>) -> OwnedValue {
let score_value: u64 = sort_key.expect("This implementation always produces a score.");
OwnedValue::F64(f64::from_u64(score_value))
}
}
impl SortKeyComputer for SortByErasedType {
type SortKey = OwnedValue;
type Child = ErasedColumnSegmentSortKeyComputer;
type Comparator = NaturalComparator;
fn requires_scoring(&self) -> bool {
matches!(self, Self::Score)
}
fn segment_sort_key_computer(
&self,
segment_reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let inner: Box<dyn ErasedSegmentSortKeyComputer> = match self {
Self::Field(column_name) => {
let fast_fields = segment_reader.fast_fields();
// TODO: We currently double-open the column to avoid relying on the implementation
// details of `SortByString` or `SortByStaticFastValue`. Once
// https://github.com/quickwit-oss/tantivy/issues/2776 is resolved, we should
// consider directly constructing the appropriate `SegmentSortKeyComputer` type for
// the column that we open here.
let (_column, column_type) =
fast_fields.u64_lenient(column_name)?.ok_or_else(|| {
FastFieldNotAvailableError {
field_name: column_name.to_owned(),
}
})?;
match column_type {
ColumnType::Str => {
let computer = SortByString::for_field(column_name);
let inner = computer.segment_sort_key_computer(segment_reader)?;
Box::new(ErasedSegmentSortKeyComputerWrapper {
inner,
converter: |val: Option<String>| {
val.map(OwnedValue::Str).unwrap_or(OwnedValue::Null)
},
})
}
ColumnType::U64 => {
let computer = SortByStaticFastValue::<u64>::for_field(column_name);
let inner = computer.segment_sort_key_computer(segment_reader)?;
Box::new(ErasedSegmentSortKeyComputerWrapper {
inner,
converter: |val: Option<u64>| {
val.map(OwnedValue::U64).unwrap_or(OwnedValue::Null)
},
})
}
ColumnType::I64 => {
let computer = SortByStaticFastValue::<i64>::for_field(column_name);
let inner = computer.segment_sort_key_computer(segment_reader)?;
Box::new(ErasedSegmentSortKeyComputerWrapper {
inner,
converter: |val: Option<i64>| {
val.map(OwnedValue::I64).unwrap_or(OwnedValue::Null)
},
})
}
ColumnType::F64 => {
let computer = SortByStaticFastValue::<f64>::for_field(column_name);
let inner = computer.segment_sort_key_computer(segment_reader)?;
Box::new(ErasedSegmentSortKeyComputerWrapper {
inner,
converter: |val: Option<f64>| {
val.map(OwnedValue::F64).unwrap_or(OwnedValue::Null)
},
})
}
ColumnType::Bool => {
let computer = SortByStaticFastValue::<bool>::for_field(column_name);
let inner = computer.segment_sort_key_computer(segment_reader)?;
Box::new(ErasedSegmentSortKeyComputerWrapper {
inner,
converter: |val: Option<bool>| {
val.map(OwnedValue::Bool).unwrap_or(OwnedValue::Null)
},
})
}
ColumnType::DateTime => {
let computer = SortByStaticFastValue::<DateTime>::for_field(column_name);
let inner = computer.segment_sort_key_computer(segment_reader)?;
Box::new(ErasedSegmentSortKeyComputerWrapper {
inner,
converter: |val: Option<DateTime>| {
val.map(OwnedValue::Date).unwrap_or(OwnedValue::Null)
},
})
}
column_type => {
return Err(crate::TantivyError::SchemaError(format!(
"Field `{}` is of type {column_type:?}, which is not supported for \
sorting by owned value yet.",
column_name
)))
}
}
}
Self::Score => Box::new(ScoreSegmentSortKeyComputer {
segment_computer: SortBySimilarityScore,
}),
};
Ok(ErasedColumnSegmentSortKeyComputer { inner })
}
}
pub struct ErasedColumnSegmentSortKeyComputer {
inner: Box<dyn ErasedSegmentSortKeyComputer>,
}
impl SegmentSortKeyComputer for ErasedColumnSegmentSortKeyComputer {
type SortKey = OwnedValue;
type SegmentSortKey = Option<u64>;
type SegmentComparator = NaturalComparator;
#[inline(always)]
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Option<u64> {
self.inner.segment_sort_key(doc, score)
}
fn convert_segment_sort_key(&self, segment_sort_key: Self::SegmentSortKey) -> OwnedValue {
self.inner.convert_segment_sort_key(segment_sort_key)
}
}
#[cfg(test)]
mod tests {
use crate::collector::sort_key::{ComparatorEnum, SortByErasedType};
use crate::collector::TopDocs;
use crate::query::AllQuery;
use crate::schema::{OwnedValue, Schema, FAST, TEXT};
use crate::Index;
#[test]
fn test_sort_by_owned_u64() {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc!(id_field => 10u64)).unwrap();
writer.add_document(doc!(id_field => 2u64)).unwrap();
writer.add_document(doc!()).unwrap();
writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let collector = TopDocs::with_limit(10)
.order_by((SortByErasedType::for_field("id"), ComparatorEnum::Natural));
let top_docs = searcher.search(&AllQuery, &collector).unwrap();
let values: Vec<OwnedValue> = top_docs.into_iter().map(|(key, _)| key).collect();
assert_eq!(
values,
vec![OwnedValue::U64(10), OwnedValue::U64(2), OwnedValue::Null]
);
let collector = TopDocs::with_limit(10).order_by((
SortByErasedType::for_field("id"),
ComparatorEnum::ReverseNoneLower,
));
let top_docs = searcher.search(&AllQuery, &collector).unwrap();
let values: Vec<OwnedValue> = top_docs.into_iter().map(|(key, _)| key).collect();
assert_eq!(
values,
vec![OwnedValue::U64(2), OwnedValue::U64(10), OwnedValue::Null]
);
}
#[test]
fn test_sort_by_owned_string() {
let mut schema_builder = Schema::builder();
let city_field = schema_builder.add_text_field("city", FAST | TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc!(city_field => "tokyo")).unwrap();
writer.add_document(doc!(city_field => "austin")).unwrap();
writer.add_document(doc!()).unwrap();
writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let collector = TopDocs::with_limit(10).order_by((
SortByErasedType::for_field("city"),
ComparatorEnum::ReverseNoneLower,
));
let top_docs = searcher.search(&AllQuery, &collector).unwrap();
let values: Vec<OwnedValue> = top_docs.into_iter().map(|(key, _)| key).collect();
assert_eq!(
values,
vec![
OwnedValue::Str("austin".to_string()),
OwnedValue::Str("tokyo".to_string()),
OwnedValue::Null
]
);
}
#[test]
fn test_sort_by_owned_reverse() {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc!(id_field => 10u64)).unwrap();
writer.add_document(doc!(id_field => 2u64)).unwrap();
writer.add_document(doc!()).unwrap();
writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let collector = TopDocs::with_limit(10)
.order_by((SortByErasedType::for_field("id"), ComparatorEnum::Reverse));
let top_docs = searcher.search(&AllQuery, &collector).unwrap();
let values: Vec<OwnedValue> = top_docs.into_iter().map(|(key, _)| key).collect();
assert_eq!(
values,
vec![OwnedValue::Null, OwnedValue::U64(2), OwnedValue::U64(10)]
);
}
#[test]
fn test_sort_by_owned_score() {
let mut schema_builder = Schema::builder();
let body_field = schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc!(body_field => "a a")).unwrap();
writer.add_document(doc!(body_field => "a")).unwrap();
writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query_parser = crate::query::QueryParser::for_index(&index, vec![body_field]);
let query = query_parser.parse_query("a").unwrap();
// Sort by score descending (Natural)
let collector = TopDocs::with_limit(10)
.order_by((SortByErasedType::for_score(), ComparatorEnum::Natural));
let top_docs = searcher.search(&query, &collector).unwrap();
let values: Vec<f64> = top_docs
.into_iter()
.map(|(key, _)| match key {
OwnedValue::F64(val) => val,
_ => panic!("Wrong type {key:?}"),
})
.collect();
assert_eq!(values.len(), 2);
assert!(values[0] > values[1]);
// Sort by score ascending (ReverseNoneLower)
let collector = TopDocs::with_limit(10).order_by((
SortByErasedType::for_score(),
ComparatorEnum::ReverseNoneLower,
));
let top_docs = searcher.search(&query, &collector).unwrap();
let values: Vec<f64> = top_docs
.into_iter()
.map(|(key, _)| match key {
OwnedValue::F64(val) => val,
_ => panic!("Wrong type {key:?}"),
})
.collect();
assert_eq!(values.len(), 2);
assert!(values[0] < values[1]);
}
}

View File

@@ -63,8 +63,8 @@ impl SortKeyComputer for SortBySimilarityScore {
impl SegmentSortKeyComputer for SortBySimilarityScore {
type SortKey = Score;
type SegmentSortKey = Score;
type SegmentComparator = NaturalComparator;
#[inline(always)]
fn segment_sort_key(&mut self, _doc: DocId, score: Score) -> Score {

View File

@@ -34,9 +34,7 @@ impl<T: FastValue> SortByStaticFastValue<T> {
impl<T: FastValue> SortKeyComputer for SortByStaticFastValue<T> {
type Child = SortByFastValueSegmentSortKeyComputer<T>;
type SortKey = Option<T>;
type Comparator = NaturalComparator;
fn check_schema(&self, schema: &crate::schema::Schema) -> crate::Result<()> {
@@ -84,8 +82,8 @@ pub struct SortByFastValueSegmentSortKeyComputer<T> {
impl<T: FastValue> SegmentSortKeyComputer for SortByFastValueSegmentSortKeyComputer<T> {
type SortKey = Option<T>;
type SegmentSortKey = Option<u64>;
type SegmentComparator = NaturalComparator;
#[inline(always)]
fn segment_sort_key(&mut self, doc: DocId, _score: Score) -> Self::SegmentSortKey {

View File

@@ -30,9 +30,7 @@ impl SortByString {
impl SortKeyComputer for SortByString {
type SortKey = Option<String>;
type Child = ByStringColumnSegmentSortKeyComputer;
type Comparator = NaturalComparator;
fn segment_sort_key_computer(
@@ -50,8 +48,8 @@ pub struct ByStringColumnSegmentSortKeyComputer {
impl SegmentSortKeyComputer for ByStringColumnSegmentSortKeyComputer {
type SortKey = Option<String>;
type SegmentSortKey = Option<TermOrdinal>;
type SegmentComparator = NaturalComparator;
#[inline(always)]
fn segment_sort_key(&mut self, doc: DocId, _score: Score) -> Option<TermOrdinal> {
@@ -60,6 +58,8 @@ impl SegmentSortKeyComputer for ByStringColumnSegmentSortKeyComputer {
}
fn convert_segment_sort_key(&self, term_ord_opt: Option<TermOrdinal>) -> Option<String> {
// TODO: Individual lookups to the dictionary like this are very likely to repeatedly
// decompress the same blocks. See https://github.com/quickwit-oss/tantivy/issues/2776
let term_ord = term_ord_opt?;
let str_column = self.str_column_opt.as_ref()?;
let mut bytes = Vec::new();

View File

@@ -12,13 +12,21 @@ use crate::{DocAddress, DocId, Result, Score, SegmentReader};
/// It is the segment local version of the [`SortKeyComputer`].
pub trait SegmentSortKeyComputer: 'static {
/// The final score being emitted.
type SortKey: 'static + PartialOrd + Send + Sync + Clone;
type SortKey: 'static + Send + Sync + Clone;
/// Sort key used by at the segment level by the `SegmentSortKeyComputer`.
///
/// It is typically small like a `u64`, and is meant to be converted
/// to the final score at the end of the collection of the segment.
type SegmentSortKey: 'static + PartialOrd + Clone + Send + Sync + Clone;
type SegmentSortKey: 'static + Clone + Send + Sync + Clone;
/// Comparator type.
type SegmentComparator: Comparator<Self::SegmentSortKey> + 'static;
/// Returns the segment sort key comparator.
fn segment_comparator(&self) -> Self::SegmentComparator {
Self::SegmentComparator::default()
}
/// Computes the sort key for the given document and score.
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey;
@@ -47,7 +55,7 @@ pub trait SegmentSortKeyComputer: 'static {
left: &Self::SegmentSortKey,
right: &Self::SegmentSortKey,
) -> Ordering {
NaturalComparator.compare(left, right)
self.segment_comparator().compare(left, right)
}
/// Implementing this method makes it possible to avoid computing
@@ -81,7 +89,7 @@ pub trait SegmentSortKeyComputer: 'static {
/// the sort key at a segment scale.
pub trait SortKeyComputer: Sync {
/// The sort key type.
type SortKey: 'static + Send + Sync + PartialOrd + Clone + std::fmt::Debug;
type SortKey: 'static + Send + Sync + Clone + std::fmt::Debug;
/// Type of the associated [`SegmentSortKeyComputer`].
type Child: SegmentSortKeyComputer<SortKey = Self::SortKey>;
/// Comparator type.
@@ -136,10 +144,7 @@ where
HeadSortKeyComputer: SortKeyComputer,
TailSortKeyComputer: SortKeyComputer,
{
type SortKey = (
<HeadSortKeyComputer::Child as SegmentSortKeyComputer>::SortKey,
<TailSortKeyComputer::Child as SegmentSortKeyComputer>::SortKey,
);
type SortKey = (HeadSortKeyComputer::SortKey, TailSortKeyComputer::SortKey);
type Child = (HeadSortKeyComputer::Child, TailSortKeyComputer::Child);
type Comparator = (
@@ -188,6 +193,11 @@ where
TailSegmentSortKeyComputer::SegmentSortKey,
);
type SegmentComparator = (
HeadSegmentSortKeyComputer::SegmentComparator,
TailSegmentSortKeyComputer::SegmentComparator,
);
/// A SegmentSortKeyComputer maps to a SegmentSortKey, but it can also decide on
/// its ordering.
///
@@ -269,11 +279,12 @@ impl<T, PreviousScore, NewScore> SegmentSortKeyComputer
for MappedSegmentSortKeyComputer<T, PreviousScore, NewScore>
where
T: SegmentSortKeyComputer<SortKey = PreviousScore>,
PreviousScore: 'static + Clone + Send + Sync + PartialOrd,
NewScore: 'static + Clone + Send + Sync + PartialOrd,
PreviousScore: 'static + Clone + Send + Sync,
NewScore: 'static + Clone + Send + Sync,
{
type SortKey = NewScore;
type SegmentSortKey = T::SegmentSortKey;
type SegmentComparator = T::SegmentComparator;
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> Self::SegmentSortKey {
self.sort_key_computer.segment_sort_key(doc, score)
@@ -463,6 +474,7 @@ where
{
type SortKey = TSortKey;
type SegmentSortKey = TSortKey;
type SegmentComparator = NaturalComparator;
fn segment_sort_key(&mut self, doc: DocId, _score: Score) -> TSortKey {
(self)(doc)

View File

@@ -324,7 +324,7 @@ impl TopDocs {
sort_key_computer: impl SortKeyComputer<SortKey = TSortKey> + Send + 'static,
) -> impl Collector<Fruit = Vec<(TSortKey, DocAddress)>>
where
TSortKey: 'static + Clone + Send + Sync + PartialOrd + std::fmt::Debug,
TSortKey: 'static + Clone + Send + Sync + std::fmt::Debug,
{
TopBySortKeyCollector::new(sort_key_computer, self.doc_range())
}
@@ -445,7 +445,7 @@ where
F: 'static + Send + Sync + Fn(&SegmentReader) -> TTweakScoreSortKeyFn,
TTweakScoreSortKeyFn: 'static + Fn(DocId, Score) -> TSortKey,
TweakScoreSegmentSortKeyComputer<TTweakScoreSortKeyFn>:
SegmentSortKeyComputer<SortKey = TSortKey>,
SegmentSortKeyComputer<SortKey = TSortKey, SegmentSortKey = TSortKey>,
TSortKey: 'static + PartialOrd + Clone + Send + Sync + std::fmt::Debug,
{
type SortKey = TSortKey;
@@ -480,6 +480,7 @@ where
{
type SortKey = TSortKey;
type SegmentSortKey = TSortKey;
type SegmentComparator = NaturalComparator;
fn segment_sort_key(&mut self, doc: DocId, score: Score) -> TSortKey {
(self.sort_key_fn)(doc, score)

View File

@@ -48,7 +48,15 @@ impl Executor {
F: Sized + Sync + Fn(A) -> crate::Result<R>,
{
match self {
Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
Executor::SingleThread => {
// Avoid `collect`, since the stacktrace is blown up by it, which makes profiling
// harder.
let mut result = Vec::with_capacity(args.size_hint().0);
for arg in args {
result.push(f(arg)?);
}
Ok(result)
}
Executor::ThreadPool(pool) => {
let args: Vec<A> = args.collect();
let num_fruits = args.len();

View File

@@ -4,7 +4,7 @@ use common::{replace_in_place, JsonPathWriter};
use rustc_hash::FxHashMap;
use crate::indexer::indexing_term::IndexingTerm;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter as _, PostingsWriterEnum};
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
use crate::time::format_description::well_known::Rfc3339;
@@ -80,7 +80,7 @@ fn index_json_object<'a, V: Value<'a>>(
text_analyzer: &mut TextAnalyzer,
term_buffer: &mut IndexingTerm,
json_path_writer: &mut JsonPathWriter,
postings_writer: &mut dyn PostingsWriter,
postings_writer: &mut PostingsWriterEnum,
ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath,
) {
@@ -110,7 +110,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
text_analyzer: &mut TextAnalyzer,
term_buffer: &mut IndexingTerm,
json_path_writer: &mut JsonPathWriter,
postings_writer: &mut dyn PostingsWriter,
postings_writer: &mut PostingsWriterEnum,
ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath,
) {

View File

@@ -1,3 +1,5 @@
mod file_watcher;
use std::collections::HashMap;
use std::fmt;
use std::fs::{self, File, OpenOptions};
@@ -7,6 +9,7 @@ use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock, Weak};
use common::StableDeref;
use file_watcher::FileWatcher;
use fs4::fs_std::FileExt;
#[cfg(all(feature = "mmap", unix))]
pub use memmap2::Advice;
@@ -18,7 +21,6 @@ use crate::core::META_FILEPATH;
use crate::directory::error::{
DeleteError, LockError, OpenDirectoryError, OpenReadError, OpenWriteError,
};
use crate::directory::file_watcher::FileWatcher;
use crate::directory::{
AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
WatchCallback, WatchHandle, WritePtr,

View File

@@ -5,7 +5,6 @@ mod mmap_directory;
mod directory;
mod directory_lock;
mod file_watcher;
pub mod footer;
mod managed_directory;
mod ram_directory;

View File

@@ -1,5 +1,7 @@
use std::borrow::{Borrow, BorrowMut};
use common::BitSet;
use crate::fastfield::AliveBitSet;
use crate::DocId;
@@ -42,7 +44,6 @@ pub trait DocSet: Send {
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`.
///
/// `target` has to be larger or equal to `.doc()` when calling `seek`.
/// If `target` is equal to `.doc()` then the DocSet should not advance.
fn seek(&mut self, target: DocId) -> DocId {
let mut doc = self.doc();
debug_assert!(doc <= target);
@@ -107,6 +108,15 @@ pub trait DocSet: Send {
buffer.len()
}
/// TODO comment on the size of the bitset
fn fill_bitset(&mut self, bitset: &mut BitSet) {
let mut doc = self.doc();
while doc != TERMINATED {
bitset.insert(doc);
doc = self.advance();
}
}
/// Returns the current document
/// Right after creating a new `DocSet`, the docset points to the first document.
///
@@ -167,19 +177,6 @@ pub trait DocSet: Send {
}
}
/// Consumes the `DocSet` and returns a Vec with all of the docs in the DocSet
/// including the current doc.
#[cfg(test)]
pub fn docset_to_doc_vec(mut doc_set: Box<dyn DocSet>) -> Vec<DocId> {
let mut output = Vec::new();
let mut doc = doc_set.doc();
while doc != TERMINATED {
output.push(doc);
doc = doc_set.advance();
}
output
}
impl DocSet for &mut dyn DocSet {
fn advance(&mut self) -> u32 {
(**self).advance()

View File

@@ -113,7 +113,7 @@ mod tests {
IndexRecordOption::WithFreqs,
);
let weight = query.weight(EnableScoring::enabled_from_searcher(&searcher))?;
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32, 0)?;
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
assert_eq!(scorer.doc(), 0);
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
assert_eq!(scorer.advance(), 1);
@@ -142,7 +142,7 @@ mod tests {
IndexRecordOption::WithFreqs,
);
let weight = query.weight(EnableScoring::enabled_from_searcher(&searcher))?;
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32, 0)?;
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
assert_eq!(scorer.doc(), 0);
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
assert_eq!(scorer.advance(), 1);

View File

@@ -0,0 +1,46 @@
use std::borrow::Cow;
use serde::{Deserialize, Serialize};
use crate::codec::{Codec, StandardCodec};
/// A small struct representing a codec configuration.
/// It is meant to be serialized in the index metadata file.
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct CodecConfiguration {
name: Cow<'static, str>,
#[serde(default, skip_serializing_if = "serde_json::Value::is_null")]
props: serde_json::Value,
}
impl CodecConfiguration {
/// Builds the codec given a codec_configuration.
///
/// If the configuration codec name does not match the type C,
/// a tantivy error is returned.
pub fn to_codec<C: Codec>(&self) -> crate::Result<C> {
if self.name != C::NAME {
return Err(crate::TantivyError::InvalidArgument(format!(
"Codec name mismatch: expected {}, got {}",
C::NAME,
self.name
)));
}
C::from_json_props(&self.props)
}
}
impl<'a, C: Codec> From<&'a C> for CodecConfiguration {
fn from(codec: &'a C) -> Self {
CodecConfiguration {
name: Cow::Borrowed(C::NAME),
props: codec.to_json_props(),
}
}
}
impl Default for CodecConfiguration {
fn default() -> Self {
CodecConfiguration::from(&StandardCodec)
}
}

View File

@@ -8,12 +8,14 @@ use std::thread::available_parallelism;
use super::segment::Segment;
use super::segment_reader::merge_field_meta_data;
use super::{FieldMetadata, IndexSettings};
use crate::codec::StandardCodec;
use crate::core::{Executor, META_FILEPATH};
use crate::directory::error::OpenReadError;
#[cfg(feature = "mmap")]
use crate::directory::MmapDirectory;
use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
use crate::error::{DataCorruption, TantivyError};
use crate::index::codec_configuration::CodecConfiguration;
use crate::index::{IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory};
use crate::indexer::index_writer::{
IndexWriterOptions, MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN,
@@ -59,6 +61,7 @@ fn save_new_metas(
schema: Schema,
index_settings: IndexSettings,
directory: &dyn Directory,
codec: CodecConfiguration,
) -> crate::Result<()> {
save_metas(
&IndexMeta {
@@ -67,6 +70,7 @@ fn save_new_metas(
schema,
opstamp: 0u64,
payload: None,
codec,
},
directory,
)?;
@@ -101,18 +105,21 @@ fn save_new_metas(
/// };
/// let index = Index::builder().schema(schema).settings(settings).create_in_ram();
/// ```
pub struct IndexBuilder {
pub struct IndexBuilder<Codec: crate::codec::Codec = StandardCodec> {
schema: Option<Schema>,
index_settings: IndexSettings,
tokenizer_manager: TokenizerManager,
fast_field_tokenizer_manager: TokenizerManager,
codec: Codec,
}
impl Default for IndexBuilder {
impl Default for IndexBuilder<StandardCodec> {
fn default() -> Self {
IndexBuilder::new()
}
}
impl IndexBuilder {
impl IndexBuilder<StandardCodec> {
/// Creates a new `IndexBuilder`
pub fn new() -> Self {
Self {
@@ -120,6 +127,21 @@ impl IndexBuilder {
index_settings: IndexSettings::default(),
tokenizer_manager: TokenizerManager::default(),
fast_field_tokenizer_manager: TokenizerManager::default(),
codec: StandardCodec,
}
}
}
impl<Codec: crate::codec::Codec> IndexBuilder<Codec> {
/// Set the codec
#[must_use]
pub fn codec<NewCodec: crate::codec::Codec>(self, codec: NewCodec) -> IndexBuilder<NewCodec> {
IndexBuilder {
schema: self.schema,
index_settings: self.index_settings,
tokenizer_manager: self.tokenizer_manager,
fast_field_tokenizer_manager: self.fast_field_tokenizer_manager,
codec,
}
}
@@ -154,7 +176,7 @@ impl IndexBuilder {
/// The index will be allocated in anonymous memory.
/// This is useful for indexing small set of documents
/// for instances like unit test or temporary in memory index.
pub fn create_in_ram(self) -> Result<Index, TantivyError> {
pub fn create_in_ram(self) -> Result<Index<Codec>, TantivyError> {
let ram_directory = RamDirectory::create();
self.create(ram_directory)
}
@@ -165,7 +187,7 @@ impl IndexBuilder {
/// If a previous index was in this directory, it returns an
/// [`TantivyError::IndexAlreadyExists`] error.
#[cfg(feature = "mmap")]
pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index> {
pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index<Codec>> {
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::open(directory_path)?);
if Index::exists(&*mmap_directory)? {
return Err(TantivyError::IndexAlreadyExists);
@@ -186,7 +208,7 @@ impl IndexBuilder {
self,
dir: impl Into<Box<dyn Directory>>,
mem_budget: usize,
) -> crate::Result<SingleSegmentIndexWriter<D>> {
) -> crate::Result<SingleSegmentIndexWriter<Codec, D>> {
let index = self.create(dir)?;
let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
Ok(index_simple_writer)
@@ -202,7 +224,7 @@ impl IndexBuilder {
/// For other unit tests, prefer the [`RamDirectory`], see:
/// [`IndexBuilder::create_in_ram()`].
#[cfg(feature = "mmap")]
pub fn create_from_tempdir(self) -> crate::Result<Index> {
pub fn create_from_tempdir(self) -> crate::Result<Index<Codec>> {
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::create_from_tempdir()?);
self.create(mmap_directory)
}
@@ -215,12 +237,15 @@ impl IndexBuilder {
}
/// Opens or creates a new index in the provided directory
pub fn open_or_create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
pub fn open_or_create<T: Into<Box<dyn Directory>>>(
self,
dir: T,
) -> crate::Result<Index<Codec>> {
let dir: Box<dyn Directory> = dir.into();
if !Index::exists(&*dir)? {
return self.create(dir);
}
let mut index = Index::open(dir)?;
let mut index: Index<Codec> = Index::<Codec>::open_with_codec(dir)?;
index.set_tokenizers(self.tokenizer_manager.clone());
if index.schema() == self.get_expect_schema()? {
Ok(index)
@@ -244,18 +269,25 @@ impl IndexBuilder {
/// Creates a new index given an implementation of the trait `Directory`.
///
/// If a directory previously existed, it will be erased.
fn create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
pub fn create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index<Codec>> {
self.create_avoid_monomorphization(dir.into())
}
fn create_avoid_monomorphization(self, dir: Box<dyn Directory>) -> crate::Result<Index<Codec>> {
self.validate()?;
let dir = dir.into();
let directory = ManagedDirectory::wrap(dir)?;
let codec: CodecConfiguration = CodecConfiguration::from(&self.codec);
save_new_metas(
self.get_expect_schema()?,
self.index_settings.clone(),
&directory,
codec,
)?;
let mut metas = IndexMeta::with_schema(self.get_expect_schema()?);
let schema = self.get_expect_schema()?;
let mut metas = IndexMeta::with_schema_and_codec(schema, &self.codec);
metas.index_settings = self.index_settings;
let mut index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
let mut index: Index<Codec> =
Index::<Codec>::open_from_metas(directory, &metas, SegmentMetaInventory::default())?;
index.set_tokenizers(self.tokenizer_manager);
index.set_fast_field_tokenizers(self.fast_field_tokenizer_manager);
Ok(index)
@@ -264,7 +296,7 @@ impl IndexBuilder {
/// Search Index
#[derive(Clone)]
pub struct Index {
pub struct Index<Codec: crate::codec::Codec = crate::codec::StandardCodec> {
directory: ManagedDirectory,
schema: Schema,
settings: IndexSettings,
@@ -272,6 +304,7 @@ pub struct Index {
tokenizers: TokenizerManager,
fast_field_tokenizers: TokenizerManager,
inventory: SegmentMetaInventory,
codec: Codec,
}
impl Index {
@@ -279,41 +312,6 @@ impl Index {
pub fn builder() -> IndexBuilder {
IndexBuilder::new()
}
/// Examines the directory to see if it contains an index.
///
/// Effectively, it only checks for the presence of the `meta.json` file.
pub fn exists(dir: &dyn Directory) -> Result<bool, OpenReadError> {
dir.exists(&META_FILEPATH)
}
/// Accessor to the search executor.
///
/// This pool is used by default when calling `searcher.search(...)`
/// to perform search on the individual segments.
///
/// By default the executor is single thread, and simply runs in the calling thread.
pub fn search_executor(&self) -> &Executor {
&self.executor
}
/// Replace the default single thread search executor pool
/// by a thread pool with a given number of threads.
pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> {
self.executor = Executor::multi_thread(num_threads, "tantivy-search-")?;
Ok(())
}
/// Custom thread pool by a outer thread pool.
pub fn set_executor(&mut self, executor: Executor) {
self.executor = executor;
}
/// Replace the default single thread search executor pool
/// by a thread pool with as many threads as there are CPUs on the system.
pub fn set_default_multithread_executor(&mut self) -> crate::Result<()> {
let default_num_threads = available_parallelism()?.get();
self.set_multithread_executor(default_num_threads)
}
/// Creates a new index using the [`RamDirectory`].
///
@@ -324,6 +322,13 @@ impl Index {
IndexBuilder::new().schema(schema).create_in_ram().unwrap()
}
/// Examines the directory to see if it contains an index.
///
/// Effectively, it only checks for the presence of the `meta.json` file.
pub fn exists(directory: &dyn Directory) -> Result<bool, OpenReadError> {
directory.exists(&META_FILEPATH)
}
/// Creates a new index in a given filepath.
/// The index will use the [`MmapDirectory`].
///
@@ -370,20 +375,107 @@ impl Index {
schema: Schema,
settings: IndexSettings,
) -> crate::Result<Index> {
let dir: Box<dyn Directory> = dir.into();
Self::create_to_avoid_monomorphization(dir.into(), schema, settings)
}
fn create_to_avoid_monomorphization(
dir: Box<dyn Directory>,
schema: Schema,
settings: IndexSettings,
) -> crate::Result<Index> {
let mut builder = IndexBuilder::new().schema(schema);
builder = builder.settings(settings);
builder.create(dir)
}
/// Opens a new directory from an index path.
#[cfg(feature = "mmap")]
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> crate::Result<Index> {
Self::open_in_dir_to_avoid_monomorphization(directory_path.as_ref())
}
#[inline(never)]
fn open_in_dir_to_avoid_monomorphization(directory_path: &Path) -> crate::Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
Index::open(mmap_directory)
}
/// Open the index using the provided directory
pub fn open<T: Into<Box<dyn Directory>>>(directory: T) -> crate::Result<Index> {
Index::<StandardCodec>::open_with_codec(directory.into())
}
}
impl<Codec: crate::codec::Codec> Index<Codec> {
/// Returns a version of this index with the standard codec.
/// This is useful when you need to pass the index to APIs that
/// don't care about the codec (e.g., for reading).
pub(crate) fn with_standard_codec(&self) -> Index<StandardCodec> {
Index {
directory: self.directory.clone(),
schema: self.schema.clone(),
settings: self.settings.clone(),
executor: self.executor.clone(),
tokenizers: self.tokenizers.clone(),
fast_field_tokenizers: self.fast_field_tokenizers.clone(),
inventory: self.inventory.clone(),
codec: StandardCodec,
}
}
/// Open the index using the provided directory
#[inline(never)]
pub fn open_with_codec(directory: Box<dyn Directory>) -> crate::Result<Index<Codec>> {
let directory = ManagedDirectory::wrap(directory)?;
let inventory = SegmentMetaInventory::default();
let metas = load_metas(&directory, &inventory)?;
let index: Index<Codec> = Index::<Codec>::open_from_metas(directory, &metas, inventory)?;
Ok(index)
}
/// Accessor to the codec.
pub fn codec(&self) -> &Codec {
&self.codec
}
/// Accessor to the search executor.
///
/// This pool is used by default when calling `searcher.search(...)`
/// to perform search on the individual segments.
///
/// By default the executor is single thread, and simply runs in the calling thread.
pub fn search_executor(&self) -> &Executor {
&self.executor
}
/// Replace the default single thread search executor pool
/// by a thread pool with a given number of threads.
pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> {
self.executor = Executor::multi_thread(num_threads, "tantivy-search-")?;
Ok(())
}
/// Custom thread pool by a outer thread pool.
pub fn set_executor(&mut self, executor: Executor) {
self.executor = executor;
}
/// Replace the default single thread search executor pool
/// by a thread pool with as many threads as there are CPUs on the system.
pub fn set_default_multithread_executor(&mut self) -> crate::Result<()> {
let default_num_threads = available_parallelism()?.get();
self.set_multithread_executor(default_num_threads)
}
/// Creates a new index given a directory and an [`IndexMeta`].
fn open_from_metas(
fn open_from_metas<C: crate::codec::Codec>(
directory: ManagedDirectory,
metas: &IndexMeta,
inventory: SegmentMetaInventory,
) -> Index {
) -> crate::Result<Index<C>> {
let schema = metas.schema.clone();
Index {
let codec = metas.codec.to_codec::<C>()?;
Ok(Index {
settings: metas.index_settings.clone(),
directory,
schema,
@@ -391,7 +483,8 @@ impl Index {
fast_field_tokenizers: TokenizerManager::default(),
executor: Executor::single_thread(),
inventory,
}
codec,
})
}
/// Setter for the tokenizer manager.
@@ -447,7 +540,7 @@ impl Index {
/// Create a default [`IndexReader`] for the given index.
///
/// See [`Index.reader_builder()`].
pub fn reader(&self) -> crate::Result<IndexReader> {
pub fn reader(&self) -> crate::Result<IndexReader<Codec>> {
self.reader_builder().try_into()
}
@@ -455,17 +548,10 @@ impl Index {
///
/// Most project should create at most one reader for a given index.
/// This method is typically called only once per `Index` instance.
pub fn reader_builder(&self) -> IndexReaderBuilder {
pub fn reader_builder(&self) -> IndexReaderBuilder<Codec> {
IndexReaderBuilder::new(self.clone())
}
/// Opens a new directory from an index path.
#[cfg(feature = "mmap")]
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> crate::Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
Index::open(mmap_directory)
}
/// Returns the list of the segment metas tracked by the index.
///
/// Such segments can of course be part of the index,
@@ -506,16 +592,6 @@ impl Index {
self.inventory.new_segment_meta(segment_id, max_doc)
}
/// Open the index using the provided directory
pub fn open<T: Into<Box<dyn Directory>>>(directory: T) -> crate::Result<Index> {
let directory = directory.into();
let directory = ManagedDirectory::wrap(directory)?;
let inventory = SegmentMetaInventory::default();
let metas = load_metas(&directory, &inventory)?;
let index = Index::open_from_metas(directory, &metas, inventory);
Ok(index)
}
/// Reads the index meta file from the directory.
pub fn load_metas(&self) -> crate::Result<IndexMeta> {
load_metas(self.directory(), &self.inventory)
@@ -539,7 +615,7 @@ impl Index {
pub fn writer_with_options<D: Document>(
&self,
options: IndexWriterOptions,
) -> crate::Result<IndexWriter<D>> {
) -> crate::Result<IndexWriter<Codec, D>> {
let directory_lock = self
.directory
.acquire_lock(&INDEX_WRITER_LOCK)
@@ -581,7 +657,7 @@ impl Index {
&self,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
) -> crate::Result<IndexWriter<D>> {
) -> crate::Result<IndexWriter<Codec, D>> {
let memory_arena_in_bytes_per_thread = overall_memory_budget_in_bytes / num_threads;
let options = IndexWriterOptions::builder()
.num_worker_threads(num_threads)
@@ -595,7 +671,7 @@ impl Index {
/// That index writer only simply has a single thread and a memory budget of 15 MB.
/// Using a single thread gives us a deterministic allocation of DocId.
#[cfg(test)]
pub fn writer_for_tests<D: Document>(&self) -> crate::Result<IndexWriter<D>> {
pub fn writer_for_tests<D: Document>(&self) -> crate::Result<IndexWriter<Codec, D>> {
self.writer_with_num_threads(1, MEMORY_BUDGET_NUM_BYTES_MIN)
}
@@ -613,7 +689,7 @@ impl Index {
pub fn writer<D: Document>(
&self,
memory_budget_in_bytes: usize,
) -> crate::Result<IndexWriter<D>> {
) -> crate::Result<IndexWriter<Codec, D>> {
let mut num_threads = std::cmp::min(available_parallelism()?.get(), MAX_NUM_THREAD);
let memory_budget_num_bytes_per_thread = memory_budget_in_bytes / num_threads;
if memory_budget_num_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
@@ -640,7 +716,7 @@ impl Index {
}
/// Returns the list of segments that are searchable
pub fn searchable_segments(&self) -> crate::Result<Vec<Segment>> {
pub fn searchable_segments(&self) -> crate::Result<Vec<Segment<Codec>>> {
Ok(self
.searchable_segment_metas()?
.into_iter()
@@ -649,12 +725,12 @@ impl Index {
}
#[doc(hidden)]
pub fn segment(&self, segment_meta: SegmentMeta) -> Segment {
pub fn segment(&self, segment_meta: SegmentMeta) -> Segment<Codec> {
Segment::for_index(self.clone(), segment_meta)
}
/// Creates a new segment.
pub fn new_segment(&self) -> Segment {
pub fn new_segment(&self) -> Segment<Codec> {
let segment_meta = self
.inventory
.new_segment_meta(SegmentId::generate_random(), 0);

View File

@@ -7,7 +7,8 @@ use std::sync::Arc;
use serde::{Deserialize, Serialize};
use super::SegmentComponent;
use crate::index::SegmentId;
use crate::codec::Codec;
use crate::index::{CodecConfiguration, SegmentId};
use crate::schema::Schema;
use crate::store::Compressor;
use crate::{Inventory, Opstamp, TrackedObject};
@@ -320,6 +321,8 @@ pub struct IndexMeta {
/// This payload is entirely unused by tantivy.
#[serde(skip_serializing_if = "Option::is_none")]
pub payload: Option<String>,
/// Codec configuration for the index.
pub codec: CodecConfiguration,
}
#[derive(Deserialize, Debug)]
@@ -331,6 +334,8 @@ struct UntrackedIndexMeta {
pub opstamp: Opstamp,
#[serde(skip_serializing_if = "Option::is_none")]
pub payload: Option<String>,
#[serde(default)]
pub codec: CodecConfiguration,
}
impl UntrackedIndexMeta {
@@ -345,6 +350,7 @@ impl UntrackedIndexMeta {
schema: self.schema,
opstamp: self.opstamp,
payload: self.payload,
codec: self.codec,
}
}
}
@@ -355,13 +361,14 @@ impl IndexMeta {
///
/// This new index does not contains any segments.
/// Opstamp will the value `0u64`.
pub fn with_schema(schema: Schema) -> IndexMeta {
pub fn with_schema_and_codec<C: Codec>(schema: Schema, codec: &C) -> IndexMeta {
IndexMeta {
index_settings: IndexSettings::default(),
segments: vec![],
schema,
opstamp: 0u64,
payload: None,
codec: CodecConfiguration::from(codec),
}
}
@@ -404,16 +411,20 @@ mod tests {
schema_builder.build()
};
let index_metas = IndexMeta {
index_settings: IndexSettings::default(),
index_settings: IndexSettings {
docstore_compression: Compressor::None,
..Default::default()
},
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None,
codec: Default::default(),
};
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(
json,
r#"{"index_settings":{"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
r#"{"index_settings":{"docstore_compression":"none","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0,"codec":{"name":"standard"}}"#
);
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
@@ -494,6 +505,8 @@ mod tests {
#[test]
#[cfg(feature = "lz4-compression")]
fn test_index_settings_default() {
use crate::store::Compressor;
let mut index_settings = IndexSettings::default();
assert_eq!(
index_settings,

View File

@@ -1,7 +1,8 @@
use std::io;
use std::sync::Arc;
use common::json_path_writer::JSON_END_OF_PATH;
use common::{BinarySerializable, ByteCount};
use common::{BinarySerializable, ByteCount, OwnedBytes};
#[cfg(feature = "quickwit")]
use futures_util::{FutureExt, StreamExt, TryStreamExt};
#[cfg(feature = "quickwit")]
@@ -9,12 +10,15 @@ use itertools::Itertools;
#[cfg(feature = "quickwit")]
use tantivy_fst::automaton::{AlwaysMatch, Automaton};
use crate::codec::postings::PostingsCodec;
use crate::codec::{Codec, ObjectSafeCodec, StandardCodec};
use crate::directory::FileSlice;
use crate::positions::PositionReader;
use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo};
use crate::fieldnorm::FieldNormReader;
use crate::postings::{Postings, TermInfo};
use crate::query::term_query::TermScorer;
use crate::query::{Bm25Weight, PhraseScorer, Scorer};
use crate::schema::{IndexRecordOption, Term, Type};
use crate::termdict::TermDictionary;
use crate::DocId;
/// The inverted index reader is in charge of accessing
/// the inverted index associated with a specific field.
@@ -34,6 +38,7 @@ pub struct InvertedIndexReader {
positions_file_slice: FileSlice,
record_option: IndexRecordOption,
total_num_tokens: u64,
codec: Arc<dyn ObjectSafeCodec>,
}
/// Object that records the amount of space used by a field in an inverted index.
@@ -69,6 +74,7 @@ impl InvertedIndexReader {
postings_file_slice: FileSlice,
positions_file_slice: FileSlice,
record_option: IndexRecordOption,
codec: Arc<dyn ObjectSafeCodec>,
) -> io::Result<InvertedIndexReader> {
let (total_num_tokens_slice, postings_body) = postings_file_slice.split(8);
let total_num_tokens = u64::deserialize(&mut total_num_tokens_slice.read_bytes()?)?;
@@ -78,6 +84,7 @@ impl InvertedIndexReader {
positions_file_slice,
record_option,
total_num_tokens,
codec,
})
}
@@ -90,6 +97,7 @@ impl InvertedIndexReader {
positions_file_slice: FileSlice::empty(),
record_option,
total_num_tokens: 0u64,
codec: Arc::new(StandardCodec),
}
}
@@ -161,80 +169,98 @@ impl InvertedIndexReader {
Ok(fields)
}
/// Resets the block segment to another position of the postings
/// file.
///
/// This is useful for enumerating through a list of terms,
/// and consuming the associated posting lists while avoiding
/// reallocating a [`BlockSegmentPostings`].
///
/// # Warning
///
/// This does not reset the positions list.
pub fn reset_block_postings_from_terminfo(
pub(crate) fn new_term_scorer_specialized<C: Codec>(
&self,
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings,
) -> io::Result<()> {
let postings_slice = self
.postings_file_slice
.slice(term_info.postings_range.clone());
let postings_bytes = postings_slice.read_bytes()?;
block_postings.reset(term_info.doc_freq, postings_bytes)?;
Ok(())
}
/// Returns a block postings given a `Term`.
/// This method is for an advanced usage only.
///
/// Most users should prefer using [`Self::read_postings()`] instead.
pub fn read_block_postings(
&self,
term: &Term,
option: IndexRecordOption,
) -> io::Result<Option<BlockSegmentPostings>> {
let Some(term_info) = self.get_term_info(term)? else {
return Ok(None);
};
let block_postings_not_loaded =
self.read_block_postings_from_terminfo(&term_info, option)?;
Ok(Some(block_postings_not_loaded))
fieldnorm_reader: FieldNormReader,
similarity_weight: Bm25Weight,
codec: &C,
) -> io::Result<TermScorer<<<C as Codec>::PostingsCodec as PostingsCodec>::Postings>> {
let postings = self.read_postings_from_terminfo_specialized(term_info, option, codec)?;
let term_scorer = TermScorer::new(postings, fieldnorm_reader, similarity_weight);
Ok(term_scorer)
}
/// Returns a block postings given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most users should prefer using [`Self::read_postings()`] instead.
pub(crate) fn read_block_postings_from_terminfo_with_seek(
pub(crate) fn new_phrase_scorer_type_specialized<C: Codec>(
&self,
term_infos: &[(usize, TermInfo)],
similarity_weight_opt: Option<Bm25Weight>,
fieldnorm_reader: FieldNormReader,
slop: u32,
codec: &C,
) -> io::Result<PhraseScorer<<<C as Codec>::PostingsCodec as PostingsCodec>::Postings>> {
let mut offset_and_term_postings: Vec<(
usize,
<<C as Codec>::PostingsCodec as PostingsCodec>::Postings,
)> = Vec::with_capacity(term_infos.len());
for (offset, term_info) in term_infos {
let postings = self.read_postings_from_terminfo_specialized(
term_info,
IndexRecordOption::WithFreqsAndPositions,
codec,
)?;
offset_and_term_postings.push((*offset, postings));
}
let phrase_scorer = PhraseScorer::new(
offset_and_term_postings,
similarity_weight_opt,
fieldnorm_reader,
slop,
);
Ok(phrase_scorer)
}
/// Build a new term scorer.
pub fn new_term_scorer(
&self,
term_info: &TermInfo,
requested_option: IndexRecordOption,
seek_doc: DocId,
) -> io::Result<(BlockSegmentPostings, usize)> {
option: IndexRecordOption,
fieldnorm_reader: FieldNormReader,
similarity_weight: Bm25Weight,
) -> io::Result<Box<dyn Scorer>> {
let term_scorer = self.codec.load_term_scorer_type_erased(
term_info,
option,
self,
fieldnorm_reader,
similarity_weight,
)?;
Ok(term_scorer)
}
/// Returns a postings object specific with a concrete type.
///
/// This requires you to provied the actual codec.
pub fn read_postings_from_terminfo_specialized<C: Codec>(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
codec: &C,
) -> io::Result<<<C as Codec>::PostingsCodec as PostingsCodec>::Postings> {
let option = option.downgrade(self.record_option);
let postings_data = self
.postings_file_slice
.slice(term_info.postings_range.clone());
BlockSegmentPostings::open(
term_info.doc_freq,
postings_data,
self.record_option,
requested_option,
seek_doc,
)
}
/// Returns a block postings given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most users should prefer using [`Self::read_postings()`] instead.
pub fn read_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
requested_option: IndexRecordOption,
) -> io::Result<BlockSegmentPostings> {
let (block_segment_postings, _) =
self.read_block_postings_from_terminfo_with_seek(term_info, requested_option, 0)?;
Ok(block_segment_postings)
.slice(term_info.postings_range.clone())
.read_bytes()?;
let positions_data: Option<OwnedBytes> = if option.has_positions() {
let positions_data = self
.positions_file_slice
.slice(term_info.positions_range.clone())
.read_bytes()?;
Some(positions_data)
} else {
None
};
let postings: <<C as Codec>::PostingsCodec as PostingsCodec>::Postings =
codec.postings_codec().load_postings(
term_info.doc_freq,
postings_data,
self.record_option,
option,
positions_data,
)?;
Ok(postings)
}
/// Returns a posting object given a `term_info`.
@@ -244,27 +270,10 @@ impl InvertedIndexReader {
pub fn read_postings_from_terminfo(
&self,
term_info: &TermInfo,
record_option: IndexRecordOption,
seek_doc: DocId,
) -> io::Result<SegmentPostings> {
let (block_segment_postings, position_within_block) =
self.read_block_postings_from_terminfo_with_seek(term_info, record_option, seek_doc)?;
let position_reader = {
if record_option.has_positions() {
let positions_data = self
.positions_file_slice
.read_bytes_slice(term_info.positions_range.clone())?;
let position_reader = PositionReader::open(positions_data)?;
Some(position_reader)
} else {
None
}
};
Ok(SegmentPostings::from_block_postings(
block_segment_postings,
position_reader,
position_within_block,
))
option: IndexRecordOption,
) -> io::Result<Box<dyn Postings>> {
self.codec
.load_postings_type_erased(term_info, option, self)
}
/// Returns the total number of tokens recorded for all documents
@@ -287,9 +296,9 @@ impl InvertedIndexReader {
&self,
term: &Term,
option: IndexRecordOption,
) -> io::Result<Option<SegmentPostings>> {
) -> io::Result<Option<Box<dyn Postings>>> {
self.get_term_info(term)?
.map(move |term_info| self.read_postings_from_terminfo(&term_info, option, 0u32))
.map(move |term_info| self.read_postings_from_terminfo(&term_info, option))
.transpose()
}

View File

@@ -2,6 +2,7 @@
//!
//! It contains `Index` and `Segment`, where a `Index` consists of one or more `Segment`s.
mod codec_configuration;
mod index;
mod index_meta;
mod inverted_index_reader;
@@ -10,6 +11,7 @@ mod segment_component;
mod segment_id;
mod segment_reader;
pub use self::codec_configuration::CodecConfiguration;
pub use self::index::{Index, IndexBuilder};
pub(crate) use self::index_meta::SegmentMetaInventory;
pub use self::index_meta::{IndexMeta, IndexSettings, Order, SegmentMeta};

View File

@@ -2,6 +2,7 @@ use std::fmt;
use std::path::PathBuf;
use super::SegmentComponent;
use crate::codec::StandardCodec;
use crate::directory::error::{OpenReadError, OpenWriteError};
use crate::directory::{Directory, FileSlice, WritePtr};
use crate::index::{Index, SegmentId, SegmentMeta};
@@ -10,25 +11,25 @@ use crate::Opstamp;
/// A segment is a piece of the index.
#[derive(Clone)]
pub struct Segment {
index: Index,
pub struct Segment<C: crate::codec::Codec = StandardCodec> {
index: Index<C>,
meta: SegmentMeta,
}
impl fmt::Debug for Segment {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
impl<C: crate::codec::Codec> fmt::Debug for Segment<C> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Segment({:?})", self.id().uuid_string())
}
}
impl Segment {
impl<C: crate::codec::Codec> Segment<C> {
/// Creates a new segment given an `Index` and a `SegmentId`
pub(crate) fn for_index(index: Index, meta: SegmentMeta) -> Segment {
pub(crate) fn for_index(index: Index<C>, meta: SegmentMeta) -> Segment<C> {
Segment { index, meta }
}
/// Returns the index the segment belongs to.
pub fn index(&self) -> &Index {
pub fn index(&self) -> &Index<C> {
&self.index
}
@@ -46,7 +47,7 @@ impl Segment {
///
/// This method is only used when updating `max_doc` from 0
/// as we finalize a fresh new segment.
pub fn with_max_doc(self, max_doc: u32) -> Segment {
pub fn with_max_doc(self, max_doc: u32) -> Segment<C> {
Segment {
index: self.index,
meta: self.meta.with_max_doc(max_doc),
@@ -55,7 +56,7 @@ impl Segment {
#[doc(hidden)]
#[must_use]
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment<C> {
Segment {
index: self.index,
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),

View File

@@ -6,6 +6,7 @@ use common::{ByteCount, HasLen};
use fnv::FnvHashMap;
use itertools::Itertools;
use crate::codec::ObjectSafeCodec;
use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption;
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
@@ -47,6 +48,8 @@ pub struct SegmentReader {
store_file: FileSlice,
alive_bitset_opt: Option<AliveBitSet>,
schema: Schema,
pub(crate) codec: Arc<dyn ObjectSafeCodec>,
}
impl SegmentReader {
@@ -140,15 +143,16 @@ impl SegmentReader {
}
/// Open a new segment for reading.
pub fn open(segment: &Segment) -> crate::Result<SegmentReader> {
pub fn open<C: crate::codec::Codec>(segment: &Segment<C>) -> crate::Result<SegmentReader> {
Self::open_with_custom_alive_set(segment, None)
}
/// Open a new segment for reading.
pub fn open_with_custom_alive_set(
segment: &Segment,
pub fn open_with_custom_alive_set<C: crate::codec::Codec>(
segment: &Segment<C>,
custom_bitset: Option<AliveBitSet>,
) -> crate::Result<SegmentReader> {
let codec: Arc<dyn ObjectSafeCodec> = Arc::new(segment.index().codec().clone());
let termdict_file = segment.open_read(SegmentComponent::Terms)?;
let termdict_composite = CompositeFile::open(&termdict_file)?;
@@ -204,6 +208,7 @@ impl SegmentReader {
alive_bitset_opt,
positions_composite,
schema,
codec,
})
}
@@ -273,6 +278,7 @@ impl SegmentReader {
postings_file,
positions_file,
record_option,
self.codec.clone(),
)?);
// by releasing the lock in between, we may end up opening the inverting index

View File

@@ -4,19 +4,20 @@ use std::sync::{Arc, RwLock, Weak};
use super::operation::DeleteOperation;
use crate::Opstamp;
// The DeleteQueue is similar in conceptually to a multiple
// consumer single producer broadcast channel.
//
// All consumer will receive all messages.
//
// Consumer of the delete queue are holding a `DeleteCursor`,
// which points to a specific place of the `DeleteQueue`.
//
// New consumer can be created in two ways
// - calling `delete_queue.cursor()` returns a cursor, that will include all future delete operation
// (and some or none of the past operations... The client is in charge of checking the opstamps.).
// - cloning an existing cursor returns a new cursor, that is at the exact same position, and can
// now advance independently from the original cursor.
/// The DeleteQueue is similar in conceptually to a multiple
/// consumer single producer broadcast channel.
///
/// All consumer will receive all messages.
///
/// Consumer of the delete queue are holding a `DeleteCursor`,
/// which points to a specific place of the `DeleteQueue`.
///
/// New consumer can be created in two ways
/// - calling `delete_queue.cursor()` returns a cursor, that will include all future delete
/// operation (and some or none of the past operations... The client is in charge of checking the
/// opstamps.).
/// - cloning an existing cursor returns a new cursor, that is at the exact same position, and can
/// now advance independently from the original cursor.
#[derive(Default)]
struct InnerDeleteQueue {
writer: Vec<DeleteOperation>,
@@ -249,12 +250,7 @@ mod tests {
struct DummyWeight;
impl Weight for DummyWeight {
fn scorer(
&self,
_reader: &SegmentReader,
_boost: Score,
_seek_doc: DocId,
) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> crate::Result<Box<dyn Scorer>> {
Err(crate::TantivyError::InternalError("dummy impl".to_owned()))
}

View File

@@ -9,6 +9,7 @@ use smallvec::smallvec;
use super::operation::{AddOperation, UserOperation};
use super::segment_updater::SegmentUpdater;
use super::{AddBatch, AddBatchReceiver, AddBatchSender, PreparedCommit};
use crate::codec::{Codec, StandardCodec};
use crate::directory::{DirectoryLock, GarbageCollectionResult, TerminatingWrite};
use crate::error::TantivyError;
use crate::fastfield::write_alive_bitset;
@@ -68,12 +69,12 @@ pub struct IndexWriterOptions {
/// indexing queue.
/// Each indexing thread builds its own independent [`Segment`], via
/// a `SegmentWriter` object.
pub struct IndexWriter<D: Document = TantivyDocument> {
pub struct IndexWriter<C: Codec = StandardCodec, D: Document = TantivyDocument> {
// the lock is just used to bind the
// lifetime of the lock with that of the IndexWriter.
_directory_lock: Option<DirectoryLock>,
index: Index,
index: Index<C>,
options: IndexWriterOptions,
@@ -82,7 +83,7 @@ pub struct IndexWriter<D: Document = TantivyDocument> {
index_writer_status: IndexWriterStatus<D>,
operation_sender: AddBatchSender<D>,
segment_updater: SegmentUpdater,
segment_updater: SegmentUpdater<C>,
worker_id: usize,
@@ -128,8 +129,8 @@ fn compute_deleted_bitset(
/// is `==` target_opstamp.
/// For instance, there was no delete operation between the state of the `segment_entry` and
/// the `target_opstamp`, `segment_entry` is not updated.
pub fn advance_deletes(
mut segment: Segment,
pub fn advance_deletes<C: Codec>(
mut segment: Segment<C>,
segment_entry: &mut SegmentEntry,
target_opstamp: Opstamp,
) -> crate::Result<()> {
@@ -179,11 +180,11 @@ pub fn advance_deletes(
Ok(())
}
fn index_documents<D: Document>(
fn index_documents<C: crate::codec::Codec, D: Document>(
memory_budget: usize,
segment: Segment,
segment: Segment<C>,
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch<D>>,
segment_updater: &SegmentUpdater,
segment_updater: &SegmentUpdater<C>,
mut delete_cursor: DeleteCursor,
) -> crate::Result<()> {
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone())?;
@@ -226,8 +227,8 @@ fn index_documents<D: Document>(
}
/// `doc_opstamps` is required to be non-empty.
fn apply_deletes(
segment: &Segment,
fn apply_deletes<C: crate::codec::Codec>(
segment: &Segment<C>,
delete_cursor: &mut DeleteCursor,
doc_opstamps: &[Opstamp],
) -> crate::Result<Option<BitSet>> {
@@ -262,7 +263,7 @@ fn apply_deletes(
})
}
impl<D: Document> IndexWriter<D> {
impl<C: Codec, D: Document> IndexWriter<C, D> {
/// Create a new index writer. Attempts to acquire a lockfile.
///
/// The lockfile should be deleted on drop, but it is possible
@@ -278,7 +279,7 @@ impl<D: Document> IndexWriter<D> {
/// If the memory arena per thread is too small or too big, returns
/// `TantivyError::InvalidArgument`
pub(crate) fn new(
index: &Index,
index: &Index<C>,
options: IndexWriterOptions,
directory_lock: DirectoryLock,
) -> crate::Result<Self> {
@@ -345,7 +346,7 @@ impl<D: Document> IndexWriter<D> {
}
/// Accessor to the index.
pub fn index(&self) -> &Index {
pub fn index(&self) -> &Index<C> {
&self.index
}
@@ -393,7 +394,7 @@ impl<D: Document> IndexWriter<D> {
/// It is safe to start writing file associated with the new `Segment`.
/// These will not be garbage collected as long as an instance object of
/// `SegmentMeta` object associated with the new `Segment` is "alive".
pub fn new_segment(&self) -> Segment {
pub fn new_segment(&self) -> Segment<C> {
self.index.new_segment()
}
@@ -615,7 +616,7 @@ impl<D: Document> IndexWriter<D> {
/// It is also possible to add a payload to the `commit`
/// using this API.
/// See [`PreparedCommit::set_payload()`].
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<'_, D>> {
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<'_, C, D>> {
// Here, because we join all of the worker threads,
// all of the segment update for this commit have been
// sent.
@@ -665,7 +666,7 @@ impl<D: Document> IndexWriter<D> {
self.prepare_commit()?.commit()
}
pub(crate) fn segment_updater(&self) -> &SegmentUpdater {
pub(crate) fn segment_updater(&self) -> &SegmentUpdater<C> {
&self.segment_updater
}
@@ -804,7 +805,7 @@ impl<D: Document> IndexWriter<D> {
}
}
impl<D: Document> Drop for IndexWriter<D> {
impl<C: Codec, D: Document> Drop for IndexWriter<C, D> {
fn drop(&mut self) {
self.segment_updater.kill();
self.drop_sender();

View File

@@ -1,9 +1,10 @@
#[cfg(test)]
mod tests {
use crate::codec::StandardCodec;
use crate::collector::TopDocs;
use crate::fastfield::AliveBitSet;
use crate::index::Index;
use crate::postings::Postings;
use crate::postings::{DocFreq, Postings};
use crate::query::QueryParser;
use crate::schema::{
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
@@ -121,21 +122,26 @@ mod tests {
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let term_info = inverted_index.get_term_info(&term_a).unwrap().unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.read_postings_from_terminfo_specialized(
&term_info,
IndexRecordOption::WithFreqsAndPositions,
&StandardCodec,
)
.unwrap();
assert_eq!(postings.doc_freq(), 2);
assert_eq!(postings.doc_freq(), DocFreq::Exact(2));
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
crate::indexer::merger::doc_freq_given_deletes(
&postings,
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
),
2
);
assert_eq!(postings.term_freq(), 1);
let mut output = vec![];
let mut output = Vec::new();
postings.positions(&mut output);
assert_eq!(output, vec![1]);
postings.advance();

View File

@@ -7,6 +7,8 @@ use common::ReadOnlyBitSet;
use itertools::Itertools;
use measure_time::debug_time;
use crate::codec::postings::PostingsCodec;
use crate::codec::{Codec, StandardCodec};
use crate::directory::WritePtr;
use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption;
@@ -15,7 +17,7 @@ use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer,
use crate::index::{Segment, SegmentComponent, SegmentReader};
use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping};
use crate::indexer::SegmentSerializer;
use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings};
use crate::postings::{InvertedIndexSerializer, Postings};
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema};
use crate::store::StoreWriter;
use crate::termdict::{TermMerger, TermOrdinal};
@@ -76,10 +78,11 @@ fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::
Ok(total_num_tokens)
}
pub struct IndexMerger {
pub struct IndexMerger<C: Codec = StandardCodec> {
schema: Schema,
pub(crate) readers: Vec<SegmentReader>,
max_doc: u32,
codec: C,
}
struct DeltaComputer {
@@ -144,8 +147,8 @@ fn extract_fast_field_required_columns(schema: &Schema) -> Vec<(String, ColumnTy
.collect()
}
impl IndexMerger {
pub fn open(schema: Schema, segments: &[Segment]) -> crate::Result<IndexMerger> {
impl<C: Codec> IndexMerger<C> {
pub fn open(schema: Schema, segments: &[Segment<C>]) -> crate::Result<IndexMerger<C>> {
let alive_bitset = segments.iter().map(|_| None).collect_vec();
Self::open_with_custom_alive_set(schema, segments, alive_bitset)
}
@@ -162,11 +165,15 @@ impl IndexMerger {
// This can be used to merge but also apply an additional filter.
// One use case is demux, which is basically taking a list of
// segments and partitions them e.g. by a value in a field.
//
// # Panics if segments is empty.
pub fn open_with_custom_alive_set(
schema: Schema,
segments: &[Segment],
segments: &[Segment<C>],
alive_bitset_opt: Vec<Option<AliveBitSet>>,
) -> crate::Result<IndexMerger> {
) -> crate::Result<IndexMerger<C>> {
assert!(!segments.is_empty());
let codec = segments[0].index().codec().clone();
let mut readers = vec![];
for (segment, new_alive_bitset_opt) in segments.iter().zip(alive_bitset_opt) {
if segment.meta().num_docs() > 0 {
@@ -189,6 +196,7 @@ impl IndexMerger {
schema,
readers,
max_doc,
codec,
})
}
@@ -287,7 +295,7 @@ impl IndexMerger {
&self,
indexed_field: Field,
_field_type: &FieldType,
serializer: &mut InvertedIndexSerializer,
serializer: &mut InvertedIndexSerializer<C>,
fieldnorm_reader: Option<FieldNormReader>,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {
@@ -355,7 +363,10 @@ impl IndexMerger {
indexed. Have you modified the schema?",
);
let mut segment_postings_containing_the_term: Vec<(usize, SegmentPostings)> = vec![];
let mut segment_postings_containing_the_term: Vec<(
usize,
<C::PostingsCodec as PostingsCodec>::Postings,
)> = Vec::with_capacity(self.readers.len());
while merged_terms.advance() {
segment_postings_containing_the_term.clear();
@@ -367,20 +378,24 @@ impl IndexMerger {
for (segment_ord, term_info) in merged_terms.current_segment_ords_and_term_infos() {
let segment_reader = &self.readers[segment_ord];
let inverted_index: &InvertedIndexReader = &field_readers[segment_ord];
let segment_postings = inverted_index.read_postings_from_terminfo(
let postings = inverted_index.read_postings_from_terminfo_specialized(
&term_info,
segment_postings_option,
0u32,
&self.codec,
)?;
let alive_bitset_opt = segment_reader.alive_bitset();
let doc_freq = if let Some(alive_bitset) = alive_bitset_opt {
segment_postings.doc_freq_given_deletes(alive_bitset)
doc_freq_given_deletes(&postings, alive_bitset)
} else {
segment_postings.doc_freq()
// We do not an exact document frequency here.
match postings.doc_freq() {
crate::postings::DocFreq::Approximate(_) => exact_doc_freq(&postings),
crate::postings::DocFreq::Exact(doc_freq) => doc_freq,
}
};
if doc_freq > 0u32 {
total_doc_freq += doc_freq;
segment_postings_containing_the_term.push((segment_ord, segment_postings));
segment_postings_containing_the_term.push((segment_ord, postings));
}
}
@@ -398,11 +413,7 @@ impl IndexMerger {
assert!(!segment_postings_containing_the_term.is_empty());
let has_term_freq = {
let has_term_freq = !segment_postings_containing_the_term[0]
.1
.block_cursor
.freqs()
.is_empty();
let has_term_freq = segment_postings_containing_the_term[0].1.has_freq();
for (_, postings) in &segment_postings_containing_the_term[1..] {
// This may look at a strange way to test whether we have term freq or not.
// With JSON object, the schema is not sufficient to know whether a term
@@ -418,7 +429,7 @@ impl IndexMerger {
//
// Overall the reliable way to know if we have actual frequencies loaded or not
// is to check whether the actual decoded array is empty or not.
if has_term_freq == postings.block_cursor.freqs().is_empty() {
if postings.has_freq() != has_term_freq {
return Err(DataCorruption::comment_only(
"Term freqs are inconsistent across segments",
)
@@ -470,7 +481,7 @@ impl IndexMerger {
fn write_postings(
&self,
serializer: &mut InvertedIndexSerializer,
serializer: &mut InvertedIndexSerializer<C>,
fieldnorm_readers: FieldNormReaders,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {
@@ -528,7 +539,7 @@ impl IndexMerger {
///
/// # Returns
/// The number of documents in the resulting segment.
pub fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
pub fn write(&self, mut serializer: SegmentSerializer<C>) -> crate::Result<u32> {
let doc_id_mapping = self.get_doc_id_from_concatenated_data()?;
debug!("write-fieldnorms");
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
@@ -556,6 +567,43 @@ impl IndexMerger {
}
}
/// Compute the number of non-deleted documents.
///
/// This method will clone and scan through the posting lists.
/// (this is a rather expensive operation).
pub(crate) fn doc_freq_given_deletes<P: Postings + Clone>(
postings: &P,
alive_bitset: &AliveBitSet,
) -> u32 {
let mut docset = postings.clone();
let mut doc_freq = 0;
loop {
let doc = docset.doc();
if doc == TERMINATED {
return doc_freq;
}
if alive_bitset.is_alive(doc) {
doc_freq += 1u32;
}
docset.advance();
}
}
/// If the postings is not able to inform us of the document frequency,
/// we just scan through it.
pub(crate) fn exact_doc_freq<P: Postings + Clone>(postings: &P) -> u32 {
let mut docset = postings.clone();
let mut doc_freq = 0;
loop {
let doc = docset.doc();
if doc == TERMINATED {
return doc_freq;
}
doc_freq += 1u32;
docset.advance();
}
}
#[cfg(test)]
mod tests {
@@ -564,12 +612,16 @@ mod tests {
use proptest::strategy::Strategy;
use schema::FAST;
use crate::codec::postings::PostingsCodec;
use crate::codec::standard::postings::StandardPostingsCodec;
use crate::collector::tests::{
BytesFastFieldTestCollector, FastFieldTestCollector, TEST_COLLECTOR_WITH_SCORE,
};
use crate::collector::{Count, FacetCollector};
use crate::fastfield::AliveBitSet;
use crate::index::{Index, SegmentId};
use crate::indexer::NoMergePolicy;
use crate::postings::{DocFreq, Postings as _};
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::schema::{
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
@@ -1521,10 +1573,10 @@ mod tests {
let searcher = reader.searcher();
let mut term_scorer = term_query
.specialized_weight(EnableScoring::enabled_from_searcher(&searcher))?
.term_scorer_for_test(searcher.segment_reader(0u32), 1.0)?
.term_scorer_for_test(searcher.segment_reader(0u32), 1.0)
.unwrap();
assert_eq!(term_scorer.doc(), 0);
assert_nearly_equals!(term_scorer.block_max_score(), 0.0079681855);
assert_nearly_equals!(term_scorer.seek_block_max(0), 0.0079681855);
assert_nearly_equals!(term_scorer.score(), 0.0079681855);
for _ in 0..81 {
writer.add_document(doc!(text=>"hello happy tax payer"))?;
@@ -1537,13 +1589,13 @@ mod tests {
for segment_reader in searcher.segment_readers() {
let mut term_scorer = term_query
.specialized_weight(EnableScoring::enabled_from_searcher(&searcher))?
.term_scorer_for_test(segment_reader, 1.0)?
.term_scorer_for_test(segment_reader, 1.0)
.unwrap();
// the difference compared to before is intrinsic to the bm25 formula. no worries
// there.
for doc in segment_reader.doc_ids_alive() {
assert_eq!(term_scorer.doc(), doc);
assert_nearly_equals!(term_scorer.block_max_score(), 0.003478312);
assert_nearly_equals!(term_scorer.seek_block_max(doc), 0.003478312);
assert_nearly_equals!(term_scorer.score(), 0.003478312);
term_scorer.advance();
}
@@ -1563,12 +1615,12 @@ mod tests {
let segment_reader = searcher.segment_reader(0u32);
let mut term_scorer = term_query
.specialized_weight(EnableScoring::enabled_from_searcher(&searcher))?
.term_scorer_for_test(segment_reader, 1.0)?
.term_scorer_for_test(segment_reader, 1.0)
.unwrap();
// the difference compared to before is intrinsic to the bm25 formula. no worries there.
for doc in segment_reader.doc_ids_alive() {
assert_eq!(term_scorer.doc(), doc);
assert_nearly_equals!(term_scorer.block_max_score(), 0.003478312);
assert_nearly_equals!(term_scorer.seek_block_max(doc), 0.003478312);
assert_nearly_equals!(term_scorer.score(), 0.003478312);
term_scorer.advance();
}
@@ -1582,4 +1634,16 @@ mod tests {
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
assert!((super::MAX_DOC_LIMIT as i32) < 0);
}
#[test]
fn test_doc_freq_given_delete() {
let docs =
<StandardPostingsCodec as PostingsCodec>::Postings::create_from_docs(&[0, 2, 10]);
assert_eq!(docs.doc_freq(), DocFreq::Exact(3));
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12);
assert_eq!(super::doc_freq_given_deletes(&docs, &alive_bitset), 2);
let all_deleted =
AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
assert_eq!(super::doc_freq_given_deletes(&docs, &all_deleted), 0);
}
}

View File

@@ -4,6 +4,7 @@
//! `IndexWriter` is the main entry point for that, which created from
//! [`Index::writer`](crate::Index::writer).
/// Delete queue implementation for broadcasting delete operations to consumers.
pub(crate) mod delete_queue;
pub(crate) mod path_to_unordered_id;

View File

@@ -1,16 +1,17 @@
use super::IndexWriter;
use crate::codec::Codec;
use crate::schema::document::Document;
use crate::{FutureResult, Opstamp, TantivyDocument};
/// A prepared commit
pub struct PreparedCommit<'a, D: Document = TantivyDocument> {
index_writer: &'a mut IndexWriter<D>,
pub struct PreparedCommit<'a, C: Codec, D: Document = TantivyDocument> {
index_writer: &'a mut IndexWriter<C, D>,
payload: Option<String>,
opstamp: Opstamp,
}
impl<'a, D: Document> PreparedCommit<'a, D> {
pub(crate) fn new(index_writer: &'a mut IndexWriter<D>, opstamp: Opstamp) -> Self {
impl<'a, C: Codec, D: Document> PreparedCommit<'a, C, D> {
pub(crate) fn new(index_writer: &'a mut IndexWriter<C, D>, opstamp: Opstamp) -> Self {
Self {
index_writer,
payload: None,

View File

@@ -8,17 +8,17 @@ use crate::store::StoreWriter;
/// Segment serializer is in charge of laying out on disk
/// the data accumulated and sorted by the `SegmentWriter`.
pub struct SegmentSerializer {
segment: Segment,
pub struct SegmentSerializer<C: crate::codec::Codec> {
segment: Segment<C>,
pub(crate) store_writer: StoreWriter,
fast_field_write: WritePtr,
fieldnorms_serializer: Option<FieldNormsSerializer>,
postings_serializer: InvertedIndexSerializer,
postings_serializer: InvertedIndexSerializer<C>,
}
impl SegmentSerializer {
impl<C: crate::codec::Codec> SegmentSerializer<C> {
/// Creates a new `SegmentSerializer`.
pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> {
pub fn for_segment(mut segment: Segment<C>) -> crate::Result<SegmentSerializer<C>> {
let settings = segment.index().settings().clone();
let store_writer = {
let store_write = segment.open_write(SegmentComponent::Store)?;
@@ -50,12 +50,12 @@ impl SegmentSerializer {
self.store_writer.mem_usage()
}
pub fn segment(&self) -> &Segment {
pub fn segment(&self) -> &Segment<C> {
&self.segment
}
/// Accessor to the `PostingsSerializer`.
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer<C> {
&mut self.postings_serializer
}

View File

@@ -10,10 +10,13 @@ use std::sync::{Arc, RwLock};
use rayon::{ThreadPool, ThreadPoolBuilder};
use super::segment_manager::SegmentManager;
use crate::codec::Codec;
use crate::core::META_FILEPATH;
use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult};
use crate::fastfield::AliveBitSet;
use crate::index::{Index, IndexMeta, IndexSettings, Segment, SegmentId, SegmentMeta};
use crate::index::{
CodecConfiguration, Index, IndexMeta, IndexSettings, Segment, SegmentId, SegmentMeta,
};
use crate::indexer::delete_queue::DeleteCursor;
use crate::indexer::index_writer::advance_deletes;
use crate::indexer::merge_operation::MergeOperationInventory;
@@ -61,10 +64,10 @@ pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate:
// We voluntarily pass a merge_operation ref to guarantee that
// the merge_operation is alive during the process
#[derive(Clone)]
pub(crate) struct SegmentUpdater(Arc<InnerSegmentUpdater>);
pub(crate) struct SegmentUpdater<C: Codec>(Arc<InnerSegmentUpdater<C>>);
impl Deref for SegmentUpdater {
type Target = InnerSegmentUpdater;
impl<C: Codec> Deref for SegmentUpdater<C> {
type Target = InnerSegmentUpdater<C>;
#[inline]
fn deref(&self) -> &Self::Target {
@@ -72,8 +75,8 @@ impl Deref for SegmentUpdater {
}
}
fn garbage_collect_files(
segment_updater: SegmentUpdater,
fn garbage_collect_files<C: Codec>(
segment_updater: SegmentUpdater<C>,
) -> crate::Result<GarbageCollectionResult> {
info!("Running garbage collection");
let mut index = segment_updater.index.clone();
@@ -84,8 +87,8 @@ fn garbage_collect_files(
/// Merges a list of segments the list of segment givens in the `segment_entries`.
/// This function happens in the calling thread and is computationally expensive.
fn merge(
index: &Index,
fn merge<Codec: crate::codec::Codec>(
index: &Index<Codec>,
mut segment_entries: Vec<SegmentEntry>,
target_opstamp: Opstamp,
) -> crate::Result<Option<SegmentEntry>> {
@@ -108,13 +111,13 @@ fn merge(
let delete_cursor = segment_entries[0].delete_cursor().clone();
let segments: Vec<Segment> = segment_entries
let segments: Vec<Segment<Codec>> = segment_entries
.iter()
.map(|segment_entry| index.segment(segment_entry.meta().clone()))
.collect();
// An IndexMerger is like a "view" of our merged segments.
let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?;
let merger: IndexMerger<Codec> = IndexMerger::open(index.schema(), &segments[..])?;
// ... we just serialize this index merger in our new segment to merge the segments.
let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone())?;
@@ -139,10 +142,10 @@ fn merge(
/// meant to work if you have an `IndexWriter` running for the origin indices, or
/// the destination `Index`.
#[doc(hidden)]
pub fn merge_indices<T: Into<Box<dyn Directory>>>(
indices: &[Index],
output_directory: T,
) -> crate::Result<Index> {
pub fn merge_indices<Codec: crate::codec::Codec>(
indices: &[Index<Codec>],
output_directory: Box<dyn Directory>,
) -> crate::Result<Index<Codec>> {
if indices.is_empty() {
// If there are no indices to merge, there is no need to do anything.
return Err(crate::TantivyError::InvalidArgument(
@@ -163,7 +166,7 @@ pub fn merge_indices<T: Into<Box<dyn Directory>>>(
));
}
let mut segments: Vec<Segment> = Vec::new();
let mut segments: Vec<Segment<Codec>> = Vec::new();
for index in indices {
segments.extend(index.searchable_segments()?);
}
@@ -185,12 +188,12 @@ pub fn merge_indices<T: Into<Box<dyn Directory>>>(
/// meant to work if you have an `IndexWriter` running for the origin indices, or
/// the destination `Index`.
#[doc(hidden)]
pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
segments: &[Segment],
pub fn merge_filtered_segments<C: crate::codec::Codec, T: Into<Box<dyn Directory>>>(
segments: &[Segment<C>],
target_settings: IndexSettings,
filter_doc_ids: Vec<Option<AliveBitSet>>,
output_directory: T,
) -> crate::Result<Index> {
) -> crate::Result<Index<C>> {
if segments.is_empty() {
// If there are no indices to merge, there is no need to do anything.
return Err(crate::TantivyError::InvalidArgument(
@@ -211,14 +214,15 @@ pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
));
}
let mut merged_index = Index::create(
output_directory,
target_schema.clone(),
target_settings.clone(),
)?;
let mut merged_index: Index<C> = Index::builder()
.schema(target_schema.clone())
.codec(segments[0].index().codec().clone())
.settings(target_settings.clone())
.create(output_directory.into())?;
let merged_segment = merged_index.new_segment();
let merged_segment_id = merged_segment.id();
let merger: IndexMerger =
let merger: IndexMerger<C> =
IndexMerger::open_with_custom_alive_set(merged_index.schema(), segments, filter_doc_ids)?;
let segment_serializer = SegmentSerializer::for_segment(merged_segment)?;
let num_docs = merger.write(segment_serializer)?;
@@ -235,6 +239,7 @@ pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
))
.trim_end()
);
let codec_configuration = CodecConfiguration::from(segments[0].index().codec());
let index_meta = IndexMeta {
index_settings: target_settings, // index_settings of all segments should be the same
@@ -242,6 +247,7 @@ pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
schema: target_schema,
opstamp: 0u64,
payload: Some(stats),
codec: codec_configuration,
};
// save the meta.json
@@ -250,7 +256,7 @@ pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
Ok(merged_index)
}
pub(crate) struct InnerSegmentUpdater {
pub(crate) struct InnerSegmentUpdater<C: Codec> {
// we keep a copy of the current active IndexMeta to
// avoid loading the file every time we need it in the
// `SegmentUpdater`.
@@ -261,7 +267,7 @@ pub(crate) struct InnerSegmentUpdater {
pool: ThreadPool,
merge_thread_pool: ThreadPool,
index: Index,
index: Index<C>,
segment_manager: SegmentManager,
merge_policy: RwLock<Arc<dyn MergePolicy>>,
killed: AtomicBool,
@@ -269,13 +275,13 @@ pub(crate) struct InnerSegmentUpdater {
merge_operations: MergeOperationInventory,
}
impl SegmentUpdater {
impl<Codec: crate::codec::Codec> SegmentUpdater<Codec> {
pub fn create(
index: Index,
index: Index<Codec>,
stamper: Stamper,
delete_cursor: &DeleteCursor,
num_merge_threads: usize,
) -> crate::Result<SegmentUpdater> {
) -> crate::Result<Self> {
let segments = index.searchable_segment_metas()?;
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
let pool = ThreadPoolBuilder::new()
@@ -404,12 +410,14 @@ impl SegmentUpdater {
//
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
committed_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
let codec = CodecConfiguration::from(index.codec());
let index_meta = IndexMeta {
index_settings: index.settings().clone(),
segments: committed_segment_metas,
schema: index.schema(),
opstamp,
payload: commit_message,
codec,
};
// TODO add context to the error.
save_metas(&index_meta, directory.box_clone().borrow_mut())?;
@@ -443,7 +451,7 @@ impl SegmentUpdater {
opstamp: Opstamp,
payload: Option<String>,
) -> FutureResult<Opstamp> {
let segment_updater: SegmentUpdater = self.clone();
let segment_updater: SegmentUpdater<Codec> = self.clone();
self.schedule_task(move || {
let segment_entries = segment_updater.purge_deletes(opstamp)?;
segment_updater.segment_manager.commit(segment_entries);
@@ -702,6 +710,7 @@ impl SegmentUpdater {
#[cfg(test)]
mod tests {
use super::merge_indices;
use crate::codec::StandardCodec;
use crate::collector::TopDocs;
use crate::directory::RamDirectory;
use crate::fastfield::AliveBitSet;
@@ -915,7 +924,7 @@ mod tests {
#[test]
fn test_merge_empty_indices_array() {
let merge_result = merge_indices(&[], RamDirectory::default());
let merge_result = merge_indices::<StandardCodec>(&[], Box::new(RamDirectory::default()));
assert!(merge_result.is_err());
}
@@ -942,7 +951,10 @@ mod tests {
};
// mismatched schema index list
let result = merge_indices(&[first_index, second_index], RamDirectory::default());
let result = merge_indices(
&[first_index, second_index],
Box::new(RamDirectory::default()),
);
assert!(result.is_err());
Ok(())

View File

@@ -4,6 +4,7 @@ use itertools::Itertools;
use tokenizer_api::BoxTokenStream;
use super::operation::AddOperation;
use crate::codec::Codec;
use crate::fastfield::FastFieldsWriter;
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
use crate::index::{Segment, SegmentComponent};
@@ -12,7 +13,7 @@ use crate::indexer::segment_serializer::SegmentSerializer;
use crate::json_utils::{index_json_value, IndexingPositionsPerPath};
use crate::postings::{
compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition,
PerFieldPostingsWriter, PostingsWriter,
PerFieldPostingsWriter, PostingsWriter, PostingsWriterEnum,
};
use crate::schema::document::{Document, Value};
use crate::schema::{FieldEntry, FieldType, Schema, DATE_TIME_PRECISION_INDEXED};
@@ -45,11 +46,11 @@ fn compute_initial_table_size(per_thread_memory_budget: usize) -> crate::Result<
///
/// They creates the postings list in anonymous memory.
/// The segment is laid on disk when the segment gets `finalized`.
pub struct SegmentWriter {
pub struct SegmentWriter<Codec: crate::codec::Codec> {
pub(crate) max_doc: DocId,
pub(crate) ctx: IndexingContext,
pub(crate) per_field_postings_writers: PerFieldPostingsWriter,
pub(crate) segment_serializer: SegmentSerializer,
pub(crate) segment_serializer: SegmentSerializer<Codec>,
pub(crate) fast_field_writers: FastFieldsWriter,
pub(crate) fieldnorms_writer: FieldNormsWriter,
pub(crate) json_path_writer: JsonPathWriter,
@@ -60,7 +61,7 @@ pub struct SegmentWriter {
schema: Schema,
}
impl SegmentWriter {
impl<Codec: crate::codec::Codec> SegmentWriter<Codec> {
/// Creates a new `SegmentWriter`
///
/// The arguments are defined as follows
@@ -70,7 +71,10 @@ impl SegmentWriter {
/// behavior as a memory limit.
/// - segment: The segment being written
/// - schema
pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result<Self> {
pub fn for_segment(
memory_budget_in_bytes: usize,
segment: Segment<Codec>,
) -> crate::Result<Self> {
let schema = segment.schema();
let tokenizer_manager = segment.index().tokenizers().clone();
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
@@ -169,7 +173,7 @@ impl SegmentWriter {
}
let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx);
let postings_writer: &mut dyn PostingsWriter =
let postings_writer: &mut PostingsWriterEnum =
self.per_field_postings_writers.get_for_field_mut(field);
term_buffer.clear_with_field(field);
@@ -386,13 +390,13 @@ impl SegmentWriter {
/// to the `SegmentSerializer`.
///
/// `doc_id_map` is used to map to the new doc_id order.
fn remap_and_write(
fn remap_and_write<C: Codec>(
schema: Schema,
per_field_postings_writers: &PerFieldPostingsWriter,
ctx: IndexingContext,
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: &FieldNormsWriter,
mut serializer: SegmentSerializer,
mut serializer: SegmentSerializer<C>,
) -> crate::Result<()> {
debug!("remap-and-write");
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
@@ -421,10 +425,9 @@ fn remap_and_write(
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use std::path::Path;
use columnar::ColumnType;
use tempfile::TempDir;
use crate::collector::{Count, TopDocs};
use crate::directory::RamDirectory;
@@ -1067,10 +1070,7 @@ mod tests {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", text_options);
let schema = schema_builder.build();
let tempdir = TempDir::new().unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
Index::create_in_dir(&tempdir_path, schema).unwrap();
let index = Index::open_in_dir(tempdir_path).unwrap();
let index = Index::create_in_ram(schema);
let schema = index.schema();
let mut index_writer = index.writer(50_000_000).unwrap();
let title = schema.get_field("title").unwrap();

View File

@@ -1,5 +1,7 @@
use std::marker::PhantomData;
use crate::codec::StandardCodec;
use crate::index::CodecConfiguration;
use crate::indexer::operation::AddOperation;
use crate::indexer::segment_updater::save_metas;
use crate::indexer::SegmentWriter;
@@ -7,22 +9,25 @@ use crate::schema::document::Document;
use crate::{Directory, Index, IndexMeta, Opstamp, Segment, TantivyDocument};
#[doc(hidden)]
pub struct SingleSegmentIndexWriter<D: Document = TantivyDocument> {
segment_writer: SegmentWriter,
segment: Segment,
pub struct SingleSegmentIndexWriter<
Codec: crate::codec::Codec = StandardCodec,
D: Document = TantivyDocument,
> {
segment_writer: SegmentWriter<Codec>,
segment: Segment<Codec>,
opstamp: Opstamp,
_phantom: PhantomData<D>,
_doc: PhantomData<D>,
}
impl<D: Document> SingleSegmentIndexWriter<D> {
pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
impl<Codec: crate::codec::Codec, D: Document> SingleSegmentIndexWriter<Codec, D> {
pub fn new(index: Index<Codec>, mem_budget: usize) -> crate::Result<Self> {
let segment = index.new_segment();
let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
Ok(Self {
segment_writer,
segment,
opstamp: 0,
_phantom: PhantomData,
_doc: PhantomData,
})
}
@@ -37,10 +42,10 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
.add_document(AddOperation { opstamp, document })
}
pub fn finalize(self) -> crate::Result<Index> {
pub fn finalize(self) -> crate::Result<Index<Codec>> {
let max_doc = self.segment_writer.max_doc();
self.segment_writer.finalize()?;
let segment: Segment = self.segment.with_max_doc(max_doc);
let segment: Segment<Codec> = self.segment.with_max_doc(max_doc);
let index = segment.index();
let index_meta = IndexMeta {
index_settings: index.settings().clone(),
@@ -48,6 +53,7 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
schema: index.schema(),
opstamp: 0,
payload: None,
codec: CodecConfiguration::from(index.codec()),
};
save_metas(&index_meta, index.directory())?;
index.directory().sync_directory()?;

View File

@@ -17,6 +17,7 @@
//!
//! ```rust
//! # use std::path::Path;
//! # use std::fs;
//! # use tempfile::TempDir;
//! # use tantivy::collector::TopDocs;
//! # use tantivy::query::QueryParser;
@@ -27,8 +28,11 @@
//! # // Let's create a temporary directory for the
//! # // sake of this example
//! # if let Ok(dir) = TempDir::new() {
//! # run_example(dir.path()).unwrap();
//! # dir.close().unwrap();
//! # let index_path = dir.path().join("index");
//! # // In case the directory already exists, we remove it
//! # let _ = fs::remove_dir_all(&index_path);
//! # fs::create_dir_all(&index_path).unwrap();
//! # run_example(&index_path).unwrap();
//! # }
//! # }
//! #
@@ -162,6 +166,9 @@ mod functional_test;
#[macro_use]
mod macros;
/// Tantivy codecs describes how data is layed out on disk.
pub mod codec;
mod future_result;
// Re-exports
@@ -203,6 +210,7 @@ mod docset;
mod reader;
#[cfg(test)]
#[cfg(feature = "mmap")]
mod compat_tests;
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer};
@@ -1170,12 +1178,11 @@ pub mod tests {
#[test]
fn test_validate_checksum() -> crate::Result<()> {
let index_path = tempfile::tempdir().expect("dir");
let mut builder = Schema::builder();
let body = builder.add_text_field("body", TEXT | STORED);
let schema = builder.build();
let index = Index::create_in_dir(&index_path, schema)?;
let mut writer: IndexWriter = index.writer(50_000_000)?;
let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_for_tests()?;
writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..5000 {
writer.add_document(doc!(body => "foo"))?;

View File

@@ -3,6 +3,7 @@ use std::io;
use common::json_path_writer::JSON_END_OF_PATH;
use stacker::Addr;
use crate::codec::Codec;
use crate::indexer::indexing_term::IndexingTerm;
use crate::indexer::path_to_unordered_id::OrderedPathId;
use crate::postings::postings_writer::SpecializedPostingsWriter;
@@ -22,12 +23,6 @@ pub(crate) struct JsonPostingsWriter<Rec: Recorder> {
non_str_posting_writer: SpecializedPostingsWriter<DocIdRecorder>,
}
impl<Rec: Recorder> From<JsonPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
fn from(json_postings_writer: JsonPostingsWriter<Rec>) -> Box<dyn PostingsWriter> {
Box::new(json_postings_writer)
}
}
impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
#[inline]
fn subscribe(
@@ -58,12 +53,12 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
}
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
fn serialize<C: Codec>(
&self,
ordered_term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
ordered_id_to_path: &[&str],
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
serializer: &mut FieldSerializer<C>,
) -> io::Result<()> {
let mut term_buffer = JsonTermSerializer(Vec::with_capacity(48));
let mut buffer_lender = BufferLender::default();

View File

@@ -1,5 +1,5 @@
use crate::docset::{DocSet, TERMINATED};
use crate::postings::{Postings, SegmentPostings};
use crate::postings::{DocFreq, Postings};
use crate::DocId;
/// `LoadedPostings` is a `DocSet` and `Postings` implementation.
@@ -25,16 +25,16 @@ impl LoadedPostings {
/// Creates a new `LoadedPostings` from a `SegmentPostings`.
///
/// It will also preload positions, if positions are available in the SegmentPostings.
pub fn load(segment_postings: &mut SegmentPostings) -> LoadedPostings {
let num_docs = segment_postings.doc_freq() as usize;
pub fn load(postings: &mut Box<dyn Postings>) -> LoadedPostings {
let num_docs: usize = u32::from(postings.doc_freq()) as usize;
let mut doc_ids = Vec::with_capacity(num_docs);
let mut positions = Vec::with_capacity(num_docs);
let mut position_offsets = Vec::with_capacity(num_docs);
while segment_postings.doc() != TERMINATED {
while postings.doc() != TERMINATED {
position_offsets.push(positions.len() as u32);
doc_ids.push(segment_postings.doc());
segment_postings.append_positions_with_offset(0, &mut positions);
segment_postings.advance();
doc_ids.push(postings.doc());
postings.append_positions_with_offset(0, &mut positions);
postings.advance();
}
position_offsets.push(positions.len() as u32);
LoadedPostings {
@@ -101,6 +101,14 @@ impl Postings for LoadedPostings {
output.push(*pos + offset);
}
}
fn has_freq(&self) -> bool {
true
}
fn doc_freq(&self) -> DocFreq {
DocFreq::Exact(self.doc_ids.len() as u32)
}
}
#[cfg(test)]

View File

@@ -4,7 +4,6 @@ mod block_search;
pub(crate) use self::block_search::branchless_binary_search;
mod block_segment_postings;
pub(crate) mod compression;
mod indexing_context;
mod json_postings_writer;
@@ -13,32 +12,22 @@ mod per_field_postings_writer;
mod postings;
mod postings_writer;
mod recorder;
mod segment_postings;
mod serializer;
mod skip;
mod term_info;
pub(crate) use loaded_postings::LoadedPostings;
pub use postings::DocFreq;
pub(crate) use stacker::compute_table_memory_size;
pub use self::block_segment_postings::BlockSegmentPostings;
pub(crate) use self::indexing_context::IndexingContext;
pub(crate) use self::per_field_postings_writer::PerFieldPostingsWriter;
pub use self::postings::Postings;
pub(crate) use self::postings_writer::{serialize_postings, IndexingPosition, PostingsWriter};
pub use self::segment_postings::SegmentPostings;
pub(crate) use self::postings_writer::{
serialize_postings, IndexingPosition, PostingsWriter, PostingsWriterEnum,
};
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
pub(crate) use self::skip::{BlockInfo, SkipReader};
pub use self::term_info::TermInfo;
#[expect(clippy::enum_variant_names)]
#[derive(Debug, PartialEq, Clone, Copy, Eq)]
pub(crate) enum FreqReadingOption {
NoFreq,
SkipFreq,
ReadFreq,
}
#[cfg(test)]
pub(crate) mod tests {
use std::mem;
@@ -49,6 +38,7 @@ pub(crate) mod tests {
use crate::index::{Index, SegmentComponent, SegmentReader};
use crate::indexer::operation::AddOperation;
use crate::indexer::SegmentWriter;
use crate::postings::DocFreq;
use crate::query::Scorer;
use crate::schema::{
Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions, INDEXED, TEXT,
@@ -279,11 +269,11 @@ pub(crate) mod tests {
}
{
let term_a = Term::from_field_text(text_field, "a");
let mut postings_a = segment_reader
let mut postings_a: Box<dyn Postings> = segment_reader
.inverted_index(term_a.field())?
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert_eq!(postings_a.len(), 1000);
assert_eq!(postings_a.doc_freq(), DocFreq::Exact(1000));
assert_eq!(postings_a.doc(), 0);
assert_eq!(postings_a.term_freq(), 6);
postings_a.positions(&mut positions);
@@ -306,7 +296,7 @@ pub(crate) mod tests {
.inverted_index(term_e.field())?
.read_postings(&term_e, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert_eq!(postings_e.len(), 1000 - 2);
assert_eq!(postings_e.doc_freq(), DocFreq::Exact(1000 - 2));
for i in 2u32..1000u32 {
assert_eq!(postings_e.term_freq(), i);
postings_e.positions(&mut positions);
@@ -527,6 +517,7 @@ pub(crate) mod tests {
}
impl<TScorer: Scorer> Scorer for UnoptimizedDocSet<TScorer> {
#[inline]
fn score(&mut self) -> Score {
self.0.score()
}

View File

@@ -1,16 +1,15 @@
use crate::postings::json_postings_writer::JsonPostingsWriter;
use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::postings_writer::{PostingsWriterEnum, SpecializedPostingsWriter};
use crate::postings::recorder::{DocIdRecorder, TermFrequencyRecorder, TfAndPositionRecorder};
use crate::postings::PostingsWriter;
use crate::schema::{Field, FieldEntry, FieldType, IndexRecordOption, Schema};
pub(crate) struct PerFieldPostingsWriter {
per_field_postings_writers: Vec<Box<dyn PostingsWriter>>,
per_field_postings_writers: Vec<PostingsWriterEnum>,
}
impl PerFieldPostingsWriter {
pub fn for_schema(schema: &Schema) -> Self {
let per_field_postings_writers = schema
let per_field_postings_writers: Vec<PostingsWriterEnum> = schema
.fields()
.map(|(_, field_entry)| posting_writer_from_field_entry(field_entry))
.collect();
@@ -19,16 +18,16 @@ impl PerFieldPostingsWriter {
}
}
pub(crate) fn get_for_field(&self, field: Field) -> &dyn PostingsWriter {
self.per_field_postings_writers[field.field_id() as usize].as_ref()
pub(crate) fn get_for_field(&self, field: Field) -> &PostingsWriterEnum {
&self.per_field_postings_writers[field.field_id() as usize]
}
pub(crate) fn get_for_field_mut(&mut self, field: Field) -> &mut dyn PostingsWriter {
self.per_field_postings_writers[field.field_id() as usize].as_mut()
pub(crate) fn get_for_field_mut(&mut self, field: Field) -> &mut PostingsWriterEnum {
&mut self.per_field_postings_writers[field.field_id() as usize]
}
}
fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter> {
fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> PostingsWriterEnum {
match *field_entry.field_type() {
FieldType::Str(ref text_options) => text_options
.get_indexing_options()
@@ -51,7 +50,7 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box<dyn Postings
| FieldType::Date(_)
| FieldType::Bytes(_)
| FieldType::IpAddr(_)
| FieldType::Facet(_) => Box::<SpecializedPostingsWriter<DocIdRecorder>>::default(),
| FieldType::Facet(_) => <SpecializedPostingsWriter<DocIdRecorder>>::default().into(),
FieldType::JsonObject(ref json_object_options) => {
if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() {
match text_indexing_option.index_option() {

View File

@@ -1,5 +1,25 @@
use crate::docset::DocSet;
/// Result of the doc_freq method.
///
/// Postings can inform us that the document frequency is approximate.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DocFreq {
/// The document frequency is approximate.
Approximate(u32),
/// The document frequency is exact.
Exact(u32),
}
impl From<DocFreq> for u32 {
fn from(doc_freq: DocFreq) -> Self {
match doc_freq {
DocFreq::Approximate(approximate_doc_freq) => approximate_doc_freq,
DocFreq::Exact(doc_freq) => doc_freq,
}
}
}
/// Postings (also called inverted list)
///
/// For a given term, it is the list of doc ids of the doc
@@ -14,6 +34,9 @@ pub trait Postings: DocSet + 'static {
/// The number of times the term appears in the document.
fn term_freq(&self) -> u32;
/// Returns the number of documents containing the term in the segment.
fn doc_freq(&self) -> DocFreq;
/// Returns the positions offsetted with a given value.
/// It is not necessary to clear the `output` before calling this method.
/// The output vector will be resized to the `term_freq`.
@@ -31,6 +54,16 @@ pub trait Postings: DocSet + 'static {
fn positions(&mut self, output: &mut Vec<u32>) {
self.positions_with_offset(0u32, output);
}
/// Returns true if the term_frequency is available.
///
/// This is a tricky question, because on JSON fields, it is possible
/// for a text term to have term freq, whereas a number term in the field has none.
///
/// This function returns whether the actual term has term frequencies or not.
/// In this above JSON field example, `has_freq` should return true for the
/// earlier and false for the latter.
fn has_freq(&self) -> bool;
}
impl Postings for Box<dyn Postings> {
@@ -41,4 +74,12 @@ impl Postings for Box<dyn Postings> {
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
(**self).append_positions_with_offset(offset, output);
}
fn has_freq(&self) -> bool {
(**self).has_freq()
}
fn doc_freq(&self) -> DocFreq {
(**self).doc_freq()
}
}

View File

@@ -4,10 +4,14 @@ use std::ops::Range;
use stacker::Addr;
use crate::codec::Codec;
use crate::fieldnorm::FieldNormReaders;
use crate::indexer::indexing_term::IndexingTerm;
use crate::indexer::path_to_unordered_id::OrderedPathId;
use crate::postings::recorder::{BufferLender, Recorder};
use crate::postings::json_postings_writer::JsonPostingsWriter;
use crate::postings::recorder::{
BufferLender, DocIdRecorder, Recorder, TermFrequencyRecorder, TfAndPositionRecorder,
};
use crate::postings::{
FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter,
};
@@ -45,12 +49,12 @@ fn make_field_partition(
/// Serialize the inverted index.
/// It pushes all term, one field at a time, towards the
/// postings serializer.
pub(crate) fn serialize_postings(
pub(crate) fn serialize_postings<C: Codec>(
ctx: IndexingContext,
schema: Schema,
per_field_postings_writers: &PerFieldPostingsWriter,
fieldnorm_readers: FieldNormReaders,
serializer: &mut InvertedIndexSerializer,
serializer: &mut InvertedIndexSerializer<C>,
) -> crate::Result<()> {
// Replace unordered ids by ordered ids to be able to sort
let unordered_id_to_ordered_id: Vec<OrderedPathId> =
@@ -100,6 +104,141 @@ pub(crate) struct IndexingPosition {
pub end_position: u32,
}
pub enum PostingsWriterEnum {
DocId(SpecializedPostingsWriter<DocIdRecorder>),
DocIdTf(SpecializedPostingsWriter<TermFrequencyRecorder>),
DocTfAndPosition(SpecializedPostingsWriter<TfAndPositionRecorder>),
JsonDocId(JsonPostingsWriter<DocIdRecorder>),
JsonDocIdTf(JsonPostingsWriter<TermFrequencyRecorder>),
JsonDocTfAndPosition(JsonPostingsWriter<TfAndPositionRecorder>),
}
impl From<SpecializedPostingsWriter<DocIdRecorder>> for PostingsWriterEnum {
fn from(doc_id_recorder_writer: SpecializedPostingsWriter<DocIdRecorder>) -> Self {
PostingsWriterEnum::DocId(doc_id_recorder_writer)
}
}
impl From<SpecializedPostingsWriter<TermFrequencyRecorder>> for PostingsWriterEnum {
fn from(doc_id_tf_recorder_writer: SpecializedPostingsWriter<TermFrequencyRecorder>) -> Self {
PostingsWriterEnum::DocIdTf(doc_id_tf_recorder_writer)
}
}
impl From<SpecializedPostingsWriter<TfAndPositionRecorder>> for PostingsWriterEnum {
fn from(
doc_id_tf_and_positions_recorder_writer: SpecializedPostingsWriter<TfAndPositionRecorder>,
) -> Self {
PostingsWriterEnum::DocTfAndPosition(doc_id_tf_and_positions_recorder_writer)
}
}
impl From<JsonPostingsWriter<DocIdRecorder>> for PostingsWriterEnum {
fn from(doc_id_recorder_writer: JsonPostingsWriter<DocIdRecorder>) -> Self {
PostingsWriterEnum::JsonDocId(doc_id_recorder_writer)
}
}
impl From<JsonPostingsWriter<TermFrequencyRecorder>> for PostingsWriterEnum {
fn from(doc_id_tf_recorder_writer: JsonPostingsWriter<TermFrequencyRecorder>) -> Self {
PostingsWriterEnum::JsonDocIdTf(doc_id_tf_recorder_writer)
}
}
impl From<JsonPostingsWriter<TfAndPositionRecorder>> for PostingsWriterEnum {
fn from(
doc_id_tf_and_positions_recorder_writer: JsonPostingsWriter<TfAndPositionRecorder>,
) -> Self {
PostingsWriterEnum::JsonDocTfAndPosition(doc_id_tf_and_positions_recorder_writer)
}
}
impl PostingsWriter for PostingsWriterEnum {
fn subscribe(&mut self, doc: DocId, pos: u32, term: &IndexingTerm, ctx: &mut IndexingContext) {
match self {
PostingsWriterEnum::DocId(writer) => writer.subscribe(doc, pos, term, ctx),
PostingsWriterEnum::DocIdTf(writer) => writer.subscribe(doc, pos, term, ctx),
PostingsWriterEnum::DocTfAndPosition(writer) => writer.subscribe(doc, pos, term, ctx),
PostingsWriterEnum::JsonDocId(writer) => writer.subscribe(doc, pos, term, ctx),
PostingsWriterEnum::JsonDocIdTf(writer) => writer.subscribe(doc, pos, term, ctx),
PostingsWriterEnum::JsonDocTfAndPosition(writer) => {
writer.subscribe(doc, pos, term, ctx)
}
}
}
fn serialize<C: Codec>(
&self,
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
ordered_id_to_path: &[&str],
ctx: &IndexingContext,
serializer: &mut FieldSerializer<C>,
) -> io::Result<()> {
match self {
PostingsWriterEnum::DocId(writer) => {
writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer)
}
PostingsWriterEnum::DocIdTf(writer) => {
writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer)
}
PostingsWriterEnum::DocTfAndPosition(writer) => {
writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer)
}
PostingsWriterEnum::JsonDocId(writer) => {
writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer)
}
PostingsWriterEnum::JsonDocIdTf(writer) => {
writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer)
}
PostingsWriterEnum::JsonDocTfAndPosition(writer) => {
writer.serialize(term_addrs, ordered_id_to_path, ctx, serializer)
}
}
}
/// Tokenize a text and subscribe all of its token.
fn index_text(
&mut self,
doc_id: DocId,
token_stream: &mut dyn TokenStream,
term_buffer: &mut IndexingTerm,
ctx: &mut IndexingContext,
indexing_position: &mut IndexingPosition,
) {
match self {
PostingsWriterEnum::DocId(writer) => {
writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position)
}
PostingsWriterEnum::DocIdTf(writer) => {
writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position)
}
PostingsWriterEnum::DocTfAndPosition(writer) => {
writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position)
}
PostingsWriterEnum::JsonDocId(writer) => {
writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position)
}
PostingsWriterEnum::JsonDocIdTf(writer) => {
writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position)
}
PostingsWriterEnum::JsonDocTfAndPosition(writer) => {
writer.index_text(doc_id, token_stream, term_buffer, ctx, indexing_position)
}
}
}
fn total_num_tokens(&self) -> u64 {
match self {
PostingsWriterEnum::DocId(writer) => writer.total_num_tokens(),
PostingsWriterEnum::DocIdTf(writer) => writer.total_num_tokens(),
PostingsWriterEnum::DocTfAndPosition(writer) => writer.total_num_tokens(),
PostingsWriterEnum::JsonDocId(writer) => writer.total_num_tokens(),
PostingsWriterEnum::JsonDocIdTf(writer) => writer.total_num_tokens(),
PostingsWriterEnum::JsonDocTfAndPosition(writer) => writer.total_num_tokens(),
}
}
}
/// The `PostingsWriter` is in charge of receiving documenting
/// and building a `Segment` in anonymous memory.
///
@@ -116,12 +255,12 @@ pub(crate) trait PostingsWriter: Send + Sync {
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
fn serialize<C: Codec>(
&self,
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
ordered_id_to_path: &[&str],
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
serializer: &mut FieldSerializer<C>,
) -> io::Result<()>;
/// Tokenize a text and subscribe all of its token.
@@ -171,22 +310,14 @@ pub(crate) struct SpecializedPostingsWriter<Rec: Recorder> {
_recorder_type: PhantomData<Rec>,
}
impl<Rec: Recorder> From<SpecializedPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
fn from(
specialized_postings_writer: SpecializedPostingsWriter<Rec>,
) -> Box<dyn PostingsWriter> {
Box::new(specialized_postings_writer)
}
}
impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
#[inline]
pub(crate) fn serialize_one_term(
pub(crate) fn serialize_one_term<C: Codec>(
term: &[u8],
addr: Addr,
buffer_lender: &mut BufferLender,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
serializer: &mut FieldSerializer<C>,
) -> io::Result<()> {
let recorder: Rec = ctx.term_index.read(addr);
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
@@ -227,12 +358,12 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
});
}
fn serialize(
fn serialize<C: Codec>(
&self,
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
_ordered_id_to_path: &[&str],
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
serializer: &mut FieldSerializer<C>,
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for (_field, _path_id, term, addr) in term_addrs {

View File

@@ -1,6 +1,7 @@
use common::read_u32_vint;
use stacker::{ExpUnrolledLinkedList, MemoryArena};
use crate::codec::Codec;
use crate::postings::FieldSerializer;
use crate::DocId;
@@ -67,10 +68,10 @@ pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
/// Close the document. It will help record the term frequency.
fn close_doc(&mut self, arena: &mut MemoryArena);
/// Pushes the postings information to the serializer.
fn serialize(
fn serialize<C: Codec>(
&self,
arena: &MemoryArena,
serializer: &mut FieldSerializer<'_>,
serializer: &mut FieldSerializer<C>,
buffer_lender: &mut BufferLender,
);
/// Returns the number of document containing this term.
@@ -110,10 +111,10 @@ impl Recorder for DocIdRecorder {
#[inline]
fn close_doc(&mut self, _arena: &mut MemoryArena) {}
fn serialize(
fn serialize<C: Codec>(
&self,
arena: &MemoryArena,
serializer: &mut FieldSerializer<'_>,
serializer: &mut FieldSerializer<C>,
buffer_lender: &mut BufferLender,
) {
let buffer = buffer_lender.lend_u8();
@@ -178,10 +179,10 @@ impl Recorder for TermFrequencyRecorder {
self.current_tf = 0;
}
fn serialize(
fn serialize<C: Codec>(
&self,
arena: &MemoryArena,
serializer: &mut FieldSerializer<'_>,
serializer: &mut FieldSerializer<C>,
buffer_lender: &mut BufferLender,
) {
let buffer = buffer_lender.lend_u8();
@@ -235,10 +236,10 @@ impl Recorder for TfAndPositionRecorder {
self.stack.writer(arena).write_u32_vint(POSITION_END);
}
fn serialize(
fn serialize<C: Codec>(
&self,
arena: &MemoryArena,
serializer: &mut FieldSerializer<'_>,
serializer: &mut FieldSerializer<C>,
buffer_lender: &mut BufferLender,
) {
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();

View File

@@ -1,16 +1,14 @@
use std::cmp::Ordering;
use std::io::{self, Write};
use common::{BinarySerializable, CountingWriter, VInt};
use common::{BinarySerializable, CountingWriter};
use super::TermInfo;
use crate::codec::postings::PostingsSerializer;
use crate::codec::Codec;
use crate::directory::{CompositeWrite, WritePtr};
use crate::fieldnorm::FieldNormReader;
use crate::index::Segment;
use crate::positions::PositionSerializer;
use crate::postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE};
use crate::postings::skip::SkipSerializer;
use crate::query::Bm25Weight;
use crate::schema::{Field, FieldEntry, FieldType, IndexRecordOption, Schema};
use crate::termdict::TermDictionaryBuilder;
use crate::{DocId, Score};
@@ -46,22 +44,27 @@ use crate::{DocId, Score};
///
/// A description of the serialization format is
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
pub struct InvertedIndexSerializer {
pub struct InvertedIndexSerializer<C: Codec> {
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
schema: Schema,
codec: C,
}
impl InvertedIndexSerializer {
use crate::codec::postings::PostingsCodec;
impl<C: Codec> InvertedIndexSerializer<C> {
/// Open a new `InvertedIndexSerializer` for the given segment
pub fn open(segment: &mut Segment) -> crate::Result<InvertedIndexSerializer> {
pub fn open(segment: &mut Segment<C>) -> crate::Result<InvertedIndexSerializer<C>> {
use crate::index::SegmentComponent::{Positions, Postings, Terms};
let codec = segment.index().codec().clone();
let inv_index_serializer = InvertedIndexSerializer {
terms_write: CompositeWrite::wrap(segment.open_write(Terms)?),
postings_write: CompositeWrite::wrap(segment.open_write(Postings)?),
positions_write: CompositeWrite::wrap(segment.open_write(Positions)?),
schema: segment.schema(),
codec,
};
Ok(inv_index_serializer)
}
@@ -75,7 +78,7 @@ impl InvertedIndexSerializer {
field: Field,
total_num_tokens: u64,
fieldnorm_reader: Option<FieldNormReader>,
) -> io::Result<FieldSerializer<'_>> {
) -> io::Result<FieldSerializer<'_, C>> {
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
let term_dictionary_write = self.terms_write.for_field(field);
let postings_write = self.postings_write.for_field(field);
@@ -88,6 +91,7 @@ impl InvertedIndexSerializer {
postings_write,
positions_write,
fieldnorm_reader,
&self.codec,
)
}
@@ -102,15 +106,17 @@ impl InvertedIndexSerializer {
/// The field serializer is in charge of
/// the serialization of a specific field.
pub struct FieldSerializer<'a> {
pub struct FieldSerializer<'a, C: Codec> {
term_dictionary_builder: TermDictionaryBuilder<&'a mut CountingWriter<WritePtr>>,
postings_serializer: PostingsSerializer<&'a mut CountingWriter<WritePtr>>,
postings_serializer: <C::PostingsCodec as PostingsCodec>::PostingsSerializer,
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
current_term_info: TermInfo,
term_open: bool,
postings_write: &'a mut CountingWriter<WritePtr>,
postings_start_offset: u64,
}
impl<'a> FieldSerializer<'a> {
impl<'a, C: Codec> FieldSerializer<'a, C> {
fn create(
field_type: &FieldType,
total_num_tokens: u64,
@@ -118,7 +124,8 @@ impl<'a> FieldSerializer<'a> {
postings_write: &'a mut CountingWriter<WritePtr>,
positions_write: &'a mut CountingWriter<WritePtr>,
fieldnorm_reader: Option<FieldNormReader>,
) -> io::Result<FieldSerializer<'a>> {
codec: &C,
) -> io::Result<FieldSerializer<'a, C>> {
total_num_tokens.serialize(postings_write)?;
let index_record_option = field_type
.index_record_option()
@@ -128,8 +135,7 @@ impl<'a> FieldSerializer<'a> {
.as_ref()
.map(|ff_reader| total_num_tokens as Score / ff_reader.num_docs() as Score)
.unwrap_or(0.0);
let postings_serializer = PostingsSerializer::new(
postings_write,
let postings_serializer = codec.postings_codec().new_serializer(
average_fieldnorm,
index_record_option,
fieldnorm_reader,
@@ -140,15 +146,22 @@ impl<'a> FieldSerializer<'a> {
None
};
let postings_start_offset = postings_write.written_bytes();
Ok(FieldSerializer {
term_dictionary_builder,
postings_serializer,
positions_serializer_opt,
current_term_info: TermInfo::default(),
term_open: false,
postings_write,
postings_start_offset,
})
}
fn postings_offset(&self) -> usize {
(self.postings_write.written_bytes() - self.postings_start_offset) as usize
}
fn current_term_info(&self) -> TermInfo {
let positions_start =
if let Some(positions_serializer) = self.positions_serializer_opt.as_ref() {
@@ -156,7 +169,7 @@ impl<'a> FieldSerializer<'a> {
} else {
0u64
} as usize;
let addr = self.postings_serializer.written_bytes() as usize;
let addr = self.postings_offset();
TermInfo {
doc_freq: 0,
postings_range: addr..addr,
@@ -179,7 +192,6 @@ impl<'a> FieldSerializer<'a> {
"Called new_term, while the previous term was not closed."
);
self.term_open = true;
self.postings_serializer.clear();
self.current_term_info = self.current_term_info();
self.term_dictionary_builder.insert_key(term)?;
self.postings_serializer
@@ -213,21 +225,22 @@ impl<'a> FieldSerializer<'a> {
crate::fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
Err(io::Error::new(io::ErrorKind::Other, format!("{msg:?}")))
});
if self.term_open {
self.postings_serializer
.close_term(self.current_term_info.doc_freq)?;
self.current_term_info.postings_range.end =
self.postings_serializer.written_bytes() as usize;
if let Some(positions_serializer) = self.positions_serializer_opt.as_mut() {
positions_serializer.close_term()?;
self.current_term_info.positions_range.end =
positions_serializer.written_bytes() as usize;
}
self.term_dictionary_builder
.insert_value(&self.current_term_info)?;
self.term_open = false;
if !self.term_open {
return Ok(());
};
self.postings_serializer
.close_term(self.current_term_info.doc_freq, self.postings_write)?;
self.current_term_info.postings_range.end = self.postings_offset();
if let Some(positions_serializer) = self.positions_serializer_opt.as_mut() {
positions_serializer.close_term()?;
self.current_term_info.positions_range.end =
positions_serializer.written_bytes() as usize;
}
self.term_dictionary_builder
.insert_value(&self.current_term_info)?;
self.term_open = false;
Ok(())
}
@@ -237,242 +250,8 @@ impl<'a> FieldSerializer<'a> {
if let Some(positions_serializer) = self.positions_serializer_opt {
positions_serializer.close()?;
}
self.postings_serializer.close()?;
self.postings_write.flush()?;
self.term_dictionary_builder.finish()?;
Ok(())
}
}
struct Block {
doc_ids: [DocId; COMPRESSION_BLOCK_SIZE],
term_freqs: [u32; COMPRESSION_BLOCK_SIZE],
len: usize,
}
impl Block {
fn new() -> Self {
Block {
doc_ids: [0u32; COMPRESSION_BLOCK_SIZE],
term_freqs: [0u32; COMPRESSION_BLOCK_SIZE],
len: 0,
}
}
fn doc_ids(&self) -> &[DocId] {
&self.doc_ids[..self.len]
}
fn term_freqs(&self) -> &[u32] {
&self.term_freqs[..self.len]
}
fn clear(&mut self) {
self.len = 0;
}
fn append_doc(&mut self, doc: DocId, term_freq: u32) {
let len = self.len;
self.doc_ids[len] = doc;
self.term_freqs[len] = term_freq;
self.len = len + 1;
}
fn is_full(&self) -> bool {
self.len == COMPRESSION_BLOCK_SIZE
}
fn is_empty(&self) -> bool {
self.len == 0
}
fn last_doc(&self) -> DocId {
assert_eq!(self.len, COMPRESSION_BLOCK_SIZE);
self.doc_ids[COMPRESSION_BLOCK_SIZE - 1]
}
}
pub struct PostingsSerializer<W: Write> {
output_write: CountingWriter<W>,
last_doc_id_encoded: u32,
block_encoder: BlockEncoder,
block: Box<Block>,
postings_write: Vec<u8>,
skip_write: SkipSerializer,
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
bm25_weight: Option<Bm25Weight>,
avg_fieldnorm: Score, /* Average number of term in the field for that segment.
* this value is used to compute the block wand information. */
term_has_freq: bool,
}
impl<W: Write> PostingsSerializer<W> {
pub fn new(
write: W,
avg_fieldnorm: Score,
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
) -> PostingsSerializer<W> {
PostingsSerializer {
output_write: CountingWriter::wrap(write),
block_encoder: BlockEncoder::new(),
block: Box::new(Block::new()),
postings_write: Vec::new(),
skip_write: SkipSerializer::new(),
last_doc_id_encoded: 0u32,
mode,
fieldnorm_reader,
bm25_weight: None,
avg_fieldnorm,
term_has_freq: false,
}
}
pub fn new_term(&mut self, term_doc_freq: u32, record_term_freq: bool) {
self.bm25_weight = None;
self.term_has_freq = self.mode.has_freq() && record_term_freq;
if !self.term_has_freq {
return;
}
let num_docs_in_segment: u64 =
if let Some(fieldnorm_reader) = self.fieldnorm_reader.as_ref() {
fieldnorm_reader.num_docs() as u64
} else {
return;
};
if num_docs_in_segment == 0 {
return;
}
self.bm25_weight = Some(Bm25Weight::for_one_term_without_explain(
term_doc_freq as u64,
num_docs_in_segment,
self.avg_fieldnorm,
));
}
fn write_block(&mut self) {
{
// encode the doc ids
let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_sorted(self.block.doc_ids(), self.last_doc_id_encoded);
self.last_doc_id_encoded = self.block.last_doc();
self.skip_write
.write_doc(self.last_doc_id_encoded, num_bits);
// last el block 0, offset block 1,
self.postings_write.extend(block_encoded);
}
if self.term_has_freq {
let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_unsorted(self.block.term_freqs(), true);
self.postings_write.extend(block_encoded);
self.skip_write.write_term_freq(num_bits);
if self.mode.has_positions() {
// We serialize the sum of term freqs within the skip information
// in order to navigate through positions.
let sum_freq = self.block.term_freqs().iter().cloned().sum();
self.skip_write.write_total_term_freq(sum_freq);
}
let mut blockwand_params = (0u8, 0u32);
if let Some(bm25_weight) = self.bm25_weight.as_ref() {
if let Some(fieldnorm_reader) = self.fieldnorm_reader.as_ref() {
let docs = self.block.doc_ids().iter().cloned();
let term_freqs = self.block.term_freqs().iter().cloned();
let fieldnorms = docs.map(|doc| fieldnorm_reader.fieldnorm_id(doc));
blockwand_params = fieldnorms
.zip(term_freqs)
.max_by(
|(left_fieldnorm_id, left_term_freq),
(right_fieldnorm_id, right_term_freq)| {
let left_score =
bm25_weight.tf_factor(*left_fieldnorm_id, *left_term_freq);
let right_score =
bm25_weight.tf_factor(*right_fieldnorm_id, *right_term_freq);
left_score
.partial_cmp(&right_score)
.unwrap_or(Ordering::Equal)
},
)
.unwrap();
}
}
let (fieldnorm_id, term_freq) = blockwand_params;
self.skip_write.write_blockwand_max(fieldnorm_id, term_freq);
}
self.block.clear();
}
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32) {
self.block.append_doc(doc_id, term_freq);
if self.block.is_full() {
self.write_block();
}
}
fn close(mut self) -> io::Result<()> {
self.postings_write.flush()
}
pub fn close_term(&mut self, doc_freq: u32) -> io::Result<()> {
if !self.block.is_empty() {
// we have doc ids waiting to be written
// this happens when the number of doc ids is
// not a perfect multiple of our block size.
//
// In that case, the remaining part is encoded
// using variable int encoding.
{
let block_encoded = self
.block_encoder
.compress_vint_sorted(self.block.doc_ids(), self.last_doc_id_encoded);
self.postings_write.write_all(block_encoded)?;
}
// ... Idem for term frequencies
if self.term_has_freq {
let block_encoded = self
.block_encoder
.compress_vint_unsorted(self.block.term_freqs());
self.postings_write.write_all(block_encoded)?;
}
self.block.clear();
}
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
let skip_data = self.skip_write.data();
VInt(skip_data.len() as u64).serialize(&mut self.output_write)?;
self.output_write.write_all(skip_data)?;
}
self.output_write.write_all(&self.postings_write[..])?;
self.skip_write.clear();
self.postings_write.clear();
self.bm25_weight = None;
Ok(())
}
/// Returns the number of bytes written in the postings write object
/// at this point.
/// When called before writing the postings of a term, this value is used as
/// start offset.
/// When called after writing the postings of a term, this value is used as a
/// end offset.
fn written_bytes(&self) -> u64 {
self.output_write.written_bytes()
}
fn clear(&mut self) {
self.block.clear();
self.last_doc_id_encoded = 0;
}
}

View File

@@ -2,7 +2,7 @@ use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED};
use crate::index::SegmentReader;
use crate::query::boost_query::BoostScorer;
use crate::query::explanation::does_not_match;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::query::{box_scorer, EnableScoring, Explanation, Query, Scorer, Weight};
use crate::{DocId, Score};
/// Query that matches all of the documents.
@@ -21,17 +21,12 @@ impl Query for AllQuery {
pub struct AllWeight;
impl Weight for AllWeight {
fn scorer(
&self,
reader: &SegmentReader,
boost: Score,
_seek_doc: DocId,
) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let all_scorer = AllScorer::new(reader.max_doc());
if boost != 1.0 {
Ok(Box::new(BoostScorer::new(all_scorer, boost)))
Ok(box_scorer(BoostScorer::new(all_scorer, boost)))
} else {
Ok(Box::new(all_scorer))
Ok(box_scorer(all_scorer))
}
}
@@ -110,6 +105,7 @@ impl DocSet for AllScorer {
}
impl Scorer for AllScorer {
#[inline]
fn score(&mut self) -> Score {
1.0
}
@@ -145,7 +141,7 @@ mod tests {
let weight = AllQuery.weight(EnableScoring::disabled_from_schema(&index.schema()))?;
{
let reader = searcher.segment_reader(0);
let mut scorer = weight.scorer(reader, 1.0, 0)?;
let mut scorer = weight.scorer(reader, 1.0)?;
assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.advance(), 1u32);
assert_eq!(scorer.doc(), 1u32);
@@ -153,7 +149,7 @@ mod tests {
}
{
let reader = searcher.segment_reader(1);
let mut scorer = weight.scorer(reader, 1.0, 0)?;
let mut scorer = weight.scorer(reader, 1.0)?;
assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.advance(), TERMINATED);
}
@@ -168,12 +164,12 @@ mod tests {
let weight = AllQuery.weight(EnableScoring::disabled_from_schema(searcher.schema()))?;
let reader = searcher.segment_reader(0);
{
let mut scorer = weight.scorer(reader, 2.0, 0)?;
let mut scorer = weight.scorer(reader, 2.0)?;
assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 2.0);
}
{
let mut scorer = weight.scorer(reader, 1.5, 0)?;
let mut scorer = weight.scorer(reader, 1.5)?;
assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.5);
}

View File

@@ -10,7 +10,7 @@ use crate::postings::TermInfo;
use crate::query::{BitSetDocSet, ConstScorer, Explanation, Scorer, Weight};
use crate::schema::{Field, IndexRecordOption};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::{DocId, Score, TantivyError};
use crate::{DocId, DocSet, Score, TantivyError};
/// A weight struct for Fuzzy Term and Regex Queries
pub struct AutomatonWeight<A> {
@@ -84,12 +84,7 @@ where
A: Automaton + Send + Sync + 'static,
A::State: Clone,
{
fn scorer(
&self,
reader: &SegmentReader,
boost: Score,
seek_doc: DocId,
) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(self.field)?;
@@ -97,22 +92,9 @@ where
let mut term_stream = self.automaton_stream(term_dict)?;
while term_stream.advance() {
let term_info = term_stream.value();
let (mut block_segment_postings, _) = inverted_index
.read_block_postings_from_terminfo_with_seek(
term_info,
IndexRecordOption::Basic,
seek_doc,
)?;
loop {
let docs = block_segment_postings.docs();
if docs.is_empty() {
break;
}
for &doc in docs {
doc_bitset.insert(doc);
}
block_segment_postings.advance();
}
let mut block_segment_postings =
inverted_index.read_postings_from_terminfo(term_info, IndexRecordOption::Basic)?;
block_segment_postings.fill_bitset(&mut doc_bitset);
}
let doc_bitset = BitSetDocSet::from(doc_bitset);
let const_scorer = ConstScorer::new(doc_bitset, boost);
@@ -120,7 +102,7 @@ where
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0, 0)?;
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) == doc {
Ok(Explanation::new("AutomatonScorer", 1.0))
} else {
@@ -195,7 +177,7 @@ mod tests {
let automaton_weight = AutomatonWeight::new(field, PrefixedByA);
let reader = index.reader()?;
let searcher = reader.searcher();
let mut scorer = automaton_weight.scorer(searcher.segment_reader(0u32), 1.0, 0)?;
let mut scorer = automaton_weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.0);
assert_eq!(scorer.advance(), 2u32);
@@ -212,7 +194,7 @@ mod tests {
let automaton_weight = AutomatonWeight::new(field, PrefixedByA);
let reader = index.reader()?;
let searcher = reader.searcher();
let mut scorer = automaton_weight.scorer(searcher.segment_reader(0u32), 1.32, 0)?;
let mut scorer = automaton_weight.scorer(searcher.segment_reader(0u32), 1.32)?;
assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.32);
Ok(())

View File

@@ -24,6 +24,13 @@ impl BitSetDocSet {
self.cursor_bucket = bucket_addr;
self.cursor_tinybitset = self.docs.tinyset(bucket_addr);
}
/// Returns the number of documents in the bitset.
///
/// This call is not free: it will bitcount the number of bits in the bitset.
pub fn doc_freq(&self) -> u32 {
self.docs.len() as u32
}
}
impl From<BitSet> for BitSetDocSet {

View File

@@ -2,22 +2,15 @@ use std::collections::HashMap;
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
use crate::index::SegmentReader;
use crate::postings::FreqReadingOption;
use crate::query::disjunction::Disjunction;
use crate::query::explanation::does_not_match;
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::term_query::TermScorer;
use crate::query::weight::{for_each_docset_buffered, for_each_pruning_scorer, for_each_scorer};
use crate::query::weight::{for_each_docset_buffered, for_each_scorer};
use crate::query::{
intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude, Explanation, Occur,
RequiredOptionalScorer, Scorer, Weight,
box_scorer, intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude,
Explanation, Occur, RequiredOptionalScorer, Scorer, Weight,
};
use crate::{DocId, Score, TERMINATED};
enum SpecializedScorer {
TermUnion(Vec<TermScorer>),
Other(Box<dyn Scorer>),
}
use crate::{DocId, Score};
fn scorer_disjunction<TScoreCombiner>(
scorers: Vec<Box<dyn Scorer>>,
@@ -32,7 +25,7 @@ where
if scorers.len() == 1 {
return scorers.into_iter().next().unwrap(); // Safe unwrap.
}
Box::new(Disjunction::new(
box_scorer(Disjunction::new(
scorers,
score_combiner,
minimum_match_required,
@@ -44,56 +37,18 @@ fn scorer_union<TScoreCombiner>(
scorers: Vec<Box<dyn Scorer>>,
score_combiner_fn: impl Fn() -> TScoreCombiner,
num_docs: u32,
) -> SpecializedScorer
) -> Box<dyn Scorer>
where
TScoreCombiner: ScoreCombiner,
{
assert!(!scorers.is_empty());
if scorers.len() == 1 {
return SpecializedScorer::Other(scorers.into_iter().next().unwrap()); //< we checked the size beforehand
}
{
let is_all_term_queries = scorers.iter().all(|scorer| scorer.is::<TermScorer>());
if is_all_term_queries {
let scorers: Vec<TermScorer> = scorers
.into_iter()
.map(|scorer| *(scorer.downcast::<TermScorer>().map_err(|_| ()).unwrap()))
.collect();
if scorers
.iter()
.all(|scorer| scorer.freq_reading_option() == FreqReadingOption::ReadFreq)
{
// Block wand is only available if we read frequencies.
return SpecializedScorer::TermUnion(scorers);
} else {
return SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
scorers,
score_combiner_fn,
num_docs,
)));
}
}
}
SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
scorers,
score_combiner_fn,
num_docs,
)))
}
fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
scorer: SpecializedScorer,
score_combiner_fn: impl Fn() -> TScoreCombiner,
num_docs: u32,
) -> Box<dyn Scorer> {
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let union_scorer =
BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs);
Box::new(union_scorer)
}
SpecializedScorer::Other(scorer) => scorer,
match scorers.len() {
0 => box_scorer(EmptyScorer),
1 => scorers.into_iter().next().unwrap(),
_ => box_scorer(BufferedUnionScorer::build(
scorers,
score_combiner_fn,
num_docs,
)),
}
}
@@ -110,7 +65,7 @@ fn effective_must_scorer(
if must_scorers.is_empty() {
if removed_all_scorer_count > 0 {
// Had AllScorer(s) only - all docs match
Some(Box::new(AllScorer::new(max_doc)))
Some(box_scorer(AllScorer::new(max_doc)))
} else {
// No MUST constraint at all
None
@@ -128,54 +83,39 @@ fn effective_must_scorer(
/// When `scoring_enabled` is false, we can just return AllScorer alone since
/// we don't need score contributions from the should_scorer.
fn effective_should_scorer_for_union<TScoreCombiner: ScoreCombiner>(
should_scorer: SpecializedScorer,
should_scorer: Box<dyn Scorer>,
removed_all_scorer_count: usize,
max_doc: DocId,
num_docs: u32,
score_combiner_fn: impl Fn() -> TScoreCombiner,
scoring_enabled: bool,
) -> SpecializedScorer {
) -> Box<dyn Scorer> {
if removed_all_scorer_count > 0 {
if scoring_enabled {
// Need to union to get score contributions from both
let all_scorers: Vec<Box<dyn Scorer>> = vec![
into_box_scorer(should_scorer, &score_combiner_fn, num_docs),
Box::new(AllScorer::new(max_doc)),
];
SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
let all_scorers: Vec<Box<dyn Scorer>> =
vec![should_scorer, box_scorer(AllScorer::new(max_doc))];
box_scorer(BufferedUnionScorer::build(
all_scorers,
score_combiner_fn,
num_docs,
)))
))
} else {
// Scoring disabled - AllScorer alone is sufficient
SpecializedScorer::Other(Box::new(AllScorer::new(max_doc)))
box_scorer(AllScorer::new(max_doc))
}
} else {
should_scorer
}
}
fn create_scorer(
weight: &dyn Weight,
reader: &SegmentReader,
boost: Score,
target_doc: DocId,
) -> crate::Result<Box<dyn Scorer>> {
if target_doc >= reader.max_doc() {
Ok(Box::new(EmptyScorer))
} else {
weight.scorer(reader, boost, target_doc)
}
}
enum ShouldScorersCombinationMethod {
// Should scorers are irrelevant.
Ignored,
// Only contributes to final score.
Optional(SpecializedScorer),
Optional(Box<dyn Scorer>),
// Regardless of score, the should scorers may impact whether a document is matching or not.
Required(SpecializedScorer),
Required(Box<dyn Scorer>),
}
/// Weight associated to the `BoolQuery`.
@@ -220,29 +160,10 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
&self,
reader: &SegmentReader,
boost: Score,
mut seek_first_doc: DocId,
) -> crate::Result<HashMap<Occur, Vec<Box<dyn Scorer>>>> {
let mut per_occur_scorers: HashMap<Occur, Vec<Box<dyn Scorer>>> = HashMap::new();
let (mut must_weights, other_weights): (Vec<(Occur, _)>, Vec<(Occur, _)>) = self
.weights
.iter()
.map(|(occur, weight)| (*occur, weight))
.partition(|(occur, _weight)| *occur == Occur::Must);
// We start by must weights in order to get the best "seek_first_doc" so that we
// can skip the first few documents of the other scorers.
must_weights.sort_by_key(|weight| weight.1.intersection_priority());
for (_, must_sub_weight) in must_weights {
let sub_scorer: Box<dyn Scorer> =
create_scorer(must_sub_weight.as_ref(), reader, boost, seek_first_doc)?;
seek_first_doc = seek_first_doc.max(sub_scorer.doc());
per_occur_scorers
.entry(Occur::Must)
.or_default()
.push(sub_scorer);
}
for (occur, sub_weight) in &other_weights {
let sub_scorer: Box<dyn Scorer> =
create_scorer(sub_weight.as_ref(), reader, boost, seek_first_doc)?;
for (occur, subweight) in &self.weights {
let sub_scorer: Box<dyn Scorer> = subweight.scorer(reader, boost)?;
per_occur_scorers
.entry(*occur)
.or_default()
@@ -256,10 +177,9 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
reader: &SegmentReader,
boost: Score,
score_combiner_fn: impl Fn() -> TComplexScoreCombiner,
seek_doc: u32,
) -> crate::Result<SpecializedScorer> {
) -> crate::Result<Box<dyn Scorer>> {
let num_docs = reader.num_docs();
let mut per_occur_scorers = self.per_occur_scorers(reader, boost, seek_doc)?;
let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
// Indicate how should clauses are combined with must clauses.
let mut must_scorers: Vec<Box<dyn Scorer>> =
@@ -267,7 +187,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
let must_special_scorer_counts = remove_and_count_all_and_empty_scorers(&mut must_scorers);
if must_special_scorer_counts.num_empty_scorers > 0 {
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
return Ok(box_scorer(EmptyScorer));
}
let mut should_scorers = per_occur_scorers.remove(&Occur::Should).unwrap_or_default();
@@ -282,7 +202,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
if exclude_special_scorer_counts.num_all_scorers > 0 {
// We exclude all documents at one point.
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
return Ok(box_scorer(EmptyScorer));
}
let effective_minimum_number_should_match = self
@@ -294,7 +214,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
if effective_minimum_number_should_match > num_of_should_scorers {
// We don't have enough scorers to satisfy the minimum number of should matches.
// The request will match no documents.
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
return Ok(box_scorer(EmptyScorer));
}
match effective_minimum_number_should_match {
0 if num_of_should_scorers == 0 => ShouldScorersCombinationMethod::Ignored,
@@ -314,12 +234,10 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
must_scorers.append(&mut should_scorers);
ShouldScorersCombinationMethod::Ignored
}
_ => ShouldScorersCombinationMethod::Required(SpecializedScorer::Other(
scorer_disjunction(
should_scorers,
score_combiner_fn(),
effective_minimum_number_should_match,
),
_ => ShouldScorersCombinationMethod::Required(scorer_disjunction(
should_scorers,
score_combiner_fn(),
effective_minimum_number_should_match,
)),
}
};
@@ -327,13 +245,9 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
let exclude_scorer_opt: Option<Box<dyn Scorer>> = if exclude_scorers.is_empty() {
None
} else {
let exclude_specialized_scorer: SpecializedScorer =
let exclude_scorers_union: Box<dyn Scorer> =
scorer_union(exclude_scorers, DoNothingCombiner::default, num_docs);
Some(into_box_scorer(
exclude_specialized_scorer,
DoNothingCombiner::default,
num_docs,
))
Some(exclude_scorers_union)
};
let include_scorer = match (should_scorers, must_scorers) {
@@ -348,8 +262,8 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
reader.max_doc(),
num_docs,
)
.unwrap_or_else(|| Box::new(EmptyScorer));
SpecializedScorer::Other(boxed_scorer)
.unwrap_or_else(|| box_scorer(EmptyScorer));
boxed_scorer
}
(ShouldScorersCombinationMethod::Optional(should_scorer), must_scorers) => {
// Optional SHOULD: contributes to scoring but not required for matching.
@@ -374,16 +288,12 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
Some(must_scorer) => {
// Has MUST constraint: SHOULD only affects scoring.
if self.scoring_enabled {
SpecializedScorer::Other(Box::new(RequiredOptionalScorer::<
_,
_,
TScoreCombiner,
>::new(
box_scorer(RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
must_scorer,
into_box_scorer(should_scorer, &score_combiner_fn, num_docs),
)))
should_scorer,
))
} else {
SpecializedScorer::Other(must_scorer)
must_scorer
}
}
}
@@ -403,23 +313,13 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
}
Some(must_scorer) => {
// Has MUST constraint: intersect MUST with SHOULD.
let should_boxed =
into_box_scorer(should_scorer, &score_combiner_fn, num_docs);
SpecializedScorer::Other(intersect_scorers(
vec![must_scorer, should_boxed],
num_docs,
))
intersect_scorers(vec![must_scorer, should_scorer], num_docs)
}
}
}
};
if let Some(exclude_scorer) = exclude_scorer_opt {
let include_scorer_boxed =
into_box_scorer(include_scorer, &score_combiner_fn, num_docs);
Ok(SpecializedScorer::Other(Box::new(Exclude::new(
include_scorer_boxed,
exclude_scorer,
))))
Ok(box_scorer(Exclude::new(include_scorer, exclude_scorer)))
} else {
Ok(include_scorer)
}
@@ -440,7 +340,7 @@ fn remove_and_count_all_and_empty_scorers(
if scorer.is::<AllScorer>() {
counts.num_all_scorers += 1;
false
} else if scorer.doc() == TERMINATED {
} else if scorer.is::<EmptyScorer>() {
counts.num_empty_scorers += 1;
false
} else {
@@ -451,13 +351,7 @@ fn remove_and_count_all_and_empty_scorers(
}
impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombiner> {
fn scorer(
&self,
reader: &SegmentReader,
boost: Score,
seek_doc: DocId,
) -> crate::Result<Box<dyn Scorer>> {
let num_docs = reader.num_docs();
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
if self.weights.is_empty() {
Ok(Box::new(EmptyScorer))
} else if self.weights.len() == 1 {
@@ -465,23 +359,17 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
if occur == Occur::MustNot {
Ok(Box::new(EmptyScorer))
} else {
weight.scorer(reader, boost, seek_doc)
weight.scorer(reader, boost)
}
} else if self.scoring_enabled {
self.complex_scorer(reader, boost, &self.score_combiner_fn, seek_doc)
.map(|specialized_scorer| {
into_box_scorer(specialized_scorer, &self.score_combiner_fn, num_docs)
})
self.complex_scorer(reader, boost, &self.score_combiner_fn)
} else {
self.complex_scorer(reader, boost, DoNothingCombiner::default, seek_doc)
.map(|specialized_scorer| {
into_box_scorer(specialized_scorer, DoNothingCombiner::default, num_docs)
})
self.complex_scorer(reader, boost, DoNothingCombiner::default)
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0, 0)?;
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
@@ -505,20 +393,8 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score),
) -> crate::Result<()> {
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn, 0)?;
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let mut union_scorer = BufferedUnionScorer::build(
term_scorers,
&self.score_combiner_fn,
reader.num_docs(),
);
for_each_scorer(&mut union_scorer, callback);
}
SpecializedScorer::Other(mut scorer) => {
for_each_scorer(scorer.as_mut(), callback);
}
}
let mut scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
for_each_scorer(scorer.as_mut(), callback);
Ok(())
}
@@ -527,22 +403,9 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
reader: &SegmentReader,
callback: &mut dyn FnMut(&[DocId]),
) -> crate::Result<()> {
let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner, 0u32)?;
let mut scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?;
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let mut union_scorer = BufferedUnionScorer::build(
term_scorers,
&self.score_combiner_fn,
reader.num_docs(),
);
for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
}
SpecializedScorer::Other(mut scorer) => {
for_each_docset_buffered(scorer.as_mut(), &mut buffer, callback);
}
}
for_each_docset_buffered(scorer.as_mut(), &mut buffer, callback);
Ok(())
}
@@ -562,15 +425,8 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> crate::Result<()> {
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn, 0u32)?;
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
super::block_wand(term_scorers, threshold, callback);
}
SpecializedScorer::Other(mut scorer) => {
for_each_pruning_scorer(scorer.as_mut(), threshold, callback);
}
}
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
reader.codec.for_each_pruning(threshold, scorer, callback);
Ok(())
}
}

View File

@@ -1,8 +1,6 @@
mod block_wand;
mod boolean_query;
mod boolean_weight;
pub(crate) use self::block_wand::{block_wand, block_wand_single_scorer};
pub use self::boolean_query::BooleanQuery;
pub use self::boolean_weight::BooleanWeight;
@@ -57,7 +55,7 @@ mod tests {
let query = query_parser.parse_query("+a")?;
let searcher = index.reader()?.searcher();
let weight = query.weight(EnableScoring::enabled_from_searcher(&searcher))?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0, 0)?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert!(scorer.is::<TermScorer>());
Ok(())
}
@@ -70,13 +68,13 @@ mod tests {
{
let query = query_parser.parse_query("+a +b +c")?;
let weight = query.weight(EnableScoring::enabled_from_searcher(&searcher))?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0, 0)?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert!(scorer.is::<Intersection<TermScorer>>());
}
{
let query = query_parser.parse_query("+a +(b c)")?;
let weight = query.weight(EnableScoring::enabled_from_searcher(&searcher))?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0, 0)?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert!(scorer.is::<Intersection<Box<dyn Scorer>>>());
}
Ok(())
@@ -90,14 +88,14 @@ mod tests {
{
let query = query_parser.parse_query("+a b")?;
let weight = query.weight(EnableScoring::enabled_from_searcher(&searcher))?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0, 0)?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert!(scorer
.is::<RequiredOptionalScorer<Box<dyn Scorer>, Box<dyn Scorer>, SumCombiner>>());
}
{
let query = query_parser.parse_query("+a b")?;
let weight = query.weight(EnableScoring::disabled_from_schema(searcher.schema()))?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0, 0)?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert!(scorer.is::<TermScorer>());
}
Ok(())
@@ -244,14 +242,12 @@ mod tests {
.weight(EnableScoring::enabled_from_searcher(&searcher))
.unwrap();
{
let mut boolean_scorer =
boolean_weight.scorer(searcher.segment_reader(0u32), 1.0, 0)?;
let mut boolean_scorer = boolean_weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals!(boolean_scorer.score(), 0.84163445);
}
{
let mut boolean_scorer =
boolean_weight.scorer(searcher.segment_reader(0u32), 2.0, 0)?;
let mut boolean_scorer = boolean_weight.scorer(searcher.segment_reader(0u32), 2.0)?;
assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals!(boolean_scorer.score(), 1.6832689);
}
@@ -345,7 +341,7 @@ mod tests {
(Occur::Must, term_match_some.box_clone()),
]);
let weight = query.weight(EnableScoring::disabled_from_searcher(&searcher))?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0f32, 0)?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0f32)?;
assert!(scorer.is::<TermScorer>());
}
{
@@ -355,7 +351,7 @@ mod tests {
(Occur::Must, term_match_none.box_clone()),
]);
let weight = query.weight(EnableScoring::disabled_from_searcher(&searcher))?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0f32, 0)?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0f32)?;
assert!(scorer.is::<EmptyScorer>());
}
{
@@ -364,7 +360,7 @@ mod tests {
(Occur::Should, term_match_none.box_clone()),
]);
let weight = query.weight(EnableScoring::disabled_from_searcher(&searcher))?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0f32, 0)?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0f32)?;
assert!(scorer.is::<AllScorer>());
}
{
@@ -373,7 +369,7 @@ mod tests {
(Occur::Should, term_match_none.box_clone()),
]);
let weight = query.weight(EnableScoring::disabled_from_searcher(&searcher))?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0f32, 0)?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0f32)?;
assert!(scorer.is::<TermScorer>());
}
Ok(())
@@ -613,134 +609,6 @@ mod tests {
Ok(())
}
/// Test that the seek_doc parameter correctly skips documents in BooleanWeight::scorer.
///
/// When seek_doc is provided, the scorer should start from that document (or the first
/// matching document >= seek_doc), skipping earlier documents.
#[test]
pub fn test_boolean_weight_seek_doc() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let value_field = schema_builder.add_u64_field("value", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
// Create 11 documents:
// doc 0: value=0
// doc 1: value=10
// doc 2: value=20
// ...
// doc 9: value=90
// doc 10: value=50 (matches range 30-70)
for i in 0..10 {
index_writer.add_document(doc!(
text_field => "hello",
value_field => (i * 10) as u64
))?;
}
index_writer.add_document(doc!(
text_field => "hello",
value_field => 50u64
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
// Create a Boolean query: MUST(term "hello") AND MUST(range 30..=70)
// This should match docs with value in [30, 70]: docs 3, 4, 5, 6, 7, 10
let term_query: Box<dyn Query> = Box::new(TermQuery::new(
Term::from_field_text(text_field, "hello"),
IndexRecordOption::Basic,
));
let range_query: Box<dyn Query> = Box::new(RangeQuery::new(
Bound::Included(Term::from_field_u64(value_field, 30)),
Bound::Included(Term::from_field_u64(value_field, 70)),
));
let boolean_query =
BooleanQuery::new(vec![(Occur::Must, term_query), (Occur::Must, range_query)]);
let weight =
boolean_query.weight(EnableScoring::disabled_from_schema(searcher.schema()))?;
let doc_when_seeking_from = |seek_from: DocId| {
let scorer = weight.scorer(segment_reader, 1.0f32, seek_from).unwrap();
crate::docset::docset_to_doc_vec(scorer)
};
// Expected matching docs: 3, 4, 5, 6, 7, 10 (values 30, 40, 50, 60, 70, 50)
assert_eq!(doc_when_seeking_from(0), vec![3, 4, 5, 6, 7, 10]);
assert_eq!(doc_when_seeking_from(1), vec![3, 4, 5, 6, 7, 10]);
assert_eq!(doc_when_seeking_from(3), vec![3, 4, 5, 6, 7, 10]);
assert_eq!(doc_when_seeking_from(4), vec![4, 5, 6, 7, 10]);
assert_eq!(doc_when_seeking_from(7), vec![7, 10]);
assert_eq!(doc_when_seeking_from(8), vec![10]);
assert_eq!(doc_when_seeking_from(10), vec![10]);
assert_eq!(doc_when_seeking_from(11), Vec::<DocId>::new());
Ok(())
}
/// Test that the seek_doc parameter works correctly with SHOULD clauses.
#[test]
pub fn test_boolean_weight_seek_doc_with_should() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
// Create documents:
// doc 0: "a b"
// doc 1: "a"
// doc 2: "b"
// doc 3: "c"
// doc 4: "a b c"
index_writer.add_document(doc!(text_field => "a b"))?;
index_writer.add_document(doc!(text_field => "a"))?;
index_writer.add_document(doc!(text_field => "b"))?;
index_writer.add_document(doc!(text_field => "c"))?;
index_writer.add_document(doc!(text_field => "a b c"))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
// Create a Boolean query: SHOULD(term "a") OR SHOULD(term "b")
// This should match docs 0, 1, 2, 4
let term_a: Box<dyn Query> = Box::new(TermQuery::new(
Term::from_field_text(text_field, "a"),
IndexRecordOption::Basic,
));
let term_b: Box<dyn Query> = Box::new(TermQuery::new(
Term::from_field_text(text_field, "b"),
IndexRecordOption::Basic,
));
let boolean_query =
BooleanQuery::new(vec![(Occur::Should, term_a), (Occur::Should, term_b)]);
let weight =
boolean_query.weight(EnableScoring::disabled_from_schema(searcher.schema()))?;
let doc_when_seeking_from = |seek_from: DocId| {
let scorer = weight.scorer(segment_reader, 1.0f32, seek_from).unwrap();
crate::docset::docset_to_doc_vec(scorer)
};
// Expected matching docs: 0, 1, 2, 4
assert_eq!(doc_when_seeking_from(0), vec![0, 1, 2, 4]);
assert_eq!(doc_when_seeking_from(1), vec![1, 2, 4]);
assert_eq!(doc_when_seeking_from(2), vec![2, 4]);
assert_eq!(doc_when_seeking_from(3), vec![4]);
assert_eq!(doc_when_seeking_from(4), vec![4]);
assert_eq!(doc_when_seeking_from(5), Vec::<DocId>::new());
Ok(())
}
/// Test multiple AllScorer instances in different clause types.
///
/// Verifies correct behavior when AllScorers appear in multiple positions.

View File

@@ -67,13 +67,8 @@ impl BoostWeight {
}
impl Weight for BoostWeight {
fn scorer(
&self,
reader: &SegmentReader,
boost: Score,
seek_doc: DocId,
) -> crate::Result<Box<dyn Scorer>> {
self.weight.scorer(reader, boost * self.boost, seek_doc)
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
self.weight.scorer(reader, boost * self.boost)
}
fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result<Explanation> {
@@ -88,10 +83,6 @@ impl Weight for BoostWeight {
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
self.weight.count(reader)
}
fn intersection_priority(&self) -> u32 {
self.weight.intersection_priority()
}
}
pub(crate) struct BoostScorer<S: Scorer> {
@@ -143,6 +134,7 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
}
impl<S: Scorer> Scorer for BoostScorer<S> {
#[inline]
fn score(&mut self) -> Score {
self.underlying.score() * self.boost
}

View File

@@ -1,7 +1,7 @@
use std::fmt;
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::query::{box_scorer, EnableScoring, Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term};
/// `ConstScoreQuery` is a wrapper over a query to provide a constant score.
@@ -63,18 +63,16 @@ impl ConstWeight {
}
impl Weight for ConstWeight {
fn scorer(
&self,
reader: &SegmentReader,
boost: Score,
seek_doc: DocId,
) -> crate::Result<Box<dyn Scorer>> {
let inner_scorer = self.weight.scorer(reader, boost, seek_doc)?;
Ok(Box::new(ConstScorer::new(inner_scorer, boost * self.score)))
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let inner_scorer = self.weight.scorer(reader, boost)?;
Ok(box_scorer(ConstScorer::new(
inner_scorer,
boost * self.score,
)))
}
fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0, 0)?;
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(TantivyError::InvalidArgument(format!(
"Document #({doc}) does not match"
@@ -89,10 +87,6 @@ impl Weight for ConstWeight {
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
self.weight.count(reader)
}
fn intersection_priority(&self) -> u32 {
self.weight.intersection_priority()
}
}
/// Wraps a `DocSet` and simply returns a constant `Scorer`.
@@ -146,6 +140,7 @@ impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
}
impl<TDocSet: DocSet + 'static> Scorer for ConstScorer<TDocSet> {
#[inline]
fn score(&mut self) -> Score {
self.score
}

View File

@@ -173,6 +173,7 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> DocSet
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Scorer
for Disjunction<TScorer, TScoreCombiner>
{
#[inline]
fn score(&mut self) -> Score {
self.current_score
}
@@ -307,6 +308,7 @@ mod tests {
}
impl Scorer for DummyScorer {
#[inline]
fn score(&mut self) -> Score {
self.foo.get(self.cursor).map(|x| x.1).unwrap_or(0.0)
}

View File

@@ -2,7 +2,7 @@ use super::Scorer;
use crate::docset::TERMINATED;
use crate::index::SegmentReader;
use crate::query::explanation::does_not_match;
use crate::query::{EnableScoring, Explanation, Query, Weight};
use crate::query::{box_scorer, EnableScoring, Explanation, Query, Weight};
use crate::{DocId, DocSet, Score, Searcher};
/// `EmptyQuery` is a dummy `Query` in which no document matches.
@@ -26,24 +26,13 @@ impl Query for EmptyQuery {
/// It is useful for tests and handling edge cases.
pub struct EmptyWeight;
impl Weight for EmptyWeight {
fn scorer(
&self,
_reader: &SegmentReader,
_boost: Score,
_seek_doc: DocId,
) -> crate::Result<Box<dyn Scorer>> {
Ok(Box::new(EmptyScorer))
fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> crate::Result<Box<dyn Scorer>> {
Ok(box_scorer(EmptyScorer))
}
fn explain(&self, _reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
Err(does_not_match(doc))
}
/// Returns a priority number used to sort weights when running an
/// intersection.
fn intersection_priority(&self) -> u32 {
0u32
}
}
/// `EmptyScorer` is a dummy `Scorer` in which no document matches.
@@ -66,6 +55,7 @@ impl DocSet for EmptyScorer {
}
impl Scorer for EmptyScorer {
#[inline]
fn score(&mut self) -> Score {
0.0
}

View File

@@ -84,6 +84,7 @@ where
TScorer: Scorer,
TDocSetExclude: DocSet + 'static,
{
#[inline]
fn score(&mut self) -> Score {
self.underlying_docset.score()
}

View File

@@ -3,7 +3,7 @@ use core::fmt::Debug;
use columnar::{ColumnIndex, DynamicColumn};
use common::BitSet;
use super::{ConstScorer, EmptyScorer};
use super::{box_scorer, ConstScorer, EmptyScorer};
use crate::docset::{DocSet, TERMINATED};
use crate::index::SegmentReader;
use crate::query::all_query::AllScorer;
@@ -98,12 +98,7 @@ pub struct ExistsWeight {
}
impl Weight for ExistsWeight {
fn scorer(
&self,
reader: &SegmentReader,
boost: Score,
_seek_doc: DocId,
) -> crate::Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let fast_field_reader = reader.fast_fields();
let mut column_handles = fast_field_reader.dynamic_column_handles(&self.field_name)?;
if self.field_type == Type::Json && self.json_subpaths {
@@ -122,7 +117,7 @@ impl Weight for ExistsWeight {
}
}
if non_empty_columns.is_empty() {
return Ok(Box::new(EmptyScorer));
return Ok(box_scorer(EmptyScorer));
}
// If any column is full, all docs match.
@@ -133,9 +128,9 @@ impl Weight for ExistsWeight {
{
let all_scorer = AllScorer::new(max_doc);
if boost != 1.0f32 {
return Ok(Box::new(BoostScorer::new(all_scorer, boost)));
return Ok(box_scorer(BoostScorer::new(all_scorer, boost)));
} else {
return Ok(Box::new(all_scorer));
return Ok(box_scorer(all_scorer));
}
}
@@ -143,7 +138,7 @@ impl Weight for ExistsWeight {
// NOTE: A lower number may be better for very sparse columns
if non_empty_columns.len() < 4 {
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
return Ok(Box::new(ConstScorer::new(docset, boost)));
return Ok(box_scorer(ConstScorer::new(docset, boost)));
}
// If we have many dynamic columns, precompute a bitset of matching docs
@@ -167,11 +162,11 @@ impl Weight for ExistsWeight {
}
}
let docset = BitSetDocSet::from(doc_bitset);
Ok(Box::new(ConstScorer::new(docset, boost)))
Ok(box_scorer(ConstScorer::new(docset, boost)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0, 0)?;
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}

View File

@@ -1,7 +1,7 @@
use super::size_hint::estimate_intersection;
use crate::docset::{DocSet, TERMINATED};
use crate::query::term_query::TermScorer;
use crate::query::{EmptyScorer, Scorer};
use crate::query::{box_scorer, EmptyScorer, Scorer};
use crate::{DocId, Score};
/// Returns the intersection scorer.
@@ -20,7 +20,7 @@ pub fn intersect_scorers(
num_docs_segment: u32,
) -> Box<dyn Scorer> {
if scorers.is_empty() {
return Box::new(EmptyScorer);
return box_scorer(EmptyScorer);
}
if scorers.len() == 1 {
return scorers.pop().unwrap();
@@ -29,7 +29,7 @@ pub fn intersect_scorers(
scorers.sort_by_key(|scorer| scorer.cost());
let doc = go_to_first_doc(&mut scorers[..]);
if doc == TERMINATED {
return Box::new(EmptyScorer);
return box_scorer(EmptyScorer);
}
// We know that we have at least 2 elements.
let left = scorers.remove(0);
@@ -38,14 +38,14 @@ pub fn intersect_scorers(
.iter()
.all(|&scorer| scorer.is::<TermScorer>());
if all_term_scorers {
return Box::new(Intersection {
return box_scorer(Intersection {
left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
others: scorers,
num_docs: num_docs_segment,
});
}
Box::new(Intersection {
box_scorer(Intersection {
left,
right,
others: scorers,
@@ -105,6 +105,7 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
}
impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOtherDocSet> {
#[inline]
fn advance(&mut self) -> DocId {
let (left, right) = (&mut self.left, &mut self.right);
let mut candidate = left.advance();
@@ -174,6 +175,7 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
.all(|docset| docset.seek_into_the_danger_zone(target))
}
#[inline]
fn doc(&self) -> DocId {
self.left.doc()
}
@@ -200,6 +202,7 @@ where
TScorer: Scorer,
TOtherScorer: Scorer,
{
#[inline]
fn score(&mut self) -> Score {
self.left.score()
+ self.right.score()

View File

@@ -24,7 +24,7 @@ mod reqopt_scorer;
mod scorer;
mod set_query;
mod size_hint;
mod term_query;
pub(crate) mod term_query;
mod union;
mod weight;
@@ -54,13 +54,14 @@ pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
pub use self::phrase_prefix_query::PhrasePrefixQuery;
pub use self::phrase_query::regex_phrase_query::{wildcard_query_to_regex_str, RegexPhraseQuery};
pub use self::phrase_query::PhraseQuery;
pub(crate) use self::phrase_query::PhraseScorer;
pub use self::query::{EnableScoring, Query, QueryClone};
pub use self::query_parser::{QueryParser, QueryParserError};
pub use self::range_query::*;
pub use self::regex_query::RegexQuery;
pub use self::reqopt_scorer::RequiredOptionalScorer;
pub use self::score_combiner::{DisjunctionMaxCombiner, ScoreCombiner, SumCombiner};
pub use self::scorer::Scorer;
pub use self::scorer::{box_scorer, Scorer};
pub use self::set_query::TermSetQuery;
pub use self::term_query::TermQuery;
pub use self::union::BufferedUnionScorer;

View File

@@ -2,7 +2,7 @@ use crate::docset::{DocSet, TERMINATED};
use crate::fieldnorm::FieldNormReader;
use crate::postings::Postings;
use crate::query::bm25::Bm25Weight;
use crate::query::phrase_query::{intersection_count, PhraseScorer};
use crate::query::phrase_query::{intersection_exists, PhraseScorer};
use crate::query::Scorer;
use crate::{DocId, Score};
@@ -81,6 +81,7 @@ impl<TPostings: Postings> DocSet for PhraseKind<TPostings> {
}
impl<TPostings: Postings> Scorer for PhraseKind<TPostings> {
#[inline]
fn score(&mut self) -> Score {
match self {
PhraseKind::SinglePrefix { positions, .. } => {
@@ -99,7 +100,6 @@ pub struct PhrasePrefixScorer<TPostings: Postings> {
phrase_scorer: PhraseKind<TPostings>,
suffixes: Vec<TPostings>,
suffix_offset: u32,
phrase_count: u32,
suffix_position_buffer: Vec<u32>,
}
@@ -143,7 +143,6 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
phrase_scorer,
suffixes,
suffix_offset: (max_offset - suffix_pos) as u32,
phrase_count: 0,
suffix_position_buffer: Vec::with_capacity(100),
};
if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
@@ -152,12 +151,7 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
phrase_prefix_scorer
}
pub fn phrase_count(&self) -> u32 {
self.phrase_count
}
fn matches_prefix(&mut self) -> bool {
let mut count = 0;
let current_doc = self.doc();
let pos_matching = self.phrase_scorer.get_intersection();
for suffix in &mut self.suffixes {
@@ -167,11 +161,12 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
let doc = suffix.seek(current_doc);
if doc == current_doc {
suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer);
count += intersection_count(pos_matching, &self.suffix_position_buffer);
if intersection_exists(pos_matching, &self.suffix_position_buffer) {
return true;
}
}
}
self.phrase_count = count as u32;
count != 0
false
}
}
@@ -215,6 +210,7 @@ impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
}
impl<TPostings: Postings> Scorer for PhrasePrefixScorer<TPostings> {
#[inline]
fn score(&mut self) -> Score {
// TODO modify score??
self.phrase_scorer.score()

View File

@@ -1,12 +1,11 @@
use super::{prefix_end, PhrasePrefixScorer};
use crate::fieldnorm::FieldNormReader;
use crate::index::SegmentReader;
use crate::postings::SegmentPostings;
use crate::postings::Postings;
use crate::query::bm25::Bm25Weight;
use crate::query::explanation::does_not_match;
use crate::query::{EmptyScorer, Explanation, Scorer, Weight};
use crate::query::{box_scorer, EmptyScorer, Explanation, Scorer, Weight};
use crate::schema::{IndexRecordOption, Term};
use crate::{DocId, DocSet, Score};
use crate::{DocId, Score};
pub struct PhrasePrefixWeight {
phrase_terms: Vec<(usize, Term)>,
@@ -42,29 +41,26 @@ impl PhrasePrefixWeight {
Ok(FieldNormReader::constant(reader.max_doc(), 1))
}
pub(crate) fn prefix_phrase_scorer(
pub(crate) fn phrase_scorer(
&self,
reader: &SegmentReader,
boost: Score,
seek_doc: DocId,
) -> crate::Result<Option<PhrasePrefixScorer<SegmentPostings>>> {
) -> crate::Result<Option<Box<dyn Scorer>>> {
let similarity_weight_opt = self
.similarity_weight_opt
.as_ref()
.map(|similarity_weight| similarity_weight.boost_by(boost));
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
let mut term_postings_list = Vec::new();
let mut term_postings_list: Vec<(usize, Box<dyn Postings>)> = Vec::new();
for &(offset, ref term) in &self.phrase_terms {
let inverted_index = reader.inverted_index(term.field())?;
let Some(term_info) = inverted_index.get_term_info(term)? else {
if let Some(postings) = reader
.inverted_index(term.field())?
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
{
term_postings_list.push((offset, postings));
} else {
return Ok(None);
};
let postings = inverted_index.read_postings_from_terminfo(
&term_info,
IndexRecordOption::WithFreqsAndPositions,
seek_doc,
)?;
term_postings_list.push((offset, postings));
}
}
let inv_index = reader.inverted_index(self.prefix.1.field())?;
@@ -106,51 +102,44 @@ impl PhrasePrefixWeight {
}
}
Ok(Some(PhrasePrefixScorer::new(
// TODO make this specialized.
Ok(Some(box_scorer(PhrasePrefixScorer::new(
term_postings_list,
similarity_weight_opt,
fieldnorm_reader,
suffixes,
self.prefix.0,
)))
))))
}
}
impl Weight for PhrasePrefixWeight {
fn scorer(
&self,
reader: &SegmentReader,
boost: Score,
seek_doc: DocId,
) -> crate::Result<Box<dyn Scorer>> {
if let Some(scorer) = self.prefix_phrase_scorer(reader, boost, seek_doc)? {
Ok(Box::new(scorer))
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader, boost)? {
Ok(scorer)
} else {
Ok(Box::new(EmptyScorer))
Ok(box_scorer(EmptyScorer))
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let scorer_opt = self.prefix_phrase_scorer(reader, 1.0, doc)?;
if scorer_opt.is_none() {
return Err(does_not_match(doc));
}
let mut scorer = scorer_opt.unwrap();
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
let phrase_count = scorer.phrase_count();
let mut explanation = Explanation::new("Phrase Prefix Scorer", scorer.score());
if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() {
explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count));
}
Ok(explanation)
}
fn intersection_priority(&self) -> u32 {
50u32
fn explain(&self, _reader: &SegmentReader, _doc: DocId) -> crate::Result<Explanation> {
todo!();
// let scorer_opt = self.phrase_scorer(reader, 1.0)?;
// if scorer_opt.is_none() {
// return Err(does_not_match(doc));
// }
// let mut scorer = scorer_opt.unwrap();
// if scorer.seek(doc) != doc {
// return Err(does_not_match(doc));
// }
// let fieldnorm_reader = self.fieldnorm_reader(reader)?;
// let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
// let phrase_count = scorer.phrase_count();
// let mut explanation = Explanation::new("Phrase Prefix Scorer", scorer.score());
// if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() {
// explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count));
// }
// Ok(explanation)
}
}
@@ -158,6 +147,8 @@ impl Weight for PhrasePrefixWeight {
mod tests {
use crate::docset::TERMINATED;
use crate::index::Index;
use crate::postings::Postings;
use crate::query::phrase_prefix_query::PhrasePrefixScorer;
use crate::query::{EnableScoring, PhrasePrefixQuery, Query};
use crate::schema::{Schema, TEXT};
use crate::{DocSet, IndexWriter, Term};
@@ -198,14 +189,14 @@ mod tests {
.phrase_prefix_query_weight(enable_scoring)
.unwrap()
.unwrap();
let mut phrase_scorer = phrase_weight
.prefix_phrase_scorer(searcher.segment_reader(0u32), 1.0, 0u32)?
let mut phrase_scorer_boxed = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
.unwrap();
let phrase_scorer: &mut PhrasePrefixScorer<Box<dyn Postings>> =
phrase_scorer_boxed.as_any_mut().downcast_mut().unwrap();
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2);
assert_eq!(phrase_scorer.advance(), 2);
assert_eq!(phrase_scorer.doc(), 2);
assert_eq!(phrase_scorer.phrase_count(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
@@ -225,14 +216,15 @@ mod tests {
.phrase_prefix_query_weight(enable_scoring)
.unwrap()
.unwrap();
let mut phrase_scorer = phrase_weight
.prefix_phrase_scorer(searcher.segment_reader(0u32), 1.0, 0u32)?
let mut phrase_scorer_boxed = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
.unwrap();
let phrase_scorer = phrase_scorer_boxed
.downcast_mut::<PhrasePrefixScorer<Box<dyn Postings>>>()
.unwrap();
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2);
assert_eq!(phrase_scorer.advance(), 2);
assert_eq!(phrase_scorer.doc(), 2);
assert_eq!(phrase_scorer.phrase_count(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
@@ -250,7 +242,7 @@ mod tests {
.unwrap()
.is_none());
let weight = phrase_query.weight(enable_scoring).unwrap();
let mut phrase_scorer = weight.scorer(searcher.segment_reader(0u32), 1.0, 0)?;
let mut phrase_scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.advance(), 2);
assert_eq!(phrase_scorer.doc(), 2);
@@ -271,7 +263,7 @@ mod tests {
]);
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
let weight = phrase_query.weight(enable_scoring).unwrap();
let mut phrase_scorer = weight.scorer(searcher.segment_reader(0u32), 1.0, 0)?;
let mut phrase_scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}

View File

@@ -5,7 +5,7 @@ pub mod regex_phrase_query;
mod regex_phrase_weight;
pub use self::phrase_query::PhraseQuery;
pub(crate) use self::phrase_scorer::intersection_count;
pub(crate) use self::phrase_scorer::intersection_exists;
pub use self::phrase_scorer::PhraseScorer;
pub use self::phrase_weight::PhraseWeight;
@@ -84,7 +84,7 @@ pub(crate) mod tests {
let phrase_query = PhraseQuery::new(terms);
let phrase_weight =
phrase_query.phrase_weight(EnableScoring::disabled_from_schema(searcher.schema()))?;
let mut phrase_scorer = phrase_weight.scorer(searcher.segment_reader(0), 1.0, 0)?;
let mut phrase_scorer = phrase_weight.scorer(searcher.segment_reader(0), 1.0)?;
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
@@ -343,43 +343,6 @@ pub(crate) mod tests {
Ok(())
}
#[test]
pub fn test_phrase_weight_seek_doc() -> crate::Result<()> {
// Create an index with documents where the phrase "a b" appears in some of them.
// Documents: 0: "c d", 1: "a b", 2: "e f", 3: "a b c", 4: "g h", 5: "a b", 6: "i j"
let index = create_index(&["c d", "a b", "e f", "a b c", "g h", "a b", "i j"])?;
let text_field = index.schema().get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
let phrase_query = PhraseQuery::new(vec![
Term::from_field_text(text_field, "a"),
Term::from_field_text(text_field, "b"),
]);
let phrase_weight =
phrase_query.phrase_weight(EnableScoring::disabled_from_schema(searcher.schema()))?;
// Helper function to collect all docs from a scorer created with a given seek_doc
let docs_when_seeking_from = |seek_from: DocId| {
let scorer = phrase_weight
.scorer(segment_reader, 1.0f32, seek_from)
.unwrap();
crate::docset::docset_to_doc_vec(scorer)
};
// Documents with "a b": 1, 3, 5
assert_eq!(docs_when_seeking_from(0), vec![1, 3, 5]);
assert_eq!(docs_when_seeking_from(1), vec![1, 3, 5]);
assert_eq!(docs_when_seeking_from(2), vec![3, 5]);
assert_eq!(docs_when_seeking_from(3), vec![3, 5]);
assert_eq!(docs_when_seeking_from(4), vec![5]);
assert_eq!(docs_when_seeking_from(5), vec![5]);
assert_eq!(docs_when_seeking_from(6), Vec::<DocId>::new());
assert_eq!(docs_when_seeking_from(7), Vec::<DocId>::new());
Ok(())
}
#[test]
pub fn test_phrase_query_on_json() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
@@ -410,7 +373,7 @@ pub(crate) mod tests {
.weight(EnableScoring::disabled_from_schema(searcher.schema()))
.unwrap();
let mut phrase_scorer = phrase_weight
.scorer(searcher.segment_reader(0), 1.0f32, 0)
.scorer(searcher.segment_reader(0), 1.0f32)
.unwrap();
let mut docs = Vec::new();
loop {

View File

@@ -126,7 +126,7 @@ impl PhraseQuery {
};
let mut weight = PhraseWeight::new(self.phrase_terms.clone(), bm25_weight_opt);
if self.slop > 0 {
weight.slop(self.slop);
weight.set_slop(self.slop);
}
Ok(weight)
}

Some files were not shown because too many files have changed in this diff Show More