perf: use term ordinal order when sorting by keys

term ordinals are sorted lexicographically. we can use than to sort instead of reading the terms from the dictionary.
This commit is contained in:
Pascal Seitz
2023-02-09 16:27:18 +08:00
parent cdffce906c
commit 6e664d071f
5 changed files with 174 additions and 8 deletions

View File

@@ -0,0 +1,123 @@
use serde::{Deserialize, Serialize};
/// DateHistogramAggregation is similar to `HistogramAggregation`, but it can only be used with date type.
///
/// Currently only **fixed time** intervals are supported. Calendar-aware time intervals are not
/// supported.
///
/// Like the histogram, values are rounded down into the closest bucket.
///
/// For this calculation all fastfield values are converted to f64.
///
/// # Limitations/Compatibility
/// Only fixed time intervals are supported.
///
/// # JSON Format
/// ```json
/// {
/// "prices": {
/// "date_histogram": {
/// "field": "price",
/// "fixed_interval": "30d"
/// }
/// }
/// }
/// ```
///
/// Response
/// See [`BucketEntry`](crate::aggregation::agg_result::BucketEntry)
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct DateHistogramAggregationReq {
/// The field to aggregate on.
pub field: String,
/// The interval to chunk your data range. Each bucket spans a value range of [0..fixed_interval).
/// Accepted values
///
/// Fixed intervals are configured with the `fixed_interval` parameter.
/// In contrast to calendar-aware intervals, fixed intervals are a fixed number of SI units and never deviate, regardless of where they fall on the calendar.
/// One second is always composed of 1000ms. This allows fixed intervals to be specified in any multiple of the supported units.
/// However, it means fixed intervals cannot express other units such as months, since the duration of a month is not a fixed quantity.
/// Attempting to specify a calendar interval like month or quarter will return an Error.
///
/// The accepted units for fixed intervals are:
/// * `ms`: milliseconds
/// * `s`: seconds. Defined as 1000 milliseconds each.
/// * `m`: minutes. Defined as 60 seconds each (60_000 milliseconds).
/// * `h`: hours. Defined as 60 minutes each (3_600_000 milliseconds).
/// * `d`: days. Defined as 24 hours (86_400_000 milliseconds).
///
/// Fractional time values are not supported, but you can address this by shifting to another time unit
/// (e.g., `1.5h` could instead be specified as `90m`).
pub fixed_interval: String,
/// Intervals implicitly defines an absolute grid of buckets `[interval * k, interval * (k + 1))`.
///
pub offset: Option<String>,
/// Whether to return the buckets as a hash map
#[serde(default)]
pub keyed: bool,
}
impl DateHistogramAggregationReq {
fn validate(&self) -> crate::Result<()> {
Ok(())
}
}
#[derive(Debug, PartialEq, Eq)]
/// Errors when parsing the fixed interval for `DateHistogramAggregationReq`.
pub enum DateHistogramParseError {
/// Unit not recognized in passed String
UnitNotRecognized(String),
/// Number not found in passed String
NumberMissing(String),
/// Unit not found in passed String
UnitMissing(String),
}
fn parse_into_milliseconds(input: &str) -> Result<u64, DateHistogramParseError> {
let split_boundary = input
.char_indices()
.take_while(|(pos, el)| el.is_numeric())
.count();
let (number, unit) = input.split_at(split_boundary);
if number.is_empty() {
return Err(DateHistogramParseError::NumberMissing(input.to_string()));
}
if unit.is_empty() {
return Err(DateHistogramParseError::UnitMissing(input.to_string()));
}
let number: u64 = number.parse().unwrap();
let multiplier_from_unit = match unit {
"ms" => 1,
"s" => 1000,
"m" => 60 * 1000,
"h" => 60 * 60 * 1000,
"d" => 24 * 60 * 60 * 1000,
_ => return Err(DateHistogramParseError::UnitNotRecognized(unit.to_string())),
};
Ok(number * multiplier_from_unit)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parser_test() {
assert_eq!(parse_into_milliseconds("1m").unwrap(), 60_000);
assert_eq!(parse_into_milliseconds("2m").unwrap(), 120_000);
assert_eq!(
parse_into_milliseconds("2y").unwrap_err(),
DateHistogramParseError::UnitNotRecognized("y".to_string())
);
assert_eq!(
parse_into_milliseconds("2000").unwrap_err(),
DateHistogramParseError::UnitMissing("2000".to_string())
);
assert_eq!(
parse_into_milliseconds("ms").unwrap_err(),
DateHistogramParseError::NumberMissing("ms".to_string())
);
}
}

View File

@@ -64,7 +64,6 @@ use crate::{DocId, TantivyError};
///
/// Response
/// See [`BucketEntry`](crate::aggregation::agg_result::BucketEntry)
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct HistogramAggregation {
/// The field to aggregate on.

View File

@@ -1,2 +1,4 @@
mod date_histogram;
mod histogram;
pub use date_histogram::*;
pub use histogram::*;

View File

@@ -328,7 +328,14 @@ impl SegmentTermCollector {
match self.req.order.target {
OrderTarget::Key => {
// defer order and cut_off after loading the texts from the dictionary
// We rely on the fact, that term ordinals match the order of the strings
// TODO: We could have a special collector, that keeps only TOP n results at any
// time.
if self.req.order.order == Order::Desc {
entries.sort_unstable_by_key(|bucket| std::cmp::Reverse(bucket.0));
} else {
entries.sort_unstable_by_key(|bucket| bucket.0);
}
}
OrderTarget::SubAggregation(_name) => {
// don't sort and cut off since it's hard to make assumptions on the quality of the
@@ -344,12 +351,11 @@ impl SegmentTermCollector {
}
}
let (term_doc_count_before_cutoff, mut sum_other_doc_count) =
if order_by_key || order_by_sub_aggregation {
(0, 0)
} else {
cut_off_buckets(&mut entries, self.req.segment_size as usize)
};
let (term_doc_count_before_cutoff, mut sum_other_doc_count) = if order_by_sub_aggregation {
(0, 0)
} else {
cut_off_buckets(&mut entries, self.req.segment_size as usize)
};
let inverted_index = agg_with_accessor
.str_dict_column
@@ -372,6 +378,7 @@ impl SegmentTermCollector {
);
}
if self.req.min_doc_count == 0 {
// TODO: Handle rev streaming for descending sorting by keys
let mut stream = term_dict.dictionary().stream()?;
while let Some((key, _ord)) = stream.next() {
if dict.len() >= self.req.segment_size as usize {

View File

@@ -333,6 +333,7 @@ mod tests {
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
use crate::aggregation::segment_agg_result::DOC_BLOCK_SIZE;
use crate::aggregation::DistributedAggregationCollector;
use crate::indexer::NoMergePolicy;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use crate::{DateTime, Index, Term};
@@ -441,6 +442,7 @@ mod tests {
{
// let mut index_writer = index.writer_for_tests()?;
let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for values in segment_and_values {
for (i, term) in values {
let i = *i;
@@ -1162,6 +1164,9 @@ mod tests {
#[cfg(all(test, feature = "unstable"))]
mod bench {
use crate::aggregation::bucket::CustomOrder;
use crate::aggregation::bucket::Order;
use crate::aggregation::bucket::OrderTarget;
use rand::prelude::SliceRandom;
use rand::{thread_rng, Rng};
use test::{self, Bencher};
@@ -1421,6 +1426,36 @@ mod tests {
});
}
#[bench]
fn bench_aggregation_terms_many_order_by_term(b: &mut Bencher) {
let index = get_test_index_bench(false).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text_many_terms".to_string(),
order: Some(CustomOrder {
order: Order::Desc,
target: OrderTarget::Key,
}),
..Default::default()
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
#[bench]
fn bench_aggregation_range_only(b: &mut Bencher) {
let index = get_test_index_bench(false).unwrap();