perf: use term ordinal order when sorting by keys

term ordinals are sorted lexicographically. we can use than to sort instead of reading the terms from the dictionary.
2026-05-28 22:20:42 +00:00 · 2023-02-09 16:27:18 +08:00
parent cdffce906c
commit 6e664d071f
5 changed files with 174 additions and 8 deletions
--- a/src/aggregation/bucket/histogram/date_histogram.rs
+++ b/src/aggregation/bucket/histogram/date_histogram.rs
@@ -0,0 +1,123 @@
+use serde::{Deserialize, Serialize};
+
+/// DateHistogramAggregation is similar to `HistogramAggregation`, but it can only be used with date type.
+///
+/// Currently only **fixed time** intervals are supported. Calendar-aware time intervals are not
+/// supported.
+///
+/// Like the histogram, values are rounded down into the closest bucket.
+///
+/// For this calculation all fastfield values are converted to f64.
+///
+/// # Limitations/Compatibility
+/// Only fixed time intervals are supported.
+///
+/// # JSON Format
+/// ```json
+/// {
+///     "prices": {
+///         "date_histogram": {
+///             "field": "price",
+///             "fixed_interval": "30d"
+///         }
+///     }
+/// }
+/// ```
+///
+/// Response
+/// See [`BucketEntry`](crate::aggregation::agg_result::BucketEntry)
+#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
+pub struct DateHistogramAggregationReq {
+    /// The field to aggregate on.
+    pub field: String,
+    /// The interval to chunk your data range. Each bucket spans a value range of [0..fixed_interval).
+    /// Accepted values
+    ///
+    /// Fixed intervals are configured with the `fixed_interval` parameter.
+    /// In contrast to calendar-aware intervals, fixed intervals are a fixed number of SI units and never deviate, regardless of where they fall on the calendar.
+    /// One second is always composed of 1000ms. This allows fixed intervals to be specified in any multiple of the supported units.
+    /// However, it means fixed intervals cannot express other units such as months, since the duration of a month is not a fixed quantity.
+    /// Attempting to specify a calendar interval like month or quarter will return an Error.
+    ///
+    /// The accepted units for fixed intervals are:
+    /// * `ms`: milliseconds
+    /// * `s`: seconds. Defined as 1000 milliseconds each.
+    /// * `m`: minutes. Defined as 60 seconds each (60_000 milliseconds).
+    /// * `h`: hours. Defined as 60 minutes each (3_600_000 milliseconds).
+    /// * `d`: days. Defined as 24 hours (86_400_000 milliseconds).
+    ///
+    /// Fractional time values are not supported, but you can address this by shifting to another time unit
+    /// (e.g., `1.5h` could instead be specified as `90m`).
+    pub fixed_interval: String,
+    /// Intervals implicitly defines an absolute grid of buckets `[interval * k, interval * (k + 1))`.
+    ///
+    pub offset: Option<String>,
+    /// Whether to return the buckets as a hash map
+    #[serde(default)]
+    pub keyed: bool,
+}
+
+impl DateHistogramAggregationReq {
+    fn validate(&self) -> crate::Result<()> {
+        Ok(())
+    }
+}
+
+#[derive(Debug, PartialEq, Eq)]
+/// Errors when parsing the fixed interval for `DateHistogramAggregationReq`.
+pub enum DateHistogramParseError {
+    /// Unit not recognized in passed String
+    UnitNotRecognized(String),
+    /// Number not found in passed String
+    NumberMissing(String),
+    /// Unit not found in passed String
+    UnitMissing(String),
+}
+
+fn parse_into_milliseconds(input: &str) -> Result<u64, DateHistogramParseError> {
+    let split_boundary = input
+        .char_indices()
+        .take_while(|(pos, el)| el.is_numeric())
+        .count();
+    let (number, unit) = input.split_at(split_boundary);
+    if number.is_empty() {
+        return Err(DateHistogramParseError::NumberMissing(input.to_string()));
+    }
+    if unit.is_empty() {
+        return Err(DateHistogramParseError::UnitMissing(input.to_string()));
+    }
+    let number: u64 = number.parse().unwrap();
+    let multiplier_from_unit = match unit {
+        "ms" => 1,
+        "s" => 1000,
+        "m" => 60 * 1000,
+        "h" => 60 * 60 * 1000,
+        "d" => 24 * 60 * 60 * 1000,
+        _ => return Err(DateHistogramParseError::UnitNotRecognized(unit.to_string())),
+    };
+
+    Ok(number * multiplier_from_unit)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parser_test() {
+        assert_eq!(parse_into_milliseconds("1m").unwrap(), 60_000);
+        assert_eq!(parse_into_milliseconds("2m").unwrap(), 120_000);
+        assert_eq!(
+            parse_into_milliseconds("2y").unwrap_err(),
+            DateHistogramParseError::UnitNotRecognized("y".to_string())
+        );
+        assert_eq!(
+            parse_into_milliseconds("2000").unwrap_err(),
+            DateHistogramParseError::UnitMissing("2000".to_string())
+        );
+        assert_eq!(
+            parse_into_milliseconds("ms").unwrap_err(),
+            DateHistogramParseError::NumberMissing("ms".to_string())
+        );
+    }
+}
--- a/src/aggregation/bucket/histogram/histogram.rs
+++ b/src/aggregation/bucket/histogram/histogram.rs
@@ -64,7 +64,6 @@ use crate::{DocId, TantivyError};
 ///
 /// Response
 /// See [`BucketEntry`](crate::aggregation::agg_result::BucketEntry)
-
 #[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
 pub struct HistogramAggregation {
    /// The field to aggregate on.
--- a/src/aggregation/bucket/histogram/mod.rs
+++ b/src/aggregation/bucket/histogram/mod.rs
@@ -1,2 +1,4 @@
+mod date_histogram;
 mod histogram;
+pub use date_histogram::*;
 pub use histogram::*;
--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -328,7 +328,14 @@ impl SegmentTermCollector {

        match self.req.order.target {
            OrderTarget::Key => {
-                // defer order and cut_off after loading the texts from the dictionary
+                // We rely on the fact, that term ordinals match the order of the strings
+                // TODO: We could have a special collector, that keeps only TOP n results at any
+                // time.
+                if self.req.order.order == Order::Desc {
+                    entries.sort_unstable_by_key(|bucket| std::cmp::Reverse(bucket.0));
+                } else {
+                    entries.sort_unstable_by_key(|bucket| bucket.0);
+                }
            }
            OrderTarget::SubAggregation(_name) => {
                // don't sort and cut off since it's hard to make assumptions on the quality of the
@@ -344,12 +351,11 @@ impl SegmentTermCollector {
            }
        }

-        let (term_doc_count_before_cutoff, mut sum_other_doc_count) =
-            if order_by_key || order_by_sub_aggregation {
-                (0, 0)
-            } else {
-                cut_off_buckets(&mut entries, self.req.segment_size as usize)
-            };
+        let (term_doc_count_before_cutoff, mut sum_other_doc_count) = if order_by_sub_aggregation {
+            (0, 0)
+        } else {
+            cut_off_buckets(&mut entries, self.req.segment_size as usize)
+        };

        let inverted_index = agg_with_accessor
            .str_dict_column
@@ -372,6 +378,7 @@ impl SegmentTermCollector {
            );
        }
        if self.req.min_doc_count == 0 {
+            // TODO: Handle rev streaming for descending sorting by keys
            let mut stream = term_dict.dictionary().stream()?;
            while let Some((key, _ord)) = stream.next() {
                if dict.len() >= self.req.segment_size as usize {
--- a/src/aggregation/mod.rs
+++ b/src/aggregation/mod.rs
@@ -333,6 +333,7 @@ mod tests {
    use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
    use crate::aggregation::segment_agg_result::DOC_BLOCK_SIZE;
    use crate::aggregation::DistributedAggregationCollector;
+    use crate::indexer::NoMergePolicy;
    use crate::query::{AllQuery, TermQuery};
    use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
    use crate::{DateTime, Index, Term};
@@ -441,6 +442,7 @@ mod tests {
        {
            // let mut index_writer = index.writer_for_tests()?;
            let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?;
+            index_writer.set_merge_policy(Box::new(NoMergePolicy));
            for values in segment_and_values {
                for (i, term) in values {
                    let i = *i;
@@ -1162,6 +1164,9 @@ mod tests {
    #[cfg(all(test, feature = "unstable"))]
    mod bench {

+        use crate::aggregation::bucket::CustomOrder;
+        use crate::aggregation::bucket::Order;
+        use crate::aggregation::bucket::OrderTarget;
        use rand::prelude::SliceRandom;
        use rand::{thread_rng, Rng};
        use test::{self, Bencher};
@@ -1421,6 +1426,36 @@ mod tests {
            });
        }

+        #[bench]
+        fn bench_aggregation_terms_many_order_by_term(b: &mut Bencher) {
+            let index = get_test_index_bench(false).unwrap();
+            let reader = index.reader().unwrap();
+
+            b.iter(|| {
+                let agg_req: Aggregations = vec![(
+                    "my_texts".to_string(),
+                    Aggregation::Bucket(BucketAggregation {
+                        bucket_agg: BucketAggregationType::Terms(TermsAggregation {
+                            field: "text_many_terms".to_string(),
+                            order: Some(CustomOrder {
+                                order: Order::Desc,
+                                target: OrderTarget::Key,
+                            }),
+                            ..Default::default()
+                        }),
+                        sub_aggregation: Default::default(),
+                    }),
+                )]
+                .into_iter()
+                .collect();
+
+                let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
+
+                let searcher = reader.searcher();
+                searcher.search(&AllQuery, &collector).unwrap()
+            });
+        }
+
        #[bench]
        fn bench_aggregation_range_only(b: &mut Bencher) {
            let index = get_test_index_bench(false).unwrap();