fix overflow on large jumps in linear sequence

new limit prevent an overflow in eval which caused the residual to be 64b when a slop of zero would give a smaller one
Built SUM final result in each branch directly.
2026-06-23 02:40:44 +00:00 · 2026-06-23 00:13:27 +02:00 · 2026-06-16 03:10:30 +08:00 · 2026-06-16 03:10:30 +08:00 · 2026-06-16 03:10:30 +08:00 · 2026-06-16 03:10:30 +08:00
7 changed files with 221 additions and 4 deletions
--- a/columnar/src/column_values/u64_based/blockwise_linear.rs
+++ b/columnar/src/column_values/u64_based/blockwise_linear.rs
@@ -241,6 +241,28 @@ mod tests {
    use super::*;
    use crate::column_values::u64_based::tests::create_and_validate;

+    // A block boundary where a high run ends and a low run begins: y0 ≈ 2^32, y511 ≈ 0.
+    // This large jump used to cause an overflow which made us render all value on 64b
+    // when 32 was enough.
+    fn large_descending_jump_vals() -> Vec<u64> {
+        let high_start: u64 = 4_294_967_039; // ≈ 2^32 - 257
+        (0u64..256)
+            .map(|i| high_start + i)
+            .chain(0u64..256)
+            .collect()
+    }
+
+    #[test]
+    fn test_blockwise_linear_large_descending_jump_uses_at_most_32bit() {
+        let vals = large_descending_jump_vals();
+        let (_, actual_rate) =
+            create_and_validate::<BlockwiseLinearCodec>(&vals, "large descending jump").unwrap();
+        assert!(
+            actual_rate <= 0.6,
+            "compression rate {actual_rate:.3} is too high (bug: 64-bit residuals)"
+        );
+    }
+
    #[test]
    fn test_with_codec_data_sets_simple() {
        create_and_validate::<BlockwiseLinearCodec>(
--- a/columnar/src/column_values/u64_based/line.rs
+++ b/columnar/src/column_values/u64_based/line.rs
@@ -37,7 +37,7 @@ fn compute_slope(y0: u64, y1: u64, num_vals: NonZeroU32) -> u64 {
    } else {
        y0.wrapping_sub(y1)
    };
-    if abs_dy >= 1 << 32 {
+    if abs_dy >= 1 << 31 {
        // This is outside of realm we handle.
        // Let's just bail.
        return 0u64;
--- a/src/aggregation/agg_req.rs
+++ b/src/aggregation/agg_req.rs
@@ -299,6 +299,12 @@ impl AggregationVariants {
            _ => None,
        }
    }
+    pub(crate) fn as_sum(&self) -> Option<&SumAggregation> {
+        match &self {
+            AggregationVariants::Sum(sum) => Some(sum),
+            _ => None,
+        }
+    }
 }

 #[cfg(test)]
--- a/src/aggregation/intermediate_agg_result.rs
+++ b/src/aggregation/intermediate_agg_result.rs
@@ -377,7 +377,22 @@ impl IntermediateMetricResult {
                MetricResult::ExtendedStats(intermediate_stats.finalize())
            }
            IntermediateMetricResult::Sum(intermediate_sum) => {
-                MetricResult::Sum(intermediate_sum.finalize().into())
+                // By default match Elasticsearch: empty / all-missing sum
+                // buckets serialize as `"value": 0`, not `"value": null`.
+                // The non-ES `none_if_no_match` flag on `SumAggregation`
+                // opts into SQL-style `null` for downstream consumers.
+                let none_if_no_match = req
+                    .agg
+                    .as_sum()
+                    .and_then(|sum| sum.none_if_no_match)
+                    .unwrap_or(false);
+                let value = intermediate_sum.finalize();
+                if none_if_no_match {
+                    MetricResult::Sum(value.into())
+                } else {
+                    let value = Some(value.unwrap_or(0.0));
+                    MetricResult::Sum(value.into())
+                }
            }
            IntermediateMetricResult::Percentiles(percentiles) => MetricResult::Percentiles(
                percentiles
--- a/src/aggregation/metric/sum.rs
+++ b/src/aggregation/metric/sum.rs
@@ -27,6 +27,16 @@ pub struct SumAggregation {
    /// { "field": "my_numbers", "missing": "10.0" }
    #[serde(default, deserialize_with = "deserialize_option_f64")]
    pub missing: Option<f64>,
+    /// Non-Elasticsearch extension. When `Some(true)`, the serialized result
+    /// returns `"value": null` if no values were collected (all documents had
+    /// missing/NULL values for the field), matching the behavior of `min`,
+    /// `max`, and `avg`. When `None` or `Some(false)` (the default) the
+    /// result returns `"value": 0`, matching Elasticsearch.
+    ///
+    /// Intended for SQL-style consumers where `SUM` of zero rows is `NULL`
+    /// and must be distinguishable from a bucket that genuinely sums to `0`.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub none_if_no_match: Option<bool>,
 }

 impl SumAggregation {
@@ -35,6 +45,7 @@ impl SumAggregation {
        Self {
            field: field_name,
            missing: None,
+            none_if_no_match: None,
        }
    }
    /// Returns the field name the aggregation is computed on.
@@ -59,8 +70,104 @@ impl IntermediateSum {
    pub fn merge_fruits(&mut self, other: IntermediateSum) {
        self.stats.merge_fruits(other.stats);
    }
-    /// Computes the final minimum value.
+    /// Computes the final sum value.
+    ///
+    /// Returns `None` when no values were collected, matching the Rust-side
+    /// behavior of `IntermediateMin`, `IntermediateMax`, and
+    /// `IntermediateAvg`. The Elasticsearch-vs-SQL choice for the
+    /// user-visible result is made at the boundary in
+    /// [`IntermediateMetricResult::into_final_metric_result`]: by default
+    /// `None` is coerced to `Some(0.0)` to match Elasticsearch
+    /// (`"value": 0`), and the [`SumAggregation::none_if_no_match`] flag
+    /// opts out of that coercion for SQL-style consumers.
    pub fn finalize(&self) -> Option<f64> {
-        Some(self.stats.finalize().sum)
+        let stats = self.stats.finalize();
+        if stats.count == 0 {
+            None
+        } else {
+            Some(stats.sum)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_sum_finalize_returns_none_when_no_values() {
+        // Default IntermediateSum has count=0 — finalize should return None,
+        // matching MIN/MAX/AVG behavior for all-NULL groups.
+        let sum = IntermediateSum::default();
+        assert_eq!(sum.finalize(), None);
+    }
+
+    #[test]
+    fn test_sum_finalize_returns_value_when_has_values() {
+        let mut sum = IntermediateSum::default();
+        // Merge in a result that has actual values
+        let stats = IntermediateStats {
+            count: 3,
+            sum: 42.0,
+            min: 10.0,
+            max: 20.0,
+            ..Default::default()
+        };
+        let other = IntermediateSum::from_stats(stats);
+        sum.merge_fruits(other);
+        assert_eq!(sum.finalize(), Some(42.0));
+    }
+
+    #[test]
+    fn test_sum_merge_two_empty_still_none() {
+        let mut a = IntermediateSum::default();
+        let b = IntermediateSum::default();
+        a.merge_fruits(b);
+        assert_eq!(a.finalize(), None);
+    }
+
+    #[test]
+    fn test_sum_aggregation_empty_index_default_matches_es() -> crate::Result<()> {
+        use serde_json::json;
+
+        use crate::aggregation::agg_req::Aggregations;
+        use crate::aggregation::tests::{exec_request, get_test_index_from_terms};
+
+        // Empty index — sum has no values to collect.
+        let values: Vec<Vec<&str>> = vec![];
+        let index = get_test_index_from_terms(false, &values)?;
+        let agg_req: Aggregations = serde_json::from_value(json!({
+            "score_sum": { "sum": { "field": "score" } }
+        }))
+        .unwrap();
+
+        let res = exec_request(agg_req, &index)?;
+        // Default: match Elasticsearch — empty sum serializes as 0, not null.
+        assert_eq!(res["score_sum"]["value"], 0.0);
+        Ok(())
+    }
+
+    #[test]
+    fn test_sum_aggregation_empty_index_none_if_no_match_opt_in() -> crate::Result<()> {
+        use serde_json::json;
+
+        use crate::aggregation::agg_req::Aggregations;
+        use crate::aggregation::tests::{exec_request, get_test_index_from_terms};
+
+        let values: Vec<Vec<&str>> = vec![];
+        let index = get_test_index_from_terms(false, &values)?;
+        let agg_req: Aggregations = serde_json::from_value(json!({
+            "score_sum": { "sum": { "field": "score", "none_if_no_match": true } }
+        }))
+        .unwrap();
+
+        let res = exec_request(agg_req, &index)?;
+        // Opt-in non-ES extension — empty sum serializes as null.
+        assert!(
+            res["score_sum"]["value"].is_null(),
+            "expected null, got {:?}",
+            res["score_sum"]["value"]
+        );
+        Ok(())
    }
 }
--- a/src/postings/block_segment_postings.rs
+++ b/src/postings/block_segment_postings.rs
@@ -287,6 +287,33 @@ impl BlockSegmentPostings {
        doc
    }

+    /// Returns the number of documents with a doc id strictly smaller than `target`
+    /// (i.e. the *rank* of `target` in this posting list).
+    ///
+    /// This jumps to the block that may contain `target` through the skip list, so no
+    /// skipped block is decoded; a single block is then decoded to locate `target`
+    /// within it. The cost is therefore `O(number_of_skip_list_entries)` plus one block
+    /// decode, rather than `O(doc_freq)`.
+    ///
+    /// Like [`Self::seek`], the underlying cursor only ever moves forward. This method
+    /// must be called with **non-decreasing** `target` values (galloping); calling it
+    /// with a `target` smaller than a previous one yields an incorrect result. `target`
+    /// must be a valid doc id (i.e. `target <= TERMINATED`), exactly as for `seek`.
+    ///
+    /// Edge cases: returns `0` when `target` is smaller than every doc id, and
+    /// `doc_freq()` when `target` is larger than every doc id.
+    pub fn rank(&mut self, target: DocId) -> u32 {
+        if self.doc_freq == 0 {
+            return 0;
+        }
+        // `within` = number of docs in the landed block with a doc id < target.
+        let within = self.seek(target);
+        // `remaining_docs` counts the landed block and everything after it, so the
+        // difference is the number of docs in all blocks strictly before it.
+        let docs_before_block = self.doc_freq - self.skip_reader.remaining_docs();
+        docs_before_block + within as u32
+    }
+
    pub(crate) fn position_offset(&self) -> u64 {
        self.skip_reader.position_offset()
    }
@@ -568,4 +595,38 @@ mod tests {
        assert_eq!(block_segments.docs(), &[1, 3, 5]);
        Ok(())
    }
+
+    #[test]
+    fn test_block_segment_postings_rank() -> crate::Result<()> {
+        // ~8 blocks worth of docs so the skip list is actually exercised.
+        let docs: Vec<DocId> = (0..1000u32).map(|i| i * 3).collect();
+        let mut block_postings = build_block_postings(&docs[..])?;
+        let doc_freq = block_postings.doc_freq();
+
+        // rank(target) must equal the number of docs strictly below target.
+        // Targets are queried in non-decreasing order, as the API requires.
+        // `target` values must be a valid doc id (<= TERMINATED) and non-decreasing.
+        let targets = [
+            0u32, 1, 2, 3, 4, 299, 300, 301, 1500, 2996, 2997, 3000, 10_000,
+        ];
+        for &target in &targets {
+            let expected = docs.iter().filter(|&&d| d < target).count() as u32;
+            assert_eq!(
+                block_postings.rank(target),
+                expected,
+                "rank({target}) mismatch"
+            );
+        }
+
+        // Edge cases: below the first doc -> 0, above the last doc -> doc_freq.
+        let mut fresh = build_block_postings(&docs[..])?;
+        assert_eq!(fresh.rank(0), 0);
+        let mut fresh = build_block_postings(&docs[..])?;
+        assert_eq!(fresh.rank(1_000_000), doc_freq);
+
+        // Empty postings: rank is always 0.
+        let mut empty = BlockSegmentPostings::empty();
+        assert_eq!(empty.rank(42), 0);
+        Ok(())
+    }
 }
--- a/src/postings/skip.rs
+++ b/src/postings/skip.rs
@@ -187,6 +187,12 @@ impl SkipReader {
        self.last_doc_in_block
    }

+    /// Number of docs from the start of the current block to the end of the postings
+    /// (i.e. the current block plus every block after it).
+    pub(crate) fn remaining_docs(&self) -> u32 {
+        self.remaining_docs
+    }
+
    pub fn position_offset(&self) -> u64 {
        self.position_offset
    }
Author	SHA1	Message	Date
trinity-1686a	4031d97bac	fix overflow on large jumps in linear sequence new limit prevent an overflow in eval which caused the residual to be 64b when a slop of zero would give a smaller one	2026-06-23 00:13:27 +02:00
Mohammad Dashti	799f7b4646	Built SUM final result in each branch directly. Keeps the empty-bucket coercion visible at the boundary instead of a shared binding, following the reviewer's suggested shape.	2026-06-16 03:10:30 +08:00
Mohammad Dashti	fc88d80726	docs: drop downstream-specific name from none_if_no_match doc The flag's purpose is described well enough by "SQL-style consumers"; no need to call out a specific downstream.	2026-06-16 03:10:30 +08:00
Mohammad Dashti	6a684e7c38	feat: opt-in none_if_no_match flag on SumAggregation for SQL-style null Switch the default serialized output of `sum` on empty / all-missing buckets back to `"value": 0` to match Elasticsearch, and gate the SQL-style `"value": null` behavior behind a new `none_if_no_match: Option<bool>` flag on `SumAggregation`. `IntermediateSum::finalize` still returns `Option<f64>` internally so the Rust API stays parallel to min/max/avg, but the ES-vs-SQL choice is made at the boundary in `IntermediateMetricResult::into_final_metric_result`: `None` is coerced to `Some(0.0)` unless `none_if_no_match` is set on the aggregation request. Adds `AggregationVariants::as_sum()` accessor for that boundary check and two end-to-end tests covering both the default ES behavior and the opt-in null behavior on an empty index.	2026-06-16 03:10:30 +08:00
Mohammad Dashti	94fe52cc67	docs: clarify SUM finalize returning None diverges from Elasticsearch Surface the trade-off in the doc comment so future reviewers see why this differs from ES (which returns "value": 0 for sum over empty/all-missing buckets) and what consumers (ParadeDB SQL NULL) the None variant is meant to serve.	2026-06-16 03:10:30 +08:00
Mohammad Dashti	2ff39f6f7f	fix: return None from SUM when no values were collected IntermediateSum::finalize() returned Some(0.0) even when count==0 (all documents had missing/NULL values). This differs from MIN, MAX, and AVG which all return None for count==0. The 0.0 came from IntermediateStats' default sum initialization. Consumers (like ParadeDB) that map None to SQL NULL were incorrectly getting 0 for SUM on all-NULL groups. Fixes paradedb/paradedb#4621	2026-06-16 03:10:30 +08:00
Windforce17	1d06328cb3	Add BlockSegmentPostings::rank() for skip-list-based positional counting Add a public rank(target) method on BlockSegmentPostings that returns the number of docs with a doc id strictly smaller than target. It jumps to the candidate block through the skip list and decodes a single block, so the cost is O(skip-list entries) + one block decode rather than O(doc_freq). This is a useful primitive for range counting over a posting list (e.g. number of matches in a [lo, hi) doc-id window) without iterating every matched doc. To support it, expose SkipReader::remaining_docs() (pub(crate)). Like seek(), rank() advances the cursor forward only and must be called with non-decreasing, valid (<= TERMINATED) targets. Adds a unit test covering multi-block lists and the below-first / above-last / empty edge cases.	2026-06-15 18:56:49 +08:00