fix!: fix regression caused by unbalanced partitions and splitting ranges (#5090)

* feat: assign partition ranges by rows * feat: balance partition rows * feat: get uppoer bound for part nums * feat: only split in non-compaction seq scan * fix: parallel scan on multiple sources * fix: can split check * feat: scanner prepare by request * feat: remove scan_parallelism * docs: upate docs * chore: update comment * style: fix clippy * feat: skip merge and dedup if there is only one source * chore: Revert "feat: skip merge and dedup if there is only one source" Since memtable won't do dedup jobs This reverts commit 2fc7a54b11. * test: avoid compaction in sqlness window sort test * chore: do not create semaphore if num partitions is enough * chore: more assertions * chore: fix typo * fix: compaction flag not set * chore: address review comments
2026-05-25 09:20:40 +00:00 · 2024-12-09 20:50:57 +08:00
parent 1b642ea6a9
commit 2fcb95f50a
19 changed files with 560 additions and 309 deletions
--- a/src/query/src/optimizer/parallelize_scan.rs
+++ b/src/query/src/optimizer/parallelize_scan.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::collections::BinaryHeap;
 use std::sync::Arc;

 use common_telemetry::debug;
@@ -93,7 +94,7 @@ impl ParallelizeScan {

                    // update the partition ranges
                    let new_exec = region_scan_exec
-                        .with_new_partitions(partition_ranges)
+                        .with_new_partitions(partition_ranges, expected_partition_num)
                        .map_err(|e| DataFusionError::External(e.into_inner()))?;
                    return Ok(Transformed::yes(Arc::new(new_exec)));
                }
@@ -109,21 +110,71 @@ impl ParallelizeScan {

    /// Distribute [`PartitionRange`]s to each partition.
    ///
-    /// Currently we use a simple round-robin strategy to assign ranges to partitions.
+    /// Currently we assign ranges to partitions according to their rows so each partition
+    /// has similar number of rows.
    /// This method may return partitions with smaller number than `expected_partition_num`
    /// if the number of ranges is smaller than `expected_partition_num`. But this will
    /// return at least one partition.
    fn assign_partition_range(
-        ranges: Vec<PartitionRange>,
+        mut ranges: Vec<PartitionRange>,
        expected_partition_num: usize,
    ) -> Vec<Vec<PartitionRange>> {
-        let actual_partition_num = expected_partition_num.min(ranges.len()).max(1);
+        if ranges.is_empty() {
+            // Returns a single partition with no range.
+            return vec![vec![]];
+        }
+
+        if ranges.len() == 1 {
+            return vec![ranges];
+        }
+
+        // Sort ranges by number of rows in descending order.
+        ranges.sort_by(|a, b| b.num_rows.cmp(&a.num_rows));
+        // Get the max row number of the ranges. Note that the number of rows may be 0 if statistics are not available.
+        let max_rows = ranges[0].num_rows;
+        let total_rows = ranges.iter().map(|range| range.num_rows).sum::<usize>();
+        // Computes the partition num by the max row number. This eliminates the unbalance of the partitions.
+        let balanced_partition_num = if max_rows > 0 {
+            total_rows.div_ceil(max_rows)
+        } else {
+            ranges.len()
+        };
+        let actual_partition_num = expected_partition_num.min(balanced_partition_num).max(1);
        let mut partition_ranges = vec![vec![]; actual_partition_num];

-        // round-robin assignment
-        for (i, range) in ranges.into_iter().enumerate() {
-            let partition_idx = i % expected_partition_num;
+        #[derive(Eq, PartialEq)]
+        struct HeapNode {
+            num_rows: usize,
+            partition_idx: usize,
+        }
+
+        impl Ord for HeapNode {
+            fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+                // Reverse for min-heap.
+                self.num_rows.cmp(&other.num_rows).reverse()
+            }
+        }
+
+        impl PartialOrd for HeapNode {
+            fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+                Some(self.cmp(other))
+            }
+        }
+
+        let mut part_heap =
+            BinaryHeap::from_iter((0..actual_partition_num).map(|partition_idx| HeapNode {
+                num_rows: 0,
+                partition_idx,
+            }));
+
+        // Assigns the range to the partition with the smallest number of rows.
+        for range in ranges {
+            // Safety: actual_partition_num always > 0.
+            let mut node = part_heap.pop().unwrap();
+            let partition_idx = node.partition_idx;
+            node.num_rows += range.num_rows;
            partition_ranges[partition_idx].push(range);
+            part_heap.push(node);
        }

        partition_ranges
@@ -172,18 +223,18 @@ mod test {
            ParallelizeScan::assign_partition_range(ranges.clone(), expected_partition_num);
        let expected = vec![
            vec![
+                PartitionRange {
+                    start: Timestamp::new(30, TimeUnit::Second),
+                    end: Timestamp::new(40, TimeUnit::Second),
+                    num_rows: 250,
+                    identifier: 4,
+                },
                PartitionRange {
                    start: Timestamp::new(0, TimeUnit::Second),
                    end: Timestamp::new(10, TimeUnit::Second),
                    num_rows: 100,
                    identifier: 1,
                },
-                PartitionRange {
-                    start: Timestamp::new(20, TimeUnit::Second),
-                    end: Timestamp::new(30, TimeUnit::Second),
-                    num_rows: 150,
-                    identifier: 3,
-                },
            ],
            vec![
                PartitionRange {
@@ -193,10 +244,10 @@ mod test {
                    identifier: 2,
                },
                PartitionRange {
-                    start: Timestamp::new(30, TimeUnit::Second),
-                    end: Timestamp::new(40, TimeUnit::Second),
-                    num_rows: 250,
-                    identifier: 4,
+                    start: Timestamp::new(20, TimeUnit::Second),
+                    end: Timestamp::new(30, TimeUnit::Second),
+                    num_rows: 150,
+                    identifier: 3,
                },
            ],
        ];
@@ -207,10 +258,10 @@ mod test {
        let result = ParallelizeScan::assign_partition_range(ranges, expected_partition_num);
        let expected = vec![
            vec![PartitionRange {
-                start: Timestamp::new(0, TimeUnit::Second),
-                end: Timestamp::new(10, TimeUnit::Second),
-                num_rows: 100,
-                identifier: 1,
+                start: Timestamp::new(30, TimeUnit::Second),
+                end: Timestamp::new(40, TimeUnit::Second),
+                num_rows: 250,
+                identifier: 4,
            }],
            vec![PartitionRange {
                start: Timestamp::new(10, TimeUnit::Second),
@@ -218,18 +269,20 @@ mod test {
                num_rows: 200,
                identifier: 2,
            }],
-            vec![PartitionRange {
-                start: Timestamp::new(20, TimeUnit::Second),
-                end: Timestamp::new(30, TimeUnit::Second),
-                num_rows: 150,
-                identifier: 3,
-            }],
-            vec![PartitionRange {
-                start: Timestamp::new(30, TimeUnit::Second),
-                end: Timestamp::new(40, TimeUnit::Second),
-                num_rows: 250,
-                identifier: 4,
-            }],
+            vec![
+                PartitionRange {
+                    start: Timestamp::new(20, TimeUnit::Second),
+                    end: Timestamp::new(30, TimeUnit::Second),
+                    num_rows: 150,
+                    identifier: 3,
+                },
+                PartitionRange {
+                    start: Timestamp::new(0, TimeUnit::Second),
+                    end: Timestamp::new(10, TimeUnit::Second),
+                    num_rows: 100,
+                    identifier: 1,
+                },
+            ],
        ];
        assert_eq!(result, expected);

@@ -237,4 +290,68 @@ mod test {
        let result = ParallelizeScan::assign_partition_range(vec![], 5);
        assert_eq!(result.len(), 1);
    }
+
+    #[test]
+    fn test_assign_unbalance_partition_range() {
+        let ranges = vec![
+            PartitionRange {
+                start: Timestamp::new(0, TimeUnit::Second),
+                end: Timestamp::new(10, TimeUnit::Second),
+                num_rows: 100,
+                identifier: 1,
+            },
+            PartitionRange {
+                start: Timestamp::new(10, TimeUnit::Second),
+                end: Timestamp::new(20, TimeUnit::Second),
+                num_rows: 200,
+                identifier: 2,
+            },
+            PartitionRange {
+                start: Timestamp::new(20, TimeUnit::Second),
+                end: Timestamp::new(30, TimeUnit::Second),
+                num_rows: 150,
+                identifier: 3,
+            },
+            PartitionRange {
+                start: Timestamp::new(30, TimeUnit::Second),
+                end: Timestamp::new(40, TimeUnit::Second),
+                num_rows: 2500,
+                identifier: 4,
+            },
+        ];
+
+        // assign to 2 partitions
+        let expected_partition_num = 2;
+        let result =
+            ParallelizeScan::assign_partition_range(ranges.clone(), expected_partition_num);
+        let expected = vec![
+            vec![PartitionRange {
+                start: Timestamp::new(30, TimeUnit::Second),
+                end: Timestamp::new(40, TimeUnit::Second),
+                num_rows: 2500,
+                identifier: 4,
+            }],
+            vec![
+                PartitionRange {
+                    start: Timestamp::new(10, TimeUnit::Second),
+                    end: Timestamp::new(20, TimeUnit::Second),
+                    num_rows: 200,
+                    identifier: 2,
+                },
+                PartitionRange {
+                    start: Timestamp::new(20, TimeUnit::Second),
+                    end: Timestamp::new(30, TimeUnit::Second),
+                    num_rows: 150,
+                    identifier: 3,
+                },
+                PartitionRange {
+                    start: Timestamp::new(0, TimeUnit::Second),
+                    end: Timestamp::new(10, TimeUnit::Second),
+                    num_rows: 100,
+                    identifier: 1,
+                },
+            ],
+        ];
+        assert_eq!(result, expected);
+    }
 }