Merge branch 'main' into transform-count-min-max

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2026-01-03 20:02:54 +00:00 · 2024-11-01 17:45:18 +08:00
parent 70c354eed6 be72d3bedb
commit 9c5d044238
16 changed files with 472 additions and 227 deletions
--- a/docs/how-to/how-to-change-log-level-on-the-fly.md
+++ b/docs/how-to/how-to-change-log-level-on-the-fly.md
@@ -0,0 +1,16 @@
+# Change Log Level on the Fly
+
+## HTTP API
+
+example:
+```bash
+curl --data "trace;flow=debug" 127.0.0.1:4000/debug/log_level
+```
+And database will reply with something like:
+```bash
+Log Level changed from Some("info") to "trace;flow=debug"%
+```
+
+The data is a string in the format of `global_level;module1=level1;module2=level2;...` that follow the same rule of `RUST_LOG`. 
+
+The module is the module name of the log, and the level is the log level. The log level can be one of the following: `trace`, `debug`, `info`, `warn`, `error`, `off`(case insensitive).
--- a/src/meta-srv/src/error.rs
+++ b/src/meta-srv/src/error.rs
@@ -18,7 +18,6 @@ use common_error::status_code::StatusCode;
 use common_macro::stack_trace_debug;
 use common_meta::DatanodeId;
 use common_runtime::JoinError;
-use rand::distributions::WeightedError;
 use snafu::{Location, Snafu};
 use store_api::storage::RegionId;
 use table::metadata::TableId;
@@ -32,6 +31,14 @@ use crate::pubsub::Message;
 #[snafu(visibility(pub))]
 #[stack_trace_debug]
 pub enum Error {
+    #[snafu(display("Failed to choose items"))]
+    ChooseItems {
+        #[snafu(implicit)]
+        location: Location,
+        #[snafu(source)]
+        error: rand::distributions::WeightedError,
+    },
+
    #[snafu(display("Exceeded deadline, operation: {}", operation))]
    ExceededDeadline {
        #[snafu(implicit)]
@@ -643,20 +650,6 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Failed to set weight array"))]
-    WeightArray {
-        #[snafu(source)]
-        error: WeightedError,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
-    #[snafu(display("Weight array is not set"))]
-    NotSetWeightArray {
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Unexpected table route type: {}", err_msg))]
    UnexpectedLogicalRouteTable {
        #[snafu(implicit)]
@@ -759,10 +752,9 @@ impl ErrorExt for Error {
            | Error::NoEnoughAvailableNode { .. }
            | Error::PublishMessage { .. }
            | Error::Join { .. }
-            | Error::WeightArray { .. }
-            | Error::NotSetWeightArray { .. }
            | Error::PeerUnavailable { .. }
-            | Error::ExceededDeadline { .. } => StatusCode::Internal,
+            | Error::ExceededDeadline { .. }
+            | Error::ChooseItems { .. } => StatusCode::Internal,

            Error::Unsupported { .. } => StatusCode::Unsupported,

--- a/src/meta-srv/src/selector/common.rs
+++ b/src/meta-srv/src/selector/common.rs
@@ -12,29 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::collections::HashSet;
-
 use common_meta::peer::Peer;
 use snafu::ensure;

-use super::weighted_choose::{WeightedChoose, WeightedItem};
+use super::weighted_choose::WeightedChoose;
 use crate::error;
 use crate::error::Result;
 use crate::metasrv::SelectTarget;
 use crate::selector::SelectorOptions;

 /// According to the `opts`, choose peers from the `weight_array` through `weighted_choose`.
-pub fn choose_peers<W>(
-    mut weight_array: Vec<WeightedItem<Peer>>,
-    opts: &SelectorOptions,
-    weighted_choose: &mut W,
-) -> Result<Vec<Peer>>
+pub fn choose_peers<W>(opts: &SelectorOptions, weighted_choose: &mut W) -> Result<Vec<Peer>>
 where
    W: WeightedChoose<Peer>,
 {
    let min_required_items = opts.min_required_items;
    ensure!(
-        !weight_array.is_empty(),
+        !weighted_choose.is_empty(),
        error::NoEnoughAvailableNodeSnafu {
            required: min_required_items,
            available: 0_usize,
@@ -43,12 +37,11 @@ where
    );

    if opts.allow_duplication {
-        weighted_choose.set_weight_array(weight_array)?;
        (0..min_required_items)
            .map(|_| weighted_choose.choose_one())
            .collect::<Result<_>>()
    } else {
-        let weight_array_len = weight_array.len();
+        let weight_array_len = weighted_choose.len();

        // When opts.allow_duplication is false, we need to check that the length of the weighted array is greater than
        // or equal to min_required_items, otherwise it may cause an infinite loop.
@@ -61,33 +54,7 @@ where
            }
        );

-        if weight_array_len == min_required_items {
-            return Ok(weight_array.into_iter().map(|item| item.item).collect());
-        }
-
-        weighted_choose.set_weight_array(weight_array.clone())?;
-
-        // Assume min_required_items is 3, weight_array_len is 100, then we can choose 3 items from the weight array
-        // and return. But assume min_required_items is 99, weight_array_len is 100. It's not cheap to choose 99 items
-        // from the weight array. So we can reverse choose 1 item from the weight array, and return the remaining 99
-        // items.
-        if min_required_items * 2 > weight_array_len {
-            let select_num = weight_array_len - min_required_items;
-            let mut selected = HashSet::with_capacity(select_num);
-            while selected.len() < select_num {
-                let item = weighted_choose.reverse_choose_one()?;
-                selected.insert(item);
-            }
-            weight_array.retain(|item| !selected.contains(&item.item));
-            Ok(weight_array.into_iter().map(|item| item.item).collect())
-        } else {
-            let mut selected = HashSet::with_capacity(min_required_items);
-            while selected.len() < min_required_items {
-                let item = weighted_choose.choose_one()?;
-                selected.insert(item);
-            }
-            Ok(selected.into_iter().collect())
-        }
+        weighted_choose.choose_multiple(min_required_items)
    }
 }

@@ -110,7 +77,6 @@ mod tests {
                    addr: "127.0.0.1:3001".to_string(),
                },
                weight: 1,
-                reverse_weight: 1,
            },
            WeightedItem {
                item: Peer {
@@ -118,7 +84,6 @@ mod tests {
                    addr: "127.0.0.1:3001".to_string(),
                },
                weight: 1,
-                reverse_weight: 1,
            },
            WeightedItem {
                item: Peer {
@@ -126,7 +91,6 @@ mod tests {
                    addr: "127.0.0.1:3001".to_string(),
                },
                weight: 1,
-                reverse_weight: 1,
            },
            WeightedItem {
                item: Peer {
@@ -134,7 +98,6 @@ mod tests {
                    addr: "127.0.0.1:3001".to_string(),
                },
                weight: 1,
-                reverse_weight: 1,
            },
            WeightedItem {
                item: Peer {
@@ -142,7 +105,6 @@ mod tests {
                    addr: "127.0.0.1:3001".to_string(),
                },
                weight: 1,
-                reverse_weight: 1,
            },
        ];

@@ -152,14 +114,11 @@ mod tests {
                allow_duplication: false,
            };

-            let selected_peers: HashSet<_> = choose_peers(
-                weight_array.clone(),
-                &opts,
-                &mut RandomWeightedChoose::default(),
-            )
-            .unwrap()
-            .into_iter()
-            .collect();
+            let selected_peers: HashSet<_> =
+                choose_peers(&opts, &mut RandomWeightedChoose::new(weight_array.clone()))
+                    .unwrap()
+                    .into_iter()
+                    .collect();

            assert_eq!(i, selected_peers.len());
        }
@@ -169,11 +128,8 @@ mod tests {
            allow_duplication: false,
        };

-        let selected_result = choose_peers(
-            weight_array.clone(),
-            &opts,
-            &mut RandomWeightedChoose::default(),
-        );
+        let selected_result =
+            choose_peers(&opts, &mut RandomWeightedChoose::new(weight_array.clone()));
        assert!(selected_result.is_err());

        for i in 1..=50 {
@@ -182,12 +138,8 @@ mod tests {
                allow_duplication: true,
            };

-            let selected_peers = choose_peers(
-                weight_array.clone(),
-                &opts,
-                &mut RandomWeightedChoose::default(),
-            )
-            .unwrap();
+            let selected_peers =
+                choose_peers(&opts, &mut RandomWeightedChoose::new(weight_array.clone())).unwrap();

            assert_eq!(i, selected_peers.len());
        }
--- a/src/meta-srv/src/selector/lease_based.rs
+++ b/src/meta-srv/src/selector/lease_based.rs
@@ -48,13 +48,12 @@ impl Selector for LeaseBasedSelector {
                    addr: v.node_addr.clone(),
                },
                weight: 1,
-                reverse_weight: 1,
            })
            .collect();

        // 3. choose peers by weight_array.
-        let weighted_choose = &mut RandomWeightedChoose::default();
-        let selected = choose_peers(weight_array, &opts, weighted_choose)?;
+        let mut weighted_choose = RandomWeightedChoose::new(weight_array);
+        let selected = choose_peers(&opts, &mut weighted_choose)?;

        Ok(selected)
    }
--- a/src/meta-srv/src/selector/load_based.rs
+++ b/src/meta-srv/src/selector/load_based.rs
@@ -19,7 +19,6 @@ use common_meta::key::TableMetadataManager;
 use common_meta::peer::Peer;
 use common_meta::rpc::router::find_leaders;
 use common_telemetry::{debug, info};
-use parking_lot::RwLock;
 use snafu::ResultExt;
 use table::metadata::TableId;

@@ -29,36 +28,30 @@ use crate::lease;
 use crate::metasrv::SelectorContext;
 use crate::selector::common::choose_peers;
 use crate::selector::weight_compute::{RegionNumsBasedWeightCompute, WeightCompute};
-use crate::selector::weighted_choose::{RandomWeightedChoose, WeightedChoose};
+use crate::selector::weighted_choose::RandomWeightedChoose;
 use crate::selector::{Namespace, Selector, SelectorOptions};

-pub struct LoadBasedSelector<W, C> {
-    weighted_choose: RwLock<W>,
+pub struct LoadBasedSelector<C> {
    weight_compute: C,
 }

-impl<W, C> LoadBasedSelector<W, C> {
-    pub fn new(weighted_choose: W, weight_compute: C) -> Self {
-        Self {
-            weighted_choose: RwLock::new(weighted_choose),
-            weight_compute,
-        }
+impl<C> LoadBasedSelector<C> {
+    pub fn new(weight_compute: C) -> Self {
+        Self { weight_compute }
    }
 }

-impl Default for LoadBasedSelector<RandomWeightedChoose<Peer>, RegionNumsBasedWeightCompute> {
+impl Default for LoadBasedSelector<RegionNumsBasedWeightCompute> {
    fn default() -> Self {
        Self {
-            weighted_choose: RwLock::new(RandomWeightedChoose::default()),
            weight_compute: RegionNumsBasedWeightCompute,
        }
    }
 }

 #[async_trait::async_trait]
-impl<W, C> Selector for LoadBasedSelector<W, C>
+impl<C> Selector for LoadBasedSelector<C>
 where
-    W: WeightedChoose<Peer>,
    C: WeightCompute<Source = HashMap<DatanodeStatKey, DatanodeStatValue>>,
 {
    type Context = SelectorContext;
@@ -100,8 +93,8 @@ where
        let weight_array = self.weight_compute.compute(&stat_kvs);

        // 5. choose peers by weight_array.
-        let mut weighted_choose = self.weighted_choose.write();
-        let selected = choose_peers(weight_array, &opts, &mut *weighted_choose)?;
+        let mut weighted_choose = RandomWeightedChoose::new(weight_array);
+        let selected = choose_peers(&opts, &mut weighted_choose)?;

        debug!(
            "LoadBasedSelector select peers: {:?}, namespace: {}, opts: {:?}.",
--- a/src/meta-srv/src/selector/weight_compute.rs
+++ b/src/meta-srv/src/selector/weight_compute.rs
@@ -85,7 +85,6 @@ impl WeightCompute for RegionNumsBasedWeightCompute {
            .map(|(peer, region_num)| WeightedItem {
                item: peer,
                weight: (max_weight - region_num + base_weight) as usize,
-                reverse_weight: (region_num - min_weight + base_weight) as usize,
            })
            .collect()
    }
@@ -181,10 +180,6 @@ mod tests {
            },
            4,
        );
-
-        for weight in weight_array.iter() {
-            assert_eq!(weight.reverse_weight, *expected.get(&weight.item).unwrap());
-        }
    }

    fn mock_stat_1() -> Stat {
--- a/src/meta-srv/src/selector/weighted_choose.rs
+++ b/src/meta-srv/src/selector/weighted_choose.rs
@@ -12,41 +12,37 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use rand::distributions::WeightedIndex;
-use rand::prelude::Distribution;
+use rand::seq::SliceRandom;
 use rand::thread_rng;
-use snafu::{ensure, ResultExt};
+use snafu::ResultExt;

 use crate::error;
 use crate::error::Result;

 /// A common trait for weighted balance algorithm.
 pub trait WeightedChoose<Item>: Send + Sync {
-    /// The method will re-set weight array.
-    ///
-    /// Note:
-    /// 1. make sure weight_array is not empty.
-    /// 2. the total weight is greater than 0.
-    ///
-    /// Otherwise an error will be returned.
-    fn set_weight_array(&mut self, weight_array: Vec<WeightedItem<Item>>) -> Result<()>;
-
    /// The method will choose one item.
-    ///
-    /// If not set weight_array before, an error will be returned.
    fn choose_one(&mut self) -> Result<Item>;

-    /// The method will reverse choose one item.
+    /// The method will choose multiple items.
    ///
-    /// If not set weight_array before, an error will be returned.
-    fn reverse_choose_one(&mut self) -> Result<Item>;
+    /// Returns less than `amount` items if the weight_array is not enough.
+    fn choose_multiple(&mut self, amount: usize) -> Result<Vec<Item>>;
+
+    /// Returns the length of the weight_array.
+    fn len(&self) -> usize;
+
+    /// Returns whether the weight_array is empty.
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
 }

+/// The struct represents a weighted item.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct WeightedItem<Item> {
    pub item: Item,
    pub weight: usize,
-    pub reverse_weight: usize,
 }

 /// A implementation of weighted balance: random weighted choose.
@@ -64,16 +60,18 @@ pub struct WeightedItem<Item> {
 /// ```
 pub struct RandomWeightedChoose<Item> {
    items: Vec<WeightedItem<Item>>,
-    weighted_index: Option<WeightedIndex<usize>>,
-    reverse_weighted_index: Option<WeightedIndex<usize>>,
+}
+
+impl<Item> RandomWeightedChoose<Item> {
+    pub fn new(items: Vec<WeightedItem<Item>>) -> Self {
+        Self { items }
+    }
 }

 impl<Item> Default for RandomWeightedChoose<Item> {
    fn default() -> Self {
        Self {
            items: Vec::default(),
-            weighted_index: None,
-            reverse_weighted_index: None,
        }
    }
 }
@@ -82,48 +80,29 @@ impl<Item> WeightedChoose<Item> for RandomWeightedChoose<Item>
 where
    Item: Clone + Send + Sync,
 {
-    fn set_weight_array(&mut self, weight_array: Vec<WeightedItem<Item>>) -> Result<()> {
-        self.weighted_index = Some(
-            WeightedIndex::new(weight_array.iter().map(|item| item.weight))
-                .context(error::WeightArraySnafu)?,
-        );
-
-        self.reverse_weighted_index = Some(
-            WeightedIndex::new(weight_array.iter().map(|item| item.reverse_weight))
-                .context(error::WeightArraySnafu)?,
-        );
-
-        self.items = weight_array;
-
-        Ok(())
-    }
-
    fn choose_one(&mut self) -> Result<Item> {
-        ensure!(
-            !self.items.is_empty() && self.weighted_index.is_some(),
-            error::NotSetWeightArraySnafu
-        );
-
        // unwrap safety: whether weighted_index is none has been checked before.
-        let weighted_index = self.weighted_index.as_ref().unwrap();
-
-        Ok(self.items[weighted_index.sample(&mut thread_rng())]
+        let item = self
+            .items
+            .choose_weighted(&mut thread_rng(), |item| item.weight as f64)
+            .context(error::ChooseItemsSnafu)?
            .item
-            .clone())
+            .clone();
+        Ok(item)
    }

-    fn reverse_choose_one(&mut self) -> Result<Item> {
-        ensure!(
-            !self.items.is_empty() && self.reverse_weighted_index.is_some(),
-            error::NotSetWeightArraySnafu
-        );
+    fn choose_multiple(&mut self, amount: usize) -> Result<Vec<Item>> {
+        Ok(self
+            .items
+            .choose_multiple_weighted(&mut thread_rng(), amount, |item| item.weight as f64)
+            .context(error::ChooseItemsSnafu)?
+            .cloned()
+            .map(|item| item.item)
+            .collect::<Vec<_>>())
+    }

-        // unwrap safety: whether reverse_weighted_index is none has been checked before.
-        let reverse_weighted_index = self.reverse_weighted_index.as_ref().unwrap();
-
-        Ok(self.items[reverse_weighted_index.sample(&mut thread_rng())]
-            .item
-            .clone())
+    fn len(&self) -> usize {
+        self.items.len()
    }
 }

@@ -133,45 +112,22 @@ mod tests {

    #[test]
    fn test_random_weighted_choose() {
-        let mut choose = RandomWeightedChoose::default();
-        choose
-            .set_weight_array(vec![
-                WeightedItem {
-                    item: 1,
-                    weight: 100,
-                    reverse_weight: 0,
-                },
-                WeightedItem {
-                    item: 2,
-                    weight: 0,
-                    reverse_weight: 100,
-                },
-            ])
-            .unwrap();
+        let mut choose = RandomWeightedChoose::new(vec![
+            WeightedItem {
+                item: 1,
+                weight: 100,
+            },
+            WeightedItem { item: 2, weight: 0 },
+        ]);
+
        for _ in 0..100 {
            let ret = choose.choose_one().unwrap();
            assert_eq!(1, ret);
        }

        for _ in 0..100 {
-            let ret = choose.reverse_choose_one().unwrap();
-            assert_eq!(2, ret);
+            let ret = choose.choose_multiple(3).unwrap();
+            assert_eq!(vec![1, 2], ret);
        }
    }
-
-    #[test]
-    #[should_panic]
-    fn test_random_weighted_choose_should_panic() {
-        let mut choose: RandomWeightedChoose<u32> = RandomWeightedChoose::default();
-        choose.set_weight_array(vec![]).unwrap();
-        let _ = choose.choose_one().unwrap();
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_random_reverse_weighted_choose_should_panic() {
-        let mut choose: RandomWeightedChoose<u32> = RandomWeightedChoose::default();
-        choose.set_weight_array(vec![]).unwrap();
-        let _ = choose.reverse_choose_one().unwrap();
-    }
 }
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -80,18 +80,15 @@ impl CacheManager {
        CacheManagerBuilder::default()
    }

-    /// Gets cached [ParquetMetaData].
+    /// Gets cached [ParquetMetaData] from in-memory cache first.
+    /// If not found, tries to get it from write cache and fill the in-memory cache.
    pub async fn get_parquet_meta_data(
        &self,
        region_id: RegionId,
        file_id: FileId,
    ) -> Option<Arc<ParquetMetaData>> {
        // Try to get metadata from sst meta cache
-        let metadata = self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| {
-            let value = sst_meta_cache.get(&SstMetaKey(region_id, file_id));
-            update_hit_miss(value, SST_META_TYPE)
-        });
-
+        let metadata = self.get_parquet_meta_data_from_mem_cache(region_id, file_id);
        if metadata.is_some() {
            return metadata;
        }
@@ -110,6 +107,20 @@ impl CacheManager {
        None
    }

+    /// Gets cached [ParquetMetaData] from in-memory cache.
+    /// This method does not perform I/O.
+    pub fn get_parquet_meta_data_from_mem_cache(
+        &self,
+        region_id: RegionId,
+        file_id: FileId,
+    ) -> Option<Arc<ParquetMetaData>> {
+        // Try to get metadata from sst meta cache
+        self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| {
+            let value = sst_meta_cache.get(&SstMetaKey(region_id, file_id));
+            update_hit_miss(value, SST_META_TYPE)
+        })
+    }
+
    /// Puts [ParquetMetaData] into the cache.
    pub fn put_parquet_meta_data(
        &self,
--- a/src/mito2/src/read/range.rs
+++ b/src/mito2/src/read/range.rs
@@ -18,15 +18,17 @@ use common_time::Timestamp;
 use smallvec::{smallvec, SmallVec};
 use store_api::region_engine::PartitionRange;

+use crate::cache::CacheManager;
 use crate::memtable::MemtableRef;
 use crate::read::scan_region::ScanInput;
 use crate::sst::file::{overlaps, FileHandle, FileTimeRange};
+use crate::sst::parquet::format::parquet_row_group_time_range;
 use crate::sst::parquet::DEFAULT_ROW_GROUP_SIZE;

 const ALL_ROW_GROUPS: i64 = -1;

 /// Index to access a row group.
-#[derive(Clone, Copy, PartialEq)]
+#[derive(Debug, Clone, Copy, PartialEq)]
 pub(crate) struct RowGroupIndex {
    /// Index to the memtable/file.
    pub(crate) index: usize,
@@ -38,6 +40,7 @@ pub(crate) struct RowGroupIndex {
 /// Meta data of a partition range.
 /// If the scanner is [UnorderedScan], each meta only has one row group or memtable.
 /// If the scanner is [SeqScan], each meta may have multiple row groups and memtables.
+#[derive(Debug, PartialEq)]
 pub(crate) struct RangeMeta {
    /// The time range of the range.
    pub(crate) time_range: FileTimeRange,
@@ -84,7 +87,12 @@ impl RangeMeta {
    pub(crate) fn unordered_scan_ranges(input: &ScanInput) -> Vec<RangeMeta> {
        let mut ranges = Vec::with_capacity(input.memtables.len() + input.files.len());
        Self::push_unordered_mem_ranges(&input.memtables, &mut ranges);
-        Self::push_unordered_file_ranges(input.memtables.len(), &input.files, &mut ranges);
+        Self::push_unordered_file_ranges(
+            input.memtables.len(),
+            &input.files,
+            input.cache_manager.as_deref(),
+            &mut ranges,
+        );

        ranges
    }
@@ -164,12 +172,36 @@ impl RangeMeta {
    fn push_unordered_file_ranges(
        num_memtables: usize,
        files: &[FileHandle],
+        cache: Option<&CacheManager>,
        ranges: &mut Vec<RangeMeta>,
    ) {
        // For append mode, we can parallelize reading row groups.
        for (i, file) in files.iter().enumerate() {
            let file_index = num_memtables + i;
-            if file.meta_ref().num_row_groups > 0 {
+            // Get parquet meta from the cache.
+            let parquet_meta = cache.and_then(|c| {
+                c.get_parquet_meta_data_from_mem_cache(file.region_id(), file.file_id())
+            });
+            if let Some(parquet_meta) = parquet_meta {
+                // Scans each row group.
+                for row_group_index in 0..file.meta_ref().num_row_groups {
+                    let time_range = parquet_row_group_time_range(
+                        file.meta_ref(),
+                        &parquet_meta,
+                        row_group_index as usize,
+                    );
+                    let num_rows = parquet_meta.row_group(row_group_index as usize).num_rows();
+                    ranges.push(RangeMeta {
+                        time_range: time_range.unwrap_or_else(|| file.time_range()),
+                        indices: smallvec![file_index],
+                        row_group_indices: smallvec![RowGroupIndex {
+                            index: file_index,
+                            row_group_index: row_group_index as i64,
+                        }],
+                        num_rows: num_rows as usize,
+                    });
+                }
+            } else if file.meta_ref().num_row_groups > 0 {
                // Scans each row group.
                for row_group_index in 0..file.meta_ref().num_row_groups {
                    ranges.push(RangeMeta {
@@ -217,7 +249,6 @@ impl RangeMeta {
        }
    }

-    // TODO(yingwen): Support multiple row groups in a range so we can split them later.
    fn push_seq_file_ranges(
        num_memtables: usize,
        files: &[FileHandle],
@@ -226,15 +257,31 @@ impl RangeMeta {
        // For non append-only mode, each range only contains one file.
        for (i, file) in files.iter().enumerate() {
            let file_index = num_memtables + i;
-            ranges.push(RangeMeta {
-                time_range: file.time_range(),
-                indices: smallvec![file_index],
-                row_group_indices: smallvec![RowGroupIndex {
-                    index: file_index,
-                    row_group_index: ALL_ROW_GROUPS,
-                }],
-                num_rows: file.meta_ref().num_rows as usize,
-            });
+            if file.meta_ref().num_row_groups > 0 {
+                // All row groups share the same time range.
+                let row_group_indices = (0..file.meta_ref().num_row_groups)
+                    .map(|row_group_index| RowGroupIndex {
+                        index: file_index,
+                        row_group_index: row_group_index as i64,
+                    })
+                    .collect();
+                ranges.push(RangeMeta {
+                    time_range: file.time_range(),
+                    indices: smallvec![file_index],
+                    row_group_indices,
+                    num_rows: file.meta_ref().num_rows as usize,
+                });
+            } else {
+                ranges.push(RangeMeta {
+                    time_range: file.time_range(),
+                    indices: smallvec![file_index],
+                    row_group_indices: smallvec![RowGroupIndex {
+                        index: file_index,
+                        row_group_index: ALL_ROW_GROUPS,
+                    }],
+                    num_rows: file.meta_ref().num_rows as usize,
+                });
+            }
        }
    }
 }
@@ -366,4 +413,212 @@ mod tests {
            &[(vec![3], 0, 1000), (vec![1, 2], 3000, 6000)],
        );
    }
+
+    #[test]
+    fn test_merge_range() {
+        let mut left = RangeMeta {
+            time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+            indices: smallvec![1],
+            row_group_indices: smallvec![
+                RowGroupIndex {
+                    index: 1,
+                    row_group_index: 1
+                },
+                RowGroupIndex {
+                    index: 1,
+                    row_group_index: 2
+                }
+            ],
+            num_rows: 5,
+        };
+        let right = RangeMeta {
+            time_range: (Timestamp::new_second(800), Timestamp::new_second(1200)),
+            indices: smallvec![2],
+            row_group_indices: smallvec![
+                RowGroupIndex {
+                    index: 2,
+                    row_group_index: 1
+                },
+                RowGroupIndex {
+                    index: 2,
+                    row_group_index: 2
+                }
+            ],
+            num_rows: 4,
+        };
+        left.merge(right);
+
+        assert_eq!(
+            left,
+            RangeMeta {
+                time_range: (Timestamp::new_second(800), Timestamp::new_second(2000)),
+                indices: smallvec![1, 2],
+                row_group_indices: smallvec![
+                    RowGroupIndex {
+                        index: 1,
+                        row_group_index: 1
+                    },
+                    RowGroupIndex {
+                        index: 1,
+                        row_group_index: 2
+                    },
+                    RowGroupIndex {
+                        index: 2,
+                        row_group_index: 1
+                    },
+                    RowGroupIndex {
+                        index: 2,
+                        row_group_index: 2
+                    },
+                ],
+                num_rows: 9,
+            }
+        );
+    }
+
+    #[test]
+    fn test_split_range() {
+        let range = RangeMeta {
+            time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+            indices: smallvec![1],
+            row_group_indices: smallvec![
+                RowGroupIndex {
+                    index: 1,
+                    row_group_index: 1
+                },
+                RowGroupIndex {
+                    index: 1,
+                    row_group_index: 2
+                }
+            ],
+            num_rows: 5,
+        };
+
+        assert!(range.can_split_preserve_order());
+        let mut output = Vec::new();
+        range.maybe_split(&mut output);
+
+        assert_eq!(
+            output,
+            &[
+                RangeMeta {
+                    time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+                    indices: smallvec![1],
+                    row_group_indices: smallvec![RowGroupIndex {
+                        index: 1,
+                        row_group_index: 1
+                    },],
+                    num_rows: 2,
+                },
+                RangeMeta {
+                    time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+                    indices: smallvec![1],
+                    row_group_indices: smallvec![RowGroupIndex {
+                        index: 1,
+                        row_group_index: 2
+                    }],
+                    num_rows: 2,
+                }
+            ]
+        );
+    }
+
+    #[test]
+    fn test_not_split_range() {
+        let range = RangeMeta {
+            time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+            indices: smallvec![1, 2],
+            row_group_indices: smallvec![
+                RowGroupIndex {
+                    index: 1,
+                    row_group_index: 1
+                },
+                RowGroupIndex {
+                    index: 2,
+                    row_group_index: 1
+                }
+            ],
+            num_rows: 5,
+        };
+
+        assert!(!range.can_split_preserve_order());
+        let mut output = Vec::new();
+        range.maybe_split(&mut output);
+        assert_eq!(1, output.len());
+    }
+
+    #[test]
+    fn test_maybe_split_ranges() {
+        let ranges = vec![
+            RangeMeta {
+                time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+                indices: smallvec![1],
+                row_group_indices: smallvec![
+                    RowGroupIndex {
+                        index: 1,
+                        row_group_index: 0
+                    },
+                    RowGroupIndex {
+                        index: 1,
+                        row_group_index: 1
+                    }
+                ],
+                num_rows: 4,
+            },
+            RangeMeta {
+                time_range: (Timestamp::new_second(3000), Timestamp::new_second(4000)),
+                indices: smallvec![2, 3],
+                row_group_indices: smallvec![
+                    RowGroupIndex {
+                        index: 2,
+                        row_group_index: 0
+                    },
+                    RowGroupIndex {
+                        index: 3,
+                        row_group_index: 0
+                    }
+                ],
+                num_rows: 5,
+            },
+        ];
+        let output = maybe_split_ranges_for_seq_scan(ranges);
+        assert_eq!(
+            output,
+            vec![
+                RangeMeta {
+                    time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+                    indices: smallvec![1],
+                    row_group_indices: smallvec![RowGroupIndex {
+                        index: 1,
+                        row_group_index: 0
+                    },],
+                    num_rows: 2,
+                },
+                RangeMeta {
+                    time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+                    indices: smallvec![1],
+                    row_group_indices: smallvec![RowGroupIndex {
+                        index: 1,
+                        row_group_index: 1
+                    }],
+                    num_rows: 2,
+                },
+                RangeMeta {
+                    time_range: (Timestamp::new_second(3000), Timestamp::new_second(4000)),
+                    indices: smallvec![2, 3],
+                    row_group_indices: smallvec![
+                        RowGroupIndex {
+                            index: 2,
+                            row_group_index: 0
+                        },
+                        RowGroupIndex {
+                            index: 3,
+                            row_group_index: 0
+                        }
+                    ],
+                    num_rows: 5,
+                },
+            ]
+        )
+    }
 }
--- a/src/mito2/src/sst/file.rs
+++ b/src/mito2/src/sst/file.rs
@@ -111,7 +111,8 @@ pub struct FileMeta {
    pub region_id: RegionId,
    /// Compared to normal file names, FileId ignore the extension
    pub file_id: FileId,
-    /// Timestamp range of file.
+    /// Timestamp range of file. The timestamps have the same time unit as the
+    /// data in the SST.
    pub time_range: FileTimeRange,
    /// SST level of the file.
    pub level: Level,
--- a/src/mito2/src/sst/parquet.rs
+++ b/src/mito2/src/sst/parquet.rs
@@ -62,7 +62,8 @@ impl Default for WriteOptions {

 /// Parquet SST info returned by the writer.
 pub struct SstInfo {
-    /// Time range of the SST.
+    /// Time range of the SST. The timestamps have the same time unit as the
+    /// data in the SST.
    pub time_range: FileTimeRange,
    /// File size in bytes.
    pub file_size: u64,
--- a/src/mito2/src/sst/parquet/format.rs
+++ b/src/mito2/src/sst/parquet/format.rs
@@ -31,13 +31,14 @@ use std::collections::{HashMap, VecDeque};
 use std::sync::Arc;

 use api::v1::SemanticType;
+use common_time::Timestamp;
 use datafusion_common::ScalarValue;
 use datatypes::arrow::array::{ArrayRef, BinaryArray, DictionaryArray, UInt32Array, UInt64Array};
 use datatypes::arrow::datatypes::{SchemaRef, UInt32Type};
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::prelude::DataType;
 use datatypes::vectors::{Helper, Vector};
-use parquet::file::metadata::RowGroupMetaData;
+use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
 use parquet::file::statistics::Statistics;
 use snafu::{ensure, OptionExt, ResultExt};
 use store_api::metadata::{ColumnMetadata, RegionMetadataRef};
@@ -48,6 +49,7 @@ use crate::error::{
 };
 use crate::read::{Batch, BatchBuilder, BatchColumn};
 use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
+use crate::sst::file::{FileMeta, FileTimeRange};
 use crate::sst::to_sst_arrow_schema;

 /// Arrow array type for the primary key dictionary.
@@ -558,6 +560,50 @@ fn new_primary_key_array(primary_key: &[u8], num_rows: usize) -> ArrayRef {
    Arc::new(DictionaryArray::new(keys, values))
 }

+/// Gets the min/max time index of the row group from the parquet meta.
+/// It assumes the parquet is created by the mito engine.
+pub(crate) fn parquet_row_group_time_range(
+    file_meta: &FileMeta,
+    parquet_meta: &ParquetMetaData,
+    row_group_idx: usize,
+) -> Option<FileTimeRange> {
+    let row_group_meta = parquet_meta.row_group(row_group_idx);
+    let num_columns = parquet_meta.file_metadata().schema_descr().num_columns();
+    assert!(
+        num_columns >= FIXED_POS_COLUMN_NUM,
+        "file only has {} columns",
+        num_columns
+    );
+    let time_index_pos = num_columns - FIXED_POS_COLUMN_NUM;
+
+    let stats = row_group_meta.column(time_index_pos).statistics()?;
+    if stats.has_min_max_set() {
+        // The physical type for the timestamp should be i64.
+        let (min, max) = match stats {
+            Statistics::Int64(value_stats) => (*value_stats.min(), *value_stats.max()),
+            Statistics::Int32(_)
+            | Statistics::Boolean(_)
+            | Statistics::Int96(_)
+            | Statistics::Float(_)
+            | Statistics::Double(_)
+            | Statistics::ByteArray(_)
+            | Statistics::FixedLenByteArray(_) => return None,
+        };
+
+        debug_assert!(
+            min >= file_meta.time_range.0.value() && min <= file_meta.time_range.1.value()
+        );
+        debug_assert!(
+            max >= file_meta.time_range.0.value() && max <= file_meta.time_range.1.value()
+        );
+        let unit = file_meta.time_range.0.unit();
+
+        Some((Timestamp::new(min, unit), Timestamp::new(max, unit)))
+    } else {
+        None
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use api::v1::OpType;
--- a/src/query/src/optimizer/windowed_sort.rs
+++ b/src/query/src/optimizer/windowed_sort.rs
@@ -101,6 +101,7 @@ impl WindowedSortPhysicalRule {
                    } else {
                        Arc::new(PartSortExec::new(
                            first_sort_expr.clone(),
+                            sort_exec.fetch(),
                            scanner_info.partition_ranges.clone(),
                            sort_exec.input().clone(),
                        ))
--- a/src/query/src/part_sort.rs
+++ b/src/query/src/part_sort.rs
@@ -47,6 +47,7 @@ use crate::downcast_ts_array;
 pub struct PartSortExec {
    /// Physical sort expressions(that is, sort by timestamp)
    expression: PhysicalSortExpr,
+    limit: Option<usize>,
    input: Arc<dyn ExecutionPlan>,
    /// Execution metrics
    metrics: ExecutionPlanMetricsSet,
@@ -57,6 +58,7 @@ pub struct PartSortExec {
 impl PartSortExec {
    pub fn new(
        expression: PhysicalSortExpr,
+        limit: Option<usize>,
        partition_ranges: Vec<Vec<PartitionRange>>,
        input: Arc<dyn ExecutionPlan>,
    ) -> Self {
@@ -69,6 +71,7 @@ impl PartSortExec {

        Self {
            expression,
+            limit,
            input,
            metrics,
            partition_ranges,
@@ -95,6 +98,7 @@ impl PartSortExec {
        let df_stream = Box::pin(PartSortStream::new(
            context,
            self,
+            self.limit,
            input_stream,
            self.partition_ranges[partition].clone(),
            partition,
@@ -106,7 +110,16 @@ impl PartSortExec {

 impl DisplayAs for PartSortExec {
    fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "PartSortExec {}", self.expression)
+        write!(
+            f,
+            "PartSortExec: expr={} num_ranges={}",
+            self.expression,
+            self.partition_ranges.len(),
+        )?;
+        if let Some(limit) = self.limit {
+            write!(f, " limit={}", limit)?;
+        }
+        Ok(())
    }
 }

@@ -138,6 +151,7 @@ impl ExecutionPlan for PartSortExec {
        };
        Ok(Arc::new(Self::new(
            self.expression.clone(),
+            self.limit,
            self.partition_ranges.clone(),
            new_input.clone(),
        )))
@@ -170,6 +184,7 @@ struct PartSortStream {
    reservation: MemoryReservation,
    buffer: Vec<DfRecordBatch>,
    expression: PhysicalSortExpr,
+    limit: Option<usize>,
    produced: usize,
    input: DfSendableRecordBatchStream,
    input_complete: bool,
@@ -185,6 +200,7 @@ impl PartSortStream {
    fn new(
        context: Arc<TaskContext>,
        sort: &PartSortExec,
+        limit: Option<usize>,
        input: DfSendableRecordBatchStream,
        partition_ranges: Vec<PartitionRange>,
        partition: usize,
@@ -194,6 +210,7 @@ impl PartSortStream {
                .register(&context.runtime_env().memory_pool),
            buffer: Vec::new(),
            expression: sort.expression.clone(),
+            limit,
            produced: 0,
            input,
            input_complete: false,
@@ -294,7 +311,7 @@ impl PartSortStream {
                )
            })?;

-        let indices = sort_to_indices(&sort_column, opt, None).map_err(|e| {
+        let indices = sort_to_indices(&sort_column, opt, self.limit).map_err(|e| {
            DataFusionError::ArrowError(
                e,
                Some(format!("Fail to sort to indices at {}", location!())),
@@ -674,6 +691,7 @@ mod test {
                expr: Arc::new(Column::new("ts", 0)),
                options: opt,
            },
+            None,
            vec![ranges],
            Arc::new(mock_input),
        );
--- a/src/query/src/window_sort.rs
+++ b/src/query/src/window_sort.rs
@@ -169,7 +169,16 @@ impl WindowedSortExec {

 impl DisplayAs for WindowedSortExec {
    fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "WindowedSortExec")
+        write!(
+            f,
+            "WindowedSortExec: expr={} num_ranges={}",
+            self.expression,
+            self.ranges.len()
+        )?;
+        if let Some(fetch) = self.fetch {
+            write!(f, " fetch={}", fetch)?;
+        }
+        Ok(())
    }
 }

--- a/tests/cases/standalone/common/order/windowed_sort.result
+++ b/tests/cases/standalone/common/order/windowed_sort.result
@@ -69,7 +69,7 @@ EXPLAIN ANALYZE SELECT * FROM test ORDER BY t LIMIT 5;
 |_|_|_|
 | 1_| 0_|_GlobalLimitExec: skip=0, fetch=5 REDACTED
 |_|_|_SortPreservingMergeExec: [t@1 ASC NULLS LAST] REDACTED
-|_|_|_WindowedSortExec REDACTED
+|_|_|_WindowedSortExec: expr=t@1 ASC NULLS LAST num_ranges=2 fetch=5 REDACTED
 |_|_|_SeqScan: region=REDACTED, partition_count=2 (1 memtable ranges, 1 file 1 ranges) REDACTED
 |_|_|_|
 |_|_| Total rows: 5_|
@@ -101,8 +101,8 @@ EXPLAIN ANALYZE SELECT * FROM test ORDER BY t DESC LIMIT 5;
 |_|_|_|
 | 1_| 0_|_GlobalLimitExec: skip=0, fetch=5 REDACTED
 |_|_|_SortPreservingMergeExec: [t@1 DESC] REDACTED
-|_|_|_WindowedSortExec REDACTED
-|_|_|_PartSortExec t@1 DESC REDACTED
+|_|_|_WindowedSortExec: expr=t@1 DESC num_ranges=2 fetch=5 REDACTED
+|_|_|_PartSortExec: expr=t@1 DESC num_ranges=2 limit=5 REDACTED
 |_|_|_SeqScan: region=REDACTED, partition_count=2 (1 memtable ranges, 1 file 1 ranges) REDACTED
 |_|_|_|
 |_|_| Total rows: 5_|
@@ -183,8 +183,8 @@ EXPLAIN ANALYZE SELECT * FROM test_pk ORDER BY t LIMIT 5;
 |_|_|_|
 | 1_| 0_|_GlobalLimitExec: skip=0, fetch=5 REDACTED
 |_|_|_SortPreservingMergeExec: [t@2 ASC NULLS LAST] REDACTED
-|_|_|_WindowedSortExec REDACTED
-|_|_|_PartSortExec t@2 ASC NULLS LAST REDACTED
+|_|_|_WindowedSortExec: expr=t@2 ASC NULLS LAST num_ranges=2 fetch=5 REDACTED
+|_|_|_PartSortExec: expr=t@2 ASC NULLS LAST num_ranges=2 limit=5 REDACTED
 |_|_|_SeqScan: region=REDACTED, partition_count=2 (1 memtable ranges, 1 file 1 ranges) REDACTED
 |_|_|_|
 |_|_| Total rows: 5_|
@@ -216,8 +216,8 @@ EXPLAIN ANALYZE SELECT * FROM test_pk ORDER BY t DESC LIMIT 5;
 |_|_|_|
 | 1_| 0_|_GlobalLimitExec: skip=0, fetch=5 REDACTED
 |_|_|_SortPreservingMergeExec: [t@2 DESC] REDACTED
-|_|_|_WindowedSortExec REDACTED
-|_|_|_PartSortExec t@2 DESC REDACTED
+|_|_|_WindowedSortExec: expr=t@2 DESC num_ranges=2 fetch=5 REDACTED
+|_|_|_PartSortExec: expr=t@2 DESC num_ranges=2 limit=5 REDACTED
 |_|_|_SeqScan: region=REDACTED, partition_count=2 (1 memtable ranges, 1 file 1 ranges) REDACTED
 |_|_|_|
 |_|_| Total rows: 5_|