Merge branch 'main' into chore/test_skip_auth

refactor: optimize heartbeat channel and etcd client keepalive settings (#7390 )
Signed-off-by: WenyXu <wenymedia@gmail.com>
2026-01-03 03:42:54 +00:00 · 2025-12-12 07:48:43 +09:00 · 2025-12-11 13:32:11 +00:00 · 2025-12-11 12:08:45 +00:00 · 2025-12-11 15:29:05 +08:00 · 2025-12-11 03:54:47 +00:00
134 changed files with 6105 additions and 1623 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -738,12 +738,12 @@ dependencies = [
 "api",
 "async-trait",
 "common-base",
+ "common-config",
 "common-error",
 "common-macro",
 "common-telemetry",
 "common-test-util",
 "digest",
- "notify",
 "sha1",
 "snafu 0.8.6",
 "sql",
@@ -2055,6 +2055,7 @@ dependencies = [
 "datanode",
 "humantime-serde",
 "meta-client",
+ "notify",
 "object-store",
 "serde",
 "serde_json",
@@ -2253,6 +2254,7 @@ dependencies = [
 "arrow-flight",
 "bytes",
 "common-base",
+ "common-config",
 "common-error",
 "common-macro",
 "common-recordbatch",
@@ -2266,7 +2268,6 @@ dependencies = [
 "hyper 1.6.0",
 "hyper-util",
 "lazy_static",
- "notify",
 "prost 0.13.5",
 "rand 0.9.1",
 "serde",
@@ -2845,6 +2846,15 @@ dependencies = [
 "unicode-segmentation",
 ]

+[[package]]
+name = "convert_case"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "633458d4ef8c78b72454de2d54fd6ab2e60f9e02be22f3c6104cdc8a4e0fceb9"
+dependencies = [
+ "unicode-segmentation",
+]
+
 [[package]]
 name = "core-foundation"
 version = "0.9.4"
@@ -3741,9 +3751,9 @@ dependencies = [

 [[package]]
 name = "datafusion-pg-catalog"
-version = "0.12.2"
+version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "755393864c0c2dd95575ceed4b25e348686028e1b83d06f8f39914209999f821"
+checksum = "09bfd1feed7ed335227af0b65955ed825e467cf67fad6ecd089123202024cfd1"
 dependencies = [
 "async-trait",
 "datafusion",
@@ -4184,21 +4194,23 @@ dependencies = [

 [[package]]
 name = "derive_more"
-version = "1.0.0"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05"
+checksum = "10b768e943bed7bf2cab53df09f4bc34bfd217cdb57d971e769874c9a6710618"
 dependencies = [
 "derive_more-impl",
 ]

 [[package]]
 name = "derive_more-impl"
-version = "1.0.0"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22"
+checksum = "6d286bfdaf75e988b4a78e013ecd79c581e06399ab53fbacd2d916c2f904f30b"
 dependencies = [
+ "convert_case 0.10.0",
 "proc-macro2",
 "quote",
+ "rustc_version",
 "syn 2.0.106",
 "unicode-xid",
 ]
@@ -4915,6 +4927,7 @@ dependencies = [
 "async-stream",
 "async-trait",
 "auth",
+ "axum 0.8.4",
 "bytes",
 "cache",
 "catalog",
@@ -4949,9 +4962,11 @@ dependencies = [
 "hostname 0.4.1",
 "humantime",
 "humantime-serde",
+ "hyper-util",
 "lazy_static",
 "log-query",
 "meta-client",
+ "meta-srv",
 "num_cpus",
 "opentelemetry-proto",
 "operator",
@@ -4963,6 +4978,7 @@ dependencies = [
 "prost 0.13.5",
 "query",
 "rand 0.9.1",
+ "reqwest",
 "serde",
 "serde_json",
 "servers",
@@ -5351,7 +5367,7 @@ dependencies = [
 [[package]]
 name = "greptime-proto"
 version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=0df99f09f1d6785055b2d9da96fc4ecc2bdf6803#0df99f09f1d6785055b2d9da96fc4ecc2bdf6803"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=0423fa30203187c75e2937a668df1da699c8b96c#0423fa30203187c75e2937a668df1da699c8b96c"
 dependencies = [
 "prost 0.13.5",
 "prost-types 0.13.5",
@@ -10837,7 +10853,7 @@ dependencies = [
 [[package]]
 name = "rskafka"
 version = "0.6.0"
-source = "git+https://github.com/WenyXu/rskafka.git?rev=7b0f31ed39db049b4ee2e5f1e95b5a30be9baf76#7b0f31ed39db049b4ee2e5f1e95b5a30be9baf76"
+source = "git+https://github.com/GreptimeTeam/rskafka.git?rev=f5688f83e7da591cda3f2674c2408b4c0ed4ed50#f5688f83e7da591cda3f2674c2408b4c0ed4ed50"
 dependencies = [
 "bytes",
 "chrono",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -131,7 +131,7 @@ datafusion-functions = "50"
 datafusion-functions-aggregate-common = "50"
 datafusion-optimizer = "50"
 datafusion-orc = "0.5"
-datafusion-pg-catalog = "0.12.2"
+datafusion-pg-catalog = "0.12.3"
 datafusion-physical-expr = "50"
 datafusion-physical-plan = "50"
 datafusion-sql = "50"
@@ -139,6 +139,7 @@ datafusion-substrait = "50"
 deadpool = "0.12"
 deadpool-postgres = "0.14"
 derive_builder = "0.20"
+derive_more = { version = "2.1", features = ["full"] }
 dotenv = "0.15"
 either = "1.15"
 etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62df834f0cffda355eba96691fe1a9a332b75a7", features = [
@@ -148,7 +149,7 @@ etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62d
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "0df99f09f1d6785055b2d9da96fc4ecc2bdf6803" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "0423fa30203187c75e2937a668df1da699c8b96c" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
@@ -200,7 +201,8 @@ reqwest = { version = "0.12", default-features = false, features = [
    "stream",
    "multipart",
 ] }
-rskafka = { git = "https://github.com/WenyXu/rskafka.git", rev = "7b0f31ed39db049b4ee2e5f1e95b5a30be9baf76", features = [
+# Branch: feat/request-timeout
+rskafka = { git = "https://github.com/GreptimeTeam/rskafka.git", rev = "f5688f83e7da591cda3f2674c2408b4c0ed4ed50", features = [
    "transport-tls",
 ] }
 rstest = "0.25"
--- a/flake.lock
+++ b/flake.lock
@@ -8,11 +8,11 @@
        "rust-analyzer-src": "rust-analyzer-src"
      },
      "locked": {
-        "lastModified": 1760078406,
-        "narHash": "sha256-JeJK0ZA845PtkCHkfo4KjeI1mYrsr2s3cxBYKhF4BoE=",
+        "lastModified": 1765252472,
+        "narHash": "sha256-byMt/uMi7DJ8tRniFopDFZMO3leSjGp6GS4zWOFT+uQ=",
        "owner": "nix-community",
        "repo": "fenix",
-        "rev": "351277c60d104944122ee389cdf581c5ce2c6732",
+        "rev": "8456b985f6652e3eef0632ee9992b439735c5544",
        "type": "github"
      },
      "original": {
@@ -41,16 +41,16 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1759994382,
-        "narHash": "sha256-wSK+3UkalDZRVHGCRikZ//CyZUJWDJkBDTQX1+G77Ow=",
+        "lastModified": 1764983851,
+        "narHash": "sha256-y7RPKl/jJ/KAP/VKLMghMgXTlvNIJMHKskl8/Uuar7o=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "5da4a26309e796daa7ffca72df93dbe53b8164c7",
+        "rev": "d9bc5c7dceb30d8d6fafa10aeb6aa8a48c218454",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
-        "ref": "nixos-25.05",
+        "ref": "nixos-25.11",
        "repo": "nixpkgs",
        "type": "github"
      }
@@ -65,11 +65,11 @@
    "rust-analyzer-src": {
      "flake": false,
      "locked": {
-        "lastModified": 1760014945,
-        "narHash": "sha256-ySdl7F9+oeWNHVrg3QL/brazqmJvYFEdpGnF3pyoDH8=",
+        "lastModified": 1765120009,
+        "narHash": "sha256-nG76b87rkaDzibWbnB5bYDm6a52b78A+fpm+03pqYIw=",
        "owner": "rust-lang",
        "repo": "rust-analyzer",
-        "rev": "90d2e1ce4dfe7dc49250a8b88a0f08ffdb9cb23f",
+        "rev": "5e3e9c4e61bba8a5e72134b9ffefbef8f531d008",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@@ -2,7 +2,7 @@
  description = "Development environment flake";

  inputs = {
-    nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05";
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11";
    fenix = {
      url = "github:nix-community/fenix";
      inputs.nixpkgs.follows = "nixpkgs";
@@ -48,7 +48,7 @@
            gnuplot ## for cargo bench
          ];

-          LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath buildInputs;
+          buildInputs = buildInputs;
          NIX_HARDENING_ENABLE = "";
        };
      });
--- a/src/api/src/helper.rs
+++ b/src/api/src/helper.rs
@@ -708,6 +708,7 @@ fn ddl_request_type(request: &DdlRequest) -> &'static str {
        Some(Expr::CreateView(_)) => "ddl.create_view",
        Some(Expr::DropView(_)) => "ddl.drop_view",
        Some(Expr::AlterDatabase(_)) => "ddl.alter_database",
+        Some(Expr::CommentOn(_)) => "ddl.comment_on",
        None => "ddl.empty",
    }
 }
--- a/src/auth/Cargo.toml
+++ b/src/auth/Cargo.toml
@@ -15,11 +15,11 @@ workspace = true
 api.workspace = true
 async-trait.workspace = true
 common-base.workspace = true
+common-config.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 common-telemetry.workspace = true
 digest = "0.10"
-notify.workspace = true
 sha1 = "0.10"
 snafu.workspace = true
 sql.workspace = true
--- a/src/auth/src/error.rs
+++ b/src/auth/src/error.rs
@@ -75,11 +75,12 @@ pub enum Error {
        username: String,
    },

-    #[snafu(display("Failed to initialize a watcher for file {}", path))]
+    #[snafu(display("Failed to initialize a file watcher"))]
    FileWatch {
-        path: String,
        #[snafu(source)]
-        error: notify::Error,
+        source: common_config::error::Error,
+        #[snafu(implicit)]
+        location: Location,
    },

    #[snafu(display("User is not authorized to perform this action"))]
--- a/src/auth/src/user_provider/watch_file_user_provider.rs
+++ b/src/auth/src/user_provider/watch_file_user_provider.rs
@@ -12,16 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::path::Path;
-use std::sync::mpsc::channel;
 use std::sync::{Arc, Mutex};

 use async_trait::async_trait;
+use common_config::file_watcher::{FileWatcherBuilder, FileWatcherConfig};
 use common_telemetry::{info, warn};
-use notify::{EventKind, RecursiveMode, Watcher};
-use snafu::{ResultExt, ensure};
+use snafu::ResultExt;

-use crate::error::{FileWatchSnafu, InvalidConfigSnafu, Result};
+use crate::error::{FileWatchSnafu, Result};
 use crate::user_provider::{UserInfoMap, authenticate_with_credential, load_credential_from_file};
 use crate::{Identity, Password, UserInfoRef, UserProvider};

@@ -41,61 +39,36 @@ impl WatchFileUserProvider {
    pub fn new(filepath: &str) -> Result<Self> {
        let credential = load_credential_from_file(filepath)?;
        let users = Arc::new(Mutex::new(credential));
-        let this = WatchFileUserProvider {
-            users: users.clone(),
-        };

-        let (tx, rx) = channel::<notify::Result<notify::Event>>();
-        let mut debouncer =
-            notify::recommended_watcher(tx).context(FileWatchSnafu { path: "<none>" })?;
-        let mut dir = Path::new(filepath).to_path_buf();
-        ensure!(
-            dir.pop(),
-            InvalidConfigSnafu {
-                value: filepath,
-                msg: "UserProvider path must be a file path",
-            }
-        );
-        debouncer
-            .watch(&dir, RecursiveMode::NonRecursive)
-            .context(FileWatchSnafu { path: filepath })?;
+        let users_clone = users.clone();
+        let filepath_owned = filepath.to_string();

-        let filepath = filepath.to_string();
-        std::thread::spawn(move || {
-            let filename = Path::new(&filepath).file_name();
-            let _hold = debouncer;
-            while let Ok(res) = rx.recv() {
-                if let Ok(event) = res {
-                    let is_this_file = event.paths.iter().any(|p| p.file_name() == filename);
-                    let is_relevant_event = matches!(
-                        event.kind,
-                        EventKind::Modify(_) | EventKind::Create(_) | EventKind::Remove(_)
+        FileWatcherBuilder::new()
+            .watch_path(filepath)
+            .context(FileWatchSnafu)?
+            .config(FileWatcherConfig::new())
+            .spawn(move || match load_credential_from_file(&filepath_owned) {
+                Ok(credential) => {
+                    let mut users = users_clone.lock().expect("users credential must be valid");
+                    #[cfg(not(test))]
+                    info!("User provider file {} reloaded", &filepath_owned);
+                    #[cfg(test)]
+                    info!(
+                        "User provider file {} reloaded: {:?}",
+                        &filepath_owned, credential
                    );
-                    if is_this_file && is_relevant_event {
-                        info!(?event.kind, "User provider file {} changed", &filepath);
-                        match load_credential_from_file(&filepath) {
-                            Ok(credential) => {
-                                let mut users =
-                                    users.lock().expect("users credential must be valid");
-                                #[cfg(not(test))]
-                                info!("User provider file {filepath} reloaded");
-                                #[cfg(test)]
-                                info!("User provider file {filepath} reloaded: {credential:?}");
-                                *users = credential;
-                            }
-                            Err(err) => {
-                                warn!(
-                                    ?err,
-                                    "Fail to load credential from file {filepath}; keep the old one",
-                                )
-                            }
-                        }
-                    }
+                    *users = credential;
                }
-            }
-        });
+                Err(err) => {
+                    warn!(
+                        ?err,
+                        "Fail to load credential from file {}; keep the old one", &filepath_owned
+                    )
+                }
+            })
+            .context(FileWatchSnafu)?;

-        Ok(this)
+        Ok(WatchFileUserProvider { users })
    }
 }

--- a/src/cli/src/common/object_store.rs
+++ b/src/cli/src/common/object_store.rs
@@ -89,6 +89,10 @@ wrap_with_clap_prefix! {
        region: Option<String>,
        #[doc = "Enable virtual host style for the object store."]
        enable_virtual_host_style: bool = Default::default(),
+        #[doc = "Allow anonymous access (disable credential signing) for testing."]
+        allow_anonymous: bool = Default::default(),
+        #[doc = "Disable config load from environment and files for testing."]
+        disable_config_load: bool = Default::default(),
    }
 }

--- a/src/cmd/src/datanode/objbench.rs
+++ b/src/cmd/src/datanode/objbench.rs
@@ -564,7 +564,7 @@ fn new_noop_file_purger() -> FilePurgerRef {
    #[derive(Debug)]
    struct Noop;
    impl FilePurger for Noop {
-        fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool) {}
+        fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool, _index_outdated: bool) {}
    }
    Arc::new(Noop)
 }
--- a/src/cmd/src/frontend.rs
+++ b/src/cmd/src/frontend.rs
@@ -35,6 +35,7 @@ use common_meta::cache::{CacheRegistryBuilder, LayeredCacheRegistryBuilder};
 use common_meta::heartbeat::handler::HandlerGroupExecutor;
 use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHandler;
 use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
+use common_meta::heartbeat::handler::suspend::SuspendHandler;
 use common_query::prelude::set_default_prefix;
 use common_stat::ResourceStatImpl;
 use common_telemetry::info;
@@ -45,7 +46,7 @@ use frontend::frontend::Frontend;
 use frontend::heartbeat::HeartbeatTask;
 use frontend::instance::builder::FrontendBuilder;
 use frontend::server::Services;
-use meta_client::{MetaClientOptions, MetaClientType};
+use meta_client::{MetaClientOptions, MetaClientRef, MetaClientType};
 use plugins::frontend::context::{
    CatalogManagerConfigureContext, DistributedCatalogManagerConfigureContext,
 };
@@ -440,30 +441,13 @@ impl StartCommand {
        };
        let catalog_manager = builder.build();

-        let executor = HandlerGroupExecutor::new(vec![
-            Arc::new(ParseMailboxMessageHandler),
-            Arc::new(InvalidateCacheHandler::new(layered_cache_registry.clone())),
-        ]);
-
-        let mut resource_stat = ResourceStatImpl::default();
-        resource_stat.start_collect_cpu_usage();
-
-        let heartbeat_task = HeartbeatTask::new(
-            &opts,
-            meta_client.clone(),
-            opts.heartbeat.clone(),
-            Arc::new(executor),
-            Arc::new(resource_stat),
-        );
-        let heartbeat_task = Some(heartbeat_task);
-
        let instance = FrontendBuilder::new(
            opts.clone(),
            cached_meta_backend.clone(),
            layered_cache_registry.clone(),
            catalog_manager,
            client,
-            meta_client,
+            meta_client.clone(),
            process_manager,
        )
        .with_plugin(plugins.clone())
@@ -471,6 +455,9 @@ impl StartCommand {
        .try_build()
        .await
        .context(error::StartFrontendSnafu)?;
+
+        let heartbeat_task = Some(create_heartbeat_task(&opts, meta_client, &instance));
+
        let instance = Arc::new(instance);

        let servers = Services::new(opts, instance.clone(), plugins)
@@ -487,6 +474,28 @@ impl StartCommand {
    }
 }

+pub fn create_heartbeat_task(
+    options: &frontend::frontend::FrontendOptions,
+    meta_client: MetaClientRef,
+    instance: &frontend::instance::Instance,
+) -> HeartbeatTask {
+    let executor = Arc::new(HandlerGroupExecutor::new(vec![
+        Arc::new(ParseMailboxMessageHandler),
+        Arc::new(SuspendHandler::new(instance.suspend_state())),
+        Arc::new(InvalidateCacheHandler::new(
+            instance.cache_invalidator().clone(),
+        )),
+    ]));
+
+    let stat = {
+        let mut stat = ResourceStatImpl::default();
+        stat.start_collect_cpu_usage();
+        Arc::new(stat)
+    };
+
+    HeartbeatTask::new(options, meta_client, executor, stat)
+}
+
 #[cfg(test)]
 mod tests {
    use std::io::Write;
--- a/src/common/config/Cargo.toml
+++ b/src/common/config/Cargo.toml
@@ -11,8 +11,10 @@ workspace = true
 common-base.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
+common-telemetry.workspace = true
 config.workspace = true
 humantime-serde.workspace = true
+notify.workspace = true
 object-store.workspace = true
 serde.workspace = true
 serde_json.workspace = true
--- a/src/common/config/src/error.rs
+++ b/src/common/config/src/error.rs
@@ -49,14 +49,41 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display("Failed to watch file: {}", path))]
+    FileWatch {
+        path: String,
+        #[snafu(source)]
+        error: notify::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to canonicalize path: {}", path))]
+    CanonicalizePath {
+        path: String,
+        #[snafu(source)]
+        error: std::io::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Invalid path '{}': expected a file, not a directory", path))]
+    InvalidPath {
+        path: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 impl ErrorExt for Error {
    fn status_code(&self) -> StatusCode {
        match self {
-            Error::TomlFormat { .. } | Error::LoadLayeredConfig { .. } => {
-                StatusCode::InvalidArguments
-            }
+            Error::TomlFormat { .. }
+            | Error::LoadLayeredConfig { .. }
+            | Error::FileWatch { .. }
+            | Error::InvalidPath { .. }
+            | Error::CanonicalizePath { .. } => StatusCode::InvalidArguments,
            Error::SerdeJson { .. } => StatusCode::Unexpected,
        }
    }
--- a/src/common/config/src/file_watcher.rs
+++ b/src/common/config/src/file_watcher.rs
@@ -0,0 +1,355 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Common file watching utilities for configuration hot-reloading.
+//!
+//! This module provides a generic file watcher that can be used to watch
+//! files for changes and trigger callbacks when changes occur.
+//!
+//! The watcher monitors the parent directory of each file rather than the
+//! file itself. This ensures that file deletions and recreations are properly
+//! tracked, which is common with editors that use atomic saves or when
+//! configuration files are replaced.
+
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+use std::sync::mpsc::channel;
+
+use common_telemetry::{error, info, warn};
+use notify::{EventKind, RecursiveMode, Watcher};
+use snafu::ResultExt;
+
+use crate::error::{CanonicalizePathSnafu, FileWatchSnafu, InvalidPathSnafu, Result};
+
+/// Configuration for the file watcher behavior.
+#[derive(Debug, Clone, Default)]
+pub struct FileWatcherConfig {
+    /// Whether to include Remove events in addition to Modify and Create.
+    pub include_remove_events: bool,
+}
+
+impl FileWatcherConfig {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn with_modify_and_create(mut self) -> Self {
+        self.include_remove_events = false;
+        self
+    }
+
+    pub fn with_remove_events(mut self) -> Self {
+        self.include_remove_events = true;
+        self
+    }
+}
+
+/// A builder for creating file watchers with flexible configuration.
+///
+/// The watcher monitors the parent directory of each file to handle file
+/// deletion and recreation properly. Events are filtered to only trigger
+/// callbacks for the specific files being watched.
+pub struct FileWatcherBuilder {
+    config: FileWatcherConfig,
+    /// Canonicalized paths of files to watch.
+    file_paths: Vec<PathBuf>,
+}
+
+impl FileWatcherBuilder {
+    /// Create a new builder with default configuration.
+    pub fn new() -> Self {
+        Self {
+            config: FileWatcherConfig::default(),
+            file_paths: Vec::new(),
+        }
+    }
+
+    /// Set the watcher configuration.
+    pub fn config(mut self, config: FileWatcherConfig) -> Self {
+        self.config = config;
+        self
+    }
+
+    /// Add a file path to watch.
+    ///
+    /// Returns an error if the path is a directory.
+    /// The path is canonicalized for reliable comparison with events.
+    pub fn watch_path<P: AsRef<Path>>(mut self, path: P) -> Result<Self> {
+        let path = path.as_ref();
+        snafu::ensure!(
+            path.is_file(),
+            InvalidPathSnafu {
+                path: path.display().to_string(),
+            }
+        );
+        // Canonicalize the path for reliable comparison with event paths
+        let canonical = path.canonicalize().context(CanonicalizePathSnafu {
+            path: path.display().to_string(),
+        })?;
+        self.file_paths.push(canonical);
+        Ok(self)
+    }
+
+    /// Add multiple file paths to watch.
+    ///
+    /// Returns an error if any path is a directory.
+    pub fn watch_paths<P: AsRef<Path>, I: IntoIterator<Item = P>>(
+        mut self,
+        paths: I,
+    ) -> Result<Self> {
+        for path in paths {
+            self = self.watch_path(path)?;
+        }
+        Ok(self)
+    }
+
+    /// Build and spawn the file watcher with the given callback.
+    ///
+    /// The callback is invoked when relevant file events are detected for
+    /// the watched files. The watcher monitors the parent directories to
+    /// handle file deletion and recreation properly.
+    ///
+    /// The spawned watcher thread runs for the lifetime of the process.
+    pub fn spawn<F>(self, callback: F) -> Result<()>
+    where
+        F: Fn() + Send + 'static,
+    {
+        let (tx, rx) = channel::<notify::Result<notify::Event>>();
+        let mut watcher =
+            notify::recommended_watcher(tx).context(FileWatchSnafu { path: "<none>" })?;
+
+        // Collect unique parent directories to watch
+        let mut watched_dirs: HashSet<PathBuf> = HashSet::new();
+        for file_path in &self.file_paths {
+            if let Some(parent) = file_path.parent()
+                && watched_dirs.insert(parent.to_path_buf())
+            {
+                watcher
+                    .watch(parent, RecursiveMode::NonRecursive)
+                    .context(FileWatchSnafu {
+                        path: parent.display().to_string(),
+                    })?;
+            }
+        }
+
+        let config = self.config;
+        let watched_files: HashSet<PathBuf> = self.file_paths.iter().cloned().collect();
+
+        info!(
+            "Spawning file watcher for paths: {:?} (watching parent directories)",
+            self.file_paths
+                .iter()
+                .map(|p| p.display().to_string())
+                .collect::<Vec<_>>()
+        );
+
+        std::thread::spawn(move || {
+            // Keep watcher alive in the thread
+            let _watcher = watcher;
+
+            while let Ok(res) = rx.recv() {
+                match res {
+                    Ok(event) => {
+                        if !is_relevant_event(&event.kind, &config) {
+                            continue;
+                        }
+
+                        // Check if any of the event paths match our watched files
+                        let is_watched_file = event.paths.iter().any(|event_path| {
+                            // Try to canonicalize the event path for comparison
+                            // If the file was deleted, canonicalize will fail, so we also
+                            // compare the raw path
+                            if let Ok(canonical) = event_path.canonicalize()
+                                && watched_files.contains(&canonical)
+                            {
+                                return true;
+                            }
+                            // For deleted files, compare using the raw path
+                            watched_files.contains(event_path)
+                        });
+
+                        if !is_watched_file {
+                            continue;
+                        }
+
+                        info!(?event.kind, ?event.paths, "Detected file change");
+                        callback();
+                    }
+                    Err(err) => {
+                        warn!("File watcher error: {}", err);
+                    }
+                }
+            }
+
+            error!("File watcher channel closed unexpectedly");
+        });
+
+        Ok(())
+    }
+}
+
+impl Default for FileWatcherBuilder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Check if an event kind is relevant based on the configuration.
+fn is_relevant_event(kind: &EventKind, config: &FileWatcherConfig) -> bool {
+    match kind {
+        EventKind::Modify(_) | EventKind::Create(_) => true,
+        EventKind::Remove(_) => config.include_remove_events,
+        _ => false,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+    use std::time::Duration;
+
+    use common_test_util::temp_dir::create_temp_dir;
+
+    use super::*;
+
+    #[test]
+    fn test_file_watcher_detects_changes() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("test_file_watcher");
+        let file_path = dir.path().join("test_file.txt");
+
+        // Create initial file
+        std::fs::write(&file_path, "initial content").unwrap();
+
+        let counter = Arc::new(AtomicUsize::new(0));
+        let counter_clone = counter.clone();
+
+        FileWatcherBuilder::new()
+            .watch_path(&file_path)
+            .unwrap()
+            .config(FileWatcherConfig::new())
+            .spawn(move || {
+                counter_clone.fetch_add(1, Ordering::SeqCst);
+            })
+            .unwrap();
+
+        // Give watcher time to start
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Modify the file
+        std::fs::write(&file_path, "modified content").unwrap();
+
+        // Wait for the event to be processed
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert!(
+            counter.load(Ordering::SeqCst) >= 1,
+            "Watcher should have detected at least one change"
+        );
+    }
+
+    #[test]
+    fn test_file_watcher_detects_delete_and_recreate() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("test_file_watcher_recreate");
+        let file_path = dir.path().join("test_file.txt");
+
+        // Create initial file
+        std::fs::write(&file_path, "initial content").unwrap();
+
+        let counter = Arc::new(AtomicUsize::new(0));
+        let counter_clone = counter.clone();
+
+        FileWatcherBuilder::new()
+            .watch_path(&file_path)
+            .unwrap()
+            .config(FileWatcherConfig::new())
+            .spawn(move || {
+                counter_clone.fetch_add(1, Ordering::SeqCst);
+            })
+            .unwrap();
+
+        // Give watcher time to start
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Delete the file
+        std::fs::remove_file(&file_path).unwrap();
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Recreate the file - this should still be detected because we watch the directory
+        std::fs::write(&file_path, "recreated content").unwrap();
+
+        // Wait for the event to be processed
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert!(
+            counter.load(Ordering::SeqCst) >= 1,
+            "Watcher should have detected file recreation"
+        );
+    }
+
+    #[test]
+    fn test_file_watcher_ignores_other_files() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("test_file_watcher_other");
+        let watched_file = dir.path().join("watched.txt");
+        let other_file = dir.path().join("other.txt");
+
+        // Create both files
+        std::fs::write(&watched_file, "watched content").unwrap();
+        std::fs::write(&other_file, "other content").unwrap();
+
+        let counter = Arc::new(AtomicUsize::new(0));
+        let counter_clone = counter.clone();
+
+        FileWatcherBuilder::new()
+            .watch_path(&watched_file)
+            .unwrap()
+            .config(FileWatcherConfig::new())
+            .spawn(move || {
+                counter_clone.fetch_add(1, Ordering::SeqCst);
+            })
+            .unwrap();
+
+        // Give watcher time to start
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Modify the other file - should NOT trigger callback
+        std::fs::write(&other_file, "modified other content").unwrap();
+
+        // Wait for potential event
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert_eq!(
+            counter.load(Ordering::SeqCst),
+            0,
+            "Watcher should not have detected changes to other files"
+        );
+
+        // Now modify the watched file - SHOULD trigger callback
+        std::fs::write(&watched_file, "modified watched content").unwrap();
+
+        // Wait for the event to be processed
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert!(
+            counter.load(Ordering::SeqCst) >= 1,
+            "Watcher should have detected change to watched file"
+        );
+    }
+}
--- a/src/common/config/src/lib.rs
+++ b/src/common/config/src/lib.rs
@@ -14,6 +14,7 @@

 pub mod config;
 pub mod error;
+pub mod file_watcher;

 use std::time::Duration;

--- a/src/common/error/src/lib.rs
+++ b/src/common/error/src/lib.rs
@@ -21,6 +21,8 @@ pub mod status_code;
 use http::{HeaderMap, HeaderValue};
 pub use snafu;

+use crate::status_code::StatusCode;
+
 // HACK - these headers are here for shared in gRPC services. For common HTTP headers,
 // please define in `src/servers/src/http/header.rs`.
 pub const GREPTIME_DB_HEADER_ERROR_CODE: &str = "x-greptime-err-code";
@@ -46,6 +48,29 @@ pub fn from_err_code_msg_to_header(code: u32, msg: &str) -> HeaderMap {
    header
 }

+/// Extract [StatusCode] and error message from [HeaderMap], if any.
+///
+/// Note that if the [StatusCode] is illegal, for example, a random number that is not pre-defined
+/// as a [StatusCode], the result is still `None`.
+pub fn from_header_to_err_code_msg(headers: &HeaderMap) -> Option<(StatusCode, &str)> {
+    let code = headers
+        .get(GREPTIME_DB_HEADER_ERROR_CODE)
+        .and_then(|value| {
+            value
+                .to_str()
+                .ok()
+                .and_then(|x| x.parse::<u32>().ok())
+                .and_then(StatusCode::from_u32)
+        });
+    let msg = headers
+        .get(GREPTIME_DB_HEADER_ERROR_MSG)
+        .and_then(|x| x.to_str().ok());
+    match (code, msg) {
+        (Some(code), Some(msg)) => Some((code, msg)),
+        _ => None,
+    }
+}
+
 /// Returns the external root cause of the source error (exclude the current error).
 pub fn root_source(err: &dyn std::error::Error) -> Option<&dyn std::error::Error> {
    // There are some divergence about the behavior of the `sources()` API
--- a/src/common/error/src/status_code.rs
+++ b/src/common/error/src/status_code.rs
@@ -42,6 +42,8 @@ pub enum StatusCode {
    External = 1007,
    /// The request is deadline exceeded (typically server-side).
    DeadlineExceeded = 1008,
+    /// Service got suspended for various reason. For example, resources exceed limit.
+    Suspended = 1009,
    // ====== End of common status code ================

    // ====== Begin of SQL related status code =========
@@ -175,7 +177,8 @@ impl StatusCode {
            | StatusCode::AccessDenied
            | StatusCode::PermissionDenied
            | StatusCode::RequestOutdated
-            | StatusCode::External => false,
+            | StatusCode::External
+            | StatusCode::Suspended => false,
        }
    }

@@ -223,7 +226,8 @@ impl StatusCode {
            | StatusCode::InvalidAuthHeader
            | StatusCode::AccessDenied
            | StatusCode::PermissionDenied
-            | StatusCode::RequestOutdated => false,
+            | StatusCode::RequestOutdated
+            | StatusCode::Suspended => false,
        }
    }

@@ -347,7 +351,8 @@ pub fn status_to_tonic_code(status_code: StatusCode) -> Code {
        | StatusCode::RegionNotReady => Code::Unavailable,
        StatusCode::RuntimeResourcesExhausted
        | StatusCode::RateLimited
-        | StatusCode::RegionBusy => Code::ResourceExhausted,
+        | StatusCode::RegionBusy
+        | StatusCode::Suspended => Code::ResourceExhausted,
        StatusCode::UnsupportedPasswordType
        | StatusCode::UserPasswordMismatch
        | StatusCode::AuthHeaderNotFound
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -39,7 +39,7 @@ datafusion-functions-aggregate-common.workspace = true
 datafusion-pg-catalog.workspace = true
 datafusion-physical-expr.workspace = true
 datatypes.workspace = true
-derive_more = { version = "1", default-features = false, features = ["display"] }
+derive_more.workspace = true
 geo = { version = "0.29", optional = true }
 geo-types = { version = "0.7", optional = true }
 geohash = { version = "0.13", optional = true }
--- a/src/common/function/src/scalars/geo/relation.rs
+++ b/src/common/function/src/scalars/geo/relation.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::fmt::Display;
 use std::sync::Arc;

 use datafusion_common::arrow::array::{Array, AsArray, BooleanBuilder};
--- a/src/common/function/src/system/pg_catalog.rs
+++ b/src/common/function/src/system/pg_catalog.rs
@@ -387,6 +387,8 @@ impl PGCatalogFunction {
        registry.register(pg_catalog::create_pg_stat_get_numscans());
        registry.register(pg_catalog::create_pg_get_constraintdef());
        registry.register(pg_catalog::create_pg_get_partition_ancestors_udf());
+        registry.register(pg_catalog::quote_ident_udf::create_quote_ident_udf());
+        registry.register(pg_catalog::quote_ident_udf::create_parse_ident_udf());
        registry.register_scalar(ObjDescriptionFunction::new());
        registry.register_scalar(ColDescriptionFunction::new());
        registry.register_scalar(ShobjDescriptionFunction::new());
--- a/src/common/grpc/Cargo.toml
+++ b/src/common/grpc/Cargo.toml
@@ -12,6 +12,7 @@ api.workspace = true
 arrow-flight.workspace = true
 bytes.workspace = true
 common-base.workspace = true
+common-config.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 common-recordbatch.workspace = true
@@ -23,7 +24,6 @@ datatypes.workspace = true
 flatbuffers = "25.2"
 hyper.workspace = true
 lazy_static.workspace = true
-notify.workspace = true
 prost.workspace = true
 serde.workspace = true
 serde_json.workspace = true
--- a/src/common/grpc/src/error.rs
+++ b/src/common/grpc/src/error.rs
@@ -38,11 +38,10 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Failed to watch config file path: {}", path))]
+    #[snafu(display("Failed to watch config file"))]
    FileWatch {
-        path: String,
        #[snafu(source)]
-        error: notify::Error,
+        source: common_config::error::Error,
        #[snafu(implicit)]
        location: Location,
    },
--- a/src/common/grpc/src/reloadable_tls.rs
+++ b/src/common/grpc/src/reloadable_tls.rs
@@ -15,11 +15,10 @@
 use std::path::Path;
 use std::result::Result as StdResult;
 use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::mpsc::channel;
 use std::sync::{Arc, RwLock};

+use common_config::file_watcher::{FileWatcherBuilder, FileWatcherConfig};
 use common_telemetry::{error, info};
-use notify::{EventKind, RecursiveMode, Watcher};
 use snafu::ResultExt;

 use crate::error::{FileWatchSnafu, Result};
@@ -119,45 +118,28 @@ where
        return Ok(());
    }

+    let watch_paths: Vec<_> = tls_config
+        .get_tls_option()
+        .watch_paths()
+        .iter()
+        .map(|p| p.to_path_buf())
+        .collect();
+
    let tls_config_for_watcher = tls_config.clone();

-    let (tx, rx) = channel::<notify::Result<notify::Event>>();
-    let mut watcher = notify::recommended_watcher(tx).context(FileWatchSnafu { path: "<none>" })?;
-
-    // Watch all paths returned by the TlsConfigLoader
-    for path in tls_config.get_tls_option().watch_paths() {
-        watcher
-            .watch(path, RecursiveMode::NonRecursive)
-            .with_context(|_| FileWatchSnafu {
-                path: path.display().to_string(),
-            })?;
-    }
-
-    info!("Spawning background task for watching TLS cert/key file changes");
-    std::thread::spawn(move || {
-        let _watcher = watcher;
-        loop {
-            match rx.recv() {
-                Ok(Ok(event)) => {
-                    if let EventKind::Modify(_) | EventKind::Create(_) = event.kind {
-                        info!("Detected TLS cert/key file change: {:?}", event);
-                        if let Err(err) = tls_config_for_watcher.reload() {
-                            error!("Failed to reload TLS config: {}", err);
-                        } else {
-                            info!("Reloaded TLS cert/key file successfully.");
-                            on_reload();
-                        }
-                    }
-                }
-                Ok(Err(err)) => {
-                    error!("Failed to watch TLS cert/key file: {}", err);
-                }
-                Err(err) => {
-                    error!("TLS cert/key file watcher channel closed: {}", err);
-                }
+    FileWatcherBuilder::new()
+        .watch_paths(&watch_paths)
+        .context(FileWatchSnafu)?
+        .config(FileWatcherConfig::new())
+        .spawn(move || {
+            if let Err(err) = tls_config_for_watcher.reload() {
+                error!("Failed to reload TLS config: {}", err);
+            } else {
+                info!("Reloaded TLS cert/key file successfully.");
+                on_reload();
            }
-        }
-    });
+        })
+        .context(FileWatchSnafu)?;

    Ok(())
 }
--- a/src/common/meta/src/cluster.rs
+++ b/src/common/meta/src/cluster.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::fmt::{Display, Formatter};
 use std::hash::{DefaultHasher, Hash, Hasher};
 use std::str::FromStr;

@@ -60,7 +61,7 @@ pub trait ClusterInfo {
 }

 /// The key of [NodeInfo] in the storage. The format is `__meta_cluster_node_info-0-{role}-{node_id}`.
-#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize, PartialOrd, Ord)]
 pub struct NodeInfoKey {
    /// The role of the node. It can be `[Role::Datanode]` or `[Role::Frontend]`.
    pub role: Role,
@@ -135,7 +136,7 @@ pub struct NodeInfo {
    pub hostname: String,
 }

-#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize, PartialOrd, Ord)]
 pub enum Role {
    Datanode,
    Frontend,
@@ -241,6 +242,12 @@ impl From<&NodeInfoKey> for Vec<u8> {
    }
 }

+impl Display for NodeInfoKey {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}-{}", self.role, self.node_id)
+    }
+}
+
 impl FromStr for NodeInfo {
    type Err = Error;

--- a/src/common/meta/src/ddl.rs
+++ b/src/common/meta/src/ddl.rs
@@ -31,6 +31,7 @@ use crate::region_registry::LeaderRegionRegistryRef;
 pub mod alter_database;
 pub mod alter_logical_tables;
 pub mod alter_table;
+pub mod comment_on;
 pub mod create_database;
 pub mod create_flow;
 pub mod create_logical_tables;
--- a/src/common/meta/src/ddl/alter_table/executor.rs
+++ b/src/common/meta/src/ddl/alter_table/executor.rs
@@ -301,8 +301,8 @@ fn build_new_table_info(
        | AlterKind::UnsetTableOptions { .. }
        | AlterKind::SetIndexes { .. }
        | AlterKind::UnsetIndexes { .. }
-        | AlterKind::DropDefaults { .. } => {}
-        AlterKind::SetDefaults { .. } => {}
+        | AlterKind::DropDefaults { .. }
+        | AlterKind::SetDefaults { .. } => {}
    }

    info!(
--- a/src/common/meta/src/ddl/comment_on.rs
+++ b/src/common/meta/src/ddl/comment_on.rs
@@ -0,0 +1,509 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use async_trait::async_trait;
+use chrono::Utc;
+use common_catalog::format_full_table_name;
+use common_procedure::error::{FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu};
+use common_procedure::{Context as ProcedureContext, LockKey, Procedure, Status};
+use common_telemetry::tracing::info;
+use datatypes::schema::COMMENT_KEY as COLUMN_COMMENT_KEY;
+use serde::{Deserialize, Serialize};
+use snafu::{OptionExt, ResultExt, ensure};
+use store_api::storage::TableId;
+use strum::AsRefStr;
+use table::metadata::RawTableInfo;
+use table::requests::COMMENT_KEY as TABLE_COMMENT_KEY;
+use table::table_name::TableName;
+
+use crate::cache_invalidator::Context;
+use crate::ddl::DdlContext;
+use crate::ddl::utils::map_to_procedure_error;
+use crate::error::{ColumnNotFoundSnafu, FlowNotFoundSnafu, Result, TableNotFoundSnafu};
+use crate::instruction::CacheIdent;
+use crate::key::flow::flow_info::{FlowInfoKey, FlowInfoValue};
+use crate::key::table_info::{TableInfoKey, TableInfoValue};
+use crate::key::table_name::TableNameKey;
+use crate::key::{DeserializedValueWithBytes, FlowId, MetadataKey, MetadataValue};
+use crate::lock_key::{CatalogLock, FlowNameLock, SchemaLock, TableNameLock};
+use crate::rpc::ddl::{CommentObjectType, CommentOnTask};
+use crate::rpc::store::PutRequest;
+
+pub struct CommentOnProcedure {
+    pub context: DdlContext,
+    pub data: CommentOnData,
+}
+
+impl CommentOnProcedure {
+    pub const TYPE_NAME: &'static str = "metasrv-procedure::CommentOn";
+
+    pub fn new(task: CommentOnTask, context: DdlContext) -> Self {
+        Self {
+            context,
+            data: CommentOnData::new(task),
+        }
+    }
+
+    pub fn from_json(json: &str, context: DdlContext) -> ProcedureResult<Self> {
+        let data = serde_json::from_str(json).context(FromJsonSnafu)?;
+
+        Ok(Self { context, data })
+    }
+
+    pub async fn on_prepare(&mut self) -> Result<Status> {
+        match self.data.object_type {
+            CommentObjectType::Table | CommentObjectType::Column => {
+                self.prepare_table_or_column().await?;
+            }
+            CommentObjectType::Flow => {
+                self.prepare_flow().await?;
+            }
+        }
+
+        // Fast path: if comment is unchanged, skip update
+        if self.data.is_unchanged {
+            let object_desc = match self.data.object_type {
+                CommentObjectType::Table => format!(
+                    "table {}",
+                    format_full_table_name(
+                        &self.data.catalog_name,
+                        &self.data.schema_name,
+                        &self.data.object_name,
+                    )
+                ),
+                CommentObjectType::Column => format!(
+                    "column {}.{}",
+                    format_full_table_name(
+                        &self.data.catalog_name,
+                        &self.data.schema_name,
+                        &self.data.object_name,
+                    ),
+                    self.data.column_name.as_ref().unwrap()
+                ),
+                CommentObjectType::Flow => {
+                    format!("flow {}.{}", self.data.catalog_name, self.data.object_name)
+                }
+            };
+            info!("Comment unchanged for {}, skipping update", object_desc);
+            return Ok(Status::done());
+        }
+
+        self.data.state = CommentOnState::UpdateMetadata;
+        Ok(Status::executing(true))
+    }
+
+    async fn prepare_table_or_column(&mut self) -> Result<()> {
+        let table_name_key = TableNameKey::new(
+            &self.data.catalog_name,
+            &self.data.schema_name,
+            &self.data.object_name,
+        );
+
+        let table_id = self
+            .context
+            .table_metadata_manager
+            .table_name_manager()
+            .get(table_name_key)
+            .await?
+            .with_context(|| TableNotFoundSnafu {
+                table_name: format_full_table_name(
+                    &self.data.catalog_name,
+                    &self.data.schema_name,
+                    &self.data.object_name,
+                ),
+            })?
+            .table_id();
+
+        let table_info = self
+            .context
+            .table_metadata_manager
+            .table_info_manager()
+            .get(table_id)
+            .await?
+            .with_context(|| TableNotFoundSnafu {
+                table_name: format_full_table_name(
+                    &self.data.catalog_name,
+                    &self.data.schema_name,
+                    &self.data.object_name,
+                ),
+            })?;
+
+        // For column comments, validate the column exists
+        if self.data.object_type == CommentObjectType::Column {
+            let column_name = self.data.column_name.as_ref().unwrap();
+            let column_exists = table_info
+                .table_info
+                .meta
+                .schema
+                .column_schemas
+                .iter()
+                .any(|col| &col.name == column_name);
+
+            ensure!(
+                column_exists,
+                ColumnNotFoundSnafu {
+                    column_name,
+                    column_id: 0u32, // column_id is not known here
+                }
+            );
+        }
+
+        self.data.table_id = Some(table_id);
+
+        // Check if comment is unchanged for early exit optimization
+        match self.data.object_type {
+            CommentObjectType::Table => {
+                let current_comment = &table_info.table_info.desc;
+                if &self.data.comment == current_comment {
+                    self.data.is_unchanged = true;
+                }
+            }
+            CommentObjectType::Column => {
+                let column_name = self.data.column_name.as_ref().unwrap();
+                let column_schema = table_info
+                    .table_info
+                    .meta
+                    .schema
+                    .column_schemas
+                    .iter()
+                    .find(|col| &col.name == column_name)
+                    .unwrap(); // Safe: validated above
+
+                let current_comment = column_schema.metadata().get(COLUMN_COMMENT_KEY);
+                if self.data.comment.as_deref() == current_comment.map(String::as_str) {
+                    self.data.is_unchanged = true;
+                }
+            }
+            CommentObjectType::Flow => {
+                // this branch is handled in `prepare_flow`
+            }
+        }
+
+        self.data.table_info = Some(table_info);
+
+        Ok(())
+    }
+
+    async fn prepare_flow(&mut self) -> Result<()> {
+        let flow_name_value = self
+            .context
+            .flow_metadata_manager
+            .flow_name_manager()
+            .get(&self.data.catalog_name, &self.data.object_name)
+            .await?
+            .with_context(|| FlowNotFoundSnafu {
+                flow_name: &self.data.object_name,
+            })?;
+
+        let flow_id = flow_name_value.flow_id();
+        let flow_info = self
+            .context
+            .flow_metadata_manager
+            .flow_info_manager()
+            .get_raw(flow_id)
+            .await?
+            .with_context(|| FlowNotFoundSnafu {
+                flow_name: &self.data.object_name,
+            })?;
+
+        self.data.flow_id = Some(flow_id);
+
+        // Check if comment is unchanged for early exit optimization
+        let current_comment = &flow_info.get_inner_ref().comment;
+        let new_comment = self.data.comment.as_deref().unwrap_or("");
+        if new_comment == current_comment.as_str() {
+            self.data.is_unchanged = true;
+        }
+
+        self.data.flow_info = Some(flow_info);
+
+        Ok(())
+    }
+
+    pub async fn on_update_metadata(&mut self) -> Result<Status> {
+        match self.data.object_type {
+            CommentObjectType::Table => {
+                self.update_table_comment().await?;
+            }
+            CommentObjectType::Column => {
+                self.update_column_comment().await?;
+            }
+            CommentObjectType::Flow => {
+                self.update_flow_comment().await?;
+            }
+        }
+
+        self.data.state = CommentOnState::InvalidateCache;
+        Ok(Status::executing(true))
+    }
+
+    async fn update_table_comment(&mut self) -> Result<()> {
+        let table_info_value = self.data.table_info.as_ref().unwrap();
+        let mut new_table_info = table_info_value.table_info.clone();
+
+        new_table_info.desc = self.data.comment.clone();
+
+        // Sync comment to table options
+        sync_table_comment_option(
+            &mut new_table_info.meta.options,
+            new_table_info.desc.as_deref(),
+        );
+
+        self.update_table_info(table_info_value, new_table_info)
+            .await?;
+
+        info!(
+            "Updated comment for table {}.{}.{}",
+            self.data.catalog_name, self.data.schema_name, self.data.object_name
+        );
+
+        Ok(())
+    }
+
+    async fn update_column_comment(&mut self) -> Result<()> {
+        let table_info_value = self.data.table_info.as_ref().unwrap();
+        let mut new_table_info = table_info_value.table_info.clone();
+
+        let column_name = self.data.column_name.as_ref().unwrap();
+        let column_schema = new_table_info
+            .meta
+            .schema
+            .column_schemas
+            .iter_mut()
+            .find(|col| &col.name == column_name)
+            .unwrap(); // Safe: validated in prepare
+
+        update_column_comment_metadata(column_schema, self.data.comment.clone());
+
+        self.update_table_info(table_info_value, new_table_info)
+            .await?;
+
+        info!(
+            "Updated comment for column {}.{}.{}.{}",
+            self.data.catalog_name, self.data.schema_name, self.data.object_name, column_name
+        );
+
+        Ok(())
+    }
+
+    async fn update_flow_comment(&mut self) -> Result<()> {
+        let flow_id = self.data.flow_id.unwrap();
+        let flow_info_value = self.data.flow_info.as_ref().unwrap();
+
+        let mut new_flow_info = flow_info_value.get_inner_ref().clone();
+        new_flow_info.comment = self.data.comment.clone().unwrap_or_default();
+        new_flow_info.updated_time = Utc::now();
+
+        let raw_value = new_flow_info.try_as_raw_value()?;
+
+        self.context
+            .table_metadata_manager
+            .kv_backend()
+            .put(
+                PutRequest::new()
+                    .with_key(FlowInfoKey::new(flow_id).to_bytes())
+                    .with_value(raw_value),
+            )
+            .await?;
+
+        info!(
+            "Updated comment for flow {}.{}",
+            self.data.catalog_name, self.data.object_name
+        );
+
+        Ok(())
+    }
+
+    async fn update_table_info(
+        &self,
+        current_table_info: &DeserializedValueWithBytes<TableInfoValue>,
+        new_table_info: RawTableInfo,
+    ) -> Result<()> {
+        let table_id = current_table_info.table_info.ident.table_id;
+        let new_table_info_value = current_table_info.update(new_table_info);
+        let raw_value = new_table_info_value.try_as_raw_value()?;
+
+        self.context
+            .table_metadata_manager
+            .kv_backend()
+            .put(
+                PutRequest::new()
+                    .with_key(TableInfoKey::new(table_id).to_bytes())
+                    .with_value(raw_value),
+            )
+            .await?;
+
+        Ok(())
+    }
+
+    pub async fn on_invalidate_cache(&mut self) -> Result<Status> {
+        let cache_invalidator = &self.context.cache_invalidator;
+
+        match self.data.object_type {
+            CommentObjectType::Table | CommentObjectType::Column => {
+                let table_id = self.data.table_id.unwrap();
+                let table_name = TableName::new(
+                    self.data.catalog_name.clone(),
+                    self.data.schema_name.clone(),
+                    self.data.object_name.clone(),
+                );
+
+                let cache_ident = vec![
+                    CacheIdent::TableId(table_id),
+                    CacheIdent::TableName(table_name),
+                ];
+
+                cache_invalidator
+                    .invalidate(&Context::default(), &cache_ident)
+                    .await?;
+            }
+            CommentObjectType::Flow => {
+                let flow_id = self.data.flow_id.unwrap();
+                let cache_ident = vec![CacheIdent::FlowId(flow_id)];
+
+                cache_invalidator
+                    .invalidate(&Context::default(), &cache_ident)
+                    .await?;
+            }
+        }
+
+        Ok(Status::done())
+    }
+}
+
+#[async_trait]
+impl Procedure for CommentOnProcedure {
+    fn type_name(&self) -> &str {
+        Self::TYPE_NAME
+    }
+
+    async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
+        match self.data.state {
+            CommentOnState::Prepare => self.on_prepare().await,
+            CommentOnState::UpdateMetadata => self.on_update_metadata().await,
+            CommentOnState::InvalidateCache => self.on_invalidate_cache().await,
+        }
+        .map_err(map_to_procedure_error)
+    }
+
+    fn dump(&self) -> ProcedureResult<String> {
+        serde_json::to_string(&self.data).context(ToJsonSnafu)
+    }
+
+    fn lock_key(&self) -> LockKey {
+        let catalog = &self.data.catalog_name;
+        let schema = &self.data.schema_name;
+
+        let lock_key = match self.data.object_type {
+            CommentObjectType::Table | CommentObjectType::Column => {
+                vec![
+                    CatalogLock::Read(catalog).into(),
+                    SchemaLock::read(catalog, schema).into(),
+                    TableNameLock::new(catalog, schema, &self.data.object_name).into(),
+                ]
+            }
+            CommentObjectType::Flow => {
+                vec![
+                    CatalogLock::Read(catalog).into(),
+                    FlowNameLock::new(catalog, &self.data.object_name).into(),
+                ]
+            }
+        };
+
+        LockKey::new(lock_key)
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, AsRefStr)]
+enum CommentOnState {
+    Prepare,
+    UpdateMetadata,
+    InvalidateCache,
+}
+
+/// The data of comment on procedure.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct CommentOnData {
+    state: CommentOnState,
+    catalog_name: String,
+    schema_name: String,
+    object_type: CommentObjectType,
+    object_name: String,
+    /// Column name (only for Column comments)
+    column_name: Option<String>,
+    comment: Option<String>,
+    /// Cached table ID (for Table/Column)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    table_id: Option<TableId>,
+    /// Cached table info (for Table/Column)
+    #[serde(skip)]
+    table_info: Option<DeserializedValueWithBytes<TableInfoValue>>,
+    /// Cached flow ID (for Flow)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    flow_id: Option<FlowId>,
+    /// Cached flow info (for Flow)
+    #[serde(skip)]
+    flow_info: Option<DeserializedValueWithBytes<FlowInfoValue>>,
+    /// Whether the comment is unchanged (optimization for early exit)
+    #[serde(skip)]
+    is_unchanged: bool,
+}
+
+impl CommentOnData {
+    pub fn new(task: CommentOnTask) -> Self {
+        Self {
+            state: CommentOnState::Prepare,
+            catalog_name: task.catalog_name,
+            schema_name: task.schema_name,
+            object_type: task.object_type,
+            object_name: task.object_name,
+            column_name: task.column_name,
+            comment: task.comment,
+            table_id: None,
+            table_info: None,
+            flow_id: None,
+            flow_info: None,
+            is_unchanged: false,
+        }
+    }
+}
+
+fn update_column_comment_metadata(
+    column_schema: &mut datatypes::schema::ColumnSchema,
+    comment: Option<String>,
+) {
+    match comment {
+        Some(value) => {
+            column_schema
+                .mut_metadata()
+                .insert(COLUMN_COMMENT_KEY.to_string(), value);
+        }
+        None => {
+            column_schema.mut_metadata().remove(COLUMN_COMMENT_KEY);
+        }
+    }
+}
+
+fn sync_table_comment_option(options: &mut table::requests::TableOptions, comment: Option<&str>) {
+    match comment {
+        Some(value) => {
+            options
+                .extra_options
+                .insert(TABLE_COMMENT_KEY.to_string(), value.to_string());
+        }
+        None => {
+            options.extra_options.remove(TABLE_COMMENT_KEY);
+        }
+    }
+}
--- a/src/common/meta/src/ddl_manager.rs
+++ b/src/common/meta/src/ddl_manager.rs
@@ -27,6 +27,7 @@ use store_api::storage::TableId;
 use crate::ddl::alter_database::AlterDatabaseProcedure;
 use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure;
 use crate::ddl::alter_table::AlterTableProcedure;
+use crate::ddl::comment_on::CommentOnProcedure;
 use crate::ddl::create_database::CreateDatabaseProcedure;
 use crate::ddl::create_flow::CreateFlowProcedure;
 use crate::ddl::create_logical_tables::CreateLogicalTablesProcedure;
@@ -52,18 +53,18 @@ use crate::rpc::ddl::DdlTask::CreateTrigger;
 #[cfg(feature = "enterprise")]
 use crate::rpc::ddl::DdlTask::DropTrigger;
 use crate::rpc::ddl::DdlTask::{
-    AlterDatabase, AlterLogicalTables, AlterTable, CreateDatabase, CreateFlow, CreateLogicalTables,
-    CreateTable, CreateView, DropDatabase, DropFlow, DropLogicalTables, DropTable, DropView,
-    TruncateTable,
+    AlterDatabase, AlterLogicalTables, AlterTable, CommentOn, CreateDatabase, CreateFlow,
+    CreateLogicalTables, CreateTable, CreateView, DropDatabase, DropFlow, DropLogicalTables,
+    DropTable, DropView, TruncateTable,
 };
 #[cfg(feature = "enterprise")]
 use crate::rpc::ddl::trigger::CreateTriggerTask;
 #[cfg(feature = "enterprise")]
 use crate::rpc::ddl::trigger::DropTriggerTask;
 use crate::rpc::ddl::{
-    AlterDatabaseTask, AlterTableTask, CreateDatabaseTask, CreateFlowTask, CreateTableTask,
-    CreateViewTask, DropDatabaseTask, DropFlowTask, DropTableTask, DropViewTask, QueryContext,
-    SubmitDdlTaskRequest, SubmitDdlTaskResponse, TruncateTableTask,
+    AlterDatabaseTask, AlterTableTask, CommentOnTask, CreateDatabaseTask, CreateFlowTask,
+    CreateTableTask, CreateViewTask, DropDatabaseTask, DropFlowTask, DropTableTask, DropViewTask,
+    QueryContext, SubmitDdlTaskRequest, SubmitDdlTaskResponse, TruncateTableTask,
 };
 use crate::rpc::router::RegionRoute;

@@ -192,7 +193,8 @@ impl DdlManager {
            TruncateTableProcedure,
            CreateDatabaseProcedure,
            DropDatabaseProcedure,
-            DropViewProcedure
+            DropViewProcedure,
+            CommentOnProcedure
        );

        for (type_name, loader_factory) in loaders {
@@ -408,6 +410,19 @@ impl DdlManager {
        self.submit_procedure(procedure_with_id).await
    }

+    /// Submits and executes a comment on task.
+    #[tracing::instrument(skip_all)]
+    pub async fn submit_comment_on_task(
+        &self,
+        comment_on_task: CommentOnTask,
+    ) -> Result<(ProcedureId, Option<Output>)> {
+        let context = self.create_context();
+        let procedure = CommentOnProcedure::new(comment_on_task, context);
+        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
+
+        self.submit_procedure(procedure_with_id).await
+    }
+
    async fn submit_procedure(
        &self,
        procedure_with_id: ProcedureWithId,
@@ -476,6 +491,7 @@ impl DdlManager {
                    handle_create_view_task(self, create_view_task).await
                }
                DropView(drop_view_task) => handle_drop_view_task(self, drop_view_task).await,
+                CommentOn(comment_on_task) => handle_comment_on_task(self, comment_on_task).await,
                #[cfg(feature = "enterprise")]
                CreateTrigger(create_trigger_task) => {
                    handle_create_trigger_task(
@@ -907,6 +923,26 @@ async fn handle_create_view_task(
    })
 }

+async fn handle_comment_on_task(
+    ddl_manager: &DdlManager,
+    comment_on_task: CommentOnTask,
+) -> Result<SubmitDdlTaskResponse> {
+    let (id, _) = ddl_manager
+        .submit_comment_on_task(comment_on_task.clone())
+        .await?;
+
+    let procedure_id = id.to_string();
+    info!(
+        "Comment on {}.{}.{} is updated via procedure_id {id:?}",
+        comment_on_task.catalog_name, comment_on_task.schema_name, comment_on_task.object_name
+    );
+
+    Ok(SubmitDdlTaskResponse {
+        key: procedure_id.into(),
+        ..Default::default()
+    })
+}
+
 #[cfg(test)]
 mod tests {
    use std::sync::Arc;
--- a/src/common/meta/src/distributed_time_constants.rs
+++ b/src/common/meta/src/distributed_time_constants.rs
@@ -14,6 +14,8 @@

 use std::time::Duration;

+use etcd_client::ConnectOptions;
+
 /// Heartbeat interval time (is the basic unit of various time).
 pub const HEARTBEAT_INTERVAL_MILLIS: u64 = 3000;

@@ -45,12 +47,18 @@ pub const META_KEEP_ALIVE_INTERVAL_SECS: u64 = META_LEASE_SECS / 2;
 pub const HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);

 /// The keep-alive interval of the heartbeat channel.
-pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS: Duration =
-    Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);
+pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS: Duration = Duration::from_secs(15);

 /// The keep-alive timeout of the heartbeat channel.
-pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS: Duration =
-    Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);
+pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS: Duration = Duration::from_secs(5);
+
+/// The default options for the etcd client.
+pub fn default_etcd_client_options() -> ConnectOptions {
+    ConnectOptions::new()
+        .with_keep_alive_while_idle(true)
+        .with_keep_alive(Duration::from_secs(15), Duration::from_secs(5))
+        .with_connect_timeout(Duration::from_secs(10))
+}

 /// The default mailbox round-trip timeout.
 pub const MAILBOX_RTT_SECS: u64 = 1;
--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -272,13 +272,6 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Failed to send message: {err_msg}"))]
-    SendMessage {
-        err_msg: String,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Failed to serde json"))]
    SerdeJson {
        #[snafu(source)]
@@ -1118,7 +1111,7 @@ impl ErrorExt for Error {
            | DeserializeFlexbuffers { .. }
            | ConvertTimeRanges { .. } => StatusCode::Unexpected,

-            SendMessage { .. } | GetKvCache { .. } | CacheNotGet { .. } => StatusCode::Internal,
+            GetKvCache { .. } | CacheNotGet { .. } => StatusCode::Internal,

            SchemaAlreadyExists { .. } => StatusCode::DatabaseAlreadyExists,

--- a/src/common/meta/src/heartbeat/handler.rs
+++ b/src/common/meta/src/heartbeat/handler.rs
@@ -23,6 +23,7 @@ use crate::heartbeat::mailbox::{IncomingMessage, MailboxRef};

 pub mod invalidate_table_cache;
 pub mod parse_mailbox_message;
+pub mod suspend;
 #[cfg(test)]
 mod tests;

--- a/src/common/meta/src/heartbeat/handler/suspend.rs
+++ b/src/common/meta/src/heartbeat/handler/suspend.rs
@@ -0,0 +1,69 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use async_trait::async_trait;
+use common_telemetry::{info, warn};
+
+use crate::error::Result;
+use crate::heartbeat::handler::{
+    HandleControl, HeartbeatResponseHandler, HeartbeatResponseHandlerContext,
+};
+use crate::instruction::Instruction;
+
+/// A heartbeat response handler that handles special "suspend" error.
+/// It will simply set or clear (if previously set) the inner suspend atomic state.
+pub struct SuspendHandler {
+    suspend: Arc<AtomicBool>,
+}
+
+impl SuspendHandler {
+    pub fn new(suspend: Arc<AtomicBool>) -> Self {
+        Self { suspend }
+    }
+}
+
+#[async_trait]
+impl HeartbeatResponseHandler for SuspendHandler {
+    fn is_acceptable(&self, context: &HeartbeatResponseHandlerContext) -> bool {
+        matches!(
+            context.incoming_message,
+            Some((_, Instruction::Suspend)) | None
+        )
+    }
+
+    async fn handle(&self, context: &mut HeartbeatResponseHandlerContext) -> Result<HandleControl> {
+        let flip_state = |expect: bool| {
+            self.suspend
+                .compare_exchange(expect, !expect, Ordering::Relaxed, Ordering::Relaxed)
+                .is_ok()
+        };
+
+        if let Some((_, Instruction::Suspend)) = context.incoming_message.take() {
+            if flip_state(false) {
+                warn!("Suspend instruction received from meta, entering suspension state");
+            }
+        } else {
+            // Suspended components are made always tried to get rid of this state, we don't want
+            // an "un-suspend" instruction to resume them running. That can be error-prone.
+            // So if the "suspend" instruction is not found in the heartbeat, just unset the state.
+            if flip_state(true) {
+                info!("clear suspend state");
+            }
+        }
+        Ok(HandleControl::Continue)
+    }
+}
--- a/src/common/meta/src/heartbeat/mailbox.rs
+++ b/src/common/meta/src/heartbeat/mailbox.rs
@@ -15,8 +15,8 @@
 use std::sync::Arc;

 use tokio::sync::mpsc::Sender;
+use tokio::sync::mpsc::error::SendError;

-use crate::error::{self, Result};
 use crate::instruction::{Instruction, InstructionReply};

 pub type IncomingMessage = (MessageMeta, Instruction);
@@ -51,13 +51,8 @@ impl HeartbeatMailbox {
        Self { sender }
    }

-    pub async fn send(&self, message: OutgoingMessage) -> Result<()> {
-        self.sender.send(message).await.map_err(|e| {
-            error::SendMessageSnafu {
-                err_msg: e.to_string(),
-            }
-            .build()
-        })
+    pub async fn send(&self, message: OutgoingMessage) -> Result<(), SendError<OutgoingMessage>> {
+        self.sender.send(message).await
    }
 }

--- a/src/common/meta/src/instruction.rs
+++ b/src/common/meta/src/instruction.rs
@@ -539,6 +539,8 @@ pub enum Instruction {
    GetFileRefs(GetFileRefs),
    /// Triggers garbage collection for a region.
    GcRegions(GcRegions),
+    /// Temporary suspend serving reads or writes
+    Suspend,
 }

 impl Instruction {
--- a/src/common/meta/src/key/table_info.rs
+++ b/src/common/meta/src/key/table_info.rs
@@ -94,7 +94,7 @@ impl TableInfoValue {
        }
    }

-    pub(crate) fn update(&self, new_table_info: RawTableInfo) -> Self {
+    pub fn update(&self, new_table_info: RawTableInfo) -> Self {
        Self {
            table_info: new_table_info,
            version: self.version + 1,
--- a/src/common/meta/src/rpc/ddl.rs
+++ b/src/common/meta/src/rpc/ddl.rs
@@ -23,19 +23,20 @@ use api::v1::alter_database_expr::Kind as PbAlterDatabaseKind;
 use api::v1::meta::ddl_task_request::Task;
 use api::v1::meta::{
    AlterDatabaseTask as PbAlterDatabaseTask, AlterTableTask as PbAlterTableTask,
-    AlterTableTasks as PbAlterTableTasks, CreateDatabaseTask as PbCreateDatabaseTask,
-    CreateFlowTask as PbCreateFlowTask, CreateTableTask as PbCreateTableTask,
-    CreateTableTasks as PbCreateTableTasks, CreateViewTask as PbCreateViewTask,
-    DdlTaskRequest as PbDdlTaskRequest, DdlTaskResponse as PbDdlTaskResponse,
-    DropDatabaseTask as PbDropDatabaseTask, DropFlowTask as PbDropFlowTask,
-    DropTableTask as PbDropTableTask, DropTableTasks as PbDropTableTasks,
-    DropViewTask as PbDropViewTask, Partition, ProcedureId,
+    AlterTableTasks as PbAlterTableTasks, CommentOnTask as PbCommentOnTask,
+    CreateDatabaseTask as PbCreateDatabaseTask, CreateFlowTask as PbCreateFlowTask,
+    CreateTableTask as PbCreateTableTask, CreateTableTasks as PbCreateTableTasks,
+    CreateViewTask as PbCreateViewTask, DdlTaskRequest as PbDdlTaskRequest,
+    DdlTaskResponse as PbDdlTaskResponse, DropDatabaseTask as PbDropDatabaseTask,
+    DropFlowTask as PbDropFlowTask, DropTableTask as PbDropTableTask,
+    DropTableTasks as PbDropTableTasks, DropViewTask as PbDropViewTask, Partition, ProcedureId,
    TruncateTableTask as PbTruncateTableTask,
 };
 use api::v1::{
-    AlterDatabaseExpr, AlterTableExpr, CreateDatabaseExpr, CreateFlowExpr, CreateTableExpr,
-    CreateViewExpr, DropDatabaseExpr, DropFlowExpr, DropTableExpr, DropViewExpr, EvalInterval,
-    ExpireAfter, Option as PbOption, QueryContext as PbQueryContext, TruncateTableExpr,
+    AlterDatabaseExpr, AlterTableExpr, CommentObjectType as PbCommentObjectType, CommentOnExpr,
+    CreateDatabaseExpr, CreateFlowExpr, CreateTableExpr, CreateViewExpr, DropDatabaseExpr,
+    DropFlowExpr, DropTableExpr, DropViewExpr, EvalInterval, ExpireAfter, Option as PbOption,
+    QueryContext as PbQueryContext, TruncateTableExpr,
 };
 use base64::Engine as _;
 use base64::engine::general_purpose;
@@ -78,6 +79,7 @@ pub enum DdlTask {
    DropView(DropViewTask),
    #[cfg(feature = "enterprise")]
    CreateTrigger(trigger::CreateTriggerTask),
+    CommentOn(CommentOnTask),
 }

 impl DdlTask {
@@ -200,6 +202,11 @@ impl DdlTask {
            view_info,
        })
    }
+
+    /// Creates a [`DdlTask`] to comment on a table, column, or flow.
+    pub fn new_comment_on(task: CommentOnTask) -> Self {
+        DdlTask::CommentOn(task)
+    }
 }

 impl TryFrom<Task> for DdlTask {
@@ -278,6 +285,7 @@ impl TryFrom<Task> for DdlTask {
                    .fail()
                }
            }
+            Task::CommentOnTask(comment_on) => Ok(DdlTask::CommentOn(comment_on.try_into()?)),
        }
    }
 }
@@ -332,6 +340,7 @@ impl TryFrom<SubmitDdlTaskRequest> for PbDdlTaskRequest {
            DdlTask::CreateTrigger(task) => Task::CreateTriggerTask(task.try_into()?),
            #[cfg(feature = "enterprise")]
            DdlTask::DropTrigger(task) => Task::DropTriggerTask(task.into()),
+            DdlTask::CommentOn(task) => Task::CommentOnTask(task.into()),
        };

        Ok(Self {
@@ -1277,6 +1286,119 @@ impl From<DropFlowTask> for PbDropFlowTask {
    }
 }

+/// Represents the ID of the object being commented on (Table or Flow).
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum CommentObjectId {
+    Table(TableId),
+    Flow(FlowId),
+}
+
+/// Comment on table, column, or flow
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct CommentOnTask {
+    pub catalog_name: String,
+    pub schema_name: String,
+    pub object_type: CommentObjectType,
+    pub object_name: String,
+    /// Column name (only for Column comments)
+    pub column_name: Option<String>,
+    /// Object ID (Table or Flow) for validation and cache invalidation
+    pub object_id: Option<CommentObjectId>,
+    pub comment: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum CommentObjectType {
+    Table,
+    Column,
+    Flow,
+}
+
+impl CommentOnTask {
+    pub fn table_ref(&self) -> TableReference<'_> {
+        TableReference {
+            catalog: &self.catalog_name,
+            schema: &self.schema_name,
+            table: &self.object_name,
+        }
+    }
+}
+
+// Proto conversions for CommentObjectType
+impl From<CommentObjectType> for PbCommentObjectType {
+    fn from(object_type: CommentObjectType) -> Self {
+        match object_type {
+            CommentObjectType::Table => PbCommentObjectType::Table,
+            CommentObjectType::Column => PbCommentObjectType::Column,
+            CommentObjectType::Flow => PbCommentObjectType::Flow,
+        }
+    }
+}
+
+impl TryFrom<i32> for CommentObjectType {
+    type Error = error::Error;
+
+    fn try_from(value: i32) -> Result<Self> {
+        match value {
+            0 => Ok(CommentObjectType::Table),
+            1 => Ok(CommentObjectType::Column),
+            2 => Ok(CommentObjectType::Flow),
+            _ => error::InvalidProtoMsgSnafu {
+                err_msg: format!(
+                    "Invalid CommentObjectType value: {}. Valid values are: 0 (Table), 1 (Column), 2 (Flow)",
+                    value
+                ),
+            }
+            .fail(),
+        }
+    }
+}
+
+// Proto conversions for CommentOnTask
+impl TryFrom<PbCommentOnTask> for CommentOnTask {
+    type Error = error::Error;
+
+    fn try_from(pb: PbCommentOnTask) -> Result<Self> {
+        let comment_on = pb.comment_on.context(error::InvalidProtoMsgSnafu {
+            err_msg: "expected comment_on",
+        })?;
+
+        Ok(CommentOnTask {
+            catalog_name: comment_on.catalog_name,
+            schema_name: comment_on.schema_name,
+            object_type: comment_on.object_type.try_into()?,
+            object_name: comment_on.object_name,
+            column_name: if comment_on.column_name.is_empty() {
+                None
+            } else {
+                Some(comment_on.column_name)
+            },
+            comment: if comment_on.comment.is_empty() {
+                None
+            } else {
+                Some(comment_on.comment)
+            },
+            object_id: None,
+        })
+    }
+}
+
+impl From<CommentOnTask> for PbCommentOnTask {
+    fn from(task: CommentOnTask) -> Self {
+        let pb_object_type: PbCommentObjectType = task.object_type.into();
+        PbCommentOnTask {
+            comment_on: Some(CommentOnExpr {
+                catalog_name: task.catalog_name,
+                schema_name: task.schema_name,
+                object_type: pb_object_type as i32,
+                object_name: task.object_name,
+                column_name: task.column_name.unwrap_or_default(),
+                comment: task.comment.unwrap_or_default(),
+            }),
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct QueryContext {
    pub(crate) current_catalog: String,
--- a/src/common/meta/src/wal_options_allocator/topic_creator.rs
+++ b/src/common/meta/src/wal_options_allocator/topic_creator.rs
@@ -14,7 +14,7 @@

 use common_telemetry::{debug, error, info};
 use common_wal::config::kafka::common::{
-    DEFAULT_BACKOFF_CONFIG, KafkaConnectionConfig, KafkaTopicConfig,
+    DEFAULT_BACKOFF_CONFIG, DEFAULT_CONNECT_TIMEOUT, KafkaConnectionConfig, KafkaTopicConfig,
 };
 use rskafka::client::error::Error as RsKafkaError;
 use rskafka::client::error::ProtocolError::TopicAlreadyExists;
@@ -205,11 +205,13 @@ impl KafkaTopicCreator {
        self.partition_client(topic).await.unwrap()
    }
 }
+
 /// Builds a kafka [Client](rskafka::client::Client).
 pub async fn build_kafka_client(connection: &KafkaConnectionConfig) -> Result<Client> {
    // Builds an kafka controller client for creating topics.
    let mut builder = ClientBuilder::new(connection.broker_endpoints.clone())
-        .backoff_config(DEFAULT_BACKOFF_CONFIG);
+        .backoff_config(DEFAULT_BACKOFF_CONFIG)
+        .connect_timeout(Some(DEFAULT_CONNECT_TIMEOUT));
    if let Some(sasl) = &connection.sasl {
        builder = builder.sasl_config(sasl.config.clone().into_sasl_config());
    };
--- a/src/common/procedure/src/local/runner.rs
+++ b/src/common/procedure/src/local/runner.rs
@@ -331,8 +331,29 @@ impl Runner {
                        }

                        match status {
-                            Status::Executing { .. } => {}
+                            Status::Executing { .. } => {
+                                let prev_state = self.meta.state();
+                                if !matches!(prev_state, ProcedureState::Running) {
+                                    info!(
+                                        "Set Procedure {}-{} state to running, prev_state: {:?}",
+                                        self.procedure.type_name(),
+                                        self.meta.id,
+                                        prev_state
+                                    );
+                                    self.meta.set_state(ProcedureState::Running);
+                                }
+                            }
                            Status::Suspended { subprocedures, .. } => {
+                                let prev_state = self.meta.state();
+                                if !matches!(prev_state, ProcedureState::Running) {
+                                    info!(
+                                        "Set Procedure {}-{} state to running, prev_state: {:?}",
+                                        self.procedure.type_name(),
+                                        self.meta.id,
+                                        prev_state
+                                    );
+                                    self.meta.set_state(ProcedureState::Running);
+                                }
                                self.on_suspended(subprocedures).await;
                            }
                            Status::Done { output } => {
@@ -393,8 +414,12 @@ impl Runner {
                            return;
                        }

-                        self.meta
-                            .set_state(ProcedureState::prepare_rollback(Arc::new(e)));
+                        if self.procedure.rollback_supported() {
+                            self.meta
+                                .set_state(ProcedureState::prepare_rollback(Arc::new(e)));
+                        } else {
+                            self.meta.set_state(ProcedureState::failed(Arc::new(e)));
+                        }
                    }
                }
            }
@@ -1080,20 +1105,10 @@ mod tests {
        let mut runner = new_runner(meta.clone(), Box::new(fail), procedure_store.clone());
        runner.manager_ctx.start();

-        runner.execute_once(&ctx).await;
-        let state = runner.meta.state();
-        assert!(state.is_prepare_rollback(), "{state:?}");
-
        runner.execute_once(&ctx).await;
        let state = runner.meta.state();
        assert!(state.is_failed(), "{state:?}");
-        check_files(
-            &object_store,
-            &procedure_store,
-            ctx.procedure_id,
-            &["0000000000.rollback"],
-        )
-        .await;
+        check_files(&object_store, &procedure_store, ctx.procedure_id, &[]).await;
    }

    #[tokio::test]
@@ -1146,6 +1161,8 @@ mod tests {
            async move {
                if times == 1 {
                    Err(Error::retry_later(MockError::new(StatusCode::Unexpected)))
+                } else if times == 2 {
+                    Ok(Status::executing(false))
                } else {
                    Ok(Status::done())
                }
@@ -1172,6 +1189,10 @@ mod tests {
        let state = runner.meta.state();
        assert!(state.is_retrying(), "{state:?}");

+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_running(), "{state:?}");
+
        runner.execute_once(&ctx).await;
        let state = runner.meta.state();
        assert!(state.is_done(), "{state:?}");
@@ -1185,6 +1206,86 @@ mod tests {
        .await;
    }

+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_execute_on_retry_later_error_with_child() {
+        common_telemetry::init_default_ut_logging();
+        let mut times = 0;
+        let child_id = ProcedureId::random();
+
+        let exec_fn = move |_| {
+            times += 1;
+            async move {
+                debug!("times: {}", times);
+                if times == 1 {
+                    Err(Error::retry_later(MockError::new(StatusCode::Unexpected)))
+                } else if times == 2 {
+                    let exec_fn = |_| {
+                        async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }
+                            .boxed()
+                    };
+                    let fail = ProcedureAdapter {
+                        data: "fail".to_string(),
+                        lock_key: LockKey::single_exclusive("catalog.schema.table.region-0"),
+                        poison_keys: PoisonKeys::default(),
+                        exec_fn,
+                        rollback_fn: None,
+                    };
+
+                    Ok(Status::Suspended {
+                        subprocedures: vec![ProcedureWithId {
+                            id: child_id,
+                            procedure: Box::new(fail),
+                        }],
+                        persist: true,
+                    })
+                } else {
+                    Ok(Status::done())
+                }
+            }
+            .boxed()
+        };
+
+        let retry_later = ProcedureAdapter {
+            data: "retry_later".to_string(),
+            lock_key: LockKey::single_exclusive("catalog.schema.table"),
+            poison_keys: PoisonKeys::default(),
+            exec_fn,
+            rollback_fn: None,
+        };
+
+        let dir = create_temp_dir("retry_later");
+        let meta = retry_later.new_meta(ROOT_ID);
+        let ctx = context_without_provider(meta.id);
+        let object_store = test_util::new_object_store(&dir);
+        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
+        let mut runner = new_runner(meta.clone(), Box::new(retry_later), procedure_store.clone());
+        runner.manager_ctx.start();
+        debug!("execute_once 1");
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_retrying(), "{state:?}");
+
+        let moved_meta = meta.clone();
+        tokio::spawn(async move {
+            moved_meta.child_notify.notify_one();
+        });
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_running(), "{state:?}");
+
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_done(), "{state:?}");
+        assert!(meta.state().is_done());
+        check_files(
+            &object_store,
+            &procedure_store,
+            ctx.procedure_id,
+            &["0000000000.step", "0000000001.commit"],
+        )
+        .await;
+    }
+
    #[tokio::test]
    async fn test_execute_exceed_max_retry_later() {
        let exec_fn =
@@ -1304,7 +1405,7 @@ mod tests {
    async fn test_child_error() {
        let mut times = 0;
        let child_id = ProcedureId::random();
-
+        common_telemetry::init_default_ut_logging();
        let exec_fn = move |ctx: Context| {
            times += 1;
            async move {
@@ -1529,7 +1630,7 @@ mod tests {

        runner.execute_once(&ctx).await;
        let state = runner.meta.state();
-        assert!(state.is_prepare_rollback(), "{state:?}");
+        assert!(state.is_failed(), "{state:?}");

        let procedure_id = runner
            .manager_ctx
@@ -1596,11 +1697,6 @@ mod tests {
        let state = runner.meta.state();
        assert!(state.is_running(), "{state:?}");

-        runner.execute_once(&ctx).await;
-        let state = runner.meta.state();
-        assert!(state.is_prepare_rollback(), "{state:?}");
-        assert!(meta.state().is_prepare_rollback());
-
        runner.execute_once(&ctx).await;
        let state = runner.meta.state();
        assert!(state.is_failed(), "{state:?}");
--- a/src/common/query/src/lib.rs
+++ b/src/common/query/src/lib.rs
@@ -46,6 +46,22 @@ pub enum OutputData {
    Stream(SendableRecordBatchStream),
 }

+impl OutputData {
+    /// Consume the data to pretty printed string.
+    pub async fn pretty_print(self) -> String {
+        match self {
+            OutputData::AffectedRows(x) => {
+                format!("Affected Rows: {x}")
+            }
+            OutputData::RecordBatches(x) => x.pretty_print().unwrap_or_else(|e| e.to_string()),
+            OutputData::Stream(x) => common_recordbatch::util::collect_batches(x)
+                .await
+                .and_then(|x| x.pretty_print())
+                .unwrap_or_else(|e| e.to_string()),
+        }
+    }
+}
+
 /// OutputMeta stores meta information produced/generated during the execution
 #[derive(Debug, Default)]
 pub struct OutputMeta {
--- a/src/common/wal/src/config/kafka/common.rs
+++ b/src/common/wal/src/config/kafka/common.rs
@@ -36,6 +36,9 @@ pub const DEFAULT_BACKOFF_CONFIG: BackoffConfig = BackoffConfig {
    deadline: Some(Duration::from_secs(3)),
 };

+/// The default connect timeout for kafka client.
+pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
+
 /// Default interval for auto WAL pruning.
 pub const DEFAULT_AUTO_PRUNE_INTERVAL: Duration = Duration::from_mins(30);
 /// Default limit for concurrent auto pruning tasks.
--- a/src/datanode/src/datanode.rs
+++ b/src/datanode/src/datanode.rs
@@ -22,6 +22,7 @@ use common_base::Plugins;
 use common_error::ext::BoxedError;
 use common_greptimedb_telemetry::GreptimeDBTelemetryTask;
 use common_meta::cache::{LayeredCacheRegistry, SchemaCacheRef, TableSchemaCacheRef};
+use common_meta::cache_invalidator::CacheInvalidatorRef;
 use common_meta::datanode::TopicStatsReporter;
 use common_meta::key::runtime_switch::RuntimeSwitchManager;
 use common_meta::key::{SchemaMetadataManager, SchemaMetadataManagerRef};
@@ -281,21 +282,11 @@ impl DatanodeBuilder {
            open_all_regions.await?;
        }

-        let mut resource_stat = ResourceStatImpl::default();
-        resource_stat.start_collect_cpu_usage();
-
        let heartbeat_task = if let Some(meta_client) = meta_client {
-            Some(
-                HeartbeatTask::try_new(
-                    &self.opts,
-                    region_server.clone(),
-                    meta_client,
-                    cache_registry,
-                    self.plugins.clone(),
-                    Arc::new(resource_stat),
-                )
-                .await?,
-            )
+            let task = self
+                .create_heartbeat_task(&region_server, meta_client, cache_registry)
+                .await?;
+            Some(task)
        } else {
            None
        };
@@ -324,6 +315,29 @@ impl DatanodeBuilder {
        })
    }

+    async fn create_heartbeat_task(
+        &self,
+        region_server: &RegionServer,
+        meta_client: MetaClientRef,
+        cache_invalidator: CacheInvalidatorRef,
+    ) -> Result<HeartbeatTask> {
+        let stat = {
+            let mut stat = ResourceStatImpl::default();
+            stat.start_collect_cpu_usage();
+            Arc::new(stat)
+        };
+
+        HeartbeatTask::try_new(
+            &self.opts,
+            region_server.clone(),
+            meta_client,
+            cache_invalidator,
+            self.plugins.clone(),
+            stat,
+        )
+        .await
+    }
+
    /// Builds [ObjectStoreManager] from [StorageConfig].
    pub async fn build_object_store_manager(cfg: &StorageConfig) -> Result<ObjectStoreManagerRef> {
        let object_store = store::new_object_store(cfg.store.clone(), &cfg.data_home).await?;
--- a/src/datanode/src/heartbeat.rs
+++ b/src/datanode/src/heartbeat.rs
@@ -25,6 +25,7 @@ use common_meta::datanode::REGION_STATISTIC_KEY;
 use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS;
 use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHandler;
 use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
+use common_meta::heartbeat::handler::suspend::SuspendHandler;
 use common_meta::heartbeat::handler::{
    HandlerGroupExecutor, HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef,
 };
@@ -91,6 +92,7 @@ impl HeartbeatTask {
        let resp_handler_executor = Arc::new(HandlerGroupExecutor::new(vec![
            region_alive_keeper.clone(),
            Arc::new(ParseMailboxMessageHandler),
+            Arc::new(SuspendHandler::new(region_server.suspend_state())),
            Arc::new(
                RegionHeartbeatResponseHandler::new(region_server.clone())
                    .with_open_region_parallelism(opts.init_regions_parallelism),
--- a/src/datanode/src/heartbeat/handler.rs
+++ b/src/datanode/src/heartbeat/handler.rs
@@ -99,26 +99,30 @@ impl RegionHeartbeatResponseHandler {
        self
    }

-    fn build_handler(&self, instruction: &Instruction) -> MetaResult<Box<InstructionHandlers>> {
+    fn build_handler(
+        &self,
+        instruction: &Instruction,
+    ) -> MetaResult<Option<Box<InstructionHandlers>>> {
        match instruction {
-            Instruction::CloseRegions(_) => Ok(Box::new(CloseRegionsHandler.into())),
-            Instruction::OpenRegions(_) => Ok(Box::new(
+            Instruction::CloseRegions(_) => Ok(Some(Box::new(CloseRegionsHandler.into()))),
+            Instruction::OpenRegions(_) => Ok(Some(Box::new(
                OpenRegionsHandler {
                    open_region_parallelism: self.open_region_parallelism,
                }
                .into(),
-            )),
-            Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler.into())),
-            Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler.into())),
-            Instruction::UpgradeRegions(_) => Ok(Box::new(
+            ))),
+            Instruction::FlushRegions(_) => Ok(Some(Box::new(FlushRegionsHandler.into()))),
+            Instruction::DowngradeRegions(_) => Ok(Some(Box::new(DowngradeRegionsHandler.into()))),
+            Instruction::UpgradeRegions(_) => Ok(Some(Box::new(
                UpgradeRegionsHandler {
                    upgrade_region_parallelism: self.open_region_parallelism,
                }
                .into(),
-            )),
-            Instruction::GetFileRefs(_) => Ok(Box::new(GetFileRefsHandler.into())),
-            Instruction::GcRegions(_) => Ok(Box::new(GcRegionsHandler.into())),
+            ))),
+            Instruction::GetFileRefs(_) => Ok(Some(Box::new(GetFileRefsHandler.into()))),
+            Instruction::GcRegions(_) => Ok(Some(Box::new(GcRegionsHandler.into()))),
            Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(),
+            Instruction::Suspend => Ok(None),
        }
    }
 }
@@ -216,30 +220,24 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
            .context(InvalidHeartbeatResponseSnafu)?;

        let mailbox = ctx.mailbox.clone();
-        let region_server = self.region_server.clone();
-        let downgrade_tasks = self.downgrade_tasks.clone();
-        let flush_tasks = self.flush_tasks.clone();
-        let gc_tasks = self.gc_tasks.clone();
-        let handler = self.build_handler(&instruction)?;
-        let _handle = common_runtime::spawn_global(async move {
-            let reply = handler
-                .handle(
-                    &HandlerContext {
-                        region_server,
-                        downgrade_tasks,
-                        flush_tasks,
-                        gc_tasks,
-                    },
-                    instruction,
-                )
-                .await;
-
-            if let Some(reply) = reply
-                && let Err(e) = mailbox.send((meta, reply)).await
-            {
-                error!(e; "Failed to send reply to mailbox");
-            }
-        });
+        if let Some(handler) = self.build_handler(&instruction)? {
+            let context = HandlerContext {
+                region_server: self.region_server.clone(),
+                downgrade_tasks: self.downgrade_tasks.clone(),
+                flush_tasks: self.flush_tasks.clone(),
+                gc_tasks: self.gc_tasks.clone(),
+            };
+            let _handle = common_runtime::spawn_global(async move {
+                let reply = handler.handle(&context, instruction).await;
+                if let Some(reply) = reply
+                    && let Err(e) = mailbox.send((meta, reply)).await
+                {
+                    let error = e.to_string();
+                    let (meta, reply) = e.0;
+                    error!("Failed to send reply {reply} to {meta:?}: {error}");
+                }
+            });
+        }

        Ok(HandleControl::Continue)
    }
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -17,6 +17,7 @@ mod catalog;
 use std::collections::HashMap;
 use std::fmt::Debug;
 use std::ops::Deref;
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, RwLock};
 use std::time::Duration;

@@ -52,7 +53,9 @@ pub use query::dummy_catalog::{
    DummyCatalogList, DummyTableProviderFactory, TableProviderFactoryRef,
 };
 use serde_json;
-use servers::error::{self as servers_error, ExecuteGrpcRequestSnafu, Result as ServerResult};
+use servers::error::{
+    self as servers_error, ExecuteGrpcRequestSnafu, Result as ServerResult, SuspendedSnafu,
+};
 use servers::grpc::FlightCompression;
 use servers::grpc::flight::{FlightCraft, FlightRecordBatchStream, TonicStream};
 use servers::grpc::region_server::RegionServerHandler;
@@ -89,6 +92,7 @@ use crate::region_server::catalog::{NameAwareCatalogList, NameAwareDataSourceInj
 pub struct RegionServer {
    inner: Arc<RegionServerInner>,
    flight_compression: FlightCompression,
+    suspend: Arc<AtomicBool>,
 }

 pub struct RegionStat {
@@ -136,6 +140,7 @@ impl RegionServer {
                ),
            )),
            flight_compression,
+            suspend: Arc::new(AtomicBool::new(false)),
        }
    }

@@ -595,6 +600,14 @@ impl RegionServer {
            .handle_sync_region(engine_with_status.engine(), region_id, manifest_info)
            .await
    }
+
+    fn is_suspended(&self) -> bool {
+        self.suspend.load(Ordering::Relaxed)
+    }
+
+    pub(crate) fn suspend_state(&self) -> Arc<AtomicBool> {
+        self.suspend.clone()
+    }
 }

 #[async_trait]
@@ -644,6 +657,8 @@ impl FlightCraft for RegionServer {
        &self,
        request: Request<Ticket>,
    ) -> TonicResult<Response<TonicStream<FlightData>>> {
+        ensure!(!self.is_suspended(), SuspendedSnafu);
+
        let ticket = request.into_inner().ticket;
        let request = api::v1::region::QueryRequest::decode(ticket.as_ref())
            .context(servers_error::InvalidFlightTicketSnafu)?;
--- a/src/frontend/Cargo.toml
+++ b/src/frontend/Cargo.toml
@@ -17,6 +17,7 @@ arc-swap = "1.0"
 async-stream.workspace = true
 async-trait.workspace = true
 auth.workspace = true
+axum.workspace = true
 bytes.workspace = true
 cache.workspace = true
 catalog.workspace = true
@@ -85,6 +86,9 @@ common-test-util.workspace = true
 datanode.workspace = true
 datatypes.workspace = true
 futures.workspace = true
+hyper-util = { workspace = true, features = ["tokio"] }
+meta-srv.workspace = true
+reqwest.workspace = true
 serde_json.workspace = true
 strfmt = "0.2"
 tower.workspace = true
--- a/src/frontend/src/error.rs
+++ b/src/frontend/src/error.rs
@@ -364,6 +364,12 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display("Service suspended"))]
+    Suspended {
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -444,6 +450,8 @@ impl ErrorExt for Error {
            Error::StatementTimeout { .. } => StatusCode::Cancelled,

            Error::AcquireLimiter { .. } => StatusCode::Internal,
+
+            Error::Suspended { .. } => StatusCode::Suspended,
        }
    }

--- a/src/frontend/src/frontend.rs
+++ b/src/frontend/src/frontend.rs
@@ -141,7 +141,43 @@ impl Frontend {

 #[cfg(test)]
 mod tests {
+    use std::sync::atomic::{AtomicBool, Ordering};
+    use std::time::Duration;
+
+    use api::v1::meta::heartbeat_server::HeartbeatServer;
+    use api::v1::meta::mailbox_message::Payload;
+    use api::v1::meta::{
+        AskLeaderRequest, AskLeaderResponse, HeartbeatRequest, HeartbeatResponse, MailboxMessage,
+        Peer, ResponseHeader, Role, heartbeat_server,
+    };
+    use async_trait::async_trait;
+    use client::{Client, Database};
+    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+    use common_error::ext::ErrorExt;
+    use common_error::from_header_to_err_code_msg;
+    use common_error::status_code::StatusCode;
+    use common_grpc::channel_manager::ChannelManager;
+    use common_meta::distributed_time_constants::FRONTEND_HEARTBEAT_INTERVAL_MILLIS;
+    use common_meta::heartbeat::handler::HandlerGroupExecutor;
+    use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
+    use common_meta::heartbeat::handler::suspend::SuspendHandler;
+    use common_meta::instruction::Instruction;
+    use common_stat::ResourceStatImpl;
+    use meta_client::MetaClientRef;
+    use meta_client::client::MetaClientBuilder;
+    use meta_srv::service::GrpcStream;
+    use servers::grpc::{FlightCompression, GRPC_SERVER};
+    use servers::http::HTTP_SERVER;
+    use servers::http::result::greptime_result_v1::GreptimedbV1Response;
+    use tokio::sync::mpsc;
+    use tonic::codec::CompressionEncoding;
+    use tonic::codegen::tokio_stream::StreamExt;
+    use tonic::codegen::tokio_stream::wrappers::ReceiverStream;
+    use tonic::{Request, Response, Status, Streaming};
+
    use super::*;
+    use crate::instance::builder::FrontendBuilder;
+    use crate::server::Services;

    #[test]
    fn test_toml() {
@@ -149,4 +185,277 @@ mod tests {
        let toml_string = toml::to_string(&opts).unwrap();
        let _parsed: FrontendOptions = toml::from_str(&toml_string).unwrap();
    }
+
+    struct SuspendableHeartbeatServer {
+        suspend: Arc<AtomicBool>,
+    }
+
+    #[async_trait]
+    impl heartbeat_server::Heartbeat for SuspendableHeartbeatServer {
+        type HeartbeatStream = GrpcStream<HeartbeatResponse>;
+
+        async fn heartbeat(
+            &self,
+            request: Request<Streaming<HeartbeatRequest>>,
+        ) -> std::result::Result<Response<Self::HeartbeatStream>, Status> {
+            let (tx, rx) = mpsc::channel(4);
+
+            common_runtime::spawn_global({
+                let mut requests = request.into_inner();
+                let suspend = self.suspend.clone();
+                async move {
+                    while let Some(request) = requests.next().await {
+                        if let Err(e) = request {
+                            let _ = tx.send(Err(e)).await;
+                            return;
+                        }
+
+                        let mailbox_message =
+                            suspend.load(Ordering::Relaxed).then(|| MailboxMessage {
+                                payload: Some(Payload::Json(
+                                    serde_json::to_string(&Instruction::Suspend).unwrap(),
+                                )),
+                                ..Default::default()
+                            });
+                        let response = HeartbeatResponse {
+                            header: Some(ResponseHeader::success()),
+                            mailbox_message,
+                            ..Default::default()
+                        };
+
+                        let _ = tx.send(Ok(response)).await;
+                    }
+                }
+            });
+
+            Ok(Response::new(Box::pin(ReceiverStream::new(rx))))
+        }
+
+        async fn ask_leader(
+            &self,
+            _: Request<AskLeaderRequest>,
+        ) -> std::result::Result<Response<AskLeaderResponse>, Status> {
+            Ok(Response::new(AskLeaderResponse {
+                header: Some(ResponseHeader::success()),
+                leader: Some(Peer {
+                    addr: "localhost:0".to_string(),
+                    ..Default::default()
+                }),
+            }))
+        }
+    }
+
+    async fn create_meta_client(
+        options: &MetaClientOptions,
+        heartbeat_server: Arc<SuspendableHeartbeatServer>,
+    ) -> MetaClientRef {
+        let (client, server) = tokio::io::duplex(1024);
+
+        // create the heartbeat server:
+        common_runtime::spawn_global(async move {
+            let mut router = tonic::transport::Server::builder();
+            let router = router.add_service(
+                HeartbeatServer::from_arc(heartbeat_server)
+                    .accept_compressed(CompressionEncoding::Zstd)
+                    .send_compressed(CompressionEncoding::Zstd),
+            );
+            router
+                .serve_with_incoming(futures::stream::iter([Ok::<_, std::io::Error>(server)]))
+                .await
+        });
+
+        // Move client to an option so we can _move_ the inner value
+        // on the first attempt to connect. All other attempts will fail.
+        let mut client = Some(client);
+        let connector = tower::service_fn(move |_| {
+            let client = client.take();
+            async move {
+                if let Some(client) = client {
+                    Ok(hyper_util::rt::TokioIo::new(client))
+                } else {
+                    Err(std::io::Error::other("client already taken"))
+                }
+            }
+        });
+        let manager = ChannelManager::new();
+        manager
+            .reset_with_connector("localhost:0", connector)
+            .unwrap();
+
+        // create the heartbeat client:
+        let mut client = MetaClientBuilder::new(0, Role::Frontend)
+            .enable_heartbeat()
+            .heartbeat_channel_manager(manager)
+            .build();
+        client.start(&options.metasrv_addrs).await.unwrap();
+        Arc::new(client)
+    }
+
+    async fn create_frontend(
+        options: &FrontendOptions,
+        meta_client: MetaClientRef,
+    ) -> Result<Frontend> {
+        let instance = Arc::new(
+            FrontendBuilder::new_test(options, meta_client.clone())
+                .try_build()
+                .await?,
+        );
+
+        let servers =
+            Services::new(options.clone(), instance.clone(), Default::default()).build()?;
+
+        let executor = Arc::new(HandlerGroupExecutor::new(vec![
+            Arc::new(ParseMailboxMessageHandler),
+            Arc::new(SuspendHandler::new(instance.suspend_state())),
+        ]));
+        let heartbeat_task = Some(HeartbeatTask::new(
+            options,
+            meta_client,
+            executor,
+            Arc::new(ResourceStatImpl::default()),
+        ));
+
+        let mut frontend = Frontend {
+            instance,
+            servers,
+            heartbeat_task,
+        };
+        frontend.start().await?;
+        Ok(frontend)
+    }
+
+    async fn verify_suspend_state_by_http(
+        frontend: &Frontend,
+        expected: std::result::Result<&str, (StatusCode, &str)>,
+    ) {
+        let addr = frontend.server_handlers().addr(HTTP_SERVER).unwrap();
+        let response = reqwest::get(format!("http://{}/v1/sql?sql=SELECT 1", addr))
+            .await
+            .unwrap();
+
+        let headers = response.headers();
+        let response = if let Some((code, error)) = from_header_to_err_code_msg(headers) {
+            Err((code, error))
+        } else {
+            Ok(response.text().await.unwrap())
+        };
+
+        match (response, expected) {
+            (Ok(response), Ok(expected)) => {
+                let response: GreptimedbV1Response = serde_json::from_str(&response).unwrap();
+                let response = serde_json::to_string(response.output()).unwrap();
+                assert_eq!(&response, expected);
+            }
+            (Err(actual), Err(expected)) => assert_eq!(actual, expected),
+            _ => unreachable!(),
+        }
+    }
+
+    async fn verify_suspend_state_by_grpc(
+        frontend: &Frontend,
+        expected: std::result::Result<&str, (StatusCode, &str)>,
+    ) {
+        let addr = frontend.server_handlers().addr(GRPC_SERVER).unwrap();
+        let client = Client::with_urls([addr.to_string()]);
+        let client = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
+        let response = client.sql("SELECT 1").await;
+
+        match (response, expected) {
+            (Ok(response), Ok(expected)) => {
+                let response = response.data.pretty_print().await;
+                assert_eq!(&response, expected.trim());
+            }
+            (Err(actual), Err(expected)) => {
+                assert_eq!(actual.status_code(), expected.0);
+                assert_eq!(actual.output_msg(), expected.1);
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn test_suspend_frontend() -> Result<()> {
+        common_telemetry::init_default_ut_logging();
+
+        let meta_client_options = MetaClientOptions {
+            metasrv_addrs: vec!["localhost:0".to_string()],
+            ..Default::default()
+        };
+        let options = FrontendOptions {
+            http: HttpOptions {
+                addr: "127.0.0.1:0".to_string(),
+                ..Default::default()
+            },
+            grpc: GrpcOptions {
+                bind_addr: "127.0.0.1:0".to_string(),
+                flight_compression: FlightCompression::None,
+                ..Default::default()
+            },
+            mysql: MysqlOptions {
+                enable: false,
+                ..Default::default()
+            },
+            postgres: PostgresOptions {
+                enable: false,
+                ..Default::default()
+            },
+            meta_client: Some(meta_client_options.clone()),
+            ..Default::default()
+        };
+
+        let server = Arc::new(SuspendableHeartbeatServer {
+            suspend: Arc::new(AtomicBool::new(false)),
+        });
+        let meta_client = create_meta_client(&meta_client_options, server.clone()).await;
+        let frontend = create_frontend(&options, meta_client).await?;
+
+        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        // initial state: not suspend:
+        assert!(!frontend.instance.is_suspended());
+        verify_suspend_state_by_http(&frontend, Ok(r#"[{"records":{"schema":{"column_schemas":[{"name":"Int64(1)","data_type":"Int64"}]},"rows":[[1]],"total_rows":1}}]"#)).await;
+        verify_suspend_state_by_grpc(
+            &frontend,
+            Ok(r#"
+----------+
+| Int64(1) |
+----------+
+| 1        |
+----------+"#),
+        )
+        .await;
+
+        // make heartbeat server returned "suspend" instruction,
+        server.suspend.store(true, Ordering::Relaxed);
+        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        // ... then the frontend is suspended:
+        assert!(frontend.instance.is_suspended());
+        verify_suspend_state_by_http(
+            &frontend,
+            Err((
+                StatusCode::Suspended,
+                "error: Service suspended, execution_time_ms: 0",
+            )),
+        )
+        .await;
+        verify_suspend_state_by_grpc(&frontend, Err((StatusCode::Suspended, "Service suspended")))
+            .await;
+
+        // make heartbeat server NOT returned "suspend" instruction,
+        server.suspend.store(false, Ordering::Relaxed);
+        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        // ... then frontend's suspend state is cleared:
+        assert!(!frontend.instance.is_suspended());
+        verify_suspend_state_by_http(&frontend, Ok(r#"[{"records":{"schema":{"column_schemas":[{"name":"Int64(1)","data_type":"Int64"}]},"rows":[[1]],"total_rows":1}}]"#)).await;
+        verify_suspend_state_by_grpc(
+            &frontend,
+            Ok(r#"
+----------+
+| Int64(1) |
+----------+
+| 1        |
+----------+"#),
+        )
+        .await;
+        Ok(())
+    }
 }
--- a/src/frontend/src/heartbeat.rs
+++ b/src/frontend/src/heartbeat.rs
@@ -27,7 +27,6 @@ use common_stat::ResourceStatRef;
 use common_telemetry::{debug, error, info, warn};
 use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient};
 use servers::addrs;
-use servers::heartbeat_options::HeartbeatOptions;
 use snafu::ResultExt;
 use tokio::sync::mpsc;
 use tokio::sync::mpsc::Receiver;
@@ -54,7 +53,6 @@ impl HeartbeatTask {
    pub fn new(
        opts: &FrontendOptions,
        meta_client: Arc<MetaClient>,
-        heartbeat_opts: HeartbeatOptions,
        resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
        resource_stat: ResourceStatRef,
    ) -> Self {
@@ -68,8 +66,8 @@ impl HeartbeatTask {
                addrs::resolve_addr(&opts.grpc.bind_addr, Some(&opts.grpc.server_addr))
            },
            meta_client,
-            report_interval: heartbeat_opts.interval,
-            retry_interval: heartbeat_opts.retry_interval,
+            report_interval: opts.heartbeat.interval,
+            retry_interval: opts.heartbeat.retry_interval,
            resp_handler_executor,
            start_time_ms: common_time::util::current_time_millis() as u64,
            resource_stat,
@@ -196,7 +194,8 @@ impl HeartbeatTask {
        let report_interval = self.report_interval;
        let start_time_ms = self.start_time_ms;
        let self_peer = Some(Peer {
-            // The peer id doesn't make sense for frontend, so we just set it 0.
+            // The node id will be actually calculated from its address (by hashing the address
+            // string) in the metasrv. So it can be set to 0 here, as a placeholder.
            id: 0,
            addr: self.peer_addr.clone(),
        });
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -26,7 +26,8 @@ mod region_query;
 pub mod standalone;

 use std::pin::Pin;
-use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
+use std::sync::{Arc, atomic};
 use std::time::{Duration, SystemTime};

 use async_stream::stream;
@@ -83,6 +84,7 @@ use snafu::prelude::*;
 use sql::ast::ObjectNamePartExt;
 use sql::dialect::Dialect;
 use sql::parser::{ParseOptions, ParserContext};
+use sql::statements::comment::CommentObject;
 use sql::statements::copy::{CopyDatabase, CopyTable};
 use sql::statements::statement::Statement;
 use sql::statements::tql::Tql;
@@ -119,6 +121,7 @@ pub struct Instance {
    limiter: Option<LimiterRef>,
    process_manager: ProcessManagerRef,
    slow_query_options: SlowQueryOptions,
+    suspend: Arc<AtomicBool>,

    // cache for otlp metrics
    // first layer key: db-string
@@ -171,6 +174,14 @@ impl Instance {
    pub fn procedure_executor(&self) -> &ProcedureExecutorRef {
        self.statement_executor.procedure_executor()
    }
+
+    pub fn suspend_state(&self) -> Arc<AtomicBool> {
+        self.suspend.clone()
+    }
+
+    pub(crate) fn is_suspended(&self) -> bool {
+        self.suspend.load(atomic::Ordering::Relaxed)
+    }
 }

 fn parse_stmt(sql: &str, dialect: &(dyn Dialect + Send + Sync)) -> Result<Vec<Statement>> {
@@ -513,6 +524,10 @@ impl SqlQueryHandler for Instance {

    #[tracing::instrument(skip_all)]
    async fn do_query(&self, query: &str, query_ctx: QueryContextRef) -> Vec<Result<Output>> {
+        if self.is_suspended() {
+            return vec![error::SuspendedSnafu {}.fail()];
+        }
+
        let query_interceptor_opt = self.plugins.get::<SqlQueryInterceptorRef<Error>>();
        let query_interceptor = query_interceptor_opt.as_ref();
        let query = match query_interceptor.pre_parsing(query, query_ctx.clone()) {
@@ -580,6 +595,8 @@ impl SqlQueryHandler for Instance {
        plan: LogicalPlan,
        query_ctx: QueryContextRef,
    ) -> Result<Output> {
+        ensure!(!self.is_suspended(), error::SuspendedSnafu);
+
        if should_capture_statement(stmt.as_ref()) {
            // It's safe to unwrap here because we've already checked the type.
            let stmt = stmt.unwrap();
@@ -641,6 +658,10 @@ impl SqlQueryHandler for Instance {
        query: &PromQuery,
        query_ctx: QueryContextRef,
    ) -> Vec<Result<Output>> {
+        if self.is_suspended() {
+            return vec![error::SuspendedSnafu {}.fail()];
+        }
+
        // check will be done in prometheus handler's do_query
        let result = PrometheusHandler::do_query(self, query, query_ctx)
            .await
@@ -655,6 +676,8 @@ impl SqlQueryHandler for Instance {
        stmt: Statement,
        query_ctx: QueryContextRef,
    ) -> Result<Option<DescribeResult>> {
+        ensure!(!self.is_suspended(), error::SuspendedSnafu);
+
        if matches!(
            stmt,
            Statement::Insert(_) | Statement::Query(_) | Statement::Delete(_)
@@ -875,7 +898,7 @@ pub fn check_permission(
            validate_param(&stmt.table_name, query_ctx)?;
        }
        Statement::ShowCreateFlow(stmt) => {
-            validate_param(&stmt.flow_name, query_ctx)?;
+            validate_flow(&stmt.flow_name, query_ctx)?;
        }
        #[cfg(feature = "enterprise")]
        Statement::ShowCreateTrigger(stmt) => {
@@ -908,6 +931,12 @@ pub fn check_permission(
        // show charset and show collation won't be checked
        Statement::ShowCharset(_) | Statement::ShowCollation(_) => {}

+        Statement::Comment(comment) => match &comment.object {
+            CommentObject::Table(table) => validate_param(table, query_ctx)?,
+            CommentObject::Column { table, .. } => validate_param(table, query_ctx)?,
+            CommentObject::Flow(flow) => validate_flow(flow, query_ctx)?,
+        },
+
        Statement::Insert(insert) => {
            let name = insert.table_name().context(ParseSqlSnafu)?;
            validate_param(name, query_ctx)?;
@@ -993,6 +1022,27 @@ fn validate_param(name: &ObjectName, query_ctx: &QueryContextRef) -> Result<()>
        .context(SqlExecInterceptedSnafu)
 }

+fn validate_flow(name: &ObjectName, query_ctx: &QueryContextRef) -> Result<()> {
+    let catalog = match &name.0[..] {
+        [_flow] => query_ctx.current_catalog().to_string(),
+        [catalog, _flow] => catalog.to_string_unquoted(),
+        _ => {
+            return InvalidSqlSnafu {
+                err_msg: format!(
+                    "expect flow name to be <catalog>.<flow_name> or <flow_name>, actual: {name}",
+                ),
+            }
+            .fail();
+        }
+    };
+
+    let schema = query_ctx.current_schema();
+
+    validate_catalog_and_schema(&catalog, &schema, query_ctx)
+        .map_err(BoxedError::new)
+        .context(SqlExecInterceptedSnafu)
+}
+
 fn validate_database(name: &ObjectName, query_ctx: &QueryContextRef) -> Result<()> {
    let (catalog, schema) = match &name.0[..] {
        [schema] => (
@@ -1251,6 +1301,28 @@ mod tests {

        // test describe table
        let sql = "DESC TABLE {catalog}{schema}demo;";
-        replace_test(sql, plugins, &query_ctx);
+        replace_test(sql, plugins.clone(), &query_ctx);
+
+        let comment_flow_cases = [
+            ("COMMENT ON FLOW my_flow IS 'comment';", true),
+            ("COMMENT ON FLOW greptime.my_flow IS 'comment';", true),
+            ("COMMENT ON FLOW wrongcatalog.my_flow IS 'comment';", false),
+        ];
+        for (sql, is_ok) in comment_flow_cases {
+            let stmt = &parse_stmt(sql, &GreptimeDbDialect {}).unwrap()[0];
+            let result = check_permission(plugins.clone(), stmt, &query_ctx);
+            assert_eq!(result.is_ok(), is_ok);
+        }
+
+        let show_flow_cases = [
+            ("SHOW CREATE FLOW my_flow;", true),
+            ("SHOW CREATE FLOW greptime.my_flow;", true),
+            ("SHOW CREATE FLOW wrongcatalog.my_flow;", false),
+        ];
+        for (sql, is_ok) in show_flow_cases {
+            let stmt = &parse_stmt(sql, &GreptimeDbDialect {}).unwrap()[0];
+            let result = check_permission(plugins.clone(), stmt, &query_ctx);
+            assert_eq!(result.is_ok(), is_ok);
+        }
    }
 }
--- a/src/frontend/src/instance/builder.rs
+++ b/src/frontend/src/instance/builder.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 use std::sync::Arc;
+use std::sync::atomic::AtomicBool;

 use cache::{TABLE_FLOWNODE_SET_CACHE_NAME, TABLE_ROUTE_CACHE_NAME};
 use catalog::CatalogManagerRef;
@@ -87,6 +88,33 @@ impl FrontendBuilder {
        }
    }

+    #[cfg(test)]
+    pub(crate) fn new_test(
+        options: &FrontendOptions,
+        meta_client: meta_client::MetaClientRef,
+    ) -> Self {
+        let kv_backend = Arc::new(common_meta::kv_backend::memory::MemoryKvBackend::new());
+
+        let layered_cache_registry = Arc::new(
+            common_meta::cache::LayeredCacheRegistryBuilder::default()
+                .add_cache_registry(cache::build_fundamental_cache_registry(kv_backend.clone()))
+                .build(),
+        );
+
+        Self::new(
+            options.clone(),
+            kv_backend,
+            layered_cache_registry,
+            catalog::memory::MemoryCatalogManager::with_default_setup(),
+            Arc::new(client::client_manager::NodeClients::default()),
+            meta_client,
+            Arc::new(catalog::process_manager::ProcessManager::new(
+                "".to_string(),
+                None,
+            )),
+        )
+    }
+
    pub fn with_local_cache_invalidator(self, cache_invalidator: CacheInvalidatorRef) -> Self {
        Self {
            local_cache_invalidator: Some(cache_invalidator),
@@ -242,6 +270,7 @@ impl FrontendBuilder {
            process_manager,
            otlp_metrics_table_legacy_cache: DashMap::new(),
            slow_query_options: self.options.slow_query.clone(),
+            suspend: Arc::new(AtomicBool::new(false)),
        })
    }
 }
--- a/src/frontend/src/instance/grpc.rs
+++ b/src/frontend/src/instance/grpc.rs
@@ -234,6 +234,11 @@ impl GrpcQueryHandler for Instance {
                    DdlExpr::DropView(_) => {
                        todo!("implemented in the following PR")
                    }
+                    DdlExpr::CommentOn(expr) => {
+                        self.statement_executor
+                            .comment_by_expr(expr, ctx.clone())
+                            .await?
+                    }
                }
            }
        };
@@ -399,6 +404,9 @@ fn fill_catalog_and_schema_from_context(ddl_expr: &mut DdlExpr, ctx: &QueryConte
        Expr::DropView(expr) => {
            check_and_fill!(expr);
        }
+        Expr::CommentOn(expr) => {
+            check_and_fill!(expr);
+        }
    }
 }

--- a/src/frontend/src/instance/jaeger.rs
+++ b/src/frontend/src/instance/jaeger.rs
@@ -65,8 +65,7 @@ impl JaegerQueryHandler for Instance {
        // It's equivalent to `SELECT DISTINCT(service_name) FROM {db}.{trace_table}`.
        Ok(query_trace_table(
            ctx,
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
            vec![SelectExpr::from(col(SERVICE_NAME_COLUMN))],
            vec![],
            vec![],
@@ -107,8 +106,7 @@ impl JaegerQueryHandler for Instance {
        // ```.
        Ok(query_trace_table(
            ctx,
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
            vec![
                SelectExpr::from(col(SPAN_NAME_COLUMN)),
                SelectExpr::from(col(SPAN_KIND_COLUMN)),
@@ -160,8 +158,7 @@ impl JaegerQueryHandler for Instance {

        Ok(query_trace_table(
            ctx,
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
            selects,
            filters,
            vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order.
@@ -220,8 +217,7 @@ impl JaegerQueryHandler for Instance {
        // ```.
        let output = query_trace_table(
            ctx.clone(),
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
            vec![wildcard()],
            filters,
            vec![],
@@ -285,8 +281,7 @@ impl JaegerQueryHandler for Instance {
                // query all spans
                Ok(query_trace_table(
                    ctx,
-                    self.catalog_manager(),
-                    self.query_engine(),
+                    self,
                    vec![wildcard()],
                    filters,
                    vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order.
@@ -303,8 +298,7 @@ impl JaegerQueryHandler for Instance {
 #[allow(clippy::too_many_arguments)]
 async fn query_trace_table(
    ctx: QueryContextRef,
-    catalog_manager: &CatalogManagerRef,
-    query_engine: &QueryEngineRef,
+    instance: &Instance,
    selects: Vec<SelectExpr>,
    filters: Vec<Expr>,
    sorts: Vec<SortExpr>,
@@ -334,7 +328,8 @@ async fn query_trace_table(
        }
    };

-    let table = catalog_manager
+    let table = instance
+        .catalog_manager()
        .table(
            ctx.current_catalog(),
            &ctx.current_schema(),
@@ -367,7 +362,7 @@ async fn query_trace_table(
        .map(|s| format!("\"{}\"", s))
        .collect::<HashSet<String>>();

-    let df_context = create_df_context(query_engine)?;
+    let df_context = create_df_context(instance.query_engine())?;

    let dataframe = df_context
        .read_table(Arc::new(DfTableProviderAdapter::new(table)))
--- a/src/frontend/src/server.rs
+++ b/src/frontend/src/server.rs
@@ -16,6 +16,9 @@ use std::net::SocketAddr;
 use std::sync::Arc;

 use auth::UserProviderRef;
+use axum::extract::{Request, State};
+use axum::middleware::Next;
+use axum::response::IntoResponse;
 use common_base::Plugins;
 use common_config::Configurable;
 use common_telemetry::info;
@@ -27,6 +30,7 @@ use servers::grpc::frontend_grpc_handler::FrontendGrpcHandler;
 use servers::grpc::greptime_handler::GreptimeRequestHandler;
 use servers::grpc::{GrpcOptions, GrpcServer};
 use servers::http::event::LogValidatorRef;
+use servers::http::result::error_result::ErrorResponse;
 use servers::http::utils::router::RouterConfigurator;
 use servers::http::{HttpServer, HttpServerBuilder};
 use servers::interceptor::LogIngestInterceptorRef;
@@ -39,6 +43,7 @@ use servers::query_handler::sql::ServerSqlQueryHandlerAdapter;
 use servers::server::{Server, ServerHandlers};
 use servers::tls::{ReloadableTlsServerConfig, maybe_watch_server_tls_config};
 use snafu::ResultExt;
+use tonic::Status;

 use crate::error::{self, Result, StartServerSnafu, TomlFormatSnafu};
 use crate::frontend::FrontendOptions;
@@ -125,7 +130,16 @@ where
            builder = builder.with_extra_router(configurator.router());
        }

-        builder
+        builder.add_layer(axum::middleware::from_fn_with_state(
+            self.instance.clone(),
+            async move |State(state): State<Arc<Instance>>, request: Request, next: Next| {
+                if state.is_suspended() {
+                    return ErrorResponse::from_error(servers::error::SuspendedSnafu.build())
+                        .into_response();
+                }
+                next.run(request).await
+            },
+        ))
    }

    pub fn with_grpc_server_builder(self, builder: GrpcServerBuilder) -> Self {
@@ -197,7 +211,17 @@ where
                self.instance.clone(),
                user_provider.clone(),
            ))
-            .flight_handler(flight_handler);
+            .flight_handler(flight_handler)
+            .add_layer(axum::middleware::from_fn_with_state(
+                self.instance.clone(),
+                async move |State(state): State<Arc<Instance>>, request: Request, next: Next| {
+                    if state.is_suspended() {
+                        let status = Status::from(servers::error::SuspendedSnafu.build());
+                        return status.into_http();
+                    }
+                    next.run(request).await
+                },
+            ));

        let grpc_server = if !external {
            let frontend_grpc_handler =
--- a/src/log-store/src/kafka/client_manager.rs
+++ b/src/log-store/src/kafka/client_manager.rs
@@ -16,7 +16,7 @@ use std::collections::HashMap;
 use std::sync::Arc;

 use common_wal::config::kafka::DatanodeKafkaConfig;
-use common_wal::config::kafka::common::DEFAULT_BACKOFF_CONFIG;
+use common_wal::config::kafka::common::{DEFAULT_BACKOFF_CONFIG, DEFAULT_CONNECT_TIMEOUT};
 use dashmap::DashMap;
 use rskafka::client::ClientBuilder;
 use rskafka::client::partition::{Compression, PartitionClient, UnknownTopicHandling};
@@ -78,7 +78,8 @@ impl ClientManager {
    ) -> Result<Self> {
        // Sets backoff config for the top-level kafka client and all clients constructed by it.
        let mut builder = ClientBuilder::new(config.connection.broker_endpoints.clone())
-            .backoff_config(DEFAULT_BACKOFF_CONFIG);
+            .backoff_config(DEFAULT_BACKOFF_CONFIG)
+            .connect_timeout(Some(DEFAULT_CONNECT_TIMEOUT));
        if let Some(sasl) = &config.connection.sasl {
            builder = builder.sasl_config(sasl.config.clone().into_sasl_config());
        };
--- a/src/meta-srv/src/bootstrap.rs
+++ b/src/meta-srv/src/bootstrap.rs
@@ -14,6 +14,7 @@

 use std::net::SocketAddr;
 use std::sync::Arc;
+use std::time::Duration;

 use api::v1::meta::cluster_server::ClusterServer;
 use api::v1::meta::heartbeat_server::HeartbeatServer;
@@ -49,16 +50,21 @@ use crate::metasrv::builder::MetasrvBuilder;
 use crate::metasrv::{
    BackendImpl, ElectionRef, Metasrv, MetasrvOptions, SelectTarget, SelectorRef,
 };
-use crate::selector::SelectorType;
 use crate::selector::lease_based::LeaseBasedSelector;
 use crate::selector::load_based::LoadBasedSelector;
 use crate::selector::round_robin::RoundRobinSelector;
 use crate::selector::weight_compute::RegionNumsBasedWeightCompute;
+use crate::selector::{Selector, SelectorType};
 use crate::service::admin;
 use crate::service::admin::admin_axum_router;
 use crate::utils::etcd::create_etcd_client_with_tls;
 use crate::{Result, error};

+/// The default keep-alive interval for gRPC.
+const DEFAULT_GRPC_KEEP_ALIVE_INTERVAL: Duration = Duration::from_secs(10);
+/// The default keep-alive timeout for gRPC.
+const DEFAULT_GRPC_KEEP_ALIVE_TIMEOUT: Duration = Duration::from_secs(10);
+
 pub struct MetasrvInstance {
    metasrv: Arc<Metasrv>,

@@ -245,7 +251,12 @@ macro_rules! add_compressed_service {
 }

 pub fn router(metasrv: Arc<Metasrv>) -> Router {
-    let mut router = tonic::transport::Server::builder().accept_http1(true); // for admin services
+    let mut router = tonic::transport::Server::builder()
+        // for admin services
+        .accept_http1(true)
+        // For quick network failures detection.
+        .http2_keepalive_interval(Some(DEFAULT_GRPC_KEEP_ALIVE_INTERVAL))
+        .http2_keepalive_timeout(Some(DEFAULT_GRPC_KEEP_ALIVE_TIMEOUT));
    let router = add_compressed_service!(router, HeartbeatServer::from_arc(metasrv.clone()));
    let router = add_compressed_service!(router, StoreServer::from_arc(metasrv.clone()));
    let router = add_compressed_service!(router, ClusterServer::from_arc(metasrv.clone()));
@@ -393,7 +404,12 @@ pub async fn metasrv_builder(
        info!("Using selector from plugins");
        selector
    } else {
-        let selector = match opts.selector {
+        let selector: Arc<
+            dyn Selector<
+                    Context = crate::metasrv::SelectorContext,
+                    Output = Vec<common_meta::peer::Peer>,
+                >,
+        > = match opts.selector {
            SelectorType::LoadBased => Arc::new(LoadBasedSelector::new(
                RegionNumsBasedWeightCompute,
                meta_peer_client.clone(),
--- a/src/meta-srv/src/election/etcd.rs
+++ b/src/meta-srv/src/election/etcd.rs
@@ -63,22 +63,6 @@ pub struct EtcdElection {
 }

 impl EtcdElection {
-    pub async fn with_endpoints<E, S>(
-        leader_value: E,
-        endpoints: S,
-        store_key_prefix: String,
-    ) -> Result<ElectionRef>
-    where
-        E: AsRef<str>,
-        S: AsRef<[E]>,
-    {
-        let client = Client::connect(endpoints, None)
-            .await
-            .context(error::ConnectEtcdSnafu)?;
-
-        Self::with_etcd_client(leader_value, client, store_key_prefix).await
-    }
-
    pub async fn with_etcd_client<E>(
        leader_value: E,
        client: Client,
--- a/src/meta-srv/src/gc/candidate.rs
+++ b/src/meta-srv/src/gc/candidate.rs
@@ -88,7 +88,8 @@ impl GcScheduler {

                // Skip regions that are in cooldown period
                if let Some(gc_info) = tracker.get(&region_stat.id)
-                    && now.duration_since(gc_info.last_gc_time) < self.config.gc_cooldown_period
+                    && now.saturating_duration_since(gc_info.last_gc_time)
+                        < self.config.gc_cooldown_period
                {
                    debug!("Skipping region {} due to cooldown", region_stat.id);
                    continue;
--- a/src/meta-srv/src/gc/handler.rs
+++ b/src/meta-srv/src/gc/handler.rs
@@ -434,7 +434,7 @@ impl GcScheduler {
                if let Some(gc_info) = gc_tracker.get(&region_id) {
                    if let Some(last_full_listing) = gc_info.last_full_listing_time {
                        // check if pass cooling down interval after last full listing
-                        let elapsed = now.duration_since(last_full_listing);
+                        let elapsed = now.saturating_duration_since(last_full_listing);
                        elapsed >= self.config.full_file_listing_interval
                    } else {
                        // Never did full listing for this region, do it now
--- a/src/meta-srv/src/gc/tracker.rs
+++ b/src/meta-srv/src/gc/tracker.rs
@@ -92,7 +92,7 @@ impl GcScheduler {

        if let Some(gc_info) = gc_tracker.get(&region_id) {
            if let Some(last_full_listing) = gc_info.last_full_listing_time {
-                let elapsed = now.duration_since(last_full_listing);
+                let elapsed = now.saturating_duration_since(last_full_listing);
                elapsed >= self.config.full_file_listing_interval
            } else {
                // Never did full listing for this region, do it now
--- a/src/meta-srv/src/handler.rs
+++ b/src/meta-srv/src/handler.rs
@@ -32,7 +32,7 @@ use collect_leader_region_handler::CollectLeaderRegionHandler;
 use collect_stats_handler::CollectStatsHandler;
 use common_base::Plugins;
 use common_meta::datanode::Stat;
-use common_meta::instruction::{Instruction, InstructionReply};
+use common_meta::instruction::InstructionReply;
 use common_meta::sequence::Sequence;
 use common_telemetry::{debug, info, warn};
 use dashmap::DashMap;
@@ -114,16 +114,19 @@ pub enum HandleControl {
 #[derive(Debug, Default)]
 pub struct HeartbeatAccumulator {
    pub header: Option<ResponseHeader>,
-    pub instructions: Vec<Instruction>,
+    mailbox_message: Option<MailboxMessage>,
    pub stat: Option<Stat>,
    pub inactive_region_ids: HashSet<RegionId>,
    pub region_lease: Option<RegionLease>,
 }

 impl HeartbeatAccumulator {
-    pub fn into_mailbox_message(self) -> Option<MailboxMessage> {
-        // TODO(jiachun): to HeartbeatResponse payload
-        None
+    pub(crate) fn take_mailbox_message(&mut self) -> Option<MailboxMessage> {
+        self.mailbox_message.take()
+    }
+
+    pub fn set_mailbox_message(&mut self, message: MailboxMessage) {
+        let _ = self.mailbox_message.insert(message);
    }
 }

@@ -275,6 +278,15 @@ impl Pushers {
    async fn remove(&self, pusher_id: &str) -> Option<Pusher> {
        self.0.write().await.remove(pusher_id)
    }
+
+    pub(crate) async fn clear(&self) -> Vec<String> {
+        let mut pushers = self.0.write().await;
+        let keys = pushers.keys().cloned().collect::<Vec<_>>();
+        if !keys.is_empty() {
+            pushers.clear();
+        }
+        keys
+    }
 }

 #[derive(Clone)]
@@ -309,12 +321,11 @@ impl HeartbeatHandlerGroup {
    }

    /// Deregisters the heartbeat response [`Pusher`] with the given key from the group.
-    ///
-    /// Returns the [`Pusher`] if it exists.
-    pub async fn deregister_push(&self, pusher_id: PusherId) -> Option<Pusher> {
-        METRIC_META_HEARTBEAT_CONNECTION_NUM.dec();
+    pub async fn deregister_push(&self, pusher_id: PusherId) {
        info!("Pusher unregister: {}", pusher_id);
-        self.pushers.remove(&pusher_id.string_key()).await
+        if self.pushers.remove(&pusher_id.string_key()).await.is_some() {
+            METRIC_META_HEARTBEAT_CONNECTION_NUM.dec();
+        }
    }

    /// Returns the [`Pushers`] of the group.
@@ -351,10 +362,11 @@ impl HeartbeatHandlerGroup {
            }
        }
        let header = std::mem::take(&mut acc.header);
+        let mailbox_message = acc.take_mailbox_message();
        let res = HeartbeatResponse {
            header,
            region_lease: acc.region_lease,
-            ..Default::default()
+            mailbox_message,
        };
        Ok(res)
    }
@@ -382,7 +394,9 @@ impl HeartbeatMailbox {

    /// Parses the [Instruction] from [MailboxMessage].
    #[cfg(test)]
-    pub fn json_instruction(msg: &MailboxMessage) -> Result<Instruction> {
+    pub(crate) fn json_instruction(
+        msg: &MailboxMessage,
+    ) -> Result<common_meta::instruction::Instruction> {
        let Payload::Json(payload) =
            msg.payload
                .as_ref()
@@ -519,6 +533,14 @@ impl Mailbox for HeartbeatMailbox {

        Ok(())
    }
+
+    async fn reset(&self) {
+        let keys = self.pushers.clear().await;
+        if !keys.is_empty() {
+            info!("Reset mailbox, deregister pushers: {:?}", keys);
+            METRIC_META_HEARTBEAT_CONNECTION_NUM.sub(keys.len() as i64);
+        }
+    }
 }

 /// The builder to build the group of heartbeat handlers.
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -452,6 +452,7 @@ pub struct MetaStateHandler {
    greptimedb_telemetry_task: Arc<GreptimeDBTelemetryTask>,
    leader_cached_kv_backend: Arc<LeaderCachedKvBackend>,
    leadership_change_notifier: LeadershipChangeNotifier,
+    mailbox: MailboxRef,
    state: StateRef,
 }

@@ -475,6 +476,9 @@ impl MetaStateHandler {
    pub async fn on_leader_stop(&self) {
        self.state.write().unwrap().next_state(become_follower());

+        // Enforces the mailbox to clear all pushers.
+        // The remaining heartbeat connections will be closed by the remote peer or keep-alive detection.
+        self.mailbox.reset().await;
        self.leadership_change_notifier
            .notify_on_leader_stop()
            .await;
@@ -602,6 +606,7 @@ impl Metasrv {
                state: self.state.clone(),
                leader_cached_kv_backend: leader_cached_kv_backend.clone(),
                leadership_change_notifier,
+                mailbox: self.mailbox.clone(),
            };
            let _handle = common_runtime::spawn_global(async move {
                loop {
--- a/src/meta-srv/src/service/mailbox.rs
+++ b/src/meta-srv/src/service/mailbox.rs
@@ -207,6 +207,9 @@ pub trait Mailbox: Send + Sync {
    async fn broadcast(&self, ch: &BroadcastChannel, msg: &MailboxMessage) -> Result<()>;

    async fn on_recv(&self, id: MessageId, maybe_msg: Result<MailboxMessage>) -> Result<()>;
+
+    /// Reset all pushers of the mailbox.
+    async fn reset(&self);
 }

 #[cfg(test)]
--- a/src/meta-srv/src/utils/etcd.rs
+++ b/src/meta-srv/src/utils/etcd.rs
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use common_meta::distributed_time_constants::default_etcd_client_options;
 use common_meta::kv_backend::etcd::create_etcd_tls_options;
-use etcd_client::{Client, ConnectOptions};
+use etcd_client::Client;
 use servers::tls::{TlsMode, TlsOption};
 use snafu::ResultExt;

@@ -30,14 +31,15 @@ pub async fn create_etcd_client_with_tls(
        .filter(|x| !x.is_empty())
        .collect::<Vec<_>>();

-    let connect_options = tls_config
-        .map(|c| create_etcd_tls_options(&convert_tls_option(c)))
-        .transpose()
-        .context(BuildTlsOptionsSnafu)?
-        .flatten()
-        .map(|tls_options| ConnectOptions::new().with_tls(tls_options));
+    let mut connect_options = default_etcd_client_options();
+    if let Some(tls_config) = tls_config
+        && let Some(tls_options) = create_etcd_tls_options(&convert_tls_option(tls_config))
+            .context(BuildTlsOptionsSnafu)?
+    {
+        connect_options = connect_options.with_tls(tls_options);
+    }

-    Client::connect(&etcd_endpoints, connect_options)
+    Client::connect(&etcd_endpoints, Some(connect_options))
        .await
        .context(error::ConnectEtcdSnafu)
 }
--- a/src/mito2/benches/simple_bulk_memtable.rs
+++ b/src/mito2/benches/simple_bulk_memtable.rs
@@ -144,6 +144,7 @@ async fn flush(mem: &SimpleBulkMemtable) {
        let reader = Box::new(DedupReader::new(
            merge_reader,
            read::dedup::LastRow::new(true),
+            None,
        ));
        Source::Reader(reader)
    };
--- a/src/mito2/src/access_layer.rs
+++ b/src/mito2/src/access_layer.rs
@@ -228,22 +228,31 @@ impl AccessLayer {

        // Delete all versions of the index file.
        for version in 0..=index_file_id.version {
-            let path = location::index_file_path(
-                &self.table_dir,
-                RegionIndexId::new(index_file_id.file_id, version),
-                self.path_type,
-            );
-            self.object_store
-                .delete(&path)
-                .await
-                .context(DeleteIndexSnafu {
-                    file_id: region_file_id.file_id(),
-                })?;
+            let index_id = RegionIndexId::new(*region_file_id, version);
+            self.delete_index(index_id).await?;
        }

        Ok(())
    }

+    pub(crate) async fn delete_index(
+        &self,
+        index_file_id: RegionIndexId,
+    ) -> Result<(), crate::error::Error> {
+        let path = location::index_file_path(
+            &self.table_dir,
+            RegionIndexId::new(index_file_id.file_id, index_file_id.version),
+            self.path_type,
+        );
+        self.object_store
+            .delete(&path)
+            .await
+            .context(DeleteIndexSnafu {
+                file_id: index_file_id.file_id(),
+            })?;
+        Ok(())
+    }
+
    /// Returns the directory of the region in the table.
    pub fn build_region_dir(&self, region_id: RegionId) -> String {
        region_dir_from_table_dir(&self.table_dir, region_id, self.path_type)
--- a/src/mito2/src/cache/index/bloom_filter_index.rs
+++ b/src/mito2/src/cache/index/bloom_filter_index.rs
@@ -21,11 +21,10 @@ use async_trait::async_trait;
 use bytes::Bytes;
 use index::bloom_filter::error::Result;
 use index::bloom_filter::reader::{BloomFilterReadMetrics, BloomFilterReader};
-use store_api::storage::{ColumnId, FileId};
+use store_api::storage::{ColumnId, FileId, IndexVersion};

 use crate::cache::index::{INDEX_METADATA_TYPE, IndexCache, PageKey};
 use crate::metrics::{CACHE_HIT, CACHE_MISS};
-use crate::sst::file::IndexVersion;

 const INDEX_TYPE_BLOOM_FILTER_INDEX: &str = "bloom_filter_index";

--- a/src/mito2/src/cache/index/inverted_index.rs
+++ b/src/mito2/src/cache/index/inverted_index.rs
@@ -22,11 +22,10 @@ use bytes::Bytes;
 use index::inverted_index::error::Result;
 use index::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
 use prost::Message;
-use store_api::storage::FileId;
+use store_api::storage::{FileId, IndexVersion};

 use crate::cache::index::{INDEX_METADATA_TYPE, IndexCache, PageKey};
 use crate::metrics::{CACHE_HIT, CACHE_MISS};
-use crate::sst::file::IndexVersion;

 const INDEX_TYPE_INVERTED_INDEX: &str = "inverted_index";

--- a/src/mito2/src/flush.rs
+++ b/src/mito2/src/flush.rs
@@ -730,11 +730,13 @@ async fn memtable_source(mem_ranges: MemtableRanges, options: &RegionOptions) ->
            // dedup according to merge mode
            match options.merge_mode.unwrap_or(MergeMode::LastRow) {
                MergeMode::LastRow => {
-                    Box::new(DedupReader::new(merge_reader, LastRow::new(false))) as _
-                }
-                MergeMode::LastNonNull => {
-                    Box::new(DedupReader::new(merge_reader, LastNonNull::new(false))) as _
+                    Box::new(DedupReader::new(merge_reader, LastRow::new(false), None)) as _
                }
+                MergeMode::LastNonNull => Box::new(DedupReader::new(
+                    merge_reader,
+                    LastNonNull::new(false),
+                    None,
+                )) as _,
            }
        };
        Source::Reader(maybe_dedup)
--- a/src/mito2/src/gc.rs
+++ b/src/mito2/src/gc.rs
@@ -287,6 +287,14 @@ impl LocalGcWorker {
        let region_id = region.region_id();

        debug!("Doing gc for region {}", region_id);
+        // do the time consuming listing only when full_file_listing is true
+        // and do it first to make sure we have the latest manifest etc.
+        let all_entries = if self.full_file_listing {
+            self.list_from_object_store(&region).await?
+        } else {
+            vec![]
+        };
+
        let manifest = region.manifest_ctx.manifest().await;
        let region_id = manifest.metadata.region_id;
        let current_files = &manifest.files;
@@ -303,10 +311,6 @@ impl LocalGcWorker {
            .map(|s| s.len())
            .sum::<usize>();

-        let concurrency = (current_files.len() / Self::CONCURRENCY_LIST_PER_FILES)
-            .max(1)
-            .min(self.opt.max_concurrent_lister_per_gc_job);
-
        let in_used: HashSet<FileId> = current_files
            .keys()
            .cloned()
@@ -314,7 +318,7 @@ impl LocalGcWorker {
            .collect();

        let unused_files = self
-            .list_to_be_deleted_files(region_id, &in_used, recently_removed_files, concurrency)
+            .list_to_be_deleted_files(region_id, &in_used, recently_removed_files, all_entries)
            .await?;

        let unused_file_cnt = unused_files.len();
@@ -442,6 +446,32 @@ impl LocalGcWorker {
        Ok(listers)
    }

+    /// List all files in the region directory.
+    /// Returns a vector of all file entries found.
+    /// This might take a long time if there are many files in the region directory.
+    async fn list_from_object_store(&self, region: &MitoRegionRef) -> Result<Vec<Entry>> {
+        let start = tokio::time::Instant::now();
+        let region_id = region.region_id();
+        let manifest = region.manifest_ctx.manifest().await;
+        let current_files = &manifest.files;
+        let concurrency = (current_files.len() / Self::CONCURRENCY_LIST_PER_FILES)
+            .max(1)
+            .min(self.opt.max_concurrent_lister_per_gc_job);
+
+        let listers = self.partition_region_files(region_id, concurrency).await?;
+        let lister_cnt = listers.len();
+
+        // Step 2: Concurrently list all files in the region directory
+        let all_entries = self.list_region_files_concurrent(listers).await?;
+        let cnt = all_entries.len();
+        info!(
+            "gc: full listing mode cost {} secs using {lister_cnt} lister for {cnt} files in region {}.",
+            start.elapsed().as_secs_f64(),
+            region_id
+        );
+        Ok(all_entries)
+    }
+
    /// Concurrently list all files in the region directory using the provided listers.
    /// Returns a vector of all file entries found across all partitions.
    async fn list_region_files_concurrent(
@@ -572,9 +602,8 @@ impl LocalGcWorker {
        region_id: RegionId,
        in_used: &HashSet<FileId>,
        recently_removed_files: BTreeMap<Timestamp, HashSet<FileId>>,
-        concurrency: usize,
+        all_entries: Vec<Entry>,
    ) -> Result<Vec<FileId>> {
-        let start = tokio::time::Instant::now();
        let now = chrono::Utc::now();
        let may_linger_until = self
            .opt
@@ -629,8 +658,7 @@ impl LocalGcWorker {
                .collect();

            info!(
-                "gc: fast mode (no full listing) cost {} secs for region {}, found {} files to delete from manifest",
-                start.elapsed().as_secs_f64(),
+                "gc: fast mode (no full listing) for region {}, found {} files to delete from manifest",
                region_id,
                files_to_delete.len()
            );
@@ -638,15 +666,7 @@ impl LocalGcWorker {
            return Ok(files_to_delete);
        }

-        // Full file listing mode: perform expensive list operations to find orphan files
-        // Step 1: Create partitioned listers for concurrent processing
-        let listers = self.partition_region_files(region_id, concurrency).await?;
-        let lister_cnt = listers.len();
-
-        // Step 2: Concurrently list all files in the region directory
-        let all_entries = self.list_region_files_concurrent(listers).await?;
-
-        let cnt = all_entries.len();
+        // Full file listing mode: get the full list of files from object store

        // Step 3: Filter files to determine which ones can be deleted
        let (all_unused_files_ready_for_delete, all_in_exist_linger_files) = self
@@ -658,12 +678,6 @@ impl LocalGcWorker {
                unknown_file_may_linger_until,
            );

-        info!(
-            "gc: full listing mode cost {} secs using {lister_cnt} lister for {cnt} files in region {}, found {} unused files to delete",
-            start.elapsed().as_secs_f64(),
-            region_id,
-            all_unused_files_ready_for_delete.len()
-        );
        debug!("All in exist linger files: {:?}", all_in_exist_linger_files);

        Ok(all_unused_files_ready_for_delete)
--- a/src/mito2/src/memtable.rs
+++ b/src/mito2/src/memtable.rs
@@ -57,6 +57,10 @@ pub(crate) mod version;

 #[cfg(any(test, feature = "test"))]
 pub use bulk::part::BulkPart;
+pub use bulk::part::{
+    BulkPartEncoder, BulkPartMeta, UnorderedPart, record_batch_estimated_size,
+    sort_primary_key_record_batch,
+};
 #[cfg(any(test, feature = "test"))]
 pub use time_partition::filter_record_batch;

--- a/src/mito2/src/memtable/bulk/part.rs
+++ b/src/mito2/src/memtable/bulk/part.rs
@@ -464,7 +464,7 @@ impl UnorderedPart {
 }

 /// More accurate estimation of the size of a record batch.
-pub(crate) fn record_batch_estimated_size(batch: &RecordBatch) -> usize {
+pub fn record_batch_estimated_size(batch: &RecordBatch) -> usize {
    batch
        .columns()
        .iter()
@@ -715,7 +715,7 @@ fn new_primary_key_column_builders(
 }

 /// Sorts the record batch with primary key format.
-fn sort_primary_key_record_batch(batch: &RecordBatch) -> Result<RecordBatch> {
+pub fn sort_primary_key_record_batch(batch: &RecordBatch) -> Result<RecordBatch> {
    let total_columns = batch.num_columns();
    let sort_columns = vec![
        // Primary key column (ascending)
--- a/src/mito2/src/memtable/simple_bulk_memtable.rs
+++ b/src/mito2/src/memtable/simple_bulk_memtable.rs
@@ -627,7 +627,7 @@ mod tests {
            .await
            .unwrap();

-        let mut reader = DedupReader::new(reader, read::dedup::LastRow::new(false));
+        let mut reader = DedupReader::new(reader, read::dedup::LastRow::new(false), None);
        let mut num_rows = 0;
        while let Some(b) = reader.next_batch().await.unwrap() {
            num_rows += b.num_rows();
@@ -659,7 +659,7 @@ mod tests {
            .await
            .unwrap();

-        let mut reader = DedupReader::new(reader, read::dedup::LastRow::new(false));
+        let mut reader = DedupReader::new(reader, read::dedup::LastRow::new(false), None);
        let mut num_rows = 0;
        while let Some(b) = reader.next_batch().await.unwrap() {
            num_rows += b.num_rows();
--- a/src/mito2/src/read/dedup.rs
+++ b/src/mito2/src/read/dedup.rs
@@ -14,6 +14,10 @@

 //! Utilities to remove duplicate rows from a sorted batch.

+use std::fmt;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
 use api::v1::OpType;
 use async_trait::async_trait;
 use common_telemetry::debug;
@@ -27,21 +31,34 @@ use crate::error::Result;
 use crate::metrics::MERGE_FILTER_ROWS_TOTAL;
 use crate::read::{Batch, BatchColumn, BatchReader};

+/// Trait for reporting dedup metrics.
+pub trait DedupMetricsReport: Send + Sync {
+    /// Reports and resets the metrics.
+    fn report(&self, metrics: &mut DedupMetrics);
+}
+
 /// A reader that dedup sorted batches from a source based on the
 /// dedup strategy.
 pub struct DedupReader<R, S> {
    source: R,
    strategy: S,
    metrics: DedupMetrics,
+    /// Optional metrics reporter.
+    metrics_reporter: Option<Arc<dyn DedupMetricsReport>>,
 }

 impl<R, S> DedupReader<R, S> {
    /// Creates a new dedup reader.
-    pub fn new(source: R, strategy: S) -> Self {
+    pub fn new(
+        source: R,
+        strategy: S,
+        metrics_reporter: Option<Arc<dyn DedupMetricsReport>>,
+    ) -> Self {
        Self {
            source,
            strategy,
            metrics: DedupMetrics::default(),
+            metrics_reporter,
        }
    }
 }
@@ -51,11 +68,14 @@ impl<R: BatchReader, S: DedupStrategy> DedupReader<R, S> {
    async fn fetch_next_batch(&mut self) -> Result<Option<Batch>> {
        while let Some(batch) = self.source.next_batch().await? {
            if let Some(batch) = self.strategy.push_batch(batch, &mut self.metrics)? {
+                self.metrics.maybe_report(&self.metrics_reporter);
                return Ok(Some(batch));
            }
        }

-        self.strategy.finish(&mut self.metrics)
+        let result = self.strategy.finish(&mut self.metrics)?;
+        self.metrics.maybe_report(&self.metrics_reporter);
+        Ok(result)
    }
 }

@@ -76,6 +96,11 @@ impl<R, S> Drop for DedupReader<R, S> {
        MERGE_FILTER_ROWS_TOTAL
            .with_label_values(&["delete"])
            .inc_by(self.metrics.num_unselected_rows as u64);
+
+        // Report any remaining metrics.
+        if let Some(reporter) = &self.metrics_reporter {
+            reporter.report(&mut self.metrics);
+        }
    }
 }

@@ -138,6 +163,8 @@ impl DedupStrategy for LastRow {
        mut batch: Batch,
        metrics: &mut DedupMetrics,
    ) -> Result<Option<Batch>> {
+        let start = Instant::now();
+
        if batch.is_empty() {
            return Ok(None);
        }
@@ -160,6 +187,7 @@ impl DedupStrategy for LastRow {
            if batch.num_rows() == 1 {
                // We don't need to update `prev_batch` because they have the same
                // key and timestamp.
+                metrics.dedup_cost += start.elapsed();
                return Ok(None);
            }
            // Skips the first row.
@@ -189,6 +217,8 @@ impl DedupStrategy for LastRow {
            filter_deleted_from_batch(&mut batch, metrics)?;
        }

+        metrics.dedup_cost += start.elapsed();
+
        // The batch can become empty if all rows are deleted.
        if batch.is_empty() {
            Ok(None)
@@ -215,12 +245,58 @@ fn filter_deleted_from_batch(batch: &mut Batch, metrics: &mut DedupMetrics) -> R
 }

 /// Metrics for deduplication.
-#[derive(Debug, Default)]
+#[derive(Default)]
 pub struct DedupMetrics {
    /// Number of rows removed during deduplication.
    pub(crate) num_unselected_rows: usize,
    /// Number of deleted rows.
    pub(crate) num_deleted_rows: usize,
+    /// Time spent on deduplication.
+    pub(crate) dedup_cost: Duration,
+}
+
+impl fmt::Debug for DedupMetrics {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // Skip output if dedup_cost is zero
+        if self.dedup_cost.is_zero() {
+            return write!(f, "{{}}");
+        }
+
+        write!(f, r#"{{"dedup_cost":"{:?}""#, self.dedup_cost)?;
+
+        if self.num_unselected_rows > 0 {
+            write!(f, r#", "num_unselected_rows":{}"#, self.num_unselected_rows)?;
+        }
+        if self.num_deleted_rows > 0 {
+            write!(f, r#", "num_deleted_rows":{}"#, self.num_deleted_rows)?;
+        }
+
+        write!(f, "}}")
+    }
+}
+
+impl DedupMetrics {
+    /// Merges metrics from another DedupMetrics instance.
+    pub(crate) fn merge(&mut self, other: &DedupMetrics) {
+        let DedupMetrics {
+            num_unselected_rows,
+            num_deleted_rows,
+            dedup_cost,
+        } = other;
+
+        self.num_unselected_rows += *num_unselected_rows;
+        self.num_deleted_rows += *num_deleted_rows;
+        self.dedup_cost += *dedup_cost;
+    }
+
+    /// Reports the metrics if dedup_cost exceeds 10ms and resets them.
+    pub(crate) fn maybe_report(&mut self, reporter: &Option<Arc<dyn DedupMetricsReport>>) {
+        if self.dedup_cost.as_millis() > 10
+            && let Some(r) = reporter
+        {
+            r.report(self);
+        }
+    }
 }

 /// Buffer to store fields in the last row to merge.
@@ -427,6 +503,8 @@ impl LastNonNull {

 impl DedupStrategy for LastNonNull {
    fn push_batch(&mut self, batch: Batch, metrics: &mut DedupMetrics) -> Result<Option<Batch>> {
+        let start = Instant::now();
+
        if batch.is_empty() {
            return Ok(None);
        }
@@ -444,6 +522,7 @@ impl DedupStrategy for LastNonNull {
            // Next key is different.
            let buffer = std::mem::replace(buffer, batch);
            let merged = self.last_fields.merge_last_non_null(buffer, metrics)?;
+            metrics.dedup_cost += start.elapsed();
            return Ok(merged);
        }

@@ -451,6 +530,7 @@ impl DedupStrategy for LastNonNull {
            // The next batch has a different timestamp.
            let buffer = std::mem::replace(buffer, batch);
            let merged = self.last_fields.merge_last_non_null(buffer, metrics)?;
+            metrics.dedup_cost += start.elapsed();
            return Ok(merged);
        }

@@ -460,6 +540,7 @@ impl DedupStrategy for LastNonNull {
        // We assumes each batch doesn't contain duplicate rows so we only need to check the first row.
        if batch.num_rows() == 1 {
            self.last_fields.push_first_row(&batch);
+            metrics.dedup_cost += start.elapsed();
            return Ok(None);
        }

@@ -472,10 +553,14 @@ impl DedupStrategy for LastNonNull {
        let buffer = std::mem::replace(buffer, batch);
        let merged = self.last_fields.merge_last_non_null(buffer, metrics)?;

+        metrics.dedup_cost += start.elapsed();
+
        Ok(merged)
    }

    fn finish(&mut self, metrics: &mut DedupMetrics) -> Result<Option<Batch>> {
+        let start = Instant::now();
+
        let Some(buffer) = self.buffer.take() else {
            return Ok(None);
        };
@@ -485,6 +570,8 @@ impl DedupStrategy for LastNonNull {

        let merged = self.last_fields.merge_last_non_null(buffer, metrics)?;

+        metrics.dedup_cost += start.elapsed();
+
        Ok(merged)
    }
 }
@@ -614,14 +701,14 @@ mod tests {

        // Test last row.
        let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastRow::new(true));
+        let mut reader = DedupReader::new(reader, LastRow::new(true), None);
        check_reader_result(&mut reader, &input).await;
        assert_eq!(0, reader.metrics().num_unselected_rows);
        assert_eq!(0, reader.metrics().num_deleted_rows);

        // Test last non-null.
        let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastNonNull::new(true));
+        let mut reader = DedupReader::new(reader, LastNonNull::new(true), None);
        check_reader_result(&mut reader, &input).await;
        assert_eq!(0, reader.metrics().num_unselected_rows);
        assert_eq!(0, reader.metrics().num_deleted_rows);
@@ -662,7 +749,7 @@ mod tests {
        ];
        // Filter deleted.
        let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastRow::new(true));
+        let mut reader = DedupReader::new(reader, LastRow::new(true), None);
        check_reader_result(
            &mut reader,
            &[
@@ -684,7 +771,7 @@ mod tests {

        // Does not filter deleted.
        let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastRow::new(false));
+        let mut reader = DedupReader::new(reader, LastRow::new(false), None);
        check_reader_result(
            &mut reader,
            &[
@@ -801,7 +888,7 @@ mod tests {

        // Filter deleted.
        let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastNonNull::new(true));
+        let mut reader = DedupReader::new(reader, LastNonNull::new(true), None);
        check_reader_result(
            &mut reader,
            &[
@@ -835,7 +922,7 @@ mod tests {

        // Does not filter deleted.
        let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastNonNull::new(false));
+        let mut reader = DedupReader::new(reader, LastNonNull::new(false), None);
        check_reader_result(
            &mut reader,
            &[
@@ -885,7 +972,7 @@ mod tests {
        )];

        let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastNonNull::new(true));
+        let mut reader = DedupReader::new(reader, LastNonNull::new(true), None);
        check_reader_result(
            &mut reader,
            &[new_batch_multi_fields(
@@ -901,7 +988,7 @@ mod tests {
        assert_eq!(1, reader.metrics().num_deleted_rows);

        let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastNonNull::new(false));
+        let mut reader = DedupReader::new(reader, LastNonNull::new(false), None);
        check_reader_result(&mut reader, &input).await;
        assert_eq!(0, reader.metrics().num_unselected_rows);
        assert_eq!(0, reader.metrics().num_deleted_rows);
@@ -928,7 +1015,7 @@ mod tests {
        ];

        let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastNonNull::new(true));
+        let mut reader = DedupReader::new(reader, LastNonNull::new(true), None);
        check_reader_result(
            &mut reader,
            &[
@@ -962,7 +1049,7 @@ mod tests {
        ];

        let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastNonNull::new(true));
+        let mut reader = DedupReader::new(reader, LastNonNull::new(true), None);
        check_reader_result(
            &mut reader,
            &[
--- a/src/mito2/src/read/flat_dedup.rs
+++ b/src/mito2/src/read/flat_dedup.rs
@@ -15,9 +15,12 @@
 //! Dedup implementation for flat format.

 use std::ops::Range;
+use std::sync::Arc;
+use std::time::Instant;

 use api::v1::OpType;
 use async_stream::try_stream;
+use common_telemetry::debug;
 use datatypes::arrow::array::{
    Array, ArrayRef, BinaryArray, BooleanArray, BooleanBufferBuilder, UInt8Array, UInt64Array,
    make_comparator,
@@ -36,7 +39,8 @@ use snafu::ResultExt;

 use crate::error::{ComputeArrowSnafu, NewRecordBatchSnafu, Result};
 use crate::memtable::partition_tree::data::timestamp_array_to_i64_slice;
-use crate::read::dedup::DedupMetrics;
+use crate::metrics::MERGE_FILTER_ROWS_TOTAL;
+use crate::read::dedup::{DedupMetrics, DedupMetricsReport};
 use crate::sst::parquet::flat_format::{
    op_type_column_index, primary_key_column_index, time_index_column_index,
 };
@@ -88,15 +92,22 @@ pub struct FlatDedupReader<I, S> {
    stream: I,
    strategy: S,
    metrics: DedupMetrics,
+    /// Optional metrics reporter.
+    metrics_reporter: Option<Arc<dyn DedupMetricsReport>>,
 }

 impl<I, S> FlatDedupReader<I, S> {
-    /// Creates a new dedup iterator.
-    pub fn new(stream: I, strategy: S) -> Self {
+    /// Creates a new dedup reader.
+    pub fn new(
+        stream: I,
+        strategy: S,
+        metrics_reporter: Option<Arc<dyn DedupMetricsReport>>,
+    ) -> Self {
        Self {
            stream,
            strategy,
            metrics: DedupMetrics::default(),
+            metrics_reporter,
        }
    }
 }
@@ -108,11 +119,14 @@ impl<I: Stream<Item = Result<RecordBatch>> + Unpin, S: RecordBatchDedupStrategy>
    async fn fetch_next_batch(&mut self) -> Result<Option<RecordBatch>> {
        while let Some(batch) = self.stream.try_next().await? {
            if let Some(batch) = self.strategy.push_batch(batch, &mut self.metrics)? {
+                self.metrics.maybe_report(&self.metrics_reporter);
                return Ok(Some(batch));
            }
        }

-        self.strategy.finish(&mut self.metrics)
+        let result = self.strategy.finish(&mut self.metrics)?;
+        self.metrics.maybe_report(&self.metrics_reporter);
+        Ok(result)
    }

    /// Converts the reader into a stream.
@@ -125,6 +139,24 @@ impl<I: Stream<Item = Result<RecordBatch>> + Unpin, S: RecordBatchDedupStrategy>
    }
 }

+impl<I, S> Drop for FlatDedupReader<I, S> {
+    fn drop(&mut self) {
+        debug!("Flat dedup reader finished, metrics: {:?}", self.metrics);
+
+        MERGE_FILTER_ROWS_TOTAL
+            .with_label_values(&["dedup"])
+            .inc_by(self.metrics.num_unselected_rows as u64);
+        MERGE_FILTER_ROWS_TOTAL
+            .with_label_values(&["delete"])
+            .inc_by(self.metrics.num_deleted_rows as u64);
+
+        // Report any remaining metrics.
+        if let Some(reporter) = &self.metrics_reporter {
+            reporter.report(&mut self.metrics);
+        }
+    }
+}
+
 /// Strategy to remove duplicate rows from sorted record batches.
 pub trait RecordBatchDedupStrategy: Send {
    /// Pushes a batch to the dedup strategy.
@@ -214,6 +246,8 @@ impl RecordBatchDedupStrategy for FlatLastRow {
        batch: RecordBatch,
        metrics: &mut DedupMetrics,
    ) -> Result<Option<RecordBatch>> {
+        let start = Instant::now();
+
        if batch.num_rows() == 0 {
            return Ok(None);
        }
@@ -235,6 +269,7 @@ impl RecordBatchDedupStrategy for FlatLastRow {
            // The batch after dedup is empty.
            // We don't need to update `prev_batch` because they have the same
            // key and timestamp.
+            metrics.dedup_cost += start.elapsed();
            return Ok(None);
        };

@@ -246,7 +281,11 @@ impl RecordBatchDedupStrategy for FlatLastRow {
        self.prev_batch = Some(batch_last_row);

        // Filters deleted rows at last.
-        maybe_filter_deleted(batch, self.filter_deleted, metrics)
+        let result = maybe_filter_deleted(batch, self.filter_deleted, metrics);
+
+        metrics.dedup_cost += start.elapsed();
+
+        result
    }

    fn finish(&mut self, _metrics: &mut DedupMetrics) -> Result<Option<RecordBatch>> {
@@ -275,6 +314,8 @@ impl RecordBatchDedupStrategy for FlatLastNonNull {
        batch: RecordBatch,
        metrics: &mut DedupMetrics,
    ) -> Result<Option<RecordBatch>> {
+        let start = Instant::now();
+
        if batch.num_rows() == 0 {
            return Ok(None);
        }
@@ -290,6 +331,7 @@ impl RecordBatchDedupStrategy for FlatLastNonNull {
            self.buffer = BatchLastRow::try_new(record_batch);
            self.contains_delete = contains_delete;

+            metrics.dedup_cost += start.elapsed();
            return Ok(None);
        };

@@ -305,7 +347,9 @@ impl RecordBatchDedupStrategy for FlatLastNonNull {
            self.buffer = BatchLastRow::try_new(record_batch);
            self.contains_delete = contains_delete;

-            return maybe_filter_deleted(buffer.last_batch, self.filter_deleted, metrics);
+            let result = maybe_filter_deleted(buffer.last_batch, self.filter_deleted, metrics);
+            metrics.dedup_cost += start.elapsed();
+            return result;
        }

        // The next batch has duplicated rows.
@@ -332,6 +376,8 @@ impl RecordBatchDedupStrategy for FlatLastNonNull {
        self.buffer = BatchLastRow::try_new(record_batch);
        self.contains_delete = contains_delete;

+        metrics.dedup_cost += start.elapsed();
+
        Ok(output)
    }

@@ -340,7 +386,13 @@ impl RecordBatchDedupStrategy for FlatLastNonNull {
            return Ok(None);
        };

-        maybe_filter_deleted(buffer.last_batch, self.filter_deleted, metrics)
+        let start = Instant::now();
+
+        let result = maybe_filter_deleted(buffer.last_batch, self.filter_deleted, metrics);
+
+        metrics.dedup_cost += start.elapsed();
+
+        result
    }
 }

--- a/src/mito2/src/read/flat_merge.rs
+++ b/src/mito2/src/read/flat_merge.rs
@@ -15,8 +15,10 @@
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::sync::Arc;
+use std::time::Instant;

 use async_stream::try_stream;
+use common_telemetry::debug;
 use datatypes::arrow::array::{Int64Array, UInt64Array};
 use datatypes::arrow::compute::interleave;
 use datatypes::arrow::datatypes::SchemaRef;
@@ -29,7 +31,9 @@ use store_api::storage::SequenceNumber;

 use crate::error::{ComputeArrowSnafu, Result};
 use crate::memtable::BoxedRecordBatchIterator;
+use crate::metrics::READ_STAGE_ELAPSED;
 use crate::read::BoxedRecordBatchStream;
+use crate::read::merge::{MergeMetrics, MergeMetricsReport};
 use crate::sst::parquet::flat_format::{
    primary_key_column_index, sequence_column_index, time_index_column_index,
 };
@@ -462,12 +466,14 @@ impl FlatMergeIterator {

        let algo = MergeAlgo::new(nodes);

-        Ok(Self {
+        let iter = Self {
            algo,
            in_progress,
            output_batch: None,
            batch_size,
-        })
+        };
+
+        Ok(iter)
    }

    /// Fetches next sorted batch.
@@ -484,12 +490,7 @@ impl FlatMergeIterator {
            }
        }

-        if let Some(batch) = self.output_batch.take() {
-            Ok(Some(batch))
-        } else {
-            // No more batches.
-            Ok(None)
-        }
+        Ok(self.output_batch.take())
    }

    /// Fetches a batch from the hottest node.
@@ -562,6 +563,10 @@ pub struct FlatMergeReader {
    /// This is not a hard limit, the iterator may return smaller batches to avoid concatenating
    /// rows.
    batch_size: usize,
+    /// Local metrics.
+    metrics: MergeMetrics,
+    /// Optional metrics reporter.
+    metrics_reporter: Option<Arc<dyn MergeMetricsReport>>,
 }

 impl FlatMergeReader {
@@ -570,7 +575,10 @@ impl FlatMergeReader {
        schema: SchemaRef,
        iters: Vec<BoxedRecordBatchStream>,
        batch_size: usize,
+        metrics_reporter: Option<Arc<dyn MergeMetricsReport>>,
    ) -> Result<Self> {
+        let start = Instant::now();
+        let metrics = MergeMetrics::default();
        let mut in_progress = BatchBuilder::new(schema, iters.len(), batch_size);
        let mut nodes = Vec::with_capacity(iters.len());
        // Initialize nodes and the buffer.
@@ -588,16 +596,24 @@ impl FlatMergeReader {

        let algo = MergeAlgo::new(nodes);

-        Ok(Self {
+        let mut reader = Self {
            algo,
            in_progress,
            output_batch: None,
            batch_size,
-        })
+            metrics,
+            metrics_reporter,
+        };
+        let elapsed = start.elapsed();
+        reader.metrics.init_cost += elapsed;
+        reader.metrics.scan_cost += elapsed;
+
+        Ok(reader)
    }

    /// Fetches next sorted batch.
    pub async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
+        let start = Instant::now();
        while self.algo.has_rows() && self.output_batch.is_none() {
            if self.algo.can_fetch_batch() && !self.in_progress.is_empty() {
                // Only one batch in the hot heap, but we have pending rows, output the pending rows first.
@@ -605,15 +621,21 @@ impl FlatMergeReader {
                debug_assert!(self.output_batch.is_some());
            } else if self.algo.can_fetch_batch() {
                self.fetch_batch_from_hottest().await?;
+                self.metrics.num_fetch_by_batches += 1;
            } else {
                self.fetch_row_from_hottest().await?;
+                self.metrics.num_fetch_by_rows += 1;
            }
        }

        if let Some(batch) = self.output_batch.take() {
+            self.metrics.scan_cost += start.elapsed();
+            self.metrics.maybe_report(&self.metrics_reporter);
            Ok(Some(batch))
        } else {
            // No more batches.
+            self.metrics.scan_cost += start.elapsed();
+            self.metrics.maybe_report(&self.metrics_reporter);
            Ok(None)
        }
    }
@@ -634,7 +656,9 @@ impl FlatMergeReader {
        // Safety: next_batch() ensures the heap is not empty.
        let mut hottest = self.algo.pop_hot().unwrap();
        debug_assert!(!hottest.current_cursor().is_finished());
+        let start = Instant::now();
        let next = hottest.advance_batch().await?;
+        self.metrics.fetch_cost += start.elapsed();
        // The node is the heap is not empty, so it must have existing rows in the builder.
        let batch = self
            .in_progress
@@ -658,8 +682,12 @@ impl FlatMergeReader {
            }
        }

+        let start = Instant::now();
        if let Some(next) = hottest.advance_row().await? {
+            self.metrics.fetch_cost += start.elapsed();
            self.in_progress.push_batch(hottest.node_index, next);
+        } else {
+            self.metrics.fetch_cost += start.elapsed();
        }

        self.algo.reheap(hottest);
@@ -675,6 +703,24 @@ impl FlatMergeReader {
    }
 }

+impl Drop for FlatMergeReader {
+    fn drop(&mut self) {
+        debug!("Flat merge reader finished, metrics: {:?}", self.metrics);
+
+        READ_STAGE_ELAPSED
+            .with_label_values(&["flat_merge"])
+            .observe(self.metrics.scan_cost.as_secs_f64());
+        READ_STAGE_ELAPSED
+            .with_label_values(&["flat_merge_fetch"])
+            .observe(self.metrics.fetch_cost.as_secs_f64());
+
+        // Report any remaining metrics.
+        if let Some(reporter) = &self.metrics_reporter {
+            reporter.report(&mut self.metrics);
+        }
+    }
+}
+
 /// A sync node in the merge iterator.
 struct GenericNode<T> {
    /// Index of the node.
--- a/src/mito2/src/read/merge.rs
+++ b/src/mito2/src/read/merge.rs
@@ -16,8 +16,9 @@

 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
-use std::mem;
+use std::sync::Arc;
 use std::time::{Duration, Instant};
+use std::{fmt, mem};

 use async_trait::async_trait;
 use common_telemetry::debug;
@@ -27,6 +28,12 @@ use crate::memtable::BoxedBatchIterator;
 use crate::metrics::READ_STAGE_ELAPSED;
 use crate::read::{Batch, BatchReader, BoxedBatchReader, Source};

+/// Trait for reporting merge metrics.
+pub trait MergeMetricsReport: Send + Sync {
+    /// Reports and resets the metrics.
+    fn report(&self, metrics: &mut MergeMetrics);
+}
+
 /// Reader to merge sorted batches.
 ///
 /// The merge reader merges [Batch]es from multiple sources that yield sorted batches.
@@ -51,7 +58,9 @@ pub struct MergeReader {
    /// Batch to output.
    output_batch: Option<Batch>,
    /// Local metrics.
-    metrics: Metrics,
+    metrics: MergeMetrics,
+    /// Optional metrics reporter.
+    metrics_reporter: Option<Arc<dyn MergeMetricsReport>>,
 }

 #[async_trait]
@@ -72,11 +81,12 @@ impl BatchReader for MergeReader {

        if let Some(batch) = self.output_batch.take() {
            self.metrics.scan_cost += start.elapsed();
-            self.metrics.num_output_rows += batch.num_rows();
+            self.metrics.maybe_report(&self.metrics_reporter);
            Ok(Some(batch))
        } else {
            // Nothing fetched.
            self.metrics.scan_cost += start.elapsed();
+            self.metrics.maybe_report(&self.metrics_reporter);
            Ok(None)
        }
    }
@@ -92,14 +102,22 @@ impl Drop for MergeReader {
        READ_STAGE_ELAPSED
            .with_label_values(&["merge_fetch"])
            .observe(self.metrics.fetch_cost.as_secs_f64());
+
+        // Report any remaining metrics.
+        if let Some(reporter) = &self.metrics_reporter {
+            reporter.report(&mut self.metrics);
+        }
    }
 }

 impl MergeReader {
    /// Creates and initializes a new [MergeReader].
-    pub async fn new(sources: Vec<Source>) -> Result<MergeReader> {
+    pub async fn new(
+        sources: Vec<Source>,
+        metrics_reporter: Option<Arc<dyn MergeMetricsReport>>,
+    ) -> Result<MergeReader> {
        let start = Instant::now();
-        let mut metrics = Metrics::default();
+        let mut metrics = MergeMetrics::default();

        let mut cold = BinaryHeap::with_capacity(sources.len());
        let hot = BinaryHeap::with_capacity(sources.len());
@@ -116,11 +134,14 @@ impl MergeReader {
            cold,
            output_batch: None,
            metrics,
+            metrics_reporter,
        };
        // Initializes the reader.
        reader.refill_hot();

-        reader.metrics.scan_cost += start.elapsed();
+        let elapsed = start.elapsed();
+        reader.metrics.init_cost += elapsed;
+        reader.metrics.scan_cost += elapsed;
        Ok(reader)
    }

@@ -250,6 +271,8 @@ pub struct MergeReaderBuilder {
    ///
    /// All source must yield batches with the same schema.
    sources: Vec<Source>,
+    /// Optional metrics reporter.
+    metrics_reporter: Option<Arc<dyn MergeMetricsReport>>,
 }

 impl MergeReaderBuilder {
@@ -260,7 +283,10 @@ impl MergeReaderBuilder {

    /// Creates a builder from sources.
    pub fn from_sources(sources: Vec<Source>) -> MergeReaderBuilder {
-        MergeReaderBuilder { sources }
+        MergeReaderBuilder {
+            sources,
+            metrics_reporter: None,
+        }
    }

    /// Pushes a batch reader to sources.
@@ -275,28 +301,94 @@ impl MergeReaderBuilder {
        self
    }

+    /// Sets the metrics reporter.
+    pub fn with_metrics_reporter(
+        &mut self,
+        reporter: Option<Arc<dyn MergeMetricsReport>>,
+    ) -> &mut Self {
+        self.metrics_reporter = reporter;
+        self
+    }
+
    /// Builds and initializes the reader, then resets the builder.
    pub async fn build(&mut self) -> Result<MergeReader> {
        let sources = mem::take(&mut self.sources);
-        MergeReader::new(sources).await
+        let metrics_reporter = self.metrics_reporter.take();
+        MergeReader::new(sources, metrics_reporter).await
    }
 }

 /// Metrics for the merge reader.
-#[derive(Debug, Default)]
-struct Metrics {
+#[derive(Default)]
+pub struct MergeMetrics {
+    /// Cost to initialize the reader.
+    pub(crate) init_cost: Duration,
    /// Total scan cost of the reader.
-    scan_cost: Duration,
+    pub(crate) scan_cost: Duration,
    /// Number of times to fetch batches.
-    num_fetch_by_batches: usize,
+    pub(crate) num_fetch_by_batches: usize,
    /// Number of times to fetch rows.
-    num_fetch_by_rows: usize,
-    /// Number of input rows.
-    num_input_rows: usize,
-    /// Number of output rows.
-    num_output_rows: usize,
+    pub(crate) num_fetch_by_rows: usize,
    /// Cost to fetch batches from sources.
-    fetch_cost: Duration,
+    pub(crate) fetch_cost: Duration,
+}
+
+impl fmt::Debug for MergeMetrics {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // Skip output if scan_cost is zero
+        if self.scan_cost.is_zero() {
+            return write!(f, "{{}}");
+        }
+
+        write!(f, r#"{{"scan_cost":"{:?}""#, self.scan_cost)?;
+
+        if !self.init_cost.is_zero() {
+            write!(f, r#", "init_cost":"{:?}""#, self.init_cost)?;
+        }
+        if self.num_fetch_by_batches > 0 {
+            write!(
+                f,
+                r#", "num_fetch_by_batches":{}"#,
+                self.num_fetch_by_batches
+            )?;
+        }
+        if self.num_fetch_by_rows > 0 {
+            write!(f, r#", "num_fetch_by_rows":{}"#, self.num_fetch_by_rows)?;
+        }
+        if !self.fetch_cost.is_zero() {
+            write!(f, r#", "fetch_cost":"{:?}""#, self.fetch_cost)?;
+        }
+
+        write!(f, "}}")
+    }
+}
+
+impl MergeMetrics {
+    /// Merges metrics from another MergeMetrics instance.
+    pub(crate) fn merge(&mut self, other: &MergeMetrics) {
+        let MergeMetrics {
+            init_cost,
+            scan_cost,
+            num_fetch_by_batches,
+            num_fetch_by_rows,
+            fetch_cost,
+        } = other;
+
+        self.init_cost += *init_cost;
+        self.scan_cost += *scan_cost;
+        self.num_fetch_by_batches += *num_fetch_by_batches;
+        self.num_fetch_by_rows += *num_fetch_by_rows;
+        self.fetch_cost += *fetch_cost;
+    }
+
+    /// Reports the metrics if scan_cost exceeds 10ms and resets them.
+    pub(crate) fn maybe_report(&mut self, reporter: &Option<Arc<dyn MergeMetricsReport>>) {
+        if self.scan_cost.as_millis() > 10
+            && let Some(r) = reporter
+        {
+            r.report(self);
+        }
+    }
 }

 /// A `Node` represent an individual input data source to be merged.
@@ -313,12 +405,11 @@ impl Node {
    /// Initialize a node.
    ///
    /// It tries to fetch one batch from the `source`.
-    async fn new(mut source: Source, metrics: &mut Metrics) -> Result<Node> {
+    async fn new(mut source: Source, metrics: &mut MergeMetrics) -> Result<Node> {
        // Ensures batch is not empty.
        let start = Instant::now();
        let current_batch = source.next_batch().await?.map(CompareFirst);
        metrics.fetch_cost += start.elapsed();
-        metrics.num_input_rows += current_batch.as_ref().map(|b| b.0.num_rows()).unwrap_or(0);

        Ok(Node {
            source,
@@ -352,17 +443,12 @@ impl Node {
    ///
    /// # Panics
    /// Panics if the node has reached EOF.
-    async fn fetch_batch(&mut self, metrics: &mut Metrics) -> Result<Batch> {
+    async fn fetch_batch(&mut self, metrics: &mut MergeMetrics) -> Result<Batch> {
        let current = self.current_batch.take().unwrap();
        let start = Instant::now();
        // Ensures batch is not empty.
        self.current_batch = self.source.next_batch().await?.map(CompareFirst);
        metrics.fetch_cost += start.elapsed();
-        metrics.num_input_rows += self
-            .current_batch
-            .as_ref()
-            .map(|b| b.0.num_rows())
-            .unwrap_or(0);
        Ok(current.0)
    }

@@ -390,7 +476,7 @@ impl Node {
    ///
    /// # Panics
    /// Panics if the node is EOF.
-    async fn skip_rows(&mut self, num_to_skip: usize, metrics: &mut Metrics) -> Result<()> {
+    async fn skip_rows(&mut self, num_to_skip: usize, metrics: &mut MergeMetrics) -> Result<()> {
        let batch = self.current_batch();
        debug_assert!(batch.num_rows() >= num_to_skip);

@@ -547,9 +633,6 @@ mod tests {
            ],
        )
        .await;
-
-        assert_eq!(8, reader.metrics.num_input_rows);
-        assert_eq!(8, reader.metrics.num_output_rows);
    }

    #[tokio::test]
@@ -666,9 +749,6 @@ mod tests {
            ],
        )
        .await;
-
-        assert_eq!(11, reader.metrics.num_input_rows);
-        assert_eq!(11, reader.metrics.num_output_rows);
    }

    #[tokio::test]
--- a/src/mito2/src/read/scan_util.rs
+++ b/src/mito2/src/read/scan_util.rs
@@ -37,6 +37,8 @@ use crate::metrics::{
    IN_PROGRESS_SCAN, PRECISE_FILTER_ROWS_TOTAL, READ_BATCHES_RETURN, READ_ROW_GROUPS_TOTAL,
    READ_ROWS_IN_ROW_GROUP_TOTAL, READ_ROWS_RETURN, READ_STAGE_ELAPSED,
 };
+use crate::read::dedup::{DedupMetrics, DedupMetricsReport};
+use crate::read::merge::{MergeMetrics, MergeMetricsReport};
 use crate::read::range::{RangeBuilderList, RangeMeta, RowGroupIndex};
 use crate::read::scan_region::StreamContext;
 use crate::read::{Batch, BoxedBatchStream, BoxedRecordBatchStream, ScannerMetrics, Source};
@@ -130,6 +132,11 @@ pub(crate) struct ScanMetricsSet {
    /// Duration of the series distributor to yield.
    distributor_yield_cost: Duration,

+    /// Merge metrics.
+    merge_metrics: MergeMetrics,
+    /// Dedup metrics.
+    dedup_metrics: DedupMetrics,
+
    /// The stream reached EOF
    stream_eof: bool,

@@ -180,6 +187,8 @@ impl fmt::Debug for ScanMetricsSet {
            num_distributor_batches,
            distributor_scan_cost,
            distributor_yield_cost,
+            merge_metrics,
+            dedup_metrics,
            stream_eof,
            mem_scan_cost,
            mem_rows,
@@ -307,6 +316,16 @@ impl fmt::Debug for ScanMetricsSet {
            write!(f, ", \"metadata_cache_metrics\":{:?}", metrics)?;
        }

+        // Write merge metrics if not empty
+        if !merge_metrics.scan_cost.is_zero() {
+            write!(f, ", \"merge_metrics\":{:?}", merge_metrics)?;
+        }
+
+        // Write dedup metrics if not empty
+        if !dedup_metrics.dedup_cost.is_zero() {
+            write!(f, ", \"dedup_metrics\":{:?}", dedup_metrics)?;
+        }
+
        write!(f, ", \"stream_eof\":{stream_eof}}}")
    }
 }
@@ -531,6 +550,28 @@ impl PartitionMetricsInner {
    }
 }

+impl MergeMetricsReport for PartitionMetricsInner {
+    fn report(&self, metrics: &mut MergeMetrics) {
+        let mut scan_metrics = self.metrics.lock().unwrap();
+        // Merge the metrics into scan_metrics
+        scan_metrics.merge_metrics.merge(metrics);
+
+        // Reset the input metrics
+        *metrics = MergeMetrics::default();
+    }
+}
+
+impl DedupMetricsReport for PartitionMetricsInner {
+    fn report(&self, metrics: &mut DedupMetrics) {
+        let mut scan_metrics = self.metrics.lock().unwrap();
+        // Merge the metrics into scan_metrics
+        scan_metrics.dedup_metrics.merge(metrics);
+
+        // Reset the input metrics
+        *metrics = DedupMetrics::default();
+    }
+}
+
 impl Drop for PartitionMetricsInner {
    fn drop(&mut self) {
        self.on_finish(false);
@@ -703,6 +744,16 @@ impl PartitionMetrics {
    pub(crate) fn explain_verbose(&self) -> bool {
        self.0.explain_verbose
    }
+
+    /// Returns a MergeMetricsReport trait object for reporting merge metrics.
+    pub(crate) fn merge_metrics_reporter(&self) -> Arc<dyn MergeMetricsReport> {
+        self.0.clone()
+    }
+
+    /// Returns a DedupMetricsReport trait object for reporting dedup metrics.
+    pub(crate) fn dedup_metrics_reporter(&self) -> Arc<dyn DedupMetricsReport> {
+        self.0.clone()
+    }
 }

 impl fmt::Debug for PartitionMetrics {
--- a/src/mito2/src/read/seq_scan.rs
+++ b/src/mito2/src/read/seq_scan.rs
@@ -189,7 +189,7 @@ impl SeqScan {
            partition_ranges.len(),
            sources.len()
        );
-        Self::build_reader_from_sources(stream_ctx, sources, None).await
+        Self::build_reader_from_sources(stream_ctx, sources, None, None).await
    }

    /// Builds a merge reader that reads all flat ranges.
@@ -223,7 +223,7 @@ impl SeqScan {
            partition_ranges.len(),
            sources.len()
        );
-        Self::build_flat_reader_from_sources(stream_ctx, sources, None).await
+        Self::build_flat_reader_from_sources(stream_ctx, sources, None, None).await
    }

    /// Builds a reader to read sources. If `semaphore` is provided, reads sources in parallel
@@ -233,6 +233,7 @@ impl SeqScan {
        stream_ctx: &StreamContext,
        mut sources: Vec<Source>,
        semaphore: Option<Arc<Semaphore>>,
+        part_metrics: Option<&PartitionMetrics>,
    ) -> Result<BoxedBatchReader> {
        if let Some(semaphore) = semaphore.as_ref() {
            // Read sources in parallel.
@@ -244,18 +245,24 @@ impl SeqScan {
        }

        let mut builder = MergeReaderBuilder::from_sources(sources);
+        if let Some(metrics) = part_metrics {
+            builder.with_metrics_reporter(Some(metrics.merge_metrics_reporter()));
+        }
        let reader = builder.build().await?;

        let dedup = !stream_ctx.input.append_mode;
+        let dedup_metrics_reporter = part_metrics.map(|m| m.dedup_metrics_reporter());
        let reader = if dedup {
            match stream_ctx.input.merge_mode {
                MergeMode::LastRow => Box::new(DedupReader::new(
                    reader,
                    LastRow::new(stream_ctx.input.filter_deleted),
+                    dedup_metrics_reporter,
                )) as _,
                MergeMode::LastNonNull => Box::new(DedupReader::new(
                    reader,
                    LastNonNull::new(stream_ctx.input.filter_deleted),
+                    dedup_metrics_reporter,
                )) as _,
            }
        } else {
@@ -277,6 +284,7 @@ impl SeqScan {
        stream_ctx: &StreamContext,
        mut sources: Vec<BoxedRecordBatchStream>,
        semaphore: Option<Arc<Semaphore>>,
+        part_metrics: Option<&PartitionMetrics>,
    ) -> Result<BoxedRecordBatchStream> {
        if let Some(semaphore) = semaphore.as_ref() {
            // Read sources in parallel.
@@ -290,15 +298,20 @@ impl SeqScan {
        let mapper = stream_ctx.input.mapper.as_flat().unwrap();
        let schema = mapper.input_arrow_schema(stream_ctx.input.compaction);

-        let reader = FlatMergeReader::new(schema, sources, DEFAULT_READ_BATCH_SIZE).await?;
+        let metrics_reporter = part_metrics.map(|m| m.merge_metrics_reporter());
+        let reader =
+            FlatMergeReader::new(schema, sources, DEFAULT_READ_BATCH_SIZE, metrics_reporter)
+                .await?;

        let dedup = !stream_ctx.input.append_mode;
+        let dedup_metrics_reporter = part_metrics.map(|m| m.dedup_metrics_reporter());
        let reader = if dedup {
            match stream_ctx.input.merge_mode {
                MergeMode::LastRow => Box::pin(
                    FlatDedupReader::new(
                        reader.into_stream().boxed(),
                        FlatLastRow::new(stream_ctx.input.filter_deleted),
+                        dedup_metrics_reporter,
                    )
                    .into_stream(),
                ) as _,
@@ -309,6 +322,7 @@ impl SeqScan {
                            mapper.field_column_start(),
                            stream_ctx.input.filter_deleted,
                        ),
+                        dedup_metrics_reporter,
                    )
                    .into_stream(),
                ) as _,
@@ -409,7 +423,7 @@ impl SeqScan {
                let mut metrics = ScannerMetrics::default();
                let mut fetch_start = Instant::now();
                let mut reader =
-                    Self::build_reader_from_sources(&stream_ctx, sources, semaphore.clone())
+                    Self::build_reader_from_sources(&stream_ctx, sources, semaphore.clone(), Some(&part_metrics))
                        .await?;
                #[cfg(debug_assertions)]
                let mut checker = crate::read::BatchChecker::default()
@@ -505,7 +519,7 @@ impl SeqScan {
                let mut metrics = ScannerMetrics::default();
                let mut fetch_start = Instant::now();
                let mut reader =
-                    Self::build_flat_reader_from_sources(&stream_ctx, sources, semaphore.clone())
+                    Self::build_flat_reader_from_sources(&stream_ctx, sources, semaphore.clone(), Some(&part_metrics))
                        .await?;

                while let Some(record_batch) = reader.try_next().await? {
--- a/src/mito2/src/read/series_scan.rs
+++ b/src/mito2/src/read/series_scan.rs
@@ -438,6 +438,7 @@ impl SeriesDistributor {
            &self.stream_ctx,
            sources,
            self.semaphore.clone(),
+            Some(&part_metrics),
        )
        .await?;
        let mut metrics = SeriesDistributorMetrics::default();
@@ -519,9 +520,13 @@ impl SeriesDistributor {
        }

        // Builds a reader that merge sources from all parts.
-        let mut reader =
-            SeqScan::build_reader_from_sources(&self.stream_ctx, sources, self.semaphore.clone())
-                .await?;
+        let mut reader = SeqScan::build_reader_from_sources(
+            &self.stream_ctx,
+            sources,
+            self.semaphore.clone(),
+            Some(&part_metrics),
+        )
+        .await?;
        let mut metrics = SeriesDistributorMetrics::default();
        let mut fetch_start = Instant::now();

--- a/src/mito2/src/sst/file.rs
+++ b/src/mito2/src/sst/file.rs
@@ -28,7 +28,7 @@ use serde::{Deserialize, Serialize};
 use smallvec::SmallVec;
 use store_api::metadata::ColumnMetadata;
 use store_api::region_request::PathType;
-use store_api::storage::{ColumnId, FileId, RegionId};
+use store_api::storage::{ColumnId, FileId, IndexVersion, RegionId};

 use crate::access_layer::AccessLayerRef;
 use crate::cache::CacheManagerRef;
@@ -82,8 +82,6 @@ pub type Level = u8;
 pub const MAX_LEVEL: Level = 2;
 /// Type to store index types for a column.
 pub type IndexTypes = SmallVec<[IndexType; 4]>;
-/// Index version
-pub type IndexVersion = u64;

 /// Cross-region file id.
 ///
@@ -308,6 +306,11 @@ impl FileMeta {
        !self.available_indexes.is_empty()
    }

+    /// Whether the index file is up-to-date comparing to another file meta.    
+    pub fn is_index_up_to_date(&self, other: &FileMeta) -> bool {
+        self.exists_index() && other.exists_index() && self.index_version >= other.index_version
+    }
+
    /// Returns true if the file has an inverted index
    pub fn inverted_index_available(&self) -> bool {
        self.available_indexes.contains(&IndexType::InvertedIndex)
@@ -434,6 +437,16 @@ impl FileHandle {
        self.inner.compacting.store(compacting, Ordering::Relaxed);
    }

+    pub fn index_outdated(&self) -> bool {
+        self.inner.index_outdated.load(Ordering::Relaxed)
+    }
+
+    pub fn set_index_outdated(&self, index_outdated: bool) {
+        self.inner
+            .index_outdated
+            .store(index_outdated, Ordering::Relaxed);
+    }
+
    /// Returns a reference to the [FileMeta].
    pub fn meta_ref(&self) -> &FileMeta {
        &self.inner.meta
@@ -471,23 +484,29 @@ struct FileHandleInner {
    meta: FileMeta,
    compacting: AtomicBool,
    deleted: AtomicBool,
+    index_outdated: AtomicBool,
    file_purger: FilePurgerRef,
 }

 impl Drop for FileHandleInner {
    fn drop(&mut self) {
-        self.file_purger
-            .remove_file(self.meta.clone(), self.deleted.load(Ordering::Relaxed));
+        self.file_purger.remove_file(
+            self.meta.clone(),
+            self.deleted.load(Ordering::Acquire),
+            self.index_outdated.load(Ordering::Acquire),
+        );
    }
 }

 impl FileHandleInner {
+    /// There should only be one `FileHandleInner` for each file on a datanode
    fn new(meta: FileMeta, file_purger: FilePurgerRef) -> FileHandleInner {
        file_purger.new_file(&meta);
        FileHandleInner {
            meta,
            compacting: AtomicBool::new(false),
            deleted: AtomicBool::new(false),
+            index_outdated: AtomicBool::new(false),
            file_purger,
        }
    }
@@ -540,38 +559,77 @@ pub async fn delete_files(
    );

    for (file_id, index_version) in file_ids {
-        if let Some(write_cache) = cache_manager.as_ref().and_then(|cache| cache.write_cache()) {
-            // Removes index file from the cache.
-            if delete_index {
-                write_cache
-                    .remove(IndexKey::new(
-                        region_id,
-                        *file_id,
-                        FileType::Puffin(*index_version),
-                    ))
-                    .await;
-            }
+        purge_index_cache_stager(
+            region_id,
+            delete_index,
+            access_layer,
+            cache_manager,
+            *file_id,
+            *index_version,
+        )
+        .await;
+    }
+    Ok(())
+}

-            // Remove the SST file from the cache.
+pub async fn delete_index(
+    region_index_id: RegionIndexId,
+    access_layer: &AccessLayerRef,
+    cache_manager: &Option<CacheManagerRef>,
+) -> crate::error::Result<()> {
+    access_layer.delete_index(region_index_id).await?;
+
+    purge_index_cache_stager(
+        region_index_id.region_id(),
+        true,
+        access_layer,
+        cache_manager,
+        region_index_id.file_id(),
+        region_index_id.version,
+    )
+    .await;
+
+    Ok(())
+}
+
+async fn purge_index_cache_stager(
+    region_id: RegionId,
+    delete_index: bool,
+    access_layer: &AccessLayerRef,
+    cache_manager: &Option<CacheManagerRef>,
+    file_id: FileId,
+    index_version: u64,
+) {
+    if let Some(write_cache) = cache_manager.as_ref().and_then(|cache| cache.write_cache()) {
+        // Removes index file from the cache.
+        if delete_index {
            write_cache
-                .remove(IndexKey::new(region_id, *file_id, FileType::Parquet))
+                .remove(IndexKey::new(
+                    region_id,
+                    file_id,
+                    FileType::Puffin(index_version),
+                ))
                .await;
        }

-        // Purges index content in the stager.
-        if let Err(e) = access_layer
-            .puffin_manager_factory()
-            .purge_stager(RegionIndexId::new(
-                RegionFileId::new(region_id, *file_id),
-                *index_version,
-            ))
-            .await
-        {
-            error!(e; "Failed to purge stager with index file, file_id: {}, index_version: {}, region: {}",
-                    file_id, index_version, region_id);
-        }
+        // Remove the SST file from the cache.
+        write_cache
+            .remove(IndexKey::new(region_id, file_id, FileType::Parquet))
+            .await;
+    }
+
+    // Purges index content in the stager.
+    if let Err(e) = access_layer
+        .puffin_manager_factory()
+        .purge_stager(RegionIndexId::new(
+            RegionFileId::new(region_id, file_id),
+            index_version,
+        ))
+        .await
+    {
+        error!(e; "Failed to purge stager with index file, file_id: {}, index_version: {}, region: {}",
+                file_id, index_version, region_id);
    }
-    Ok(())
 }

 #[cfg(test)]
--- a/src/mito2/src/sst/file_purger.rs
+++ b/src/mito2/src/sst/file_purger.rs
@@ -21,7 +21,7 @@ use crate::access_layer::AccessLayerRef;
 use crate::cache::CacheManagerRef;
 use crate::error::Result;
 use crate::schedule::scheduler::SchedulerRef;
-use crate::sst::file::{FileMeta, delete_files};
+use crate::sst::file::{FileMeta, delete_files, delete_index};
 use crate::sst::file_ref::FileReferenceManagerRef;

 /// A worker to delete files in background.
@@ -29,7 +29,8 @@ pub trait FilePurger: Send + Sync + fmt::Debug {
    /// Send a request to remove the file.
    /// If `is_delete` is true, the file will be deleted from the storage.
    /// Otherwise, only the reference will be removed.
-    fn remove_file(&self, file_meta: FileMeta, is_delete: bool);
+    /// If `index_outdated` is true, the index file will be deleted regardless of `is_delete`.
+    fn remove_file(&self, file_meta: FileMeta, is_delete: bool, index_outdated: bool);

    /// Notify the purger of a new file created.
    /// This is useful for object store based storage, where we need to track the file references
@@ -46,7 +47,7 @@ pub type FilePurgerRef = Arc<dyn FilePurger>;
 pub struct NoopFilePurger;

 impl FilePurger for NoopFilePurger {
-    fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool) {
+    fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool, _index_outdated: bool) {
        // noop
    }
 }
@@ -142,12 +143,27 @@ impl LocalFilePurger {
            error!(e; "Failed to schedule the file purge request");
        }
    }
+
+    fn delete_index(&self, file_meta: FileMeta) {
+        let sst_layer = self.sst_layer.clone();
+        let cache_manager = self.cache_manager.clone();
+        if let Err(e) = self.scheduler.schedule(Box::pin(async move {
+            let index_id = file_meta.index_id();
+            if let Err(e) = delete_index(index_id, &sst_layer, &cache_manager).await {
+                error!(e; "Failed to delete index for file {:?} from storage", file_meta);
+            }
+        })) {
+            error!(e; "Failed to schedule the index purge request");
+        }
+    }
 }

 impl FilePurger for LocalFilePurger {
-    fn remove_file(&self, file_meta: FileMeta, is_delete: bool) {
+    fn remove_file(&self, file_meta: FileMeta, is_delete: bool, index_outdated: bool) {
        if is_delete {
            self.delete_file(file_meta);
+        } else if index_outdated {
+            self.delete_index(file_meta);
        }
    }
 }
@@ -158,7 +174,7 @@ pub struct ObjectStoreFilePurger {
 }

 impl FilePurger for ObjectStoreFilePurger {
-    fn remove_file(&self, file_meta: FileMeta, _is_delete: bool) {
+    fn remove_file(&self, file_meta: FileMeta, _is_delete: bool, _index_outdated: bool) {
        // if not on local file system, instead inform the global file purger to remove the file reference.
        // notice that no matter whether the file is deleted or not, we need to remove the reference
        // because the file is no longer in use nonetheless.
--- a/src/mito2/src/sst/index.rs
+++ b/src/mito2/src/sst/index.rs
@@ -503,6 +503,8 @@ pub type ResultMpscSender = Sender<Result<IndexBuildOutcome>>;

 #[derive(Clone)]
 pub struct IndexBuildTask {
+    /// The SST file handle to build index for.
+    pub file: FileHandle,
    /// The file meta to build index for.
    pub file_meta: FileMeta,
    pub reason: IndexBuildType,
@@ -651,10 +653,7 @@ impl IndexBuildTask {

        let mut parquet_reader = self
            .access_layer
-            .read_sst(FileHandle::new(
-                self.file_meta.clone(),
-                self.file_purger.clone(),
-            ))
+            .read_sst(self.file.clone()) // use the latest file handle instead of creating a new one
            .build()
            .await?;

@@ -1498,14 +1497,19 @@ mod tests {
        let region_id = metadata.region_id;
        let indexer_builder = mock_indexer_builder(metadata, &env).await;

+        let file_meta = FileMeta {
+            region_id,
+            file_id: FileId::random(),
+            file_size: 100,
+            ..Default::default()
+        };
+
+        let file = FileHandle::new(file_meta.clone(), file_purger.clone());
+
        // Create mock task.
        let task = IndexBuildTask {
-            file_meta: FileMeta {
-                region_id,
-                file_id: FileId::random(),
-                file_size: 100,
-                ..Default::default()
-            },
+            file,
+            file_meta,
            reason: IndexBuildType::Flush,
            access_layer: env.access_layer.clone(),
            listener: WorkerListener::default(),
@@ -1555,10 +1559,13 @@ mod tests {
            mock_version_control(metadata.clone(), file_purger.clone(), files).await;
        let indexer_builder = mock_indexer_builder(metadata.clone(), &env).await;

+        let file = FileHandle::new(file_meta.clone(), file_purger.clone());
+
        // Create mock task.
        let (tx, mut rx) = mpsc::channel(4);
        let (result_tx, mut result_rx) = mpsc::channel::<Result<IndexBuildOutcome>>(4);
        let task = IndexBuildTask {
+            file,
            file_meta: file_meta.clone(),
            reason: IndexBuildType::Flush,
            access_layer: env.access_layer.clone(),
@@ -1626,10 +1633,13 @@ mod tests {
            mock_version_control(metadata.clone(), file_purger.clone(), files).await;
        let indexer_builder = mock_indexer_builder(metadata.clone(), &env).await;

+        let file = FileHandle::new(file_meta.clone(), file_purger.clone());
+
        // Create mock task.
        let (tx, _rx) = mpsc::channel(4);
        let (result_tx, mut result_rx) = mpsc::channel::<Result<IndexBuildOutcome>>(4);
        let task = IndexBuildTask {
+            file,
            file_meta: file_meta.clone(),
            reason: IndexBuildType::Flush,
            access_layer: env.access_layer.clone(),
@@ -1726,10 +1736,13 @@ mod tests {
            mock_version_control(metadata.clone(), file_purger.clone(), files).await;
        let indexer_builder = mock_indexer_builder(metadata.clone(), &env).await;

+        let file = FileHandle::new(file_meta.clone(), file_purger.clone());
+
        // Create mock task.
        let (tx, mut rx) = mpsc::channel(4);
        let (result_tx, mut result_rx) = mpsc::channel::<Result<IndexBuildOutcome>>(4);
        let task = IndexBuildTask {
+            file,
            file_meta: file_meta.clone(),
            reason: IndexBuildType::Flush,
            access_layer: env.access_layer.clone(),
@@ -1813,10 +1826,13 @@ mod tests {
        let version_control =
            mock_version_control(metadata.clone(), file_purger.clone(), files).await;

+        let file = FileHandle::new(file_meta.clone(), file_purger.clone());
+
        // Create mock task.
        let (tx, mut _rx) = mpsc::channel(4);
        let (result_tx, mut result_rx) = mpsc::channel::<Result<IndexBuildOutcome>>(4);
        let task = IndexBuildTask {
+            file,
            file_meta: file_meta.clone(),
            reason: IndexBuildType::Flush,
            access_layer: env.access_layer.clone(),
@@ -1864,13 +1880,18 @@ mod tests {
        let (tx, _rx) = mpsc::channel(4);
        let (result_tx, _result_rx) = mpsc::channel::<Result<IndexBuildOutcome>>(4);

+        let file_meta = FileMeta {
+            region_id,
+            file_id,
+            file_size: 100,
+            ..Default::default()
+        };
+
+        let file = FileHandle::new(file_meta.clone(), file_purger.clone());
+
        IndexBuildTask {
-            file_meta: FileMeta {
-                region_id,
-                file_id,
-                file_size: 100,
-                ..Default::default()
-            },
+            file,
+            file_meta,
            reason,
            access_layer: env.access_layer.clone(),
            listener: WorkerListener::default(),
--- a/src/mito2/src/sst/version.rs
+++ b/src/mito2/src/sst/version.rs
@@ -57,9 +57,28 @@ impl SstVersion {
    ) {
        for file in files_to_add {
            let level = file.level;
+            let new_index_version = file.index_version;
+            // If the file already exists, then we should only replace the handle when the index is outdated.
            self.levels[level as usize]
                .files
-                .insert(file.file_id, FileHandle::new(file, file_purger.clone()));
+                .entry(file.file_id)
+                .and_modify(|f| {
+                    if *f.meta_ref() == file || f.meta_ref().is_index_up_to_date(&file) {
+                        // same file meta or current file handle's index is up-to-date, skip adding
+                        if f.index_id().version > new_index_version {
+                            // what does it mean for us to see older index version?
+                            common_telemetry::warn!(
+                                "Adding file with older index version, existing: {:?}, new: {:?}, ignoring new file",
+                                f.meta_ref(),
+                                file
+                            );
+                        }
+                    } else {
+                        // include case like old file have no index or index is outdated
+                        *f = FileHandle::new(file.clone(), file_purger.clone());
+                    }
+                })
+                .or_insert_with(|| FileHandle::new(file.clone(), file_purger.clone()));
        }
    }

--- a/src/mito2/src/worker/handle_rebuild_index.rs
+++ b/src/mito2/src/worker/handle_rebuild_index.rs
@@ -72,6 +72,7 @@ impl<S> RegionWorkerLoop<S> {
        });

        IndexBuildTask {
+            file: file.clone(),
            file_meta: file.meta_ref().clone(),
            reason: build_type,
            access_layer: access_layer.clone(),
--- a/src/object-store/src/config.rs
+++ b/src/object-store/src/config.rs
@@ -117,6 +117,10 @@ pub struct S3Connection {
    /// By default, opendal will send API to https://s3.us-east-1.amazonaws.com/bucket_name
    /// Enabled, opendal will send API to https://bucket_name.s3.us-east-1.amazonaws.com
    pub enable_virtual_host_style: bool,
+    /// Allow anonymous access (disable credential signing) - useful for local testing
+    pub allow_anonymous: bool,
+    /// Disable config load from environment and files - useful for local testing
+    pub disable_config_load: bool,
 }

 impl From<&S3Connection> for S3 {
@@ -139,6 +143,14 @@ impl From<&S3Connection> for S3 {
            builder = builder.enable_virtual_host_style();
        }

+        if connection.allow_anonymous {
+            builder = builder.allow_anonymous();
+        }
+
+        if connection.disable_config_load {
+            builder = builder.disable_config_load();
+        }
+
        builder
    }
 }
--- a/src/object-store/src/factory.rs
+++ b/src/object-store/src/factory.rs
@@ -117,8 +117,8 @@ pub async fn new_oss_object_store(oss_config: &OssConfig) -> Result<ObjectStore>
 pub async fn new_s3_object_store(s3_config: &S3Config) -> Result<ObjectStore> {
    let root = util::normalize_dir(&s3_config.connection.root);
    info!(
-        "The s3 storage bucket is: {}, root is: {}",
-        s3_config.connection.bucket, &root
+        "The s3 storage bucket is: {}, root is: {}, endpoint: {:?}",
+        s3_config.connection.bucket, &root, s3_config.connection.endpoint
    );

    let client = build_http_client(&s3_config.http_client)?;
--- a/src/operator/src/statement.rs
+++ b/src/operator/src/statement.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 mod admin;
+mod comment;
 mod copy_database;
 mod copy_query_to;
 mod copy_table_from;
@@ -428,6 +429,7 @@ impl StatementExecutor {
            Statement::ShowCreateTrigger(show) => self.show_create_trigger(show, query_ctx).await,
            Statement::SetVariables(set_var) => self.set_variables(set_var, query_ctx),
            Statement::ShowVariables(show_variable) => self.show_variable(show_variable, query_ctx),
+            Statement::Comment(stmt) => self.comment(stmt, query_ctx).await,
            Statement::ShowColumns(show_columns) => {
                self.show_columns(show_columns, query_ctx).await
            }
--- a/src/operator/src/statement/comment.rs
+++ b/src/operator/src/statement/comment.rs
@@ -0,0 +1,176 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use api::v1::CommentOnExpr;
+use common_error::ext::BoxedError;
+use common_meta::procedure_executor::ExecutorContext;
+use common_meta::rpc::ddl::{CommentObjectType, CommentOnTask, DdlTask, SubmitDdlTaskRequest};
+use common_query::Output;
+use session::context::QueryContextRef;
+use session::table_name::table_idents_to_full_name;
+use snafu::ResultExt;
+use sql::ast::ObjectNamePartExt;
+use sql::statements::comment::{Comment, CommentObject};
+
+use crate::error::{ExecuteDdlSnafu, ExternalSnafu, InvalidSqlSnafu, Result};
+use crate::statement::StatementExecutor;
+
+impl StatementExecutor {
+    /// Adds a comment to a database object (table, column, or flow).
+    ///
+    /// # Arguments
+    ///
+    /// * `stmt`: A `Comment` struct containing the object to comment on and the comment text.
+    /// * `query_ctx`: A `QueryContextRef` providing contextual information for the query.
+    ///
+    /// # Returns
+    ///
+    /// A `Result` containing the `Output` of the operation, or an error if the operation fails.
+    pub async fn comment(&self, stmt: Comment, query_ctx: QueryContextRef) -> Result<Output> {
+        let comment_on_task = self.create_comment_on_task_from_stmt(stmt, &query_ctx)?;
+
+        let request = SubmitDdlTaskRequest {
+            task: DdlTask::new_comment_on(comment_on_task),
+            query_context: query_ctx,
+        };
+
+        self.procedure_executor
+            .submit_ddl_task(&ExecutorContext::default(), request)
+            .await
+            .context(ExecuteDdlSnafu)
+            .map(|_| Output::new_with_affected_rows(0))
+    }
+
+    pub async fn comment_by_expr(
+        &self,
+        expr: CommentOnExpr,
+        query_ctx: QueryContextRef,
+    ) -> Result<Output> {
+        let comment_on_task = self.create_comment_on_task_from_expr(expr)?;
+
+        let request = SubmitDdlTaskRequest {
+            task: DdlTask::new_comment_on(comment_on_task),
+            query_context: query_ctx,
+        };
+
+        self.procedure_executor
+            .submit_ddl_task(&ExecutorContext::default(), request)
+            .await
+            .context(ExecuteDdlSnafu)
+            .map(|_| Output::new_with_affected_rows(0))
+    }
+
+    fn create_comment_on_task_from_expr(&self, expr: CommentOnExpr) -> Result<CommentOnTask> {
+        let object_type = match expr.object_type {
+            0 => CommentObjectType::Table,
+            1 => CommentObjectType::Column,
+            2 => CommentObjectType::Flow,
+            _ => {
+                return InvalidSqlSnafu {
+                    err_msg: format!(
+                        "Invalid CommentObjectType value: {}. Valid values are: 0 (Table), 1 (Column), 2 (Flow)",
+                        expr.object_type
+                    ),
+                }
+                .fail();
+            }
+        };
+
+        Ok(CommentOnTask {
+            catalog_name: expr.catalog_name,
+            schema_name: expr.schema_name,
+            object_type,
+            object_name: expr.object_name,
+            column_name: if expr.column_name.is_empty() {
+                None
+            } else {
+                Some(expr.column_name)
+            },
+            object_id: None,
+            comment: if expr.comment.is_empty() {
+                None
+            } else {
+                Some(expr.comment)
+            },
+        })
+    }
+
+    fn create_comment_on_task_from_stmt(
+        &self,
+        stmt: Comment,
+        query_ctx: &QueryContextRef,
+    ) -> Result<CommentOnTask> {
+        match stmt.object {
+            CommentObject::Table(table) => {
+                let (catalog_name, schema_name, table_name) =
+                    table_idents_to_full_name(&table, query_ctx)
+                        .map_err(BoxedError::new)
+                        .context(ExternalSnafu)?;
+
+                Ok(CommentOnTask {
+                    catalog_name,
+                    schema_name,
+                    object_type: CommentObjectType::Table,
+                    object_name: table_name,
+                    column_name: None,
+                    object_id: None,
+                    comment: stmt.comment,
+                })
+            }
+            CommentObject::Column { table, column } => {
+                let (catalog_name, schema_name, table_name) =
+                    table_idents_to_full_name(&table, query_ctx)
+                        .map_err(BoxedError::new)
+                        .context(ExternalSnafu)?;
+
+                Ok(CommentOnTask {
+                    catalog_name,
+                    schema_name,
+                    object_type: CommentObjectType::Column,
+                    object_name: table_name,
+                    column_name: Some(column.value),
+                    object_id: None,
+                    comment: stmt.comment,
+                })
+            }
+            CommentObject::Flow(flow_name) => {
+                let (catalog_name, flow_name_str) = match &flow_name.0[..] {
+                    [flow] => (
+                        query_ctx.current_catalog().to_string(),
+                        flow.to_string_unquoted(),
+                    ),
+                    [catalog, flow] => (catalog.to_string_unquoted(), flow.to_string_unquoted()),
+                    _ => {
+                        return InvalidSqlSnafu {
+                            err_msg: format!(
+                                "expect flow name to be <catalog>.<flow_name> or <flow_name>, actual: {flow_name}"
+                            ),
+                        }
+                        .fail();
+                    }
+                };
+
+                Ok(CommentOnTask {
+                    catalog_name,
+                    schema_name: String::new(), // Flow doesn't use schema
+                    object_type: CommentObjectType::Flow,
+                    object_name: flow_name_str,
+                    column_name: None,
+                    object_id: None,
+                    comment: stmt.comment,
+                })
+            }
+        }
+    }
+}
--- a/src/operator/src/statement/show.rs
+++ b/src/operator/src/statement/show.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::sync::Arc;
+
 use common_error::ext::BoxedError;
 use common_meta::key::schema_name::SchemaNameKey;
 use common_query::Output;
@@ -120,7 +122,30 @@ impl StatementExecutor {
        table: TableRef,
        query_ctx: QueryContextRef,
    ) -> Result<Output> {
-        let table_info = table.table_info();
+        let mut table_info = table.table_info();
+        let partition_column_names: Vec<_> =
+            table_info.meta.partition_column_names().cloned().collect();
+
+        if let Some(latest) = self
+            .table_metadata_manager
+            .table_info_manager()
+            .get(table_info.table_id())
+            .await
+            .context(TableMetadataManagerSnafu)?
+        {
+            let mut latest_info = TableInfo::try_from(latest.into_inner().table_info)
+                .context(error::CreateTableInfoSnafu)?;
+
+            if !partition_column_names.is_empty() {
+                latest_info.meta.partition_key_indices = partition_column_names
+                    .iter()
+                    .filter_map(|name| latest_info.meta.schema.column_index_by_name(name.as_str()))
+                    .collect();
+            }
+
+            table_info = Arc::new(latest_info);
+        }
+
        if table_info.table_type != TableType::Base {
            return error::ShowCreateTableBaseOnlySnafu {
                table_name: table_name.to_string(),
@@ -150,7 +175,7 @@ impl StatementExecutor {

        let partitions = create_partitions_stmt(&table_info, partitions)?;

-        query::sql::show_create_table(table, schema_options, partitions, query_ctx)
+        query::sql::show_create_table(table_info, schema_options, partitions, query_ctx)
            .context(ExecuteStatementSnafu)
    }

--- a/src/pipeline/benches/processor.rs
+++ b/src/pipeline/benches/processor.rs
@@ -33,7 +33,7 @@ fn processor_mut(
            .exec_mut(v, pipeline_ctx, schema_info)?
            .into_transformed()
            .expect("expect transformed result ");
-        result.push(r.0);
+        result.extend(r.into_iter().map(|v| v.0));
    }

    Ok(result)
--- a/src/pipeline/src/error.rs
+++ b/src/pipeline/src/error.rs
@@ -19,6 +19,7 @@ use common_error::status_code::StatusCode;
 use common_macro::stack_trace_debug;
 use datatypes::timestamp::TimestampNanosecond;
 use snafu::{Location, Snafu};
+use vrl::value::Kind;

 #[derive(Snafu)]
 #[snafu(visibility(pub))]
@@ -676,8 +677,12 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Vrl script should return `.` in the end"))]
+    #[snafu(display(
+        "Vrl script should return object or array in the end, got `{:?}`",
+        result_kind
+    ))]
    VrlReturnValue {
+        result_kind: Kind,
        #[snafu(implicit)]
        location: Location,
    },
@@ -695,6 +700,25 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display(
+        "Array element at index {index} must be an object for one-to-many transformation, got {actual_type}"
+    ))]
+    ArrayElementMustBeObject {
+        index: usize,
+        actual_type: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to transform array element at index {index}: {source}"))]
+    TransformArrayElement {
+        index: usize,
+        #[snafu(source)]
+        source: Box<Error>,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display("Failed to build DataFusion logical plan"))]
    BuildDfLogicalPlan {
        #[snafu(source)]
@@ -792,7 +816,10 @@ impl ErrorExt for Error {
            | InvalidPipelineVersion { .. }
            | InvalidCustomTimeIndex { .. }
            | TimeIndexMustBeNonNull { .. } => StatusCode::InvalidArguments,
-            MultiPipelineWithDiffSchema { .. } | ValueMustBeMap { .. } => StatusCode::IllegalState,
+            MultiPipelineWithDiffSchema { .. }
+            | ValueMustBeMap { .. }
+            | ArrayElementMustBeObject { .. } => StatusCode::IllegalState,
+            TransformArrayElement { source, .. } => source.status_code(),
            BuildDfLogicalPlan { .. } | RecordBatchLenNotMatch { .. } => StatusCode::Internal,
            ExecuteInternalStatement { source, .. } => source.status_code(),
            DataFrame { source, .. } => source.status_code(),
--- a/src/pipeline/src/etl.rs
+++ b/src/pipeline/src/etl.rs
@@ -19,6 +19,8 @@ pub mod processor;
 pub mod transform;
 pub mod value;

+use std::collections::HashMap;
+
 use api::v1::Row;
 use common_time::timestamp::TimeUnit;
 use itertools::Itertools;
@@ -30,13 +32,17 @@ use yaml_rust::{Yaml, YamlLoader};

 use crate::dispatcher::{Dispatcher, Rule};
 use crate::error::{
-    AutoTransformOneTimestampSnafu, Error, IntermediateKeyIndexSnafu, InvalidVersionNumberSnafu,
-    Result, YamlLoadSnafu, YamlParseSnafu,
+    ArrayElementMustBeObjectSnafu, AutoTransformOneTimestampSnafu, Error,
+    IntermediateKeyIndexSnafu, InvalidVersionNumberSnafu, Result, TransformArrayElementSnafu,
+    YamlLoadSnafu, YamlParseSnafu,
 };
 use crate::etl::processor::ProcessorKind;
-use crate::etl::transform::transformer::greptime::values_to_row;
+use crate::etl::transform::transformer::greptime::{RowWithTableSuffix, values_to_rows};
 use crate::tablesuffix::TableSuffixTemplate;
-use crate::{ContextOpt, GreptimeTransformer, IdentityTimeIndex, PipelineContext, SchemaInfo};
+use crate::{
+    ContextOpt, GreptimeTransformer, IdentityTimeIndex, PipelineContext, SchemaInfo,
+    unwrap_or_continue_if_err,
+};

 const DESCRIPTION: &str = "description";
 const DOC_VERSION: &str = "version";
@@ -230,21 +236,51 @@ pub enum PipelineExecOutput {
    Filtered,
 }

+/// Output from a successful pipeline transformation.
+///
+/// Rows are grouped by their ContextOpt, with each row having its own optional
+/// table_suffix for routing to different tables when using one-to-many expansion.
+/// This enables true per-row configuration options where different rows can have
+/// different database settings (TTL, merge mode, etc.).
 #[derive(Debug)]
 pub struct TransformedOutput {
-    pub opt: ContextOpt,
-    pub row: Row,
-    pub table_suffix: Option<String>,
+    /// Rows grouped by their ContextOpt, each with optional table suffix
+    pub rows_by_context: HashMap<ContextOpt, Vec<RowWithTableSuffix>>,
 }

 impl PipelineExecOutput {
    // Note: This is a test only function, do not use it in production.
-    pub fn into_transformed(self) -> Option<(Row, Option<String>)> {
-        if let Self::Transformed(TransformedOutput {
-            row, table_suffix, ..
-        }) = self
-        {
-            Some((row, table_suffix))
+    pub fn into_transformed(self) -> Option<Vec<RowWithTableSuffix>> {
+        if let Self::Transformed(TransformedOutput { rows_by_context }) = self {
+            // For backward compatibility, merge all rows with a default ContextOpt
+            Some(rows_by_context.into_values().flatten().collect())
+        } else {
+            None
+        }
+    }
+
+    // New method for accessing the HashMap structure directly
+    pub fn into_transformed_hashmap(self) -> Option<HashMap<ContextOpt, Vec<RowWithTableSuffix>>> {
+        if let Self::Transformed(TransformedOutput { rows_by_context }) = self {
+            Some(rows_by_context)
+        } else {
+            None
+        }
+    }
+
+    // Backward compatibility helper that returns first ContextOpt with all its rows
+    // or merges all rows with default ContextOpt for multi-context scenarios
+    pub fn into_legacy_format(self) -> Option<(ContextOpt, Vec<RowWithTableSuffix>)> {
+        if let Self::Transformed(TransformedOutput { rows_by_context }) = self {
+            if rows_by_context.len() == 1 {
+                let (opt, rows) = rows_by_context.into_iter().next().unwrap();
+                Some((opt, rows))
+            } else {
+                // Multiple contexts: merge all rows with default ContextOpt for test compatibility
+                let all_rows: Vec<RowWithTableSuffix> =
+                    rows_by_context.into_values().flatten().collect();
+                Some((ContextOpt::default(), all_rows))
+            }
        } else {
            None
        }
@@ -285,45 +321,43 @@ impl Pipeline {
            return Ok(PipelineExecOutput::DispatchedTo(rule.into(), val));
        }

-        // extract the options first
-        // this might be a breaking change, for table_suffix is now right after the processors
-        let mut opt = ContextOpt::from_pipeline_map_to_opt(&mut val)?;
-        let table_suffix = opt.resolve_table_suffix(self.tablesuffix.as_ref(), &val);
+        let mut val = if val.is_array() {
+            val
+        } else {
+            VrlValue::Array(vec![val])
+        };

-        let row = match self.transformer() {
+        let rows_by_context = match self.transformer() {
            TransformerMode::GreptimeTransformer(greptime_transformer) => {
-                let values = greptime_transformer.transform_mut(&mut val, self.is_v1())?;
-                if self.is_v1() {
-                    // v1 dont combine with auto-transform
-                    // so return immediately
-                    return Ok(PipelineExecOutput::Transformed(TransformedOutput {
-                        opt,
-                        row: Row { values },
-                        table_suffix,
-                    }));
-                }
-                // continue v2 process, and set the rest fields with auto-transform
-                // if transformer presents, then ts has been set
-                values_to_row(schema_info, val, pipeline_ctx, Some(values), false)?
+                transform_array_elements_by_ctx(
+                    // SAFETY: by line 326, val must be an array
+                    val.as_array_mut().unwrap(),
+                    greptime_transformer,
+                    self.is_v1(),
+                    schema_info,
+                    pipeline_ctx,
+                    self.tablesuffix.as_ref(),
+                )?
            }
            TransformerMode::AutoTransform(ts_name, time_unit) => {
-                // infer ts from the context
-                // we've check that only one timestamp should exist
-
-                // Create pipeline context with the found timestamp
                let def = crate::PipelineDefinition::GreptimeIdentityPipeline(Some(
                    IdentityTimeIndex::Epoch(ts_name.clone(), *time_unit, false),
                ));
                let n_ctx =
                    PipelineContext::new(&def, pipeline_ctx.pipeline_param, pipeline_ctx.channel);
-                values_to_row(schema_info, val, &n_ctx, None, true)?
+                values_to_rows(
+                    schema_info,
+                    val,
+                    &n_ctx,
+                    None,
+                    true,
+                    self.tablesuffix.as_ref(),
+                )?
            }
        };

        Ok(PipelineExecOutput::Transformed(TransformedOutput {
-            opt,
-            row,
-            table_suffix,
+            rows_by_context,
        }))
    }

@@ -350,6 +384,65 @@ impl Pipeline {
    }
 }

+/// Transforms an array of VRL values into rows grouped by their ContextOpt.
+/// Each element can have its own ContextOpt for per-row configuration.
+fn transform_array_elements_by_ctx(
+    arr: &mut [VrlValue],
+    transformer: &GreptimeTransformer,
+    is_v1: bool,
+    schema_info: &mut SchemaInfo,
+    pipeline_ctx: &PipelineContext<'_>,
+    tablesuffix_template: Option<&TableSuffixTemplate>,
+) -> Result<HashMap<ContextOpt, Vec<RowWithTableSuffix>>> {
+    let skip_error = pipeline_ctx.pipeline_param.skip_error();
+    let mut rows_by_context = HashMap::new();
+
+    for (index, element) in arr.iter_mut().enumerate() {
+        if !element.is_object() {
+            unwrap_or_continue_if_err!(
+                ArrayElementMustBeObjectSnafu {
+                    index,
+                    actual_type: element.kind_str().to_string(),
+                }
+                .fail(),
+                skip_error
+            );
+        }
+
+        let values =
+            unwrap_or_continue_if_err!(transformer.transform_mut(element, is_v1), skip_error);
+        if is_v1 {
+            // v1 mode: just use transformer output directly
+            let mut opt = unwrap_or_continue_if_err!(
+                ContextOpt::from_pipeline_map_to_opt(element),
+                skip_error
+            );
+            let table_suffix = opt.resolve_table_suffix(tablesuffix_template, element);
+            rows_by_context
+                .entry(opt)
+                .or_insert_with(Vec::new)
+                .push((Row { values }, table_suffix));
+        } else {
+            // v2 mode: combine with auto-transform for remaining fields
+            let element_rows_map = values_to_rows(
+                schema_info,
+                element.clone(),
+                pipeline_ctx,
+                Some(values),
+                false,
+                tablesuffix_template,
+            )
+            .map_err(Box::new)
+            .context(TransformArrayElementSnafu { index })?;
+            for (k, v) in element_rows_map {
+                rows_by_context.entry(k).or_default().extend(v);
+            }
+        }
+    }
+
+    Ok(rows_by_context)
+}
+
 pub(crate) fn find_key_index(intermediate_keys: &[String], key: &str, kind: &str) -> Result<usize> {
    intermediate_keys
        .iter()
@@ -361,7 +454,7 @@ pub(crate) fn find_key_index(intermediate_keys: &[String], key: &str, kind: &str
 /// The schema_info cannot be used in auto-transform ts-infer mode for lacking the ts schema.
 ///
 /// Usage:
-/// ```rust
+/// ```ignore
 /// let (pipeline, schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
 /// let pipeline_ctx = PipelineContext::new(&pipeline_def, &pipeline_param, Channel::Unknown);
 /// ```
@@ -382,6 +475,7 @@ macro_rules! setup_pipeline {
        (pipeline, schema_info, pipeline_def, pipeline_param)
    }};
 }
+
 #[cfg(test)]
 mod tests {
    use std::collections::BTreeMap;
@@ -433,15 +527,16 @@ transform:
        );

        let payload = input_value.into();
-        let result = pipeline
+        let mut result = pipeline
            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
            .unwrap()
            .into_transformed()
            .unwrap();

-        assert_eq!(result.0.values[0].value_data, Some(ValueData::U32Value(1)));
-        assert_eq!(result.0.values[1].value_data, Some(ValueData::U32Value(2)));
-        match &result.0.values[2].value_data {
+        let (row, _table_suffix) = result.swap_remove(0);
+        assert_eq!(row.values[0].value_data, Some(ValueData::U32Value(1)));
+        assert_eq!(row.values[1].value_data, Some(ValueData::U32Value(2)));
+        match &row.values[2].value_data {
            Some(ValueData::TimestampNanosecondValue(v)) => {
                assert_ne!(v, &0);
            }
@@ -504,7 +599,7 @@ transform:
            .into_transformed()
            .unwrap();

-        assert_eq!(schema_info.schema.len(), result.0.values.len());
+        assert_eq!(schema_info.schema.len(), result[0].0.values.len());
        let test = [
            (
                ColumnDataType::String as i32,
@@ -545,7 +640,7 @@ transform:
        let schema = pipeline.schemas().unwrap();
        for i in 0..schema.len() {
            let schema = &schema[i];
-            let value = &result.0.values[i];
+            let value = &result[0].0.values[i];
            assert_eq!(schema.datatype, test[i].0);
            assert_eq!(value.value_data, test[i].1);
        }
@@ -595,9 +690,15 @@ transform:
            .unwrap()
            .into_transformed()
            .unwrap();
-        assert_eq!(result.0.values[0].value_data, Some(ValueData::U32Value(1)));
-        assert_eq!(result.0.values[1].value_data, Some(ValueData::U32Value(2)));
-        match &result.0.values[2].value_data {
+        assert_eq!(
+            result[0].0.values[0].value_data,
+            Some(ValueData::U32Value(1))
+        );
+        assert_eq!(
+            result[0].0.values[1].value_data,
+            Some(ValueData::U32Value(2))
+        );
+        match &result[0].0.values[2].value_data {
            Some(ValueData::TimestampNanosecondValue(v)) => {
                assert_ne!(v, &0);
            }
@@ -644,14 +745,14 @@ transform:
        let schema = pipeline.schemas().unwrap().clone();
        let result = input_value.into();

-        let row = pipeline
+        let rows_with_suffix = pipeline
            .exec_mut(result, &pipeline_ctx, &mut schema_info)
            .unwrap()
            .into_transformed()
            .unwrap();
        let output = Rows {
            schema,
-            rows: vec![row.0],
+            rows: rows_with_suffix.into_iter().map(|(r, _)| r).collect(),
        };
        let schemas = output.schema;

@@ -804,4 +905,566 @@ transform:
        let r: Result<Pipeline> = parse(&Content::Yaml(bad_yaml3));
        assert!(r.is_err());
    }
+
+    /// Test one-to-many VRL pipeline expansion.
+    /// A VRL processor can return an array, which results in multiple output rows.
+    #[test]
+    fn test_one_to_many_vrl_expansion() {
+        let pipeline_yaml = r#"
+processors:
+  - epoch:
+      field: timestamp
+      resolution: ms
+  - vrl:
+      source: |
+        events = del(.events)
+        base_host = del(.host)
+        base_ts = del(.timestamp)
+        map_values(array!(events)) -> |event| {
+            {
+                "host": base_host,
+                "event_type": event.type,
+                "event_value": event.value,
+                "timestamp": base_ts
+            }
+        }
+
+transform:
+  - field: host
+    type: string
+  - field: event_type
+    type: string
+  - field: event_value
+    type: int32
+  - field: timestamp
+    type: timestamp, ms
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        // Input with 3 events
+        let input_value: serde_json::Value = serde_json::from_str(
+            r#"{
+                "host": "server1",
+                "timestamp": 1716668197217,
+                "events": [
+                    {"type": "cpu", "value": 80},
+                    {"type": "memory", "value": 60},
+                    {"type": "disk", "value": 45}
+                ]
+            }"#,
+        )
+        .unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
+
+        // Should produce 3 rows from 1 input
+        assert_eq!(result.len(), 3);
+
+        // Verify each row has correct structure
+        for (row, _table_suffix) in &result {
+            assert_eq!(row.values.len(), 4); // host, event_type, event_value, timestamp
+            // First value should be "server1"
+            assert_eq!(
+                row.values[0].value_data,
+                Some(ValueData::StringValue("server1".to_string()))
+            );
+            // Last value should be the timestamp
+            assert_eq!(
+                row.values[3].value_data,
+                Some(ValueData::TimestampMillisecondValue(1716668197217))
+            );
+        }
+
+        // Verify event types
+        let event_types: Vec<_> = result
+            .iter()
+            .map(|(r, _)| match &r.values[1].value_data {
+                Some(ValueData::StringValue(s)) => s.clone(),
+                _ => panic!("expected string"),
+            })
+            .collect();
+        assert!(event_types.contains(&"cpu".to_string()));
+        assert!(event_types.contains(&"memory".to_string()));
+        assert!(event_types.contains(&"disk".to_string()));
+    }
+
+    /// Test that single object output still works (backward compatibility)
+    #[test]
+    fn test_single_object_output_unchanged() {
+        let pipeline_yaml = r#"
+processors:
+  - epoch:
+      field: ts
+      resolution: ms
+  - vrl:
+      source: |
+        .processed = true
+        .
+
+transform:
+  - field: name
+    type: string
+  - field: processed
+    type: boolean
+  - field: ts
+    type: timestamp, ms
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        let input_value: serde_json::Value = serde_json::from_str(
+            r#"{
+                "name": "test",
+                "ts": 1716668197217
+            }"#,
+        )
+        .unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
+
+        // Should produce exactly 1 row
+        assert_eq!(result.len(), 1);
+        assert_eq!(
+            result[0].0.values[0].value_data,
+            Some(ValueData::StringValue("test".to_string()))
+        );
+        assert_eq!(
+            result[0].0.values[1].value_data,
+            Some(ValueData::BoolValue(true))
+        );
+    }
+
+    /// Test that empty array produces zero rows
+    #[test]
+    fn test_empty_array_produces_zero_rows() {
+        let pipeline_yaml = r#"
+processors:
+  - vrl:
+      source: |
+        .events
+
+transform:
+  - field: value
+    type: int32
+  - field: greptime_timestamp
+    type: timestamp, ns
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        let input_value: serde_json::Value = serde_json::from_str(r#"{"events": []}"#).unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
+
+        // Empty array should produce zero rows
+        assert_eq!(result.len(), 0);
+    }
+
+    /// Test that array elements must be objects
+    #[test]
+    fn test_array_element_must_be_object() {
+        let pipeline_yaml = r#"
+processors:
+  - vrl:
+      source: |
+        .items
+
+transform:
+  - field: value
+    type: int32
+  - field: greptime_timestamp
+    type: timestamp, ns
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        // Array with non-object elements should fail
+        let input_value: serde_json::Value =
+            serde_json::from_str(r#"{"items": [1, 2, 3]}"#).unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline.exec_mut(payload, &pipeline_ctx, &mut schema_info);
+
+        assert!(result.is_err());
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("must be an object"),
+            "Expected error about non-object element, got: {}",
+            err_msg
+        );
+    }
+
+    /// Test one-to-many with table suffix from VRL hint
+    #[test]
+    fn test_one_to_many_with_table_suffix_hint() {
+        let pipeline_yaml = r#"
+processors:
+  - epoch:
+      field: ts
+      resolution: ms
+  - vrl:
+      source: |
+        .greptime_table_suffix = "_" + string!(.category)
+        .
+
+transform:
+  - field: name
+    type: string
+  - field: category
+    type: string
+  - field: ts
+    type: timestamp, ms
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        let input_value: serde_json::Value = serde_json::from_str(
+            r#"{
+                "name": "test",
+                "category": "metrics",
+                "ts": 1716668197217
+            }"#,
+        )
+        .unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
+
+        // Should have table suffix extracted per row
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].1, Some("_metrics".to_string()));
+    }
+
+    /// Test one-to-many with per-row table suffix
+    #[test]
+    fn test_one_to_many_per_row_table_suffix() {
+        let pipeline_yaml = r#"
+processors:
+  - epoch:
+      field: timestamp
+      resolution: ms
+  - vrl:
+      source: |
+        events = del(.events)
+        base_ts = del(.timestamp)
+
+        map_values(array!(events)) -> |event| {
+            suffix = "_" + string!(event.category)
+            {
+                "name": event.name,
+                "value": event.value,
+                "timestamp": base_ts,
+                "greptime_table_suffix": suffix
+            }
+        }
+
+transform:
+  - field: name
+    type: string
+  - field: value
+    type: int32
+  - field: timestamp
+    type: timestamp, ms
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        // Input with events that should go to different tables
+        let input_value: serde_json::Value = serde_json::from_str(
+            r#"{
+                "timestamp": 1716668197217,
+                "events": [
+                    {"name": "cpu_usage", "value": 80, "category": "cpu"},
+                    {"name": "mem_usage", "value": 60, "category": "memory"},
+                    {"name": "cpu_temp", "value": 45, "category": "cpu"}
+                ]
+            }"#,
+        )
+        .unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
+
+        // Should produce 3 rows
+        assert_eq!(result.len(), 3);
+
+        // Collect table suffixes
+        let table_suffixes: Vec<_> = result.iter().map(|(_, suffix)| suffix.clone()).collect();
+
+        // Should have different table suffixes per row
+        assert!(table_suffixes.contains(&Some("_cpu".to_string())));
+        assert!(table_suffixes.contains(&Some("_memory".to_string())));
+
+        // Count rows per table suffix
+        let cpu_count = table_suffixes
+            .iter()
+            .filter(|s| *s == &Some("_cpu".to_string()))
+            .count();
+        let memory_count = table_suffixes
+            .iter()
+            .filter(|s| *s == &Some("_memory".to_string()))
+            .count();
+        assert_eq!(cpu_count, 2);
+        assert_eq!(memory_count, 1);
+    }
+
+    /// Test that one-to-many mapping preserves per-row ContextOpt in HashMap
+    #[test]
+    fn test_one_to_many_hashmap_contextopt_preservation() {
+        let pipeline_yaml = r#"
+processors:
+  - epoch:
+      field: timestamp
+      resolution: ms
+  - vrl:
+      source: |
+        events = del(.events)
+        base_ts = del(.timestamp)
+
+        map_values(array!(events)) -> |event| {
+            # Set different TTL values per event type
+            ttl = if event.type == "critical" {
+                "1h"
+            } else if event.type == "warning" {
+                "24h"
+            } else {
+                "7d"
+            }
+
+            {
+                "host": del(.host),
+                "event_type": event.type,
+                "event_value": event.value,
+                "timestamp": base_ts,
+                "greptime_ttl": ttl
+            }
+        }
+
+transform:
+  - field: host
+    type: string
+  - field: event_type
+    type: string
+  - field: event_value
+    type: int32
+  - field: timestamp
+    type: timestamp, ms
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        // Input with events that should have different ContextOpt values
+        let input_value: serde_json::Value = serde_json::from_str(
+            r#"{
+                "host": "server1",
+                "timestamp": 1716668197217,
+                "events": [
+                    {"type": "critical", "value": 100},
+                    {"type": "warning", "value": 50},
+                    {"type": "info", "value": 25}
+                ]
+            }"#,
+        )
+        .unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap();
+
+        // Extract the HashMap structure
+        let rows_by_context = result.into_transformed_hashmap().unwrap();
+
+        // Should have 3 different ContextOpt groups due to different TTL values
+        assert_eq!(rows_by_context.len(), 3);
+
+        // Verify each ContextOpt group has exactly 1 row and different configurations
+        let mut context_opts = Vec::new();
+        for (opt, rows) in &rows_by_context {
+            assert_eq!(rows.len(), 1); // Each group should have exactly 1 row
+            context_opts.push(opt.clone());
+        }
+
+        // ContextOpts should be different due to different TTL values
+        assert_ne!(context_opts[0], context_opts[1]);
+        assert_ne!(context_opts[1], context_opts[2]);
+        assert_ne!(context_opts[0], context_opts[2]);
+
+        // Verify the rows are correctly structured
+        for rows in rows_by_context.values() {
+            for (row, _table_suffix) in rows {
+                assert_eq!(row.values.len(), 4); // host, event_type, event_value, timestamp
+            }
+        }
+    }
+
+    /// Test that single object input still works with HashMap structure
+    #[test]
+    fn test_single_object_hashmap_compatibility() {
+        let pipeline_yaml = r#"
+processors:
+  - epoch:
+      field: ts
+      resolution: ms
+  - vrl:
+      source: |
+        .processed = true
+        .
+
+transform:
+  - field: name
+    type: string
+  - field: processed
+    type: boolean
+  - field: ts
+    type: timestamp, ms
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        let input_value: serde_json::Value = serde_json::from_str(
+            r#"{
+                "name": "test",
+                "ts": 1716668197217
+            }"#,
+        )
+        .unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap();
+
+        // Extract the HashMap structure
+        let rows_by_context = result.into_transformed_hashmap().unwrap();
+
+        // Single object should produce exactly 1 ContextOpt group
+        assert_eq!(rows_by_context.len(), 1);
+
+        let (_opt, rows) = rows_by_context.into_iter().next().unwrap();
+        assert_eq!(rows.len(), 1);
+
+        // Verify the row structure
+        let (row, _table_suffix) = &rows[0];
+        assert_eq!(row.values.len(), 3); // name, processed, timestamp
+    }
+
+    /// Test that empty arrays work correctly with HashMap structure
+    #[test]
+    fn test_empty_array_hashmap() {
+        let pipeline_yaml = r#"
+processors:
+  - vrl:
+      source: |
+        .events
+
+transform:
+  - field: value
+    type: int32
+  - field: greptime_timestamp
+    type: timestamp, ns
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        let input_value: serde_json::Value = serde_json::from_str(r#"{"events": []}"#).unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap();
+
+        // Extract the HashMap structure
+        let rows_by_context = result.into_transformed_hashmap().unwrap();
+
+        // Empty array should produce empty HashMap
+        assert_eq!(rows_by_context.len(), 0);
+    }
 }
--- a/src/pipeline/src/etl/ctx_req.rs
+++ b/src/pipeline/src/etl/ctx_req.rs
@@ -57,7 +57,7 @@ const PIPELINE_HINT_PREFIX: &str = "greptime_";
 ///
 /// The options are set in the format of hint keys. See [`PIPELINE_HINT_KEYS`].
 /// It's is used as the key in [`ContextReq`] for grouping the row insert requests.
-#[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)]
 pub struct ContextOpt {
    // table options, that need to be set in the query context before making row insert requests
    auto_create_table: Option<String>,
@@ -192,8 +192,15 @@ impl ContextReq {
        Self { req: req_map }
    }

-    pub fn add_row(&mut self, opt: ContextOpt, req: RowInsertRequest) {
-        self.req.entry(opt).or_default().push(req);
+    pub fn add_row(&mut self, opt: &ContextOpt, req: RowInsertRequest) {
+        match self.req.get_mut(opt) {
+            None => {
+                self.req.insert(opt.clone(), vec![req]);
+            }
+            Some(e) => {
+                e.push(req);
+            }
+        }
    }

    pub fn add_rows(&mut self, opt: ContextOpt, reqs: impl IntoIterator<Item = RowInsertRequest>) {
--- a/src/pipeline/src/etl/processor/vrl_processor.rs
+++ b/src/pipeline/src/etl/processor/vrl_processor.rs
@@ -15,7 +15,7 @@
 use std::collections::BTreeMap;

 use chrono_tz::Tz;
-use snafu::OptionExt;
+use snafu::{OptionExt, ensure};
 use vrl::compiler::runtime::Runtime;
 use vrl::compiler::{Program, TargetValue, compile};
 use vrl::diagnostic::Formatter;
@@ -53,9 +53,15 @@ impl VrlProcessor {
        // check if the return value is have regex
        let result_def = program.final_type_info().result;
        let kind = result_def.kind();
-        if !kind.is_object() {
-            return VrlReturnValueSnafu.fail();
-        }
+        // Check if the return type could possibly be an object or array.
+        // We use contains_* methods since VRL type inference may return
+        // a Kind that represents multiple possible types.
+        ensure!(
+            kind.contains_object() || kind.contains_array(),
+            VrlReturnValueSnafu {
+                result_kind: kind.clone(),
+            }
+        );
        check_regex_output(kind)?;

        Ok(Self { source, program })
@@ -111,13 +117,7 @@ impl crate::etl::processor::Processor for VrlProcessor {
    }

    fn exec_mut(&self, val: VrlValue) -> Result<VrlValue> {
-        let val = self.resolve(val)?;
-
-        if let VrlValue::Object(_) = val {
-            Ok(val)
-        } else {
-            VrlRegexValueSnafu.fail()
-        }
+        self.resolve(val)
    }
 }

--- a/src/pipeline/src/etl/transform/transformer/greptime.rs
+++ b/src/pipeline/src/etl/transform/transformer/greptime.rs
@@ -37,8 +37,8 @@ use vrl::prelude::{Bytes, VrlValueConvert};
 use vrl::value::{KeyString, Value as VrlValue};

 use crate::error::{
-    IdentifyPipelineColumnTypeMismatchSnafu, InvalidTimestampSnafu, Result,
-    TimeIndexMustBeNonNullSnafu, TransformColumnNameMustBeUniqueSnafu,
+    ArrayElementMustBeObjectSnafu, IdentifyPipelineColumnTypeMismatchSnafu, InvalidTimestampSnafu,
+    Result, TimeIndexMustBeNonNullSnafu, TransformColumnNameMustBeUniqueSnafu,
    TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu, ValueMustBeMapSnafu,
 };
 use crate::etl::PipelineDocVersion;
@@ -50,6 +50,9 @@ use crate::{PipelineContext, truthy, unwrap_or_continue_if_err};

 const DEFAULT_MAX_NESTED_LEVELS_FOR_JSON_FLATTENING: usize = 10;

+/// Row with potentially designated table suffix.
+pub type RowWithTableSuffix = (Row, Option<String>);
+
 /// fields not in the columns will be discarded
 /// to prevent automatic column creation in GreptimeDB
 #[derive(Debug, Clone)]
@@ -363,6 +366,73 @@ fn calc_ts(p_ctx: &PipelineContext, values: &VrlValue) -> Result<Option<ValueDat
    }
 }

+/// Converts VRL values to Greptime rows grouped by their ContextOpt.
+/// # Returns
+/// A HashMap where keys are `ContextOpt` and values are vectors of (row, table_suffix) pairs.
+/// Single object input produces one ContextOpt group with one row.
+/// Array input groups rows by their per-element ContextOpt values.
+///
+/// # Errors
+/// - `ArrayElementMustBeObject` if an array element is not an object
+pub(crate) fn values_to_rows(
+    schema_info: &mut SchemaInfo,
+    mut values: VrlValue,
+    pipeline_ctx: &PipelineContext<'_>,
+    row: Option<Vec<GreptimeValue>>,
+    need_calc_ts: bool,
+    tablesuffix_template: Option<&crate::tablesuffix::TableSuffixTemplate>,
+) -> Result<std::collections::HashMap<ContextOpt, Vec<RowWithTableSuffix>>> {
+    let skip_error = pipeline_ctx.pipeline_param.skip_error();
+    let VrlValue::Array(arr) = values else {
+        // Single object: extract ContextOpt and table_suffix
+        let mut result = std::collections::HashMap::new();
+
+        let mut opt = match ContextOpt::from_pipeline_map_to_opt(&mut values) {
+            Ok(r) => r,
+            Err(e) => return if skip_error { Ok(result) } else { Err(e) },
+        };
+
+        let table_suffix = opt.resolve_table_suffix(tablesuffix_template, &values);
+        let row = match values_to_row(schema_info, values, pipeline_ctx, row, need_calc_ts) {
+            Ok(r) => r,
+            Err(e) => return if skip_error { Ok(result) } else { Err(e) },
+        };
+        result.insert(opt, vec![(row, table_suffix)]);
+        return Ok(result);
+    };
+
+    let mut rows_by_context: std::collections::HashMap<ContextOpt, Vec<RowWithTableSuffix>> =
+        std::collections::HashMap::new();
+    for (index, mut value) in arr.into_iter().enumerate() {
+        if !value.is_object() {
+            unwrap_or_continue_if_err!(
+                ArrayElementMustBeObjectSnafu {
+                    index,
+                    actual_type: value.kind_str().to_string(),
+                }
+                .fail(),
+                skip_error
+            );
+        }
+
+        // Extract ContextOpt and table_suffix for this element
+        let mut opt = unwrap_or_continue_if_err!(
+            ContextOpt::from_pipeline_map_to_opt(&mut value),
+            skip_error
+        );
+        let table_suffix = opt.resolve_table_suffix(tablesuffix_template, &value);
+        let transformed_row = unwrap_or_continue_if_err!(
+            values_to_row(schema_info, value, pipeline_ctx, row.clone(), need_calc_ts),
+            skip_error
+        );
+        rows_by_context
+            .entry(opt)
+            .or_default()
+            .push((transformed_row, table_suffix));
+    }
+    Ok(rows_by_context)
+}
+
 /// `need_calc_ts` happens in two cases:
 /// 1. full greptime_identity
 /// 2. auto-transform without transformer
@@ -992,4 +1062,139 @@ mod tests {
            assert_eq!(flattened_object, expected);
        }
    }
+
+    use ahash::HashMap as AHashMap;
+    #[test]
+    fn test_values_to_rows_skip_error_handling() {
+        let table_suffix_template: Option<crate::tablesuffix::TableSuffixTemplate> = None;
+
+        // Case 1: skip_error=true, mixed valid/invalid elements
+        {
+            let schema_info = &mut SchemaInfo::default();
+            let input_array = vec![
+                // Valid object
+                serde_json::json!({"name": "Alice", "age": 25}).into(),
+                // Invalid element (string)
+                VrlValue::Bytes("invalid_string".into()),
+                // Valid object
+                serde_json::json!({"name": "Bob", "age": 30}).into(),
+                // Invalid element (number)
+                VrlValue::Integer(42),
+                // Valid object
+                serde_json::json!({"name": "Charlie", "age": 35}).into(),
+            ];
+
+            let params = GreptimePipelineParams::from_map(AHashMap::from_iter([(
+                "skip_error".to_string(),
+                "true".to_string(),
+            )]));
+
+            let pipeline_ctx = PipelineContext::new(
+                &PipelineDefinition::GreptimeIdentityPipeline(None),
+                &params,
+                Channel::Unknown,
+            );
+
+            let result = values_to_rows(
+                schema_info,
+                VrlValue::Array(input_array),
+                &pipeline_ctx,
+                None,
+                true,
+                table_suffix_template.as_ref(),
+            );
+
+            // Should succeed and only process valid objects
+            assert!(result.is_ok());
+            let rows_by_context = result.unwrap();
+            // Count total rows across all ContextOpt groups
+            let total_rows: usize = rows_by_context.values().map(|v| v.len()).sum();
+            assert_eq!(total_rows, 3); // Only 3 valid objects
+        }
+
+        // Case 2: skip_error=false, invalid elements present
+        {
+            let schema_info = &mut SchemaInfo::default();
+            let input_array = vec![
+                serde_json::json!({"name": "Alice", "age": 25}).into(),
+                VrlValue::Bytes("invalid_string".into()), // This should cause error
+            ];
+
+            let params = GreptimePipelineParams::default(); // skip_error = false
+
+            let pipeline_ctx = PipelineContext::new(
+                &PipelineDefinition::GreptimeIdentityPipeline(None),
+                &params,
+                Channel::Unknown,
+            );
+
+            let result = values_to_rows(
+                schema_info,
+                VrlValue::Array(input_array),
+                &pipeline_ctx,
+                None,
+                true,
+                table_suffix_template.as_ref(),
+            );
+
+            // Should fail with ArrayElementMustBeObject error
+            assert!(result.is_err());
+            let error_msg = result.unwrap_err().to_string();
+            assert!(error_msg.contains("Array element at index 1 must be an object for one-to-many transformation, got string"));
+        }
+    }
+
+    /// Test that values_to_rows correctly groups rows by per-element ContextOpt
+    #[test]
+    fn test_values_to_rows_per_element_context_opt() {
+        let table_suffix_template: Option<crate::tablesuffix::TableSuffixTemplate> = None;
+        let schema_info = &mut SchemaInfo::default();
+
+        // Create array with elements having different TTL values (ContextOpt)
+        let input_array = vec![
+            serde_json::json!({"name": "Alice", "greptime_ttl": "1h"}).into(),
+            serde_json::json!({"name": "Bob", "greptime_ttl": "1h"}).into(),
+            serde_json::json!({"name": "Charlie", "greptime_ttl": "24h"}).into(),
+        ];
+
+        let params = GreptimePipelineParams::default();
+        let pipeline_ctx = PipelineContext::new(
+            &PipelineDefinition::GreptimeIdentityPipeline(None),
+            &params,
+            Channel::Unknown,
+        );
+
+        let result = values_to_rows(
+            schema_info,
+            VrlValue::Array(input_array),
+            &pipeline_ctx,
+            None,
+            true,
+            table_suffix_template.as_ref(),
+        );
+
+        assert!(result.is_ok());
+        let rows_by_context = result.unwrap();
+
+        // Should have 2 different ContextOpt groups (1h TTL and 24h TTL)
+        assert_eq!(rows_by_context.len(), 2);
+
+        // Count rows per group
+        let total_rows: usize = rows_by_context.values().map(|v| v.len()).sum();
+        assert_eq!(total_rows, 3);
+
+        // Verify that rows are correctly grouped by TTL
+        let mut ttl_1h_count = 0;
+        let mut ttl_24h_count = 0;
+        for rows in rows_by_context.values() {
+            // ContextOpt doesn't expose ttl directly, but we can count by group size
+            if rows.len() == 2 {
+                ttl_1h_count = rows.len();
+            } else if rows.len() == 1 {
+                ttl_24h_count = rows.len();
+            }
+        }
+        assert_eq!(ttl_1h_count, 2); // Alice and Bob with 1h TTL
+        assert_eq!(ttl_24h_count, 1); // Charlie with 24h TTL
+    }
 }
--- a/src/pipeline/tests/common.rs
+++ b/src/pipeline/tests/common.rs
@@ -35,21 +35,25 @@ pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows {
    match input_value {
        VrlValue::Array(array) => {
            for value in array {
-                let row = pipeline
+                let rows_with_suffix = pipeline
                    .exec_mut(value, &pipeline_ctx, &mut schema_info)
                    .expect("failed to exec pipeline")
                    .into_transformed()
                    .expect("expect transformed result ");
-                rows.push(row.0);
+                for (r, _) in rows_with_suffix {
+                    rows.push(r);
+                }
            }
        }
        VrlValue::Object(_) => {
-            let row = pipeline
+            let rows_with_suffix = pipeline
                .exec_mut(input_value, &pipeline_ctx, &mut schema_info)
                .expect("failed to exec pipeline")
                .into_transformed()
                .expect("expect transformed result ");
-            rows.push(row.0);
+            for (r, _) in rows_with_suffix {
+                rows.push(r);
+            }
        }
        _ => {
            panic!("invalid input value");
--- a/src/pipeline/tests/pipeline.rs
+++ b/src/pipeline/tests/pipeline.rs
@@ -427,7 +427,7 @@ transform:
    );
    let stats = input_value.into();

-    let row = pipeline
+    let rows_with_suffix = pipeline
        .exec_mut(stats, &pipeline_ctx, &mut schema_info)
        .expect("failed to exec pipeline")
        .into_transformed()
@@ -435,7 +435,7 @@ transform:

    let output = Rows {
        schema: pipeline.schemas().unwrap().clone(),
-        rows: vec![row.0],
+        rows: rows_with_suffix.into_iter().map(|(r, _)| r).collect(),
    };

    assert_eq!(output.rows.len(), 1);
@@ -501,13 +501,13 @@ transform:
    );

    let status = input_value.into();
-    let row = pipeline
+    let mut rows_with_suffix = pipeline
        .exec_mut(status, &pipeline_ctx, &mut schema_info)
        .unwrap()
        .into_transformed()
        .expect("expect transformed result ");
+    let (row, _) = rows_with_suffix.swap_remove(0);
    let r = row
-        .0
        .values
        .into_iter()
        .map(|v| v.value_data.unwrap())
@@ -616,15 +616,16 @@ transform:
    );

    let status = input_value.into();
-    let row = pipeline
+    let mut rows_with_suffix = pipeline
        .exec_mut(status, &pipeline_ctx, &mut schema_info)
        .unwrap()
        .into_transformed()
        .expect("expect transformed result ");

+    let (row, _) = rows_with_suffix.swap_remove(0);
    let r = row
-        .0
        .values
+        .clone()
        .into_iter()
        .map(|v| v.value_data.unwrap())
        .collect::<Vec<_>>();
@@ -688,13 +689,13 @@ transform:
    );

    let status = input_value.into();
-    let row = pipeline
+    let mut rows_with_suffix = pipeline
        .exec_mut(status, &pipeline_ctx, &mut schema_info)
        .unwrap()
        .into_transformed()
        .expect("expect transformed result ");
+    let (row, _) = rows_with_suffix.swap_remove(0);
    let r = row
-        .0
        .values
        .into_iter()
        .map(|v| v.value_data.unwrap())
@@ -734,14 +735,14 @@ transform:
    );

    let status = input_value.into();
-    let row = pipeline
+    let mut rows_with_suffix = pipeline
        .exec_mut(status, &pipeline_ctx, &mut schema_info)
        .unwrap()
        .into_transformed()
        .expect("expect transformed result ");

+    let (row, _) = rows_with_suffix.swap_remove(0);
    let r = row
-        .0
        .values
        .into_iter()
        .map(|v| v.value_data.unwrap())
@@ -799,14 +800,14 @@ transform:
    );

    let status = input_value.into();
-    let row = pipeline
+    let mut rows_with_suffix = pipeline
        .exec_mut(status, &pipeline_ctx, &mut schema_info)
        .unwrap()
        .into_transformed()
        .expect("expect transformed result ");

+    let (row, _) = rows_with_suffix.swap_remove(0);
    let mut r = row
-        .0
        .values
        .into_iter()
        .map(|v| v.value_data.unwrap())
@@ -846,13 +847,14 @@ transform:
    );

    let status = input_value.into();
-    let row = pipeline
+    let mut rows_with_suffix = pipeline
        .exec_mut(status, &pipeline_ctx, &mut schema_info)
        .unwrap()
        .into_transformed()
        .expect("expect transformed result ");

-    row.0.values.into_iter().for_each(|v| {
+    let (row, _) = rows_with_suffix.swap_remove(0);
+    row.values.into_iter().for_each(|v| {
        if let ValueData::TimestampNanosecondValue(v) = v.value_data.unwrap() {
            let now = chrono::Utc::now().timestamp_nanos_opt().unwrap();
            assert!(now - v < 5_000_000);
@@ -923,13 +925,13 @@ transform:
    assert_eq!(dispatched_to.pipeline.unwrap(), "access_log_pipeline");

    let status = input_value2.into();
-    let row = pipeline
+    let mut rows_with_suffix = pipeline
        .exec_mut(status, &pipeline_ctx, &mut schema_info)
        .unwrap()
        .into_transformed()
        .expect("expect transformed result ");
+    let (row, _) = rows_with_suffix.swap_remove(0);
    let r = row
-        .0
        .values
        .into_iter()
        .map(|v| v.value_data.unwrap())
@@ -988,8 +990,8 @@ table_suffix: _${logger}
        .exec_mut(status, &pipeline_ctx, &mut schema_info)
        .unwrap();

-    let (row, table_name) = exec_re.into_transformed().unwrap();
-    let values = row.values;
+    let mut rows_with_suffix = exec_re.into_transformed().unwrap();
+    let (row, table_suffix) = rows_with_suffix.swap_remove(0);
    let expected_values = vec![
        Value {
            value_data: Some(ValueData::StringValue("hello world".into())),
@@ -998,6 +1000,234 @@ table_suffix: _${logger}
            value_data: Some(ValueData::TimestampNanosecondValue(1716668197217000000)),
        },
    ];
-    assert_eq!(expected_values, values);
-    assert_eq!(table_name, Some("_http".to_string()));
+    assert_eq!(expected_values, row.values);
+    assert_eq!(table_suffix, Some("_http".to_string()));
+}
+
+/// Test one-to-many pipeline expansion using VRL processor that returns an array
+#[test]
+fn test_one_to_many_pipeline() {
+    // Input: single log entry with a list of events
+    let input_value = serde_json::json!({
+        "request_id": "req-123",
+        "events": [
+            {"type": "click", "value": 100},
+            {"type": "scroll", "value": 200},
+            {"type": "submit", "value": 300}
+        ]
+    });
+
+    // VRL processor that expands events into separate rows using map
+    let pipeline_yaml = r#"
+processors:
+  - vrl:
+      source: |
+        events = del(.events)
+        request_id = del(.request_id)
+        map_values(array!(events)) -> |event| {
+            {
+                "request_id": request_id,
+                "event_type": event.type,
+                "event_value": event.value
+            }
+        }
+
+transform:
+  - field: request_id
+    type: string
+  - field: event_type
+    type: string
+  - field: event_value
+    type: uint64
+"#;
+
+    let yaml_content = Content::Yaml(pipeline_yaml);
+    let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline");
+    let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+    let pipeline_ctx = PipelineContext::new(
+        &pipeline_def,
+        &pipeline_param,
+        session::context::Channel::Unknown,
+    );
+
+    let status = input_value.into();
+    let rows_with_suffix = pipeline
+        .exec_mut(status, &pipeline_ctx, &mut schema_info)
+        .expect("failed to exec pipeline")
+        .into_transformed()
+        .expect("expect transformed result");
+
+    // Should produce 3 rows from the single input
+    assert_eq!(rows_with_suffix.len(), 3);
+
+    // Row 0: click event
+    assert_eq!(
+        rows_with_suffix[0].0.values[0].value_data,
+        Some(StringValue("req-123".into()))
+    );
+    assert_eq!(
+        rows_with_suffix[0].0.values[1].value_data,
+        Some(StringValue("click".into()))
+    );
+    assert_eq!(
+        rows_with_suffix[0].0.values[2].value_data,
+        Some(U64Value(100))
+    );
+
+    // Row 1: scroll event
+    assert_eq!(
+        rows_with_suffix[1].0.values[0].value_data,
+        Some(StringValue("req-123".into()))
+    );
+    assert_eq!(
+        rows_with_suffix[1].0.values[1].value_data,
+        Some(StringValue("scroll".into()))
+    );
+    assert_eq!(
+        rows_with_suffix[1].0.values[2].value_data,
+        Some(U64Value(200))
+    );
+
+    // Row 2: submit event
+    assert_eq!(
+        rows_with_suffix[2].0.values[0].value_data,
+        Some(StringValue("req-123".into()))
+    );
+    assert_eq!(
+        rows_with_suffix[2].0.values[1].value_data,
+        Some(StringValue("submit".into()))
+    );
+    assert_eq!(
+        rows_with_suffix[2].0.values[2].value_data,
+        Some(U64Value(300))
+    );
+}
+
+/// Test that single object input still works correctly (backward compatibility)
+#[test]
+fn test_one_to_many_single_object_unchanged() {
+    let input_value = serde_json::json!({
+        "name": "Alice",
+        "age": 30
+    });
+
+    let pipeline_yaml = r#"
+processors:
+  - vrl:
+      source: |
+        .processed = true
+        .
+
+transform:
+  - field: name
+    type: string
+  - field: age
+    type: uint32
+  - field: processed
+    type: boolean
+"#;
+
+    let yaml_content = Content::Yaml(pipeline_yaml);
+    let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline");
+    let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+    let pipeline_ctx = PipelineContext::new(
+        &pipeline_def,
+        &pipeline_param,
+        session::context::Channel::Unknown,
+    );
+
+    let status = input_value.into();
+    let rows_with_suffix = pipeline
+        .exec_mut(status, &pipeline_ctx, &mut schema_info)
+        .expect("failed to exec pipeline")
+        .into_transformed()
+        .expect("expect transformed result");
+
+    // Should produce exactly 1 row
+    assert_eq!(rows_with_suffix.len(), 1);
+
+    let (row, _) = &rows_with_suffix[0];
+    assert_eq!(row.values[0].value_data, Some(StringValue("Alice".into())));
+    assert_eq!(row.values[1].value_data, Some(U32Value(30)));
+    assert_eq!(row.values[2].value_data, Some(BoolValue(true)));
+}
+
+/// Test error handling when array contains non-object elements
+#[test]
+fn test_one_to_many_array_element_validation() {
+    let input_value = serde_json::json!({
+        "items": ["string", 123, true]
+    });
+
+    // VRL that returns an array with non-object elements
+    let pipeline_yaml = r#"
+processors:
+  - vrl:
+      source: |
+        .items
+
+transform:
+  - field: value
+    type: string
+"#;
+
+    let yaml_content = Content::Yaml(pipeline_yaml);
+    let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline");
+    let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+    let pipeline_ctx = PipelineContext::new(
+        &pipeline_def,
+        &pipeline_param,
+        session::context::Channel::Unknown,
+    );
+
+    let status = input_value.into();
+    let result = pipeline.exec_mut(status, &pipeline_ctx, &mut schema_info);
+
+    // Should fail because array elements are not objects
+    assert!(result.is_err());
+    let err = result.unwrap_err();
+    let err_msg = err.to_string();
+    assert!(
+        err_msg.contains("must be an object"),
+        "Expected 'must be an object' error, got: {}",
+        err_msg
+    );
+}
+
+/// Test that empty array produces zero rows
+#[test]
+fn test_one_to_many_empty_array() {
+    let input_value = serde_json::json!({
+        "events": []
+    });
+
+    let pipeline_yaml = r#"
+processors:
+  - vrl:
+      source: |
+        .events
+
+transform:
+  - field: value
+    type: string
+"#;
+
+    let yaml_content = Content::Yaml(pipeline_yaml);
+    let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline");
+    let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+    let pipeline_ctx = PipelineContext::new(
+        &pipeline_def,
+        &pipeline_param,
+        session::context::Channel::Unknown,
+    );
+
+    let status = input_value.into();
+    let rows_with_suffix = pipeline
+        .exec_mut(status, &pipeline_ctx, &mut schema_info)
+        .expect("failed to exec pipeline")
+        .into_transformed()
+        .expect("expect transformed result");
+
+    // Empty array should produce zero rows
+    assert_eq!(rows_with_suffix.len(), 0);
 }
--- a/src/query/src/part_sort.rs
+++ b/src/query/src/part_sort.rs
@@ -284,7 +284,6 @@ struct PartSortStream {
    buffer: PartSortBuffer,
    expression: PhysicalSortExpr,
    limit: Option<usize>,
-    produced: usize,
    input: DfSendableRecordBatchStream,
    input_complete: bool,
    schema: SchemaRef,
@@ -340,7 +339,6 @@ impl PartSortStream {
            buffer,
            expression: sort.expression.clone(),
            limit,
-            produced: 0,
            input,
            input_complete: false,
            schema: sort.input.schema(),
@@ -565,7 +563,6 @@ impl PartSortStream {
            )
        })?;

-        self.produced += sorted.num_rows();
        drop(full_input);
        // here remove both buffer and full_input memory
        self.reservation.shrink(2 * total_mem);
@@ -666,6 +663,16 @@ impl PartSortStream {
        let sorted_batch = self.sort_buffer();
        // step to next proper PartitionRange
        self.cur_part_idx += 1;
+
+        // If we've processed all partitions, discard remaining data
+        if self.cur_part_idx >= self.partition_ranges.len() {
+            // assert there is no data beyond the last partition range (remaining is empty).
+            // it would be acceptable even if it happens, because `remaining_range` will be discarded anyway.
+            debug_assert!(remaining_range.num_rows() == 0);
+
+            return sorted_batch.map(|x| if x.num_rows() == 0 { None } else { Some(x) });
+        }
+
        let next_sort_column = sort_column.slice(idx, batch.num_rows() - idx);
        if self.try_find_next_range(&next_sort_column)?.is_some() {
            // remaining batch still contains data that exceeds the current partition range
@@ -687,6 +694,12 @@ impl PartSortStream {
        cx: &mut Context<'_>,
    ) -> Poll<Option<datafusion_common::Result<DfRecordBatch>>> {
        loop {
+            // Early termination: if we've already produced enough rows,
+            // don't poll more input - just return
+            if matches!(self.limit, Some(0)) {
+                return Poll::Ready(None);
+            }
+
            // no more input, sort the buffer and return
            if self.input_complete {
                if self.buffer.is_empty() {
@@ -701,7 +714,24 @@ impl PartSortStream {
            if let Some(evaluating_batch) = self.evaluating_batch.take()
                && evaluating_batch.num_rows() != 0
            {
+                // Check if we've already processed all partitions
+                if self.cur_part_idx >= self.partition_ranges.len() {
+                    // All partitions processed, discard remaining data
+                    if self.buffer.is_empty() {
+                        return Poll::Ready(None);
+                    } else {
+                        let sorted_batch = self.sort_buffer()?;
+                        self.limit = self
+                            .limit
+                            .map(|l| l.saturating_sub(sorted_batch.num_rows()));
+                        return Poll::Ready(Some(Ok(sorted_batch)));
+                    }
+                }
+
                if let Some(sorted_batch) = self.split_batch(evaluating_batch)? {
+                    self.limit = self
+                        .limit
+                        .map(|l| l.saturating_sub(sorted_batch.num_rows()));
                    return Poll::Ready(Some(Ok(sorted_batch)));
                } else {
                    continue;
@@ -713,6 +743,9 @@ impl PartSortStream {
            match res {
                Poll::Ready(Some(Ok(batch))) => {
                    if let Some(sorted_batch) = self.split_batch(batch)? {
+                        self.limit = self
+                            .limit
+                            .map(|l| l.saturating_sub(sorted_batch.num_rows()));
                        return Poll::Ready(Some(Ok(sorted_batch)));
                    } else {
                        continue;
@@ -896,22 +929,30 @@ mod test {
                output_data.push(cur_data);
            }

-            let expected_output = output_data
+            let mut limit_remains = limit;
+            let mut expected_output = output_data
                .into_iter()
                .map(|a| {
                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, a)]).unwrap()
                })
                .map(|rb| {
                    // trim expected output with limit
-                    if let Some(limit) = limit
-                        && rb.num_rows() > limit
-                    {
-                        rb.slice(0, limit)
+                    if let Some(limit) = limit_remains.as_mut() {
+                        let rb = rb.slice(0, (*limit).min(rb.num_rows()));
+                        *limit = limit.saturating_sub(rb.num_rows());
+                        rb
                    } else {
                        rb
                    }
                })
                .collect_vec();
+            while let Some(rb) = expected_output.last() {
+                if rb.num_rows() == 0 {
+                    expected_output.pop();
+                } else {
+                    break;
+                }
+            }

            test_cases.push((
                case_id,
@@ -932,13 +973,14 @@ mod test {
                opt,
                limit,
                expected_output,
+                None,
            )
            .await;
        }
    }

    #[tokio::test]
-    async fn simple_case() {
+    async fn simple_cases() {
        let testcases = vec![
            (
                TimeUnit::Millisecond,
@@ -1027,7 +1069,7 @@ mod test {
                ],
                true,
                Some(2),
-                vec![vec![19, 17], vec![12, 11], vec![9, 8], vec![4, 3]],
+                vec![vec![19, 17]],
            ),
        ];

@@ -1080,6 +1122,7 @@ mod test {
                opt,
                limit,
                expected_output,
+                None,
            )
            .await;
        }
@@ -1093,6 +1136,7 @@ mod test {
        opt: SortOptions,
        limit: Option<usize>,
        expected_output: Vec<DfRecordBatch>,
+        expected_polled_rows: Option<usize>,
    ) {
        for rb in &expected_output {
            if let Some(limit) = limit {
@@ -1104,16 +1148,15 @@ mod test {
                );
            }
        }
-        let (ranges, batches): (Vec<_>, Vec<_>) = input_ranged_data.clone().into_iter().unzip();

-        let batches = batches
-            .into_iter()
-            .flat_map(|mut cols| {
-                cols.push(DfRecordBatch::new_empty(schema.clone()));
-                cols
-            })
-            .collect_vec();
-        let mock_input = MockInputExec::new(batches, schema.clone());
+        let mut data_partition = Vec::with_capacity(input_ranged_data.len());
+        let mut ranges = Vec::with_capacity(input_ranged_data.len());
+        for (part_range, batches) in input_ranged_data {
+            data_partition.push(batches);
+            ranges.push(part_range);
+        }
+
+        let mock_input = Arc::new(MockInputExec::new(data_partition, schema.clone()));

        let exec = PartSortExec::new(
            PhysicalSortExpr {
@@ -1122,7 +1165,7 @@ mod test {
            },
            limit,
            vec![ranges.clone()],
-            Arc::new(mock_input),
+            mock_input.clone(),
        );

        let exec_stream = exec.execute(0, Arc::new(TaskContext::default())).unwrap();
@@ -1131,12 +1174,17 @@ mod test {
        // a makeshift solution for compare large data
        if real_output != expected_output {
            let mut first_diff = 0;
+            let mut is_diff_found = false;
            for (idx, (lhs, rhs)) in real_output.iter().zip(expected_output.iter()).enumerate() {
-                if lhs != rhs {
+                if lhs.slice(0, rhs.num_rows()) != *rhs {
                    first_diff = idx;
+                    is_diff_found = true;
                    break;
                }
            }
+            if !is_diff_found {
+                return;
+            }
            println!("first diff batch at {}", first_diff);
            println!(
                "ranges: {:?}",
@@ -1175,8 +1223,14 @@ mod test {
                let buf = String::from_utf8_lossy(&buf);
                full_msg += &format!("case_id:{case_id}, expected_output \n{buf}");
            }
+
+            if let Some(expected_polled_rows) = expected_polled_rows {
+                let input_pulled_rows = mock_input.metrics().unwrap().output_rows().unwrap();
+                assert_eq!(input_pulled_rows, expected_polled_rows);
+            }
+
            panic!(
-                "case_{} failed, opt: {:?},\n real output has {} batches, {} rows, expected has {} batches with {} rows\nfull msg: {}",
+                "case_{} failed (limit {limit:?}), opt: {:?},\n real output has {} batches, {} rows, expected has {} batches with {} rows\nfull msg: {}",
                case_id,
                opt,
                real_output.len(),
@@ -1187,4 +1241,249 @@ mod test {
            );
        }
    }
+
+    /// Test that verifies the limit is correctly applied per partition when
+    /// multiple batches are received for the same partition.
+    #[tokio::test]
+    async fn test_limit_with_multiple_batches_per_partition() {
+        let unit = TimeUnit::Millisecond;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(unit, None),
+            false,
+        )]));
+
+        // Test case: Multiple batches in a single partition with limit=3
+        // Input: 3 batches with [1,2,3], [4,5,6], [7,8,9] all in partition (0,10)
+        // Expected: Only top 3 values [9,8,7] for descending sort
+        let input_ranged_data = vec![(
+            PartitionRange {
+                start: Timestamp::new(0, unit.into()),
+                end: Timestamp::new(10, unit.into()),
+                num_rows: 9,
+                identifier: 0,
+            },
+            vec![
+                DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![1, 2, 3])])
+                    .unwrap(),
+                DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![4, 5, 6])])
+                    .unwrap(),
+                DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![7, 8, 9])])
+                    .unwrap(),
+            ],
+        )];
+
+        let expected_output = vec![
+            DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![9, 8, 7])])
+                .unwrap(),
+        ];
+
+        run_test(
+            1000,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                ..Default::default()
+            },
+            Some(3),
+            expected_output,
+            None,
+        )
+        .await;
+
+        // Test case: Multiple batches across multiple partitions with limit=2
+        // Partition 0: batches [10,11,12], [13,14,15] -> top 2 descending = [15,14]
+        // Partition 1: batches [1,2,3], [4,5] -> top 2 descending = [5,4]
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(10, unit.into()),
+                    end: Timestamp::new(20, unit.into()),
+                    num_rows: 6,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![10, 11, 12])],
+                    )
+                    .unwrap(),
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![13, 14, 15])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(0, unit.into()),
+                    end: Timestamp::new(10, unit.into()),
+                    num_rows: 5,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![1, 2, 3])])
+                        .unwrap(),
+                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![4, 5])])
+                        .unwrap(),
+                ],
+            ),
+        ];
+
+        let expected_output = vec![
+            DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![15, 14])]).unwrap(),
+        ];
+
+        run_test(
+            1001,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                ..Default::default()
+            },
+            Some(2),
+            expected_output,
+            None,
+        )
+        .await;
+
+        // Test case: Ascending sort with limit
+        // Partition: batches [7,8,9], [4,5,6], [1,2,3] -> top 2 ascending = [1,2]
+        let input_ranged_data = vec![(
+            PartitionRange {
+                start: Timestamp::new(0, unit.into()),
+                end: Timestamp::new(10, unit.into()),
+                num_rows: 9,
+                identifier: 0,
+            },
+            vec![
+                DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![7, 8, 9])])
+                    .unwrap(),
+                DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![4, 5, 6])])
+                    .unwrap(),
+                DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![1, 2, 3])])
+                    .unwrap(),
+            ],
+        )];
+
+        let expected_output = vec![
+            DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![1, 2])]).unwrap(),
+        ];
+
+        run_test(
+            1002,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: false,
+                ..Default::default()
+            },
+            Some(2),
+            expected_output,
+            None,
+        )
+        .await;
+    }
+
+    /// Test that verifies early termination behavior.
+    /// Once we've produced limit * num_partitions rows, we should stop
+    /// pulling from input stream.
+    #[tokio::test]
+    async fn test_early_termination() {
+        let unit = TimeUnit::Millisecond;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(unit, None),
+            false,
+        )]));
+
+        // Create 3 partitions, each with more data than the limit
+        // limit=2 per partition, so total expected output = 6 rows
+        // After producing 6 rows, early termination should kick in
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(0, unit.into()),
+                    end: Timestamp::new(10, unit.into()),
+                    num_rows: 10,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![1, 2, 3, 4, 5])],
+                    )
+                    .unwrap(),
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![6, 7, 8, 9, 10])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(10, unit.into()),
+                    end: Timestamp::new(20, unit.into()),
+                    num_rows: 10,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![11, 12, 13, 14, 15])],
+                    )
+                    .unwrap(),
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![16, 17, 18, 19, 20])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(20, unit.into()),
+                    end: Timestamp::new(30, unit.into()),
+                    num_rows: 10,
+                    identifier: 2,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![21, 22, 23, 24, 25])],
+                    )
+                    .unwrap(),
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![26, 27, 28, 29, 30])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+        ];
+
+        // PartSort won't reorder `PartitionRange` (it assumes it's already ordered), so it will not read other partitions.
+        // This case is just to verify that early termination works as expected.
+        let expected_output = vec![
+            DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![9, 8])]).unwrap(),
+        ];
+
+        run_test(
+            1003,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                ..Default::default()
+            },
+            Some(2),
+            expected_output,
+            Some(10),
+        )
+        .await;
+    }
 }
--- a/src/query/src/sql.rs
+++ b/src/query/src/sql.rs
@@ -65,6 +65,7 @@ use sql::statements::statement::Statement;
 use sqlparser::ast::ObjectName;
 use store_api::metric_engine_consts::{is_metric_engine, is_metric_engine_internal_column};
 use table::TableRef;
+use table::metadata::TableInfoRef;
 use table::requests::{FILE_TABLE_LOCATION_KEY, FILE_TABLE_PATTERN_KEY};

 use crate::QueryEngineRef;
@@ -821,13 +822,12 @@ pub fn show_create_database(database_name: &str, options: OptionMap) -> Result<O
 }

 pub fn show_create_table(
-    table: TableRef,
+    table_info: TableInfoRef,
    schema_options: Option<SchemaOptions>,
    partitions: Option<Partitions>,
    query_ctx: QueryContextRef,
 ) -> Result<Output> {
-    let table_info = table.table_info();
-    let table_name = &table_info.name;
+    let table_name = table_info.name.clone();

    let quote_style = query_ctx.quote_style();

@@ -838,7 +838,7 @@ pub fn show_create_table(
    });
    let sql = format!("{}", stmt);
    let columns = vec![
-        Arc::new(StringVector::from(vec![table_name.clone()])) as _,
+        Arc::new(StringVector::from(vec![table_name])) as _,
        Arc::new(StringVector::from(vec![sql])) as _,
    ];
    let records = RecordBatches::try_from_columns(SHOW_CREATE_TABLE_OUTPUT_SCHEMA.clone(), columns)
--- a/src/query/src/sql/show_create_table.rs
+++ b/src/query/src/sql/show_create_table.rs
@@ -34,7 +34,9 @@ use sql::statements::create::{Column, ColumnExtensions, CreateTable, TableConstr
 use sql::statements::{self, OptionMap};
 use store_api::metric_engine_consts::{is_metric_engine, is_metric_engine_internal_column};
 use table::metadata::{TableInfoRef, TableMeta};
-use table::requests::{FILE_TABLE_META_KEY, TTL_KEY, WRITE_BUFFER_SIZE_KEY};
+use table::requests::{
+    COMMENT_KEY as TABLE_COMMENT_KEY, FILE_TABLE_META_KEY, TTL_KEY, WRITE_BUFFER_SIZE_KEY,
+};

 use crate::error::{
    ConvertSqlTypeSnafu, ConvertSqlValueSnafu, GetFulltextOptionsSnafu,
@@ -249,6 +251,13 @@ pub fn create_table_stmt(

    let constraints = create_table_constraints(&table_meta.engine, schema, table_meta, quote_style);

+    let mut options = create_sql_options(table_meta, schema_options);
+    if let Some(comment) = &table_info.desc
+        && options.get(TABLE_COMMENT_KEY).is_none()
+    {
+        options.insert(format!("'{TABLE_COMMENT_KEY}'"), comment.clone());
+    }
+
    Ok(CreateTable {
        if_not_exists: true,
        table_id: table_info.ident.table_id,
@@ -256,7 +265,7 @@ pub fn create_table_stmt(
        columns,
        engine: table_meta.engine.clone(),
        constraints,
-        options: create_sql_options(table_meta, schema_options),
+        options,
        partitions: None,
    })
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Lanqing Yang	5ce5e41296	Merge branch 'main' into chore/test_skip_auth	2025-12-12 07:48:43 +09:00
Weny Xu	ba4eda40e5	refactor: optimize heartbeat channel and etcd client keepalive settings (#7390 ) Signed-off-by: WenyXu <wenymedia@gmail.com>	2025-12-11 13:32:11 +00:00
discord9	f06a64ff90	feat: mark index outdated (#7383 ) * feat: mark index outdated Signed-off-by: discord9 <discord9@163.com> * refactor: move IndexVerwsion to store-api Signed-off-by: discord9 <discord9@163.com> * per review Signed-off-by: discord9 <discord9@163.com> * fix: condition for add files Signed-off-by: discord9 <discord9@163.com> * cleanup Signed-off-by: discord9 <discord9@163.com> * refactor(sst): extract index version check into method Signed-off-by: discord9 <discord9@163.com> --------- Signed-off-by: discord9 <discord9@163.com>	2025-12-11 12:08:45 +00:00
discord9	b8c362ec65	fix: disable config load in int test Signed-off-by: discord9 <discord9@163.com>	2025-12-11 15:29:05 +08:00
fys	84b4777925	fix: parse "KEEP FIRING FOR" (#7386 ) * fix: parse "KEEP FIRING FOR" * fix: cargo fmt	2025-12-11 03:54:47 +00:00
discord9	a26dee0ca1	fix: gc listing op first (#7385 ) Signed-off-by: discord9 <discord9@163.com>	2025-12-11 03:25:05 +00:00
Ning Sun	276f6bf026	feat: grafana postgresql data source query builder support (#7379 ) * feat: grafana postgresql data source query builder support * test: add sqlness test cases	2025-12-11 03:18:35 +00:00
Weny Xu	1d5291b06d	fix(procedure): update procedure state correctly during execution and on failure (#7376 ) Signed-off-by: WenyXu <wenymedia@gmail.com>	2025-12-11 02:30:32 +00:00
Ruihang Xia	564cc0c750	feat: table/column/flow COMMENT (#7060 ) * initial impl Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * simplify impl Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * sqlness test Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * avoid unimplemented panic Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * validate flow Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * update sqlness result Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix table column comment Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * table level comment Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * simplify table info serde Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * don't txn Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * remove empty trait Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * wip: procedure Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * update proto Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * grpc support Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * Apply suggestions from code review Co-authored-by: dennis zhuang <killme2008@gmail.com> Co-authored-by: LFC <990479+MichaelScofield@users.noreply.github.com> Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * try from pb struct Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * doc comment Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * check unchanged fast case Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * tune errors Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix merge error Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * use try_as_raw_value Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Co-authored-by: dennis zhuang <killme2008@gmail.com> Co-authored-by: LFC <990479+MichaelScofield@users.noreply.github.com>	2025-12-10 15:08:47 +00:00
LFC	f1abe5d215	feat: suspend frontend and datanode (#7370 ) Signed-off-by: luofucong <luofc@foxmail.com>	2025-12-10 12:18:24 +00:00
Ruihang Xia	ab426cbf89	refactor: remove duplication coverage and code from window sort tests (#7384 ) * refactor: remove duplication coverage and code from window sort tests Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * allow clippy Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-12-10 10:11:19 +00:00
Weny Xu	cb0f1afb01	fix: improve network failure detection (#7382 ) * fix(meta): add default etcd client options with keep-alive settings (#7363) * fix: improve network failure detection (#7367) * Update src/meta-srv/src/handler.rs Co-authored-by: Lei, HUANG <6406592+v0y4g3r@users.noreply.github.com> --------- Co-authored-by: Lei, HUANG <6406592+v0y4g3r@users.noreply.github.com>	2025-12-10 09:48:36 +00:00
Yingwen	a22d08f1b1	feat: collect merge and dedup metrics (#7375 ) * feat: collect FlatMergeReader metrics Signed-off-by: evenyag <realevenyag@gmail.com> * feat: add MergeMetricsReporter, rename Metrics to MergeMetrics Signed-off-by: evenyag <realevenyag@gmail.com> * feat: remove num_input_rows from MergeMetrics The merge reader won't dedup so there is no need to collect input rows Signed-off-by: evenyag <realevenyag@gmail.com> * feat: report merge metrics to PartitionMetrics Signed-off-by: evenyag <realevenyag@gmail.com> * feat: add dedup cost to DedupMetrics Signed-off-by: evenyag <realevenyag@gmail.com> * feat: collect dedup metrics Signed-off-by: evenyag <realevenyag@gmail.com> * refactor: remove metrics from FlatMergeIterator Signed-off-by: evenyag <realevenyag@gmail.com> * feat: remove num_output_rows from MergeMetrics Signed-off-by: evenyag <realevenyag@gmail.com> * chore: fix clippy Signed-off-by: evenyag <realevenyag@gmail.com> * feat: implement merge() for merge and dedup metrics Signed-off-by: evenyag <realevenyag@gmail.com> * fix: report metrics after observe metrics Signed-off-by: evenyag <realevenyag@gmail.com> --------- Signed-off-by: evenyag <realevenyag@gmail.com>	2025-12-10 09:16:20 +00:00
Ruihang Xia	6817a376b5	fix: part sort behavior (#7374 ) * fix: part sort behavior Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * tune tests Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * debug assertion and remove produced count Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-12-10 07:44:44 +00:00
discord9	4d1a587079	chore: saturating duration since (#7380 ) chore: sat duration since Signed-off-by: discord9 <discord9@163.com>	2025-12-10 07:10:46 +00:00
Lei, HUANG	9f1aefe98f	feat: allow one to many VRL pipeline (#7342 ) * feat/allow-one-to-many-pipeline: ### Enhance Pipeline Processing for One-to-Many Transformations - Support One-to-Many Transformations: - Updated `processor.rs`, `etl.rs`, `vrl_processor.rs`, and `greptime.rs` to handle one-to-many transformations by allowing VRL processors to return arrays, expanding each element into separate rows. - Introduced `transform_array_elements` and `values_to_rows` functions to facilitate this transformation. - Error Handling Enhancements: - Added new error types in `error.rs` to handle cases where array elements are not objects and for transformation failures. - Testing Enhancements: - Added tests in `pipeline.rs` to verify one-to-many transformations, single object processing, and error handling for non-object array elements. - Context Management: - Modified `ctx_req.rs` to clone `ContextOpt` when adding rows, ensuring correct context management during transformations. - Server Pipeline Adjustments: - Updated `pipeline.rs` in `servers` to handle transformed outputs with one-to-many row expansions, ensuring correct row padding and request formation. Signed-off-by: Lei, HUANG <mrsatangel@gmail.com> * feat/allow-one-to-many-pipeline: Add one-to-many VRL pipeline test in `http.rs` - Introduced `test_pipeline_one_to_many_vrl` to verify VRL processor's ability to expand a single input row into multiple output rows. - Updated `http_tests!` macro to include the new test. - Implemented test scenarios for single and multiple input rows, ensuring correct data transformation and row count validation. Signed-off-by: Lei, HUANG <mrsatangel@gmail.com> * feat/allow-one-to-many-pipeline: ### Add Tests for VRL Pipeline Transformations - File: `src/pipeline/src/etl.rs` - Added tests for one-to-many VRL pipeline expansion to ensure multiple output rows from a single input. - Introduced tests to verify backward compatibility for single object output. - Implemented tests to confirm zero rows are produced from empty arrays. - Added validation tests to ensure array elements must be objects. - Developed tests for one-to-many transformations with table suffix hints from VRL. Signed-off-by: Lei, HUANG <mrsatangel@gmail.com> * feat/allow-one-to-many-pipeline: ### Enhance Pipeline Transformation with Per-Row Table Suffixes - `src/pipeline/src/etl.rs`: Updated `TransformedOutput` to include per-row table suffixes, allowing for more flexible routing of transformed data. Modified `PipelineExecOutput` and related methods to handle the new structure. - `src/pipeline/src/etl/transform/transformer/greptime.rs`: Enhanced `values_to_rows` to support per-row table suffix extraction and application. - `src/pipeline/tests/common.rs` and `src/pipeline/tests/pipeline.rs`: Adjusted tests to validate the new per-row table suffix functionality, ensuring backward compatibility and correct behavior in one-to-many transformations. - `src/servers/src/pipeline.rs`: Modified `run_custom_pipeline` to process transformed outputs with per-row table suffixes, grouping rows by `(opt, table_name)` for insertion. Signed-off-by: Lei, HUANG <mrsatangel@gmail.com> * feat/allow-one-to-many-pipeline: ### Update VRL Processor Type Checks - File: `vrl_processor.rs` - Changes: Updated type checking logic to use `contains_object()` and `contains_array()` methods instead of `is_object()` and `is_array()`. This change ensures compatibility with VRL type inference that may return multiple possible types. Signed-off-by: Lei, HUANG <mrsatangel@gmail.com> * feat/allow-one-to-many-pipeline: - Enhance Error Handling: Added new error types `ArrayElementMustBeObjectSnafu` and `TransformArrayElementSnafu` to improve error handling in `etl.rs` and `greptime.rs`. - Refactor Error Usage: Moved error usage declarations in `transform_array_elements` and `values_to_rows` functions to the top of the file for better organization in `etl.rs` and `greptime.rs`. Signed-off-by: Lei, HUANG <mrsatangel@gmail.com> * feat/allow-one-to-many-pipeline: ### Update `greptime.rs` to Enhance Error Handling - Error Handling: Modified the `values_to_rows` function to handle invalid array elements based on the `skip_error` parameter. If `skip_error` is true, invalid elements are skipped; otherwise, an error is returned. - Testing: Added unit tests in `greptime.rs` to verify the behavior of `values_to_rows` with different `skip_error` settings, ensuring correct processing of valid objects and appropriate error handling for invalid elements. Signed-off-by: Lei, HUANG <mrsatangel@gmail.com> * feat/allow-one-to-many-pipeline: ### Commit Summary - Enhance `TransformedOutput` Structure: Refactored `TransformedOutput` to use a `HashMap` for grouping rows by `ContextOpt`, allowing for per-row configuration options. Updated methods in `PipelineExecOutput` to support the new structure (`src/pipeline/src/etl.rs`). - Add New Transformation Method: Introduced `transform_array_elements_to_hashmap` to handle array inputs with per-row `ContextOpt` in `HashMap` format (`src/pipeline/src/etl.rs`). - Update Pipeline Execution: Modified `run_custom_pipeline` to process `TransformedOutput` using the new `HashMap` structure, ensuring rows are grouped by `ContextOpt` and table name (`src/servers/src/pipeline.rs`). - Add Tests for New Structure: Implemented tests to verify the functionality of the new `HashMap` structure in `TransformedOutput`, including scenarios for one-to-many mapping, single object input, and empty arrays (`src/pipeline/src/etl.rs`). Signed-off-by: Lei, HUANG <mrsatangel@gmail.com> * feat/allow-one-to-many-pipeline: ### Refactor `values_to_rows` to Return `HashMap` Grouped by `ContextOpt` - `etl.rs`: - Updated `values_to_rows` to return a `HashMap` grouped by `ContextOpt` instead of a vector. - Adjusted logic to handle single object and array inputs, ensuring rows are grouped by their `ContextOpt`. - Modified functions to extract rows from default `ContextOpt` and apply table suffixes accordingly. - `greptime.rs`: - Enhanced `values_to_rows` to handle errors gracefully with `skip_error` logic. - Added logic to group rows by `ContextOpt` for array inputs. - Tests: - Updated existing tests to validate the new `HashMap` return structure. - Added a new test to verify correct grouping of rows by per-element `ContextOpt`. Signed-off-by: Lei, HUANG <mrsatangel@gmail.com> * feat/allow-one-to-many-pipeline: ### Refactor and Enhance Error Handling in ETL Pipeline - Refactored Functionality: - Replaced `transform_array_elements` with `transform_array_elements_by_ctx` in `etl.rs` to streamline transformation logic and improve error handling. - Updated `values_to_rows` in `greptime.rs` to use `or_default` for cleaner code. - Enhanced Error Handling: - Introduced `unwrap_or_continue_if_err` macro in `etl.rs` to allow skipping errors based on pipeline context, improving robustness in data processing. These changes enhance the maintainability and error resilience of the ETL pipeline. Signed-off-by: Lei, HUANG <mrsatangel@gmail.com> * feat/allow-one-to-many-pipeline: ### Update `Row` Handling in ETL Pipeline - Refactor `Row` Type: Introduced `RowWithTableSuffix` type alias to simplify handling of rows with optional table suffixes across the ETL pipeline. - Modify Function Signatures: Updated function signatures in `etl.rs` and `greptime.rs` to use `RowWithTableSuffix` for better clarity and consistency. - Enhance Test Coverage: Adjusted test logic in `greptime.rs` to align with the new `RowWithTableSuffix` type, ensuring correct grouping and processing of rows by TTL. Files affected: `etl.rs`, `greptime.rs`. Signed-off-by: Lei, HUANG <mrsatangel@gmail.com> --------- Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>	2025-12-10 06:38:44 +00:00
Lei, HUANG	2f9130a2de	chore(mito): expose some symbols (#7373 ) chore/expose-symbols: ### Commit Summary - Visibility Changes: Updated visibility of functions in `bulk/part.rs`: - Made `record_batch_estimated_size` and `sort_primary_key_record_batch` functions public. - Enhancements: Enhanced functionality in `memtable.rs` by exposing additional components from `bulk::part`: - `BulkPartEncoder`, `BulkPartMeta`, `UnorderedPart`, `record_batch_estimated_size`, and `sort_primary_key_record_batch`. Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>	2025-12-09 14:33:14 +00:00
shuiyisong	fa2b4e5e63	refactor: extract file watcher to common-config (#7357 ) * refactor: extract file watcher to common-config Signed-off-by: shuiyisong <xixing.sys@gmail.com> * fix: add file check Signed-off-by: shuiyisong <xixing.sys@gmail.com> * chore: watch dir instead of file Signed-off-by: shuiyisong <xixing.sys@gmail.com> * chore: address CR issues Signed-off-by: shuiyisong <xixing.sys@gmail.com> --------- Signed-off-by: shuiyisong <xixing.sys@gmail.com>	2025-12-09 11:23:26 +00:00