feat: add slow log and configure by SLOW_FILE_SCAN_THRESHOLD

Signed-off-by: evenyag <realevenyag@gmail.com>
feat: divide build_cost to build_part_cost and build_reader_cost
2025-12-23 14:40:01 +00:00 · 2025-12-12 10:41:06 +08:00 · 2025-12-12 10:41:06 +08:00 · 2025-12-12 10:41:06 +08:00 · 2025-12-11 13:32:11 +00:00 · 2025-12-11 12:08:45 +00:00
254 changed files with 14978 additions and 3242 deletions
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -49,14 +49,9 @@ on:
        description: Do not run integration tests during the build
        type: boolean
        default: true
-      build_linux_amd64_artifacts:
+      build_linux_artifacts:
        type: boolean
-        description: Build linux-amd64 artifacts
-        required: false
-        default: false
-      build_linux_arm64_artifacts:
-        type: boolean
-        description: Build linux-arm64 artifacts
+        description: Build linux artifacts (both amd64 and arm64)
        required: false
        default: false
      build_macos_artifacts:
@@ -144,7 +139,7 @@ jobs:
          ./.github/scripts/check-version.sh "${{ steps.create-version.outputs.version }}"

      - name: Allocate linux-amd64 runner
-        if: ${{ inputs.build_linux_amd64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+        if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
        uses: ./.github/actions/start-runner
        id: start-linux-amd64-runner
        with:
@@ -158,7 +153,7 @@ jobs:
          subnet-id: ${{ vars.EC2_RUNNER_SUBNET_ID }}

      - name: Allocate linux-arm64 runner
-        if: ${{ inputs.build_linux_arm64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+        if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
        uses: ./.github/actions/start-runner
        id: start-linux-arm64-runner
        with:
@@ -173,7 +168,7 @@ jobs:

  build-linux-amd64-artifacts:
    name: Build linux-amd64 artifacts
-    if: ${{ inputs.build_linux_amd64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+    if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
    needs: [
      allocate-runners,
    ]
@@ -195,7 +190,7 @@ jobs:

  build-linux-arm64-artifacts:
    name: Build linux-arm64 artifacts
-    if: ${{ inputs.build_linux_arm64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+    if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
    needs: [
      allocate-runners,
    ]
@@ -217,7 +212,7 @@ jobs:

  run-multi-lang-tests:
    name: Run Multi-language SDK Tests
-    if: ${{ inputs.build_linux_amd64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+    if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
    needs: [
      allocate-runners,
      build-linux-amd64-artifacts,
@@ -386,7 +381,18 @@ jobs:

  publish-github-release:
    name: Create GitHub release and upload artifacts
-    if: ${{ inputs.publish_github_release || github.event_name == 'push' || github.event_name == 'schedule' }}
+    # Use always() to run even when optional jobs (macos, windows) are skipped.
+    # Then check that required jobs succeeded and optional jobs didn't fail.
+    if: |
+      always() &&
+      (inputs.publish_github_release || github.event_name == 'push' || github.event_name == 'schedule') &&
+      needs.allocate-runners.result == 'success' &&
+      (needs.build-linux-amd64-artifacts.result == 'success' || needs.build-linux-amd64-artifacts.result == 'skipped') &&
+      (needs.build-linux-arm64-artifacts.result == 'success' || needs.build-linux-arm64-artifacts.result == 'skipped') &&
+      (needs.build-macos-artifacts.result == 'success' || needs.build-macos-artifacts.result == 'skipped') &&
+      (needs.build-windows-artifacts.result == 'success' || needs.build-windows-artifacts.result == 'skipped') &&
+      (needs.release-images-to-dockerhub.result == 'success' || needs.release-images-to-dockerhub.result == 'skipped') &&
+      (needs.run-multi-lang-tests.result == 'success' || needs.run-multi-lang-tests.result == 'skipped')
    needs: [ # The job have to wait for all the artifacts are built.
      allocate-runners,
      build-linux-amd64-artifacts,
--- a/AUTHOR.md
+++ b/AUTHOR.md
@@ -2,41 +2,41 @@

 ## Individual Committers (in alphabetical order)

-* [CookiePieWw](https://github.com/CookiePieWw)
-* [etolbakov](https://github.com/etolbakov)
-* [irenjj](https://github.com/irenjj)
-* [KKould](https://github.com/KKould)
-* [Lanqing Yang](https://github.com/lyang24)
-* [NiwakaDev](https://github.com/NiwakaDev)
-* [tisonkun](https://github.com/tisonkun)
+- [apdong2022](https://github.com/apdong2022)
+- [beryl678](https://github.com/beryl678)
+- [CookiePieWw](https://github.com/CookiePieWw)
+- [etolbakov](https://github.com/etolbakov)
+- [irenjj](https://github.com/irenjj)
+- [KKould](https://github.com/KKould)
+- [Lanqing Yang](https://github.com/lyang24)
+- [nicecui](https://github.com/nicecui)
+- [NiwakaDev](https://github.com/NiwakaDev)
+- [paomian](https://github.com/paomian)
+- [tisonkun](https://github.com/tisonkun)
+- [Wenjie0329](https://github.com/Wenjie0329)
+- [zhaoyingnan01](https://github.com/zhaoyingnan01)
+- [zhongzc](https://github.com/zhongzc)
+- [ZonaHex](https://github.com/ZonaHex)
+- [zyy17](https://github.com/zyy17)

 ## Team Members (in alphabetical order)

-* [apdong2022](https://github.com/apdong2022)
-* [beryl678](https://github.com/beryl678)
-* [daviderli614](https://github.com/daviderli614)
-* [discord9](https://github.com/discord9)
-* [evenyag](https://github.com/evenyag)
-* [fengjiachun](https://github.com/fengjiachun)
-* [fengys1996](https://github.com/fengys1996)
-* [GrepTime](https://github.com/GrepTime)
-* [holalengyu](https://github.com/holalengyu)
-* [killme2008](https://github.com/killme2008)
-* [MichaelScofield](https://github.com/MichaelScofield)
-* [nicecui](https://github.com/nicecui)
-* [paomian](https://github.com/paomian)
-* [shuiyisong](https://github.com/shuiyisong)
-* [sunchanglong](https://github.com/sunchanglong)
-* [sunng87](https://github.com/sunng87)
-* [v0y4g3r](https://github.com/v0y4g3r)
-* [waynexia](https://github.com/waynexia)
-* [Wenjie0329](https://github.com/Wenjie0329)
-* [WenyXu](https://github.com/WenyXu)
-* [xtang](https://github.com/xtang)
-* [zhaoyingnan01](https://github.com/zhaoyingnan01)
-* [zhongzc](https://github.com/zhongzc)
-* [ZonaHex](https://github.com/ZonaHex)
-* [zyy17](https://github.com/zyy17)
+- [daviderli614](https://github.com/daviderli614)
+- [discord9](https://github.com/discord9)
+- [evenyag](https://github.com/evenyag)
+- [fengjiachun](https://github.com/fengjiachun)
+- [fengys1996](https://github.com/fengys1996)
+- [GrepTime](https://github.com/GrepTime)
+- [holalengyu](https://github.com/holalengyu)
+- [killme2008](https://github.com/killme2008)
+- [MichaelScofield](https://github.com/MichaelScofield)
+- [shuiyisong](https://github.com/shuiyisong)
+- [sunchanglong](https://github.com/sunchanglong)
+- [sunng87](https://github.com/sunng87)
+- [v0y4g3r](https://github.com/v0y4g3r)
+- [waynexia](https://github.com/waynexia)
+- [WenyXu](https://github.com/WenyXu)
+- [xtang](https://github.com/xtang)

 ## All Contributors

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -738,12 +738,12 @@ dependencies = [
 "api",
 "async-trait",
 "common-base",
+ "common-config",
 "common-error",
 "common-macro",
 "common-telemetry",
 "common-test-util",
 "digest",
- "notify",
 "sha1",
 "snafu 0.8.6",
 "sql",
@@ -2055,6 +2055,7 @@ dependencies = [
 "datanode",
 "humantime-serde",
 "meta-client",
+ "notify",
 "object-store",
 "serde",
 "serde_json",
@@ -2253,6 +2254,7 @@ dependencies = [
 "arrow-flight",
 "bytes",
 "common-base",
+ "common-config",
 "common-error",
 "common-macro",
 "common-recordbatch",
@@ -2266,7 +2268,6 @@ dependencies = [
 "hyper 1.6.0",
 "hyper-util",
 "lazy_static",
- "notify",
 "prost 0.13.5",
 "rand 0.9.1",
 "serde",
@@ -2845,6 +2846,15 @@ dependencies = [
 "unicode-segmentation",
 ]

+[[package]]
+name = "convert_case"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "633458d4ef8c78b72454de2d54fd6ab2e60f9e02be22f3c6104cdc8a4e0fceb9"
+dependencies = [
+ "unicode-segmentation",
+]
+
 [[package]]
 name = "core-foundation"
 version = "0.9.4"
@@ -3741,9 +3751,9 @@ dependencies = [

 [[package]]
 name = "datafusion-pg-catalog"
-version = "0.12.2"
+version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "755393864c0c2dd95575ceed4b25e348686028e1b83d06f8f39914209999f821"
+checksum = "09bfd1feed7ed335227af0b65955ed825e467cf67fad6ecd089123202024cfd1"
 dependencies = [
 "async-trait",
 "datafusion",
@@ -4184,21 +4194,23 @@ dependencies = [

 [[package]]
 name = "derive_more"
-version = "1.0.0"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05"
+checksum = "10b768e943bed7bf2cab53df09f4bc34bfd217cdb57d971e769874c9a6710618"
 dependencies = [
 "derive_more-impl",
 ]

 [[package]]
 name = "derive_more-impl"
-version = "1.0.0"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22"
+checksum = "6d286bfdaf75e988b4a78e013ecd79c581e06399ab53fbacd2d916c2f904f30b"
 dependencies = [
+ "convert_case 0.10.0",
 "proc-macro2",
 "quote",
+ "rustc_version",
 "syn 2.0.106",
 "unicode-xid",
 ]
@@ -4915,6 +4927,7 @@ dependencies = [
 "async-stream",
 "async-trait",
 "auth",
+ "axum 0.8.4",
 "bytes",
 "cache",
 "catalog",
@@ -4949,9 +4962,11 @@ dependencies = [
 "hostname 0.4.1",
 "humantime",
 "humantime-serde",
+ "hyper-util",
 "lazy_static",
 "log-query",
 "meta-client",
+ "meta-srv",
 "num_cpus",
 "opentelemetry-proto",
 "operator",
@@ -4963,6 +4978,7 @@ dependencies = [
 "prost 0.13.5",
 "query",
 "rand 0.9.1",
+ "reqwest",
 "serde",
 "serde_json",
 "servers",
@@ -5351,7 +5367,7 @@ dependencies = [
 [[package]]
 name = "greptime-proto"
 version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=0df99f09f1d6785055b2d9da96fc4ecc2bdf6803#0df99f09f1d6785055b2d9da96fc4ecc2bdf6803"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=0423fa30203187c75e2937a668df1da699c8b96c#0423fa30203187c75e2937a668df1da699c8b96c"
 dependencies = [
 "prost 0.13.5",
 "prost-types 0.13.5",
@@ -7514,9 +7530,11 @@ dependencies = [
 "common-test-util",
 "common-time",
 "common-wal",
+ "criterion 0.4.0",
 "datafusion",
 "datatypes",
 "futures-util",
+ "fxhash",
 "humantime-serde",
 "itertools 0.14.0",
 "lazy_static",
@@ -9201,9 +9219,9 @@ dependencies = [

 [[package]]
 name = "pgwire"
-version = "0.36.1"
+version = "0.36.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d331bb0eef5bc83a221c0a85b1f205bccf094d4f72a26ae1d68a1b1c535123b7"
+checksum = "70a2bcdcc4b20a88e0648778ecf00415bbd5b447742275439c22176835056f99"
 dependencies = [
 "async-trait",
 "base64 0.22.1",
@@ -10835,7 +10853,7 @@ dependencies = [
 [[package]]
 name = "rskafka"
 version = "0.6.0"
-source = "git+https://github.com/WenyXu/rskafka.git?rev=7b0f31ed39db049b4ee2e5f1e95b5a30be9baf76#7b0f31ed39db049b4ee2e5f1e95b5a30be9baf76"
+source = "git+https://github.com/GreptimeTeam/rskafka.git?rev=f5688f83e7da591cda3f2674c2408b4c0ed4ed50#f5688f83e7da591cda3f2674c2408b4c0ed4ed50"
 dependencies = [
 "bytes",
 "chrono",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -131,7 +131,7 @@ datafusion-functions = "50"
 datafusion-functions-aggregate-common = "50"
 datafusion-optimizer = "50"
 datafusion-orc = "0.5"
-datafusion-pg-catalog = "0.12.2"
+datafusion-pg-catalog = "0.12.3"
 datafusion-physical-expr = "50"
 datafusion-physical-plan = "50"
 datafusion-sql = "50"
@@ -139,6 +139,7 @@ datafusion-substrait = "50"
 deadpool = "0.12"
 deadpool-postgres = "0.14"
 derive_builder = "0.20"
+derive_more = { version = "2.1", features = ["full"] }
 dotenv = "0.15"
 either = "1.15"
 etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62df834f0cffda355eba96691fe1a9a332b75a7", features = [
@@ -148,7 +149,7 @@ etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62d
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "0df99f09f1d6785055b2d9da96fc4ecc2bdf6803" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "0423fa30203187c75e2937a668df1da699c8b96c" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
@@ -200,7 +201,8 @@ reqwest = { version = "0.12", default-features = false, features = [
    "stream",
    "multipart",
 ] }
-rskafka = { git = "https://github.com/WenyXu/rskafka.git", rev = "7b0f31ed39db049b4ee2e5f1e95b5a30be9baf76", features = [
+# Branch: feat/request-timeout
+rskafka = { git = "https://github.com/GreptimeTeam/rskafka.git", rev = "f5688f83e7da591cda3f2674c2408b4c0ed4ed50", features = [
    "transport-tls",
 ] }
 rstest = "0.25"
--- a/config/config.md
+++ b/config/config.md
@@ -294,7 +294,6 @@
 | `meta_client` | -- | -- | The metasrv client options. |
 | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
 | `meta_client.timeout` | String | `3s` | Operation timeout. |
-| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
 | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
 | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
 | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
@@ -457,7 +456,6 @@
 | `meta_client` | -- | -- | The metasrv client options. |
 | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
 | `meta_client.timeout` | String | `3s` | Operation timeout. |
-| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
 | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
 | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
 | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
@@ -629,7 +627,6 @@
 | `meta_client` | -- | -- | The metasrv client options. |
 | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
 | `meta_client.timeout` | String | `3s` | Operation timeout. |
-| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
 | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
 | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
 | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -99,9 +99,6 @@ metasrv_addrs = ["127.0.0.1:3002"]
 ## Operation timeout.
 timeout = "3s"

-## Heartbeat timeout.
-heartbeat_timeout = "500ms"
-
 ## DDL timeout.
 ddl_timeout = "10s"

--- a/config/flownode.example.toml
+++ b/config/flownode.example.toml
@@ -78,9 +78,6 @@ metasrv_addrs = ["127.0.0.1:3002"]
 ## Operation timeout.
 timeout = "3s"

-## Heartbeat timeout.
-heartbeat_timeout = "500ms"
-
 ## DDL timeout.
 ddl_timeout = "10s"

--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -226,9 +226,6 @@ metasrv_addrs = ["127.0.0.1:3002"]
 ## Operation timeout.
 timeout = "3s"

-## Heartbeat timeout.
-heartbeat_timeout = "500ms"
-
 ## DDL timeout.
 ddl_timeout = "10s"

--- a/flake.lock
+++ b/flake.lock
@@ -8,11 +8,11 @@
        "rust-analyzer-src": "rust-analyzer-src"
      },
      "locked": {
-        "lastModified": 1760078406,
-        "narHash": "sha256-JeJK0ZA845PtkCHkfo4KjeI1mYrsr2s3cxBYKhF4BoE=",
+        "lastModified": 1765252472,
+        "narHash": "sha256-byMt/uMi7DJ8tRniFopDFZMO3leSjGp6GS4zWOFT+uQ=",
        "owner": "nix-community",
        "repo": "fenix",
-        "rev": "351277c60d104944122ee389cdf581c5ce2c6732",
+        "rev": "8456b985f6652e3eef0632ee9992b439735c5544",
        "type": "github"
      },
      "original": {
@@ -41,16 +41,16 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1759994382,
-        "narHash": "sha256-wSK+3UkalDZRVHGCRikZ//CyZUJWDJkBDTQX1+G77Ow=",
+        "lastModified": 1764983851,
+        "narHash": "sha256-y7RPKl/jJ/KAP/VKLMghMgXTlvNIJMHKskl8/Uuar7o=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "5da4a26309e796daa7ffca72df93dbe53b8164c7",
+        "rev": "d9bc5c7dceb30d8d6fafa10aeb6aa8a48c218454",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
-        "ref": "nixos-25.05",
+        "ref": "nixos-25.11",
        "repo": "nixpkgs",
        "type": "github"
      }
@@ -65,11 +65,11 @@
    "rust-analyzer-src": {
      "flake": false,
      "locked": {
-        "lastModified": 1760014945,
-        "narHash": "sha256-ySdl7F9+oeWNHVrg3QL/brazqmJvYFEdpGnF3pyoDH8=",
+        "lastModified": 1765120009,
+        "narHash": "sha256-nG76b87rkaDzibWbnB5bYDm6a52b78A+fpm+03pqYIw=",
        "owner": "rust-lang",
        "repo": "rust-analyzer",
-        "rev": "90d2e1ce4dfe7dc49250a8b88a0f08ffdb9cb23f",
+        "rev": "5e3e9c4e61bba8a5e72134b9ffefbef8f531d008",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@@ -2,7 +2,7 @@
  description = "Development environment flake";

  inputs = {
-    nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05";
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11";
    fenix = {
      url = "github:nix-community/fenix";
      inputs.nixpkgs.follows = "nixpkgs";
@@ -48,7 +48,7 @@
            gnuplot ## for cargo bench
          ];

-          LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath buildInputs;
+          buildInputs = buildInputs;
          NIX_HARDENING_ENABLE = "";
        };
      });
--- a/src/api/src/helper.rs
+++ b/src/api/src/helper.rs
@@ -708,6 +708,7 @@ fn ddl_request_type(request: &DdlRequest) -> &'static str {
        Some(Expr::CreateView(_)) => "ddl.create_view",
        Some(Expr::DropView(_)) => "ddl.drop_view",
        Some(Expr::AlterDatabase(_)) => "ddl.alter_database",
+        Some(Expr::CommentOn(_)) => "ddl.comment_on",
        None => "ddl.empty",
    }
 }
--- a/src/auth/Cargo.toml
+++ b/src/auth/Cargo.toml
@@ -15,11 +15,11 @@ workspace = true
 api.workspace = true
 async-trait.workspace = true
 common-base.workspace = true
+common-config.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 common-telemetry.workspace = true
 digest = "0.10"
-notify.workspace = true
 sha1 = "0.10"
 snafu.workspace = true
 sql.workspace = true
--- a/src/auth/src/error.rs
+++ b/src/auth/src/error.rs
@@ -75,11 +75,12 @@ pub enum Error {
        username: String,
    },

-    #[snafu(display("Failed to initialize a watcher for file {}", path))]
+    #[snafu(display("Failed to initialize a file watcher"))]
    FileWatch {
-        path: String,
        #[snafu(source)]
-        error: notify::Error,
+        source: common_config::error::Error,
+        #[snafu(implicit)]
+        location: Location,
    },

    #[snafu(display("User is not authorized to perform this action"))]
--- a/src/auth/src/user_provider/watch_file_user_provider.rs
+++ b/src/auth/src/user_provider/watch_file_user_provider.rs
@@ -12,16 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::path::Path;
-use std::sync::mpsc::channel;
 use std::sync::{Arc, Mutex};

 use async_trait::async_trait;
+use common_config::file_watcher::{FileWatcherBuilder, FileWatcherConfig};
 use common_telemetry::{info, warn};
-use notify::{EventKind, RecursiveMode, Watcher};
-use snafu::{ResultExt, ensure};
+use snafu::ResultExt;

-use crate::error::{FileWatchSnafu, InvalidConfigSnafu, Result};
+use crate::error::{FileWatchSnafu, Result};
 use crate::user_provider::{UserInfoMap, authenticate_with_credential, load_credential_from_file};
 use crate::{Identity, Password, UserInfoRef, UserProvider};

@@ -41,61 +39,36 @@ impl WatchFileUserProvider {
    pub fn new(filepath: &str) -> Result<Self> {
        let credential = load_credential_from_file(filepath)?;
        let users = Arc::new(Mutex::new(credential));
-        let this = WatchFileUserProvider {
-            users: users.clone(),
-        };

-        let (tx, rx) = channel::<notify::Result<notify::Event>>();
-        let mut debouncer =
-            notify::recommended_watcher(tx).context(FileWatchSnafu { path: "<none>" })?;
-        let mut dir = Path::new(filepath).to_path_buf();
-        ensure!(
-            dir.pop(),
-            InvalidConfigSnafu {
-                value: filepath,
-                msg: "UserProvider path must be a file path",
-            }
-        );
-        debouncer
-            .watch(&dir, RecursiveMode::NonRecursive)
-            .context(FileWatchSnafu { path: filepath })?;
+        let users_clone = users.clone();
+        let filepath_owned = filepath.to_string();

-        let filepath = filepath.to_string();
-        std::thread::spawn(move || {
-            let filename = Path::new(&filepath).file_name();
-            let _hold = debouncer;
-            while let Ok(res) = rx.recv() {
-                if let Ok(event) = res {
-                    let is_this_file = event.paths.iter().any(|p| p.file_name() == filename);
-                    let is_relevant_event = matches!(
-                        event.kind,
-                        EventKind::Modify(_) | EventKind::Create(_) | EventKind::Remove(_)
+        FileWatcherBuilder::new()
+            .watch_path(filepath)
+            .context(FileWatchSnafu)?
+            .config(FileWatcherConfig::new())
+            .spawn(move || match load_credential_from_file(&filepath_owned) {
+                Ok(credential) => {
+                    let mut users = users_clone.lock().expect("users credential must be valid");
+                    #[cfg(not(test))]
+                    info!("User provider file {} reloaded", &filepath_owned);
+                    #[cfg(test)]
+                    info!(
+                        "User provider file {} reloaded: {:?}",
+                        &filepath_owned, credential
                    );
-                    if is_this_file && is_relevant_event {
-                        info!(?event.kind, "User provider file {} changed", &filepath);
-                        match load_credential_from_file(&filepath) {
-                            Ok(credential) => {
-                                let mut users =
-                                    users.lock().expect("users credential must be valid");
-                                #[cfg(not(test))]
-                                info!("User provider file {filepath} reloaded");
-                                #[cfg(test)]
-                                info!("User provider file {filepath} reloaded: {credential:?}");
-                                *users = credential;
-                            }
-                            Err(err) => {
-                                warn!(
-                                    ?err,
-                                    "Fail to load credential from file {filepath}; keep the old one",
-                                )
-                            }
-                        }
-                    }
+                    *users = credential;
                }
-            }
-        });
+                Err(err) => {
+                    warn!(
+                        ?err,
+                        "Fail to load credential from file {}; keep the old one", &filepath_owned
+                    )
+                }
+            })
+            .context(FileWatchSnafu)?;

-        Ok(this)
+        Ok(WatchFileUserProvider { users })
    }
 }

--- a/src/cmd/src/datanode/objbench.rs
+++ b/src/cmd/src/datanode/objbench.rs
@@ -163,7 +163,7 @@ impl ObjbenchCommand {
            available_indexes: Default::default(),
            indexes: Default::default(),
            index_file_size: 0,
-            index_file_id: None,
+            index_version: 0,
            num_rows,
            num_row_groups,
            sequence: None,
@@ -564,7 +564,7 @@ fn new_noop_file_purger() -> FilePurgerRef {
    #[derive(Debug)]
    struct Noop;
    impl FilePurger for Noop {
-        fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool) {}
+        fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool, _index_outdated: bool) {}
    }
    Arc::new(Noop)
 }
--- a/src/cmd/src/frontend.rs
+++ b/src/cmd/src/frontend.rs
@@ -35,6 +35,7 @@ use common_meta::cache::{CacheRegistryBuilder, LayeredCacheRegistryBuilder};
 use common_meta::heartbeat::handler::HandlerGroupExecutor;
 use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHandler;
 use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
+use common_meta::heartbeat::handler::suspend::SuspendHandler;
 use common_query::prelude::set_default_prefix;
 use common_stat::ResourceStatImpl;
 use common_telemetry::info;
@@ -45,7 +46,7 @@ use frontend::frontend::Frontend;
 use frontend::heartbeat::HeartbeatTask;
 use frontend::instance::builder::FrontendBuilder;
 use frontend::server::Services;
-use meta_client::{MetaClientOptions, MetaClientType};
+use meta_client::{MetaClientOptions, MetaClientRef, MetaClientType};
 use plugins::frontend::context::{
    CatalogManagerConfigureContext, DistributedCatalogManagerConfigureContext,
 };
@@ -440,30 +441,13 @@ impl StartCommand {
        };
        let catalog_manager = builder.build();

-        let executor = HandlerGroupExecutor::new(vec![
-            Arc::new(ParseMailboxMessageHandler),
-            Arc::new(InvalidateCacheHandler::new(layered_cache_registry.clone())),
-        ]);
-
-        let mut resource_stat = ResourceStatImpl::default();
-        resource_stat.start_collect_cpu_usage();
-
-        let heartbeat_task = HeartbeatTask::new(
-            &opts,
-            meta_client.clone(),
-            opts.heartbeat.clone(),
-            Arc::new(executor),
-            Arc::new(resource_stat),
-        );
-        let heartbeat_task = Some(heartbeat_task);
-
        let instance = FrontendBuilder::new(
            opts.clone(),
            cached_meta_backend.clone(),
            layered_cache_registry.clone(),
            catalog_manager,
            client,
-            meta_client,
+            meta_client.clone(),
            process_manager,
        )
        .with_plugin(plugins.clone())
@@ -471,6 +455,9 @@ impl StartCommand {
        .try_build()
        .await
        .context(error::StartFrontendSnafu)?;
+
+        let heartbeat_task = Some(create_heartbeat_task(&opts, meta_client, &instance));
+
        let instance = Arc::new(instance);

        let servers = Services::new(opts, instance.clone(), plugins)
@@ -487,6 +474,28 @@ impl StartCommand {
    }
 }

+pub fn create_heartbeat_task(
+    options: &frontend::frontend::FrontendOptions,
+    meta_client: MetaClientRef,
+    instance: &frontend::instance::Instance,
+) -> HeartbeatTask {
+    let executor = Arc::new(HandlerGroupExecutor::new(vec![
+        Arc::new(ParseMailboxMessageHandler),
+        Arc::new(SuspendHandler::new(instance.suspend_state())),
+        Arc::new(InvalidateCacheHandler::new(
+            instance.cache_invalidator().clone(),
+        )),
+    ]));
+
+    let stat = {
+        let mut stat = ResourceStatImpl::default();
+        stat.start_collect_cpu_usage();
+        Arc::new(stat)
+    };
+
+    HeartbeatTask::new(options, meta_client, executor, stat)
+}
+
 #[cfg(test)]
 mod tests {
    use std::io::Write;
--- a/src/cmd/tests/load_config_test.rs
+++ b/src/cmd/tests/load_config_test.rs
@@ -52,7 +52,6 @@ fn test_load_datanode_example_config() {
            meta_client: Some(MetaClientOptions {
                metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
                timeout: Duration::from_secs(3),
-                heartbeat_timeout: Duration::from_millis(500),
                ddl_timeout: Duration::from_secs(10),
                connect_timeout: Duration::from_secs(1),
                tcp_nodelay: true,
@@ -118,7 +117,6 @@ fn test_load_frontend_example_config() {
            meta_client: Some(MetaClientOptions {
                metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
                timeout: Duration::from_secs(3),
-                heartbeat_timeout: Duration::from_millis(500),
                ddl_timeout: Duration::from_secs(10),
                connect_timeout: Duration::from_secs(1),
                tcp_nodelay: true,
@@ -241,7 +239,6 @@ fn test_load_flownode_example_config() {
            meta_client: Some(MetaClientOptions {
                metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
                timeout: Duration::from_secs(3),
-                heartbeat_timeout: Duration::from_millis(500),
                ddl_timeout: Duration::from_secs(10),
                connect_timeout: Duration::from_secs(1),
                tcp_nodelay: true,
--- a/src/common/config/Cargo.toml
+++ b/src/common/config/Cargo.toml
@@ -11,8 +11,10 @@ workspace = true
 common-base.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
+common-telemetry.workspace = true
 config.workspace = true
 humantime-serde.workspace = true
+notify.workspace = true
 object-store.workspace = true
 serde.workspace = true
 serde_json.workspace = true
--- a/src/common/config/src/error.rs
+++ b/src/common/config/src/error.rs
@@ -49,14 +49,41 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display("Failed to watch file: {}", path))]
+    FileWatch {
+        path: String,
+        #[snafu(source)]
+        error: notify::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to canonicalize path: {}", path))]
+    CanonicalizePath {
+        path: String,
+        #[snafu(source)]
+        error: std::io::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Invalid path '{}': expected a file, not a directory", path))]
+    InvalidPath {
+        path: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 impl ErrorExt for Error {
    fn status_code(&self) -> StatusCode {
        match self {
-            Error::TomlFormat { .. } | Error::LoadLayeredConfig { .. } => {
-                StatusCode::InvalidArguments
-            }
+            Error::TomlFormat { .. }
+            | Error::LoadLayeredConfig { .. }
+            | Error::FileWatch { .. }
+            | Error::InvalidPath { .. }
+            | Error::CanonicalizePath { .. } => StatusCode::InvalidArguments,
            Error::SerdeJson { .. } => StatusCode::Unexpected,
        }
    }
--- a/src/common/config/src/file_watcher.rs
+++ b/src/common/config/src/file_watcher.rs
@@ -0,0 +1,355 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Common file watching utilities for configuration hot-reloading.
+//!
+//! This module provides a generic file watcher that can be used to watch
+//! files for changes and trigger callbacks when changes occur.
+//!
+//! The watcher monitors the parent directory of each file rather than the
+//! file itself. This ensures that file deletions and recreations are properly
+//! tracked, which is common with editors that use atomic saves or when
+//! configuration files are replaced.
+
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+use std::sync::mpsc::channel;
+
+use common_telemetry::{error, info, warn};
+use notify::{EventKind, RecursiveMode, Watcher};
+use snafu::ResultExt;
+
+use crate::error::{CanonicalizePathSnafu, FileWatchSnafu, InvalidPathSnafu, Result};
+
+/// Configuration for the file watcher behavior.
+#[derive(Debug, Clone, Default)]
+pub struct FileWatcherConfig {
+    /// Whether to include Remove events in addition to Modify and Create.
+    pub include_remove_events: bool,
+}
+
+impl FileWatcherConfig {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn with_modify_and_create(mut self) -> Self {
+        self.include_remove_events = false;
+        self
+    }
+
+    pub fn with_remove_events(mut self) -> Self {
+        self.include_remove_events = true;
+        self
+    }
+}
+
+/// A builder for creating file watchers with flexible configuration.
+///
+/// The watcher monitors the parent directory of each file to handle file
+/// deletion and recreation properly. Events are filtered to only trigger
+/// callbacks for the specific files being watched.
+pub struct FileWatcherBuilder {
+    config: FileWatcherConfig,
+    /// Canonicalized paths of files to watch.
+    file_paths: Vec<PathBuf>,
+}
+
+impl FileWatcherBuilder {
+    /// Create a new builder with default configuration.
+    pub fn new() -> Self {
+        Self {
+            config: FileWatcherConfig::default(),
+            file_paths: Vec::new(),
+        }
+    }
+
+    /// Set the watcher configuration.
+    pub fn config(mut self, config: FileWatcherConfig) -> Self {
+        self.config = config;
+        self
+    }
+
+    /// Add a file path to watch.
+    ///
+    /// Returns an error if the path is a directory.
+    /// The path is canonicalized for reliable comparison with events.
+    pub fn watch_path<P: AsRef<Path>>(mut self, path: P) -> Result<Self> {
+        let path = path.as_ref();
+        snafu::ensure!(
+            path.is_file(),
+            InvalidPathSnafu {
+                path: path.display().to_string(),
+            }
+        );
+        // Canonicalize the path for reliable comparison with event paths
+        let canonical = path.canonicalize().context(CanonicalizePathSnafu {
+            path: path.display().to_string(),
+        })?;
+        self.file_paths.push(canonical);
+        Ok(self)
+    }
+
+    /// Add multiple file paths to watch.
+    ///
+    /// Returns an error if any path is a directory.
+    pub fn watch_paths<P: AsRef<Path>, I: IntoIterator<Item = P>>(
+        mut self,
+        paths: I,
+    ) -> Result<Self> {
+        for path in paths {
+            self = self.watch_path(path)?;
+        }
+        Ok(self)
+    }
+
+    /// Build and spawn the file watcher with the given callback.
+    ///
+    /// The callback is invoked when relevant file events are detected for
+    /// the watched files. The watcher monitors the parent directories to
+    /// handle file deletion and recreation properly.
+    ///
+    /// The spawned watcher thread runs for the lifetime of the process.
+    pub fn spawn<F>(self, callback: F) -> Result<()>
+    where
+        F: Fn() + Send + 'static,
+    {
+        let (tx, rx) = channel::<notify::Result<notify::Event>>();
+        let mut watcher =
+            notify::recommended_watcher(tx).context(FileWatchSnafu { path: "<none>" })?;
+
+        // Collect unique parent directories to watch
+        let mut watched_dirs: HashSet<PathBuf> = HashSet::new();
+        for file_path in &self.file_paths {
+            if let Some(parent) = file_path.parent()
+                && watched_dirs.insert(parent.to_path_buf())
+            {
+                watcher
+                    .watch(parent, RecursiveMode::NonRecursive)
+                    .context(FileWatchSnafu {
+                        path: parent.display().to_string(),
+                    })?;
+            }
+        }
+
+        let config = self.config;
+        let watched_files: HashSet<PathBuf> = self.file_paths.iter().cloned().collect();
+
+        info!(
+            "Spawning file watcher for paths: {:?} (watching parent directories)",
+            self.file_paths
+                .iter()
+                .map(|p| p.display().to_string())
+                .collect::<Vec<_>>()
+        );
+
+        std::thread::spawn(move || {
+            // Keep watcher alive in the thread
+            let _watcher = watcher;
+
+            while let Ok(res) = rx.recv() {
+                match res {
+                    Ok(event) => {
+                        if !is_relevant_event(&event.kind, &config) {
+                            continue;
+                        }
+
+                        // Check if any of the event paths match our watched files
+                        let is_watched_file = event.paths.iter().any(|event_path| {
+                            // Try to canonicalize the event path for comparison
+                            // If the file was deleted, canonicalize will fail, so we also
+                            // compare the raw path
+                            if let Ok(canonical) = event_path.canonicalize()
+                                && watched_files.contains(&canonical)
+                            {
+                                return true;
+                            }
+                            // For deleted files, compare using the raw path
+                            watched_files.contains(event_path)
+                        });
+
+                        if !is_watched_file {
+                            continue;
+                        }
+
+                        info!(?event.kind, ?event.paths, "Detected file change");
+                        callback();
+                    }
+                    Err(err) => {
+                        warn!("File watcher error: {}", err);
+                    }
+                }
+            }
+
+            error!("File watcher channel closed unexpectedly");
+        });
+
+        Ok(())
+    }
+}
+
+impl Default for FileWatcherBuilder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Check if an event kind is relevant based on the configuration.
+fn is_relevant_event(kind: &EventKind, config: &FileWatcherConfig) -> bool {
+    match kind {
+        EventKind::Modify(_) | EventKind::Create(_) => true,
+        EventKind::Remove(_) => config.include_remove_events,
+        _ => false,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+    use std::time::Duration;
+
+    use common_test_util::temp_dir::create_temp_dir;
+
+    use super::*;
+
+    #[test]
+    fn test_file_watcher_detects_changes() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("test_file_watcher");
+        let file_path = dir.path().join("test_file.txt");
+
+        // Create initial file
+        std::fs::write(&file_path, "initial content").unwrap();
+
+        let counter = Arc::new(AtomicUsize::new(0));
+        let counter_clone = counter.clone();
+
+        FileWatcherBuilder::new()
+            .watch_path(&file_path)
+            .unwrap()
+            .config(FileWatcherConfig::new())
+            .spawn(move || {
+                counter_clone.fetch_add(1, Ordering::SeqCst);
+            })
+            .unwrap();
+
+        // Give watcher time to start
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Modify the file
+        std::fs::write(&file_path, "modified content").unwrap();
+
+        // Wait for the event to be processed
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert!(
+            counter.load(Ordering::SeqCst) >= 1,
+            "Watcher should have detected at least one change"
+        );
+    }
+
+    #[test]
+    fn test_file_watcher_detects_delete_and_recreate() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("test_file_watcher_recreate");
+        let file_path = dir.path().join("test_file.txt");
+
+        // Create initial file
+        std::fs::write(&file_path, "initial content").unwrap();
+
+        let counter = Arc::new(AtomicUsize::new(0));
+        let counter_clone = counter.clone();
+
+        FileWatcherBuilder::new()
+            .watch_path(&file_path)
+            .unwrap()
+            .config(FileWatcherConfig::new())
+            .spawn(move || {
+                counter_clone.fetch_add(1, Ordering::SeqCst);
+            })
+            .unwrap();
+
+        // Give watcher time to start
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Delete the file
+        std::fs::remove_file(&file_path).unwrap();
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Recreate the file - this should still be detected because we watch the directory
+        std::fs::write(&file_path, "recreated content").unwrap();
+
+        // Wait for the event to be processed
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert!(
+            counter.load(Ordering::SeqCst) >= 1,
+            "Watcher should have detected file recreation"
+        );
+    }
+
+    #[test]
+    fn test_file_watcher_ignores_other_files() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("test_file_watcher_other");
+        let watched_file = dir.path().join("watched.txt");
+        let other_file = dir.path().join("other.txt");
+
+        // Create both files
+        std::fs::write(&watched_file, "watched content").unwrap();
+        std::fs::write(&other_file, "other content").unwrap();
+
+        let counter = Arc::new(AtomicUsize::new(0));
+        let counter_clone = counter.clone();
+
+        FileWatcherBuilder::new()
+            .watch_path(&watched_file)
+            .unwrap()
+            .config(FileWatcherConfig::new())
+            .spawn(move || {
+                counter_clone.fetch_add(1, Ordering::SeqCst);
+            })
+            .unwrap();
+
+        // Give watcher time to start
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Modify the other file - should NOT trigger callback
+        std::fs::write(&other_file, "modified other content").unwrap();
+
+        // Wait for potential event
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert_eq!(
+            counter.load(Ordering::SeqCst),
+            0,
+            "Watcher should not have detected changes to other files"
+        );
+
+        // Now modify the watched file - SHOULD trigger callback
+        std::fs::write(&watched_file, "modified watched content").unwrap();
+
+        // Wait for the event to be processed
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert!(
+            counter.load(Ordering::SeqCst) >= 1,
+            "Watcher should have detected change to watched file"
+        );
+    }
+}
--- a/src/common/config/src/lib.rs
+++ b/src/common/config/src/lib.rs
@@ -14,6 +14,7 @@

 pub mod config;
 pub mod error;
+pub mod file_watcher;

 use std::time::Duration;

--- a/src/common/error/src/lib.rs
+++ b/src/common/error/src/lib.rs
@@ -21,6 +21,8 @@ pub mod status_code;
 use http::{HeaderMap, HeaderValue};
 pub use snafu;

+use crate::status_code::StatusCode;
+
 // HACK - these headers are here for shared in gRPC services. For common HTTP headers,
 // please define in `src/servers/src/http/header.rs`.
 pub const GREPTIME_DB_HEADER_ERROR_CODE: &str = "x-greptime-err-code";
@@ -46,6 +48,29 @@ pub fn from_err_code_msg_to_header(code: u32, msg: &str) -> HeaderMap {
    header
 }

+/// Extract [StatusCode] and error message from [HeaderMap], if any.
+///
+/// Note that if the [StatusCode] is illegal, for example, a random number that is not pre-defined
+/// as a [StatusCode], the result is still `None`.
+pub fn from_header_to_err_code_msg(headers: &HeaderMap) -> Option<(StatusCode, &str)> {
+    let code = headers
+        .get(GREPTIME_DB_HEADER_ERROR_CODE)
+        .and_then(|value| {
+            value
+                .to_str()
+                .ok()
+                .and_then(|x| x.parse::<u32>().ok())
+                .and_then(StatusCode::from_u32)
+        });
+    let msg = headers
+        .get(GREPTIME_DB_HEADER_ERROR_MSG)
+        .and_then(|x| x.to_str().ok());
+    match (code, msg) {
+        (Some(code), Some(msg)) => Some((code, msg)),
+        _ => None,
+    }
+}
+
 /// Returns the external root cause of the source error (exclude the current error).
 pub fn root_source(err: &dyn std::error::Error) -> Option<&dyn std::error::Error> {
    // There are some divergence about the behavior of the `sources()` API
--- a/src/common/error/src/status_code.rs
+++ b/src/common/error/src/status_code.rs
@@ -42,6 +42,8 @@ pub enum StatusCode {
    External = 1007,
    /// The request is deadline exceeded (typically server-side).
    DeadlineExceeded = 1008,
+    /// Service got suspended for various reason. For example, resources exceed limit.
+    Suspended = 1009,
    // ====== End of common status code ================

    // ====== Begin of SQL related status code =========
@@ -175,7 +177,8 @@ impl StatusCode {
            | StatusCode::AccessDenied
            | StatusCode::PermissionDenied
            | StatusCode::RequestOutdated
-            | StatusCode::External => false,
+            | StatusCode::External
+            | StatusCode::Suspended => false,
        }
    }

@@ -223,7 +226,8 @@ impl StatusCode {
            | StatusCode::InvalidAuthHeader
            | StatusCode::AccessDenied
            | StatusCode::PermissionDenied
-            | StatusCode::RequestOutdated => false,
+            | StatusCode::RequestOutdated
+            | StatusCode::Suspended => false,
        }
    }

@@ -347,7 +351,8 @@ pub fn status_to_tonic_code(status_code: StatusCode) -> Code {
        | StatusCode::RegionNotReady => Code::Unavailable,
        StatusCode::RuntimeResourcesExhausted
        | StatusCode::RateLimited
-        | StatusCode::RegionBusy => Code::ResourceExhausted,
+        | StatusCode::RegionBusy
+        | StatusCode::Suspended => Code::ResourceExhausted,
        StatusCode::UnsupportedPasswordType
        | StatusCode::UserPasswordMismatch
        | StatusCode::AuthHeaderNotFound
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -39,7 +39,7 @@ datafusion-functions-aggregate-common.workspace = true
 datafusion-pg-catalog.workspace = true
 datafusion-physical-expr.workspace = true
 datatypes.workspace = true
-derive_more = { version = "1", default-features = false, features = ["display"] }
+derive_more.workspace = true
 geo = { version = "0.29", optional = true }
 geo-types = { version = "0.7", optional = true }
 geohash = { version = "0.13", optional = true }
--- a/src/common/function/src/scalars/geo/relation.rs
+++ b/src/common/function/src/scalars/geo/relation.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::fmt::Display;
 use std::sync::Arc;

 use datafusion_common::arrow::array::{Array, AsArray, BooleanBuilder};
--- a/src/common/function/src/system/pg_catalog.rs
+++ b/src/common/function/src/system/pg_catalog.rs
@@ -387,6 +387,8 @@ impl PGCatalogFunction {
        registry.register(pg_catalog::create_pg_stat_get_numscans());
        registry.register(pg_catalog::create_pg_get_constraintdef());
        registry.register(pg_catalog::create_pg_get_partition_ancestors_udf());
+        registry.register(pg_catalog::quote_ident_udf::create_quote_ident_udf());
+        registry.register(pg_catalog::quote_ident_udf::create_parse_ident_udf());
        registry.register_scalar(ObjDescriptionFunction::new());
        registry.register_scalar(ColDescriptionFunction::new());
        registry.register_scalar(ShobjDescriptionFunction::new());
--- a/src/common/grpc/Cargo.toml
+++ b/src/common/grpc/Cargo.toml
@@ -12,6 +12,7 @@ api.workspace = true
 arrow-flight.workspace = true
 bytes.workspace = true
 common-base.workspace = true
+common-config.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 common-recordbatch.workspace = true
@@ -23,7 +24,6 @@ datatypes.workspace = true
 flatbuffers = "25.2"
 hyper.workspace = true
 lazy_static.workspace = true
-notify.workspace = true
 prost.workspace = true
 serde.workspace = true
 serde_json.workspace = true
--- a/src/common/grpc/src/error.rs
+++ b/src/common/grpc/src/error.rs
@@ -38,11 +38,10 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Failed to watch config file path: {}", path))]
+    #[snafu(display("Failed to watch config file"))]
    FileWatch {
-        path: String,
        #[snafu(source)]
-        error: notify::Error,
+        source: common_config::error::Error,
        #[snafu(implicit)]
        location: Location,
    },
--- a/src/common/grpc/src/flight/do_put.rs
+++ b/src/common/grpc/src/flight/do_put.rs
@@ -46,13 +46,16 @@ pub struct DoPutResponse {
    request_id: i64,
    /// The successfully ingested rows number.
    affected_rows: AffectedRows,
+    /// The elapsed time in seconds for handling the bulk insert.
+    elapsed_secs: f64,
 }

 impl DoPutResponse {
-    pub fn new(request_id: i64, affected_rows: AffectedRows) -> Self {
+    pub fn new(request_id: i64, affected_rows: AffectedRows, elapsed_secs: f64) -> Self {
        Self {
            request_id,
            affected_rows,
+            elapsed_secs,
        }
    }

@@ -63,6 +66,10 @@ impl DoPutResponse {
    pub fn affected_rows(&self) -> AffectedRows {
        self.affected_rows
    }
+
+    pub fn elapsed_secs(&self) -> f64 {
+        self.elapsed_secs
+    }
 }

 impl TryFrom<PutResult> for DoPutResponse {
@@ -86,8 +93,11 @@ mod tests {

    #[test]
    fn test_serde_do_put_response() {
-        let x = DoPutResponse::new(42, 88);
+        let x = DoPutResponse::new(42, 88, 0.123);
        let serialized = serde_json::to_string(&x).unwrap();
-        assert_eq!(serialized, r#"{"request_id":42,"affected_rows":88}"#);
+        assert_eq!(
+            serialized,
+            r#"{"request_id":42,"affected_rows":88,"elapsed_secs":0.123}"#
+        );
    }
 }
--- a/src/common/grpc/src/reloadable_tls.rs
+++ b/src/common/grpc/src/reloadable_tls.rs
@@ -15,11 +15,10 @@
 use std::path::Path;
 use std::result::Result as StdResult;
 use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::mpsc::channel;
 use std::sync::{Arc, RwLock};

+use common_config::file_watcher::{FileWatcherBuilder, FileWatcherConfig};
 use common_telemetry::{error, info};
-use notify::{EventKind, RecursiveMode, Watcher};
 use snafu::ResultExt;

 use crate::error::{FileWatchSnafu, Result};
@@ -119,45 +118,28 @@ where
        return Ok(());
    }

+    let watch_paths: Vec<_> = tls_config
+        .get_tls_option()
+        .watch_paths()
+        .iter()
+        .map(|p| p.to_path_buf())
+        .collect();
+
    let tls_config_for_watcher = tls_config.clone();

-    let (tx, rx) = channel::<notify::Result<notify::Event>>();
-    let mut watcher = notify::recommended_watcher(tx).context(FileWatchSnafu { path: "<none>" })?;
-
-    // Watch all paths returned by the TlsConfigLoader
-    for path in tls_config.get_tls_option().watch_paths() {
-        watcher
-            .watch(path, RecursiveMode::NonRecursive)
-            .with_context(|_| FileWatchSnafu {
-                path: path.display().to_string(),
-            })?;
-    }
-
-    info!("Spawning background task for watching TLS cert/key file changes");
-    std::thread::spawn(move || {
-        let _watcher = watcher;
-        loop {
-            match rx.recv() {
-                Ok(Ok(event)) => {
-                    if let EventKind::Modify(_) | EventKind::Create(_) = event.kind {
-                        info!("Detected TLS cert/key file change: {:?}", event);
-                        if let Err(err) = tls_config_for_watcher.reload() {
-                            error!("Failed to reload TLS config: {}", err);
-                        } else {
-                            info!("Reloaded TLS cert/key file successfully.");
-                            on_reload();
-                        }
-                    }
-                }
-                Ok(Err(err)) => {
-                    error!("Failed to watch TLS cert/key file: {}", err);
-                }
-                Err(err) => {
-                    error!("TLS cert/key file watcher channel closed: {}", err);
-                }
+    FileWatcherBuilder::new()
+        .watch_paths(&watch_paths)
+        .context(FileWatchSnafu)?
+        .config(FileWatcherConfig::new())
+        .spawn(move || {
+            if let Err(err) = tls_config_for_watcher.reload() {
+                error!("Failed to reload TLS config: {}", err);
+            } else {
+                info!("Reloaded TLS cert/key file successfully.");
+                on_reload();
            }
-        }
-    });
+        })
+        .context(FileWatchSnafu)?;

    Ok(())
 }
--- a/src/common/meta/src/cluster.rs
+++ b/src/common/meta/src/cluster.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::fmt::{Display, Formatter};
 use std::hash::{DefaultHasher, Hash, Hasher};
 use std::str::FromStr;

@@ -60,7 +61,7 @@ pub trait ClusterInfo {
 }

 /// The key of [NodeInfo] in the storage. The format is `__meta_cluster_node_info-0-{role}-{node_id}`.
-#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize, PartialOrd, Ord)]
 pub struct NodeInfoKey {
    /// The role of the node. It can be `[Role::Datanode]` or `[Role::Frontend]`.
    pub role: Role,
@@ -135,7 +136,7 @@ pub struct NodeInfo {
    pub hostname: String,
 }

-#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize, PartialOrd, Ord)]
 pub enum Role {
    Datanode,
    Frontend,
@@ -241,6 +242,12 @@ impl From<&NodeInfoKey> for Vec<u8> {
    }
 }

+impl Display for NodeInfoKey {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}-{}", self.role, self.node_id)
+    }
+}
+
 impl FromStr for NodeInfo {
    type Err = Error;

--- a/src/common/meta/src/ddl.rs
+++ b/src/common/meta/src/ddl.rs
@@ -31,6 +31,7 @@ use crate::region_registry::LeaderRegionRegistryRef;
 pub mod alter_database;
 pub mod alter_logical_tables;
 pub mod alter_table;
+pub mod comment_on;
 pub mod create_database;
 pub mod create_flow;
 pub mod create_logical_tables;
--- a/src/common/meta/src/ddl/alter_table/executor.rs
+++ b/src/common/meta/src/ddl/alter_table/executor.rs
@@ -301,8 +301,8 @@ fn build_new_table_info(
        | AlterKind::UnsetTableOptions { .. }
        | AlterKind::SetIndexes { .. }
        | AlterKind::UnsetIndexes { .. }
-        | AlterKind::DropDefaults { .. } => {}
-        AlterKind::SetDefaults { .. } => {}
+        | AlterKind::DropDefaults { .. }
+        | AlterKind::SetDefaults { .. } => {}
    }

    info!(
--- a/src/common/meta/src/ddl/comment_on.rs
+++ b/src/common/meta/src/ddl/comment_on.rs
@@ -0,0 +1,509 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use async_trait::async_trait;
+use chrono::Utc;
+use common_catalog::format_full_table_name;
+use common_procedure::error::{FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu};
+use common_procedure::{Context as ProcedureContext, LockKey, Procedure, Status};
+use common_telemetry::tracing::info;
+use datatypes::schema::COMMENT_KEY as COLUMN_COMMENT_KEY;
+use serde::{Deserialize, Serialize};
+use snafu::{OptionExt, ResultExt, ensure};
+use store_api::storage::TableId;
+use strum::AsRefStr;
+use table::metadata::RawTableInfo;
+use table::requests::COMMENT_KEY as TABLE_COMMENT_KEY;
+use table::table_name::TableName;
+
+use crate::cache_invalidator::Context;
+use crate::ddl::DdlContext;
+use crate::ddl::utils::map_to_procedure_error;
+use crate::error::{ColumnNotFoundSnafu, FlowNotFoundSnafu, Result, TableNotFoundSnafu};
+use crate::instruction::CacheIdent;
+use crate::key::flow::flow_info::{FlowInfoKey, FlowInfoValue};
+use crate::key::table_info::{TableInfoKey, TableInfoValue};
+use crate::key::table_name::TableNameKey;
+use crate::key::{DeserializedValueWithBytes, FlowId, MetadataKey, MetadataValue};
+use crate::lock_key::{CatalogLock, FlowNameLock, SchemaLock, TableNameLock};
+use crate::rpc::ddl::{CommentObjectType, CommentOnTask};
+use crate::rpc::store::PutRequest;
+
+pub struct CommentOnProcedure {
+    pub context: DdlContext,
+    pub data: CommentOnData,
+}
+
+impl CommentOnProcedure {
+    pub const TYPE_NAME: &'static str = "metasrv-procedure::CommentOn";
+
+    pub fn new(task: CommentOnTask, context: DdlContext) -> Self {
+        Self {
+            context,
+            data: CommentOnData::new(task),
+        }
+    }
+
+    pub fn from_json(json: &str, context: DdlContext) -> ProcedureResult<Self> {
+        let data = serde_json::from_str(json).context(FromJsonSnafu)?;
+
+        Ok(Self { context, data })
+    }
+
+    pub async fn on_prepare(&mut self) -> Result<Status> {
+        match self.data.object_type {
+            CommentObjectType::Table | CommentObjectType::Column => {
+                self.prepare_table_or_column().await?;
+            }
+            CommentObjectType::Flow => {
+                self.prepare_flow().await?;
+            }
+        }
+
+        // Fast path: if comment is unchanged, skip update
+        if self.data.is_unchanged {
+            let object_desc = match self.data.object_type {
+                CommentObjectType::Table => format!(
+                    "table {}",
+                    format_full_table_name(
+                        &self.data.catalog_name,
+                        &self.data.schema_name,
+                        &self.data.object_name,
+                    )
+                ),
+                CommentObjectType::Column => format!(
+                    "column {}.{}",
+                    format_full_table_name(
+                        &self.data.catalog_name,
+                        &self.data.schema_name,
+                        &self.data.object_name,
+                    ),
+                    self.data.column_name.as_ref().unwrap()
+                ),
+                CommentObjectType::Flow => {
+                    format!("flow {}.{}", self.data.catalog_name, self.data.object_name)
+                }
+            };
+            info!("Comment unchanged for {}, skipping update", object_desc);
+            return Ok(Status::done());
+        }
+
+        self.data.state = CommentOnState::UpdateMetadata;
+        Ok(Status::executing(true))
+    }
+
+    async fn prepare_table_or_column(&mut self) -> Result<()> {
+        let table_name_key = TableNameKey::new(
+            &self.data.catalog_name,
+            &self.data.schema_name,
+            &self.data.object_name,
+        );
+
+        let table_id = self
+            .context
+            .table_metadata_manager
+            .table_name_manager()
+            .get(table_name_key)
+            .await?
+            .with_context(|| TableNotFoundSnafu {
+                table_name: format_full_table_name(
+                    &self.data.catalog_name,
+                    &self.data.schema_name,
+                    &self.data.object_name,
+                ),
+            })?
+            .table_id();
+
+        let table_info = self
+            .context
+            .table_metadata_manager
+            .table_info_manager()
+            .get(table_id)
+            .await?
+            .with_context(|| TableNotFoundSnafu {
+                table_name: format_full_table_name(
+                    &self.data.catalog_name,
+                    &self.data.schema_name,
+                    &self.data.object_name,
+                ),
+            })?;
+
+        // For column comments, validate the column exists
+        if self.data.object_type == CommentObjectType::Column {
+            let column_name = self.data.column_name.as_ref().unwrap();
+            let column_exists = table_info
+                .table_info
+                .meta
+                .schema
+                .column_schemas
+                .iter()
+                .any(|col| &col.name == column_name);
+
+            ensure!(
+                column_exists,
+                ColumnNotFoundSnafu {
+                    column_name,
+                    column_id: 0u32, // column_id is not known here
+                }
+            );
+        }
+
+        self.data.table_id = Some(table_id);
+
+        // Check if comment is unchanged for early exit optimization
+        match self.data.object_type {
+            CommentObjectType::Table => {
+                let current_comment = &table_info.table_info.desc;
+                if &self.data.comment == current_comment {
+                    self.data.is_unchanged = true;
+                }
+            }
+            CommentObjectType::Column => {
+                let column_name = self.data.column_name.as_ref().unwrap();
+                let column_schema = table_info
+                    .table_info
+                    .meta
+                    .schema
+                    .column_schemas
+                    .iter()
+                    .find(|col| &col.name == column_name)
+                    .unwrap(); // Safe: validated above
+
+                let current_comment = column_schema.metadata().get(COLUMN_COMMENT_KEY);
+                if self.data.comment.as_deref() == current_comment.map(String::as_str) {
+                    self.data.is_unchanged = true;
+                }
+            }
+            CommentObjectType::Flow => {
+                // this branch is handled in `prepare_flow`
+            }
+        }
+
+        self.data.table_info = Some(table_info);
+
+        Ok(())
+    }
+
+    async fn prepare_flow(&mut self) -> Result<()> {
+        let flow_name_value = self
+            .context
+            .flow_metadata_manager
+            .flow_name_manager()
+            .get(&self.data.catalog_name, &self.data.object_name)
+            .await?
+            .with_context(|| FlowNotFoundSnafu {
+                flow_name: &self.data.object_name,
+            })?;
+
+        let flow_id = flow_name_value.flow_id();
+        let flow_info = self
+            .context
+            .flow_metadata_manager
+            .flow_info_manager()
+            .get_raw(flow_id)
+            .await?
+            .with_context(|| FlowNotFoundSnafu {
+                flow_name: &self.data.object_name,
+            })?;
+
+        self.data.flow_id = Some(flow_id);
+
+        // Check if comment is unchanged for early exit optimization
+        let current_comment = &flow_info.get_inner_ref().comment;
+        let new_comment = self.data.comment.as_deref().unwrap_or("");
+        if new_comment == current_comment.as_str() {
+            self.data.is_unchanged = true;
+        }
+
+        self.data.flow_info = Some(flow_info);
+
+        Ok(())
+    }
+
+    pub async fn on_update_metadata(&mut self) -> Result<Status> {
+        match self.data.object_type {
+            CommentObjectType::Table => {
+                self.update_table_comment().await?;
+            }
+            CommentObjectType::Column => {
+                self.update_column_comment().await?;
+            }
+            CommentObjectType::Flow => {
+                self.update_flow_comment().await?;
+            }
+        }
+
+        self.data.state = CommentOnState::InvalidateCache;
+        Ok(Status::executing(true))
+    }
+
+    async fn update_table_comment(&mut self) -> Result<()> {
+        let table_info_value = self.data.table_info.as_ref().unwrap();
+        let mut new_table_info = table_info_value.table_info.clone();
+
+        new_table_info.desc = self.data.comment.clone();
+
+        // Sync comment to table options
+        sync_table_comment_option(
+            &mut new_table_info.meta.options,
+            new_table_info.desc.as_deref(),
+        );
+
+        self.update_table_info(table_info_value, new_table_info)
+            .await?;
+
+        info!(
+            "Updated comment for table {}.{}.{}",
+            self.data.catalog_name, self.data.schema_name, self.data.object_name
+        );
+
+        Ok(())
+    }
+
+    async fn update_column_comment(&mut self) -> Result<()> {
+        let table_info_value = self.data.table_info.as_ref().unwrap();
+        let mut new_table_info = table_info_value.table_info.clone();
+
+        let column_name = self.data.column_name.as_ref().unwrap();
+        let column_schema = new_table_info
+            .meta
+            .schema
+            .column_schemas
+            .iter_mut()
+            .find(|col| &col.name == column_name)
+            .unwrap(); // Safe: validated in prepare
+
+        update_column_comment_metadata(column_schema, self.data.comment.clone());
+
+        self.update_table_info(table_info_value, new_table_info)
+            .await?;
+
+        info!(
+            "Updated comment for column {}.{}.{}.{}",
+            self.data.catalog_name, self.data.schema_name, self.data.object_name, column_name
+        );
+
+        Ok(())
+    }
+
+    async fn update_flow_comment(&mut self) -> Result<()> {
+        let flow_id = self.data.flow_id.unwrap();
+        let flow_info_value = self.data.flow_info.as_ref().unwrap();
+
+        let mut new_flow_info = flow_info_value.get_inner_ref().clone();
+        new_flow_info.comment = self.data.comment.clone().unwrap_or_default();
+        new_flow_info.updated_time = Utc::now();
+
+        let raw_value = new_flow_info.try_as_raw_value()?;
+
+        self.context
+            .table_metadata_manager
+            .kv_backend()
+            .put(
+                PutRequest::new()
+                    .with_key(FlowInfoKey::new(flow_id).to_bytes())
+                    .with_value(raw_value),
+            )
+            .await?;
+
+        info!(
+            "Updated comment for flow {}.{}",
+            self.data.catalog_name, self.data.object_name
+        );
+
+        Ok(())
+    }
+
+    async fn update_table_info(
+        &self,
+        current_table_info: &DeserializedValueWithBytes<TableInfoValue>,
+        new_table_info: RawTableInfo,
+    ) -> Result<()> {
+        let table_id = current_table_info.table_info.ident.table_id;
+        let new_table_info_value = current_table_info.update(new_table_info);
+        let raw_value = new_table_info_value.try_as_raw_value()?;
+
+        self.context
+            .table_metadata_manager
+            .kv_backend()
+            .put(
+                PutRequest::new()
+                    .with_key(TableInfoKey::new(table_id).to_bytes())
+                    .with_value(raw_value),
+            )
+            .await?;
+
+        Ok(())
+    }
+
+    pub async fn on_invalidate_cache(&mut self) -> Result<Status> {
+        let cache_invalidator = &self.context.cache_invalidator;
+
+        match self.data.object_type {
+            CommentObjectType::Table | CommentObjectType::Column => {
+                let table_id = self.data.table_id.unwrap();
+                let table_name = TableName::new(
+                    self.data.catalog_name.clone(),
+                    self.data.schema_name.clone(),
+                    self.data.object_name.clone(),
+                );
+
+                let cache_ident = vec![
+                    CacheIdent::TableId(table_id),
+                    CacheIdent::TableName(table_name),
+                ];
+
+                cache_invalidator
+                    .invalidate(&Context::default(), &cache_ident)
+                    .await?;
+            }
+            CommentObjectType::Flow => {
+                let flow_id = self.data.flow_id.unwrap();
+                let cache_ident = vec![CacheIdent::FlowId(flow_id)];
+
+                cache_invalidator
+                    .invalidate(&Context::default(), &cache_ident)
+                    .await?;
+            }
+        }
+
+        Ok(Status::done())
+    }
+}
+
+#[async_trait]
+impl Procedure for CommentOnProcedure {
+    fn type_name(&self) -> &str {
+        Self::TYPE_NAME
+    }
+
+    async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
+        match self.data.state {
+            CommentOnState::Prepare => self.on_prepare().await,
+            CommentOnState::UpdateMetadata => self.on_update_metadata().await,
+            CommentOnState::InvalidateCache => self.on_invalidate_cache().await,
+        }
+        .map_err(map_to_procedure_error)
+    }
+
+    fn dump(&self) -> ProcedureResult<String> {
+        serde_json::to_string(&self.data).context(ToJsonSnafu)
+    }
+
+    fn lock_key(&self) -> LockKey {
+        let catalog = &self.data.catalog_name;
+        let schema = &self.data.schema_name;
+
+        let lock_key = match self.data.object_type {
+            CommentObjectType::Table | CommentObjectType::Column => {
+                vec![
+                    CatalogLock::Read(catalog).into(),
+                    SchemaLock::read(catalog, schema).into(),
+                    TableNameLock::new(catalog, schema, &self.data.object_name).into(),
+                ]
+            }
+            CommentObjectType::Flow => {
+                vec![
+                    CatalogLock::Read(catalog).into(),
+                    FlowNameLock::new(catalog, &self.data.object_name).into(),
+                ]
+            }
+        };
+
+        LockKey::new(lock_key)
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, AsRefStr)]
+enum CommentOnState {
+    Prepare,
+    UpdateMetadata,
+    InvalidateCache,
+}
+
+/// The data of comment on procedure.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct CommentOnData {
+    state: CommentOnState,
+    catalog_name: String,
+    schema_name: String,
+    object_type: CommentObjectType,
+    object_name: String,
+    /// Column name (only for Column comments)
+    column_name: Option<String>,
+    comment: Option<String>,
+    /// Cached table ID (for Table/Column)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    table_id: Option<TableId>,
+    /// Cached table info (for Table/Column)
+    #[serde(skip)]
+    table_info: Option<DeserializedValueWithBytes<TableInfoValue>>,
+    /// Cached flow ID (for Flow)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    flow_id: Option<FlowId>,
+    /// Cached flow info (for Flow)
+    #[serde(skip)]
+    flow_info: Option<DeserializedValueWithBytes<FlowInfoValue>>,
+    /// Whether the comment is unchanged (optimization for early exit)
+    #[serde(skip)]
+    is_unchanged: bool,
+}
+
+impl CommentOnData {
+    pub fn new(task: CommentOnTask) -> Self {
+        Self {
+            state: CommentOnState::Prepare,
+            catalog_name: task.catalog_name,
+            schema_name: task.schema_name,
+            object_type: task.object_type,
+            object_name: task.object_name,
+            column_name: task.column_name,
+            comment: task.comment,
+            table_id: None,
+            table_info: None,
+            flow_id: None,
+            flow_info: None,
+            is_unchanged: false,
+        }
+    }
+}
+
+fn update_column_comment_metadata(
+    column_schema: &mut datatypes::schema::ColumnSchema,
+    comment: Option<String>,
+) {
+    match comment {
+        Some(value) => {
+            column_schema
+                .mut_metadata()
+                .insert(COLUMN_COMMENT_KEY.to_string(), value);
+        }
+        None => {
+            column_schema.mut_metadata().remove(COLUMN_COMMENT_KEY);
+        }
+    }
+}
+
+fn sync_table_comment_option(options: &mut table::requests::TableOptions, comment: Option<&str>) {
+    match comment {
+        Some(value) => {
+            options
+                .extra_options
+                .insert(TABLE_COMMENT_KEY.to_string(), value.to_string());
+        }
+        None => {
+            options.extra_options.remove(TABLE_COMMENT_KEY);
+        }
+    }
+}
--- a/src/common/meta/src/ddl_manager.rs
+++ b/src/common/meta/src/ddl_manager.rs
@@ -27,6 +27,7 @@ use store_api::storage::TableId;
 use crate::ddl::alter_database::AlterDatabaseProcedure;
 use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure;
 use crate::ddl::alter_table::AlterTableProcedure;
+use crate::ddl::comment_on::CommentOnProcedure;
 use crate::ddl::create_database::CreateDatabaseProcedure;
 use crate::ddl::create_flow::CreateFlowProcedure;
 use crate::ddl::create_logical_tables::CreateLogicalTablesProcedure;
@@ -52,18 +53,18 @@ use crate::rpc::ddl::DdlTask::CreateTrigger;
 #[cfg(feature = "enterprise")]
 use crate::rpc::ddl::DdlTask::DropTrigger;
 use crate::rpc::ddl::DdlTask::{
-    AlterDatabase, AlterLogicalTables, AlterTable, CreateDatabase, CreateFlow, CreateLogicalTables,
-    CreateTable, CreateView, DropDatabase, DropFlow, DropLogicalTables, DropTable, DropView,
-    TruncateTable,
+    AlterDatabase, AlterLogicalTables, AlterTable, CommentOn, CreateDatabase, CreateFlow,
+    CreateLogicalTables, CreateTable, CreateView, DropDatabase, DropFlow, DropLogicalTables,
+    DropTable, DropView, TruncateTable,
 };
 #[cfg(feature = "enterprise")]
 use crate::rpc::ddl::trigger::CreateTriggerTask;
 #[cfg(feature = "enterprise")]
 use crate::rpc::ddl::trigger::DropTriggerTask;
 use crate::rpc::ddl::{
-    AlterDatabaseTask, AlterTableTask, CreateDatabaseTask, CreateFlowTask, CreateTableTask,
-    CreateViewTask, DropDatabaseTask, DropFlowTask, DropTableTask, DropViewTask, QueryContext,
-    SubmitDdlTaskRequest, SubmitDdlTaskResponse, TruncateTableTask,
+    AlterDatabaseTask, AlterTableTask, CommentOnTask, CreateDatabaseTask, CreateFlowTask,
+    CreateTableTask, CreateViewTask, DropDatabaseTask, DropFlowTask, DropTableTask, DropViewTask,
+    QueryContext, SubmitDdlTaskRequest, SubmitDdlTaskResponse, TruncateTableTask,
 };
 use crate::rpc::router::RegionRoute;

@@ -192,7 +193,8 @@ impl DdlManager {
            TruncateTableProcedure,
            CreateDatabaseProcedure,
            DropDatabaseProcedure,
-            DropViewProcedure
+            DropViewProcedure,
+            CommentOnProcedure
        );

        for (type_name, loader_factory) in loaders {
@@ -408,6 +410,19 @@ impl DdlManager {
        self.submit_procedure(procedure_with_id).await
    }

+    /// Submits and executes a comment on task.
+    #[tracing::instrument(skip_all)]
+    pub async fn submit_comment_on_task(
+        &self,
+        comment_on_task: CommentOnTask,
+    ) -> Result<(ProcedureId, Option<Output>)> {
+        let context = self.create_context();
+        let procedure = CommentOnProcedure::new(comment_on_task, context);
+        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
+
+        self.submit_procedure(procedure_with_id).await
+    }
+
    async fn submit_procedure(
        &self,
        procedure_with_id: ProcedureWithId,
@@ -476,6 +491,7 @@ impl DdlManager {
                    handle_create_view_task(self, create_view_task).await
                }
                DropView(drop_view_task) => handle_drop_view_task(self, drop_view_task).await,
+                CommentOn(comment_on_task) => handle_comment_on_task(self, comment_on_task).await,
                #[cfg(feature = "enterprise")]
                CreateTrigger(create_trigger_task) => {
                    handle_create_trigger_task(
@@ -907,6 +923,26 @@ async fn handle_create_view_task(
    })
 }

+async fn handle_comment_on_task(
+    ddl_manager: &DdlManager,
+    comment_on_task: CommentOnTask,
+) -> Result<SubmitDdlTaskResponse> {
+    let (id, _) = ddl_manager
+        .submit_comment_on_task(comment_on_task.clone())
+        .await?;
+
+    let procedure_id = id.to_string();
+    info!(
+        "Comment on {}.{}.{} is updated via procedure_id {id:?}",
+        comment_on_task.catalog_name, comment_on_task.schema_name, comment_on_task.object_name
+    );
+
+    Ok(SubmitDdlTaskResponse {
+        key: procedure_id.into(),
+        ..Default::default()
+    })
+}
+
 #[cfg(test)]
 mod tests {
    use std::sync::Arc;
--- a/src/common/meta/src/distributed_time_constants.rs
+++ b/src/common/meta/src/distributed_time_constants.rs
@@ -14,6 +14,8 @@

 use std::time::Duration;

+use etcd_client::ConnectOptions;
+
 /// Heartbeat interval time (is the basic unit of various time).
 pub const HEARTBEAT_INTERVAL_MILLIS: u64 = 3000;

@@ -41,6 +43,23 @@ pub const POSTGRES_KEEP_ALIVE_SECS: u64 = 30;
 /// In a lease, there are two opportunities for renewal.
 pub const META_KEEP_ALIVE_INTERVAL_SECS: u64 = META_LEASE_SECS / 2;

+/// The timeout of the heartbeat request.
+pub const HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);
+
+/// The keep-alive interval of the heartbeat channel.
+pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS: Duration = Duration::from_secs(15);
+
+/// The keep-alive timeout of the heartbeat channel.
+pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS: Duration = Duration::from_secs(5);
+
+/// The default options for the etcd client.
+pub fn default_etcd_client_options() -> ConnectOptions {
+    ConnectOptions::new()
+        .with_keep_alive_while_idle(true)
+        .with_keep_alive(Duration::from_secs(15), Duration::from_secs(5))
+        .with_connect_timeout(Duration::from_secs(10))
+}
+
 /// The default mailbox round-trip timeout.
 pub const MAILBOX_RTT_SECS: u64 = 1;

--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -272,13 +272,6 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Failed to send message: {err_msg}"))]
-    SendMessage {
-        err_msg: String,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Failed to serde json"))]
    SerdeJson {
        #[snafu(source)]
@@ -1118,7 +1111,7 @@ impl ErrorExt for Error {
            | DeserializeFlexbuffers { .. }
            | ConvertTimeRanges { .. } => StatusCode::Unexpected,

-            SendMessage { .. } | GetKvCache { .. } | CacheNotGet { .. } => StatusCode::Internal,
+            GetKvCache { .. } | CacheNotGet { .. } => StatusCode::Internal,

            SchemaAlreadyExists { .. } => StatusCode::DatabaseAlreadyExists,

--- a/src/common/meta/src/heartbeat/handler.rs
+++ b/src/common/meta/src/heartbeat/handler.rs
@@ -23,6 +23,7 @@ use crate::heartbeat::mailbox::{IncomingMessage, MailboxRef};

 pub mod invalidate_table_cache;
 pub mod parse_mailbox_message;
+pub mod suspend;
 #[cfg(test)]
 mod tests;

--- a/src/common/meta/src/heartbeat/handler/suspend.rs
+++ b/src/common/meta/src/heartbeat/handler/suspend.rs
@@ -0,0 +1,69 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use async_trait::async_trait;
+use common_telemetry::{info, warn};
+
+use crate::error::Result;
+use crate::heartbeat::handler::{
+    HandleControl, HeartbeatResponseHandler, HeartbeatResponseHandlerContext,
+};
+use crate::instruction::Instruction;
+
+/// A heartbeat response handler that handles special "suspend" error.
+/// It will simply set or clear (if previously set) the inner suspend atomic state.
+pub struct SuspendHandler {
+    suspend: Arc<AtomicBool>,
+}
+
+impl SuspendHandler {
+    pub fn new(suspend: Arc<AtomicBool>) -> Self {
+        Self { suspend }
+    }
+}
+
+#[async_trait]
+impl HeartbeatResponseHandler for SuspendHandler {
+    fn is_acceptable(&self, context: &HeartbeatResponseHandlerContext) -> bool {
+        matches!(
+            context.incoming_message,
+            Some((_, Instruction::Suspend)) | None
+        )
+    }
+
+    async fn handle(&self, context: &mut HeartbeatResponseHandlerContext) -> Result<HandleControl> {
+        let flip_state = |expect: bool| {
+            self.suspend
+                .compare_exchange(expect, !expect, Ordering::Relaxed, Ordering::Relaxed)
+                .is_ok()
+        };
+
+        if let Some((_, Instruction::Suspend)) = context.incoming_message.take() {
+            if flip_state(false) {
+                warn!("Suspend instruction received from meta, entering suspension state");
+            }
+        } else {
+            // Suspended components are made always tried to get rid of this state, we don't want
+            // an "un-suspend" instruction to resume them running. That can be error-prone.
+            // So if the "suspend" instruction is not found in the heartbeat, just unset the state.
+            if flip_state(true) {
+                info!("clear suspend state");
+            }
+        }
+        Ok(HandleControl::Continue)
+    }
+}
--- a/src/common/meta/src/heartbeat/mailbox.rs
+++ b/src/common/meta/src/heartbeat/mailbox.rs
@@ -15,8 +15,8 @@
 use std::sync::Arc;

 use tokio::sync::mpsc::Sender;
+use tokio::sync::mpsc::error::SendError;

-use crate::error::{self, Result};
 use crate::instruction::{Instruction, InstructionReply};

 pub type IncomingMessage = (MessageMeta, Instruction);
@@ -51,13 +51,8 @@ impl HeartbeatMailbox {
        Self { sender }
    }

-    pub async fn send(&self, message: OutgoingMessage) -> Result<()> {
-        self.sender.send(message).await.map_err(|e| {
-            error::SendMessageSnafu {
-                err_msg: e.to_string(),
-            }
-            .build()
-        })
+    pub async fn send(&self, message: OutgoingMessage) -> Result<(), SendError<OutgoingMessage>> {
+        self.sender.send(message).await
    }
 }

--- a/src/common/meta/src/instruction.rs
+++ b/src/common/meta/src/instruction.rs
@@ -539,6 +539,8 @@ pub enum Instruction {
    GetFileRefs(GetFileRefs),
    /// Triggers garbage collection for a region.
    GcRegions(GcRegions),
+    /// Temporary suspend serving reads or writes
+    Suspend,
 }

 impl Instruction {
--- a/src/common/meta/src/key/table_info.rs
+++ b/src/common/meta/src/key/table_info.rs
@@ -94,7 +94,7 @@ impl TableInfoValue {
        }
    }

-    pub(crate) fn update(&self, new_table_info: RawTableInfo) -> Self {
+    pub fn update(&self, new_table_info: RawTableInfo) -> Self {
        Self {
            table_info: new_table_info,
            version: self.version + 1,
--- a/src/common/meta/src/rpc/ddl.rs
+++ b/src/common/meta/src/rpc/ddl.rs
@@ -23,19 +23,20 @@ use api::v1::alter_database_expr::Kind as PbAlterDatabaseKind;
 use api::v1::meta::ddl_task_request::Task;
 use api::v1::meta::{
    AlterDatabaseTask as PbAlterDatabaseTask, AlterTableTask as PbAlterTableTask,
-    AlterTableTasks as PbAlterTableTasks, CreateDatabaseTask as PbCreateDatabaseTask,
-    CreateFlowTask as PbCreateFlowTask, CreateTableTask as PbCreateTableTask,
-    CreateTableTasks as PbCreateTableTasks, CreateViewTask as PbCreateViewTask,
-    DdlTaskRequest as PbDdlTaskRequest, DdlTaskResponse as PbDdlTaskResponse,
-    DropDatabaseTask as PbDropDatabaseTask, DropFlowTask as PbDropFlowTask,
-    DropTableTask as PbDropTableTask, DropTableTasks as PbDropTableTasks,
-    DropViewTask as PbDropViewTask, Partition, ProcedureId,
+    AlterTableTasks as PbAlterTableTasks, CommentOnTask as PbCommentOnTask,
+    CreateDatabaseTask as PbCreateDatabaseTask, CreateFlowTask as PbCreateFlowTask,
+    CreateTableTask as PbCreateTableTask, CreateTableTasks as PbCreateTableTasks,
+    CreateViewTask as PbCreateViewTask, DdlTaskRequest as PbDdlTaskRequest,
+    DdlTaskResponse as PbDdlTaskResponse, DropDatabaseTask as PbDropDatabaseTask,
+    DropFlowTask as PbDropFlowTask, DropTableTask as PbDropTableTask,
+    DropTableTasks as PbDropTableTasks, DropViewTask as PbDropViewTask, Partition, ProcedureId,
    TruncateTableTask as PbTruncateTableTask,
 };
 use api::v1::{
-    AlterDatabaseExpr, AlterTableExpr, CreateDatabaseExpr, CreateFlowExpr, CreateTableExpr,
-    CreateViewExpr, DropDatabaseExpr, DropFlowExpr, DropTableExpr, DropViewExpr, EvalInterval,
-    ExpireAfter, Option as PbOption, QueryContext as PbQueryContext, TruncateTableExpr,
+    AlterDatabaseExpr, AlterTableExpr, CommentObjectType as PbCommentObjectType, CommentOnExpr,
+    CreateDatabaseExpr, CreateFlowExpr, CreateTableExpr, CreateViewExpr, DropDatabaseExpr,
+    DropFlowExpr, DropTableExpr, DropViewExpr, EvalInterval, ExpireAfter, Option as PbOption,
+    QueryContext as PbQueryContext, TruncateTableExpr,
 };
 use base64::Engine as _;
 use base64::engine::general_purpose;
@@ -78,6 +79,7 @@ pub enum DdlTask {
    DropView(DropViewTask),
    #[cfg(feature = "enterprise")]
    CreateTrigger(trigger::CreateTriggerTask),
+    CommentOn(CommentOnTask),
 }

 impl DdlTask {
@@ -200,6 +202,11 @@ impl DdlTask {
            view_info,
        })
    }
+
+    /// Creates a [`DdlTask`] to comment on a table, column, or flow.
+    pub fn new_comment_on(task: CommentOnTask) -> Self {
+        DdlTask::CommentOn(task)
+    }
 }

 impl TryFrom<Task> for DdlTask {
@@ -278,6 +285,7 @@ impl TryFrom<Task> for DdlTask {
                    .fail()
                }
            }
+            Task::CommentOnTask(comment_on) => Ok(DdlTask::CommentOn(comment_on.try_into()?)),
        }
    }
 }
@@ -332,6 +340,7 @@ impl TryFrom<SubmitDdlTaskRequest> for PbDdlTaskRequest {
            DdlTask::CreateTrigger(task) => Task::CreateTriggerTask(task.try_into()?),
            #[cfg(feature = "enterprise")]
            DdlTask::DropTrigger(task) => Task::DropTriggerTask(task.into()),
+            DdlTask::CommentOn(task) => Task::CommentOnTask(task.into()),
        };

        Ok(Self {
@@ -1277,6 +1286,119 @@ impl From<DropFlowTask> for PbDropFlowTask {
    }
 }

+/// Represents the ID of the object being commented on (Table or Flow).
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum CommentObjectId {
+    Table(TableId),
+    Flow(FlowId),
+}
+
+/// Comment on table, column, or flow
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct CommentOnTask {
+    pub catalog_name: String,
+    pub schema_name: String,
+    pub object_type: CommentObjectType,
+    pub object_name: String,
+    /// Column name (only for Column comments)
+    pub column_name: Option<String>,
+    /// Object ID (Table or Flow) for validation and cache invalidation
+    pub object_id: Option<CommentObjectId>,
+    pub comment: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum CommentObjectType {
+    Table,
+    Column,
+    Flow,
+}
+
+impl CommentOnTask {
+    pub fn table_ref(&self) -> TableReference<'_> {
+        TableReference {
+            catalog: &self.catalog_name,
+            schema: &self.schema_name,
+            table: &self.object_name,
+        }
+    }
+}
+
+// Proto conversions for CommentObjectType
+impl From<CommentObjectType> for PbCommentObjectType {
+    fn from(object_type: CommentObjectType) -> Self {
+        match object_type {
+            CommentObjectType::Table => PbCommentObjectType::Table,
+            CommentObjectType::Column => PbCommentObjectType::Column,
+            CommentObjectType::Flow => PbCommentObjectType::Flow,
+        }
+    }
+}
+
+impl TryFrom<i32> for CommentObjectType {
+    type Error = error::Error;
+
+    fn try_from(value: i32) -> Result<Self> {
+        match value {
+            0 => Ok(CommentObjectType::Table),
+            1 => Ok(CommentObjectType::Column),
+            2 => Ok(CommentObjectType::Flow),
+            _ => error::InvalidProtoMsgSnafu {
+                err_msg: format!(
+                    "Invalid CommentObjectType value: {}. Valid values are: 0 (Table), 1 (Column), 2 (Flow)",
+                    value
+                ),
+            }
+            .fail(),
+        }
+    }
+}
+
+// Proto conversions for CommentOnTask
+impl TryFrom<PbCommentOnTask> for CommentOnTask {
+    type Error = error::Error;
+
+    fn try_from(pb: PbCommentOnTask) -> Result<Self> {
+        let comment_on = pb.comment_on.context(error::InvalidProtoMsgSnafu {
+            err_msg: "expected comment_on",
+        })?;
+
+        Ok(CommentOnTask {
+            catalog_name: comment_on.catalog_name,
+            schema_name: comment_on.schema_name,
+            object_type: comment_on.object_type.try_into()?,
+            object_name: comment_on.object_name,
+            column_name: if comment_on.column_name.is_empty() {
+                None
+            } else {
+                Some(comment_on.column_name)
+            },
+            comment: if comment_on.comment.is_empty() {
+                None
+            } else {
+                Some(comment_on.comment)
+            },
+            object_id: None,
+        })
+    }
+}
+
+impl From<CommentOnTask> for PbCommentOnTask {
+    fn from(task: CommentOnTask) -> Self {
+        let pb_object_type: PbCommentObjectType = task.object_type.into();
+        PbCommentOnTask {
+            comment_on: Some(CommentOnExpr {
+                catalog_name: task.catalog_name,
+                schema_name: task.schema_name,
+                object_type: pb_object_type as i32,
+                object_name: task.object_name,
+                column_name: task.column_name.unwrap_or_default(),
+                comment: task.comment.unwrap_or_default(),
+            }),
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct QueryContext {
    pub(crate) current_catalog: String,
--- a/src/common/meta/src/wal_options_allocator/topic_creator.rs
+++ b/src/common/meta/src/wal_options_allocator/topic_creator.rs
@@ -14,7 +14,7 @@

 use common_telemetry::{debug, error, info};
 use common_wal::config::kafka::common::{
-    DEFAULT_BACKOFF_CONFIG, KafkaConnectionConfig, KafkaTopicConfig,
+    DEFAULT_BACKOFF_CONFIG, DEFAULT_CONNECT_TIMEOUT, KafkaConnectionConfig, KafkaTopicConfig,
 };
 use rskafka::client::error::Error as RsKafkaError;
 use rskafka::client::error::ProtocolError::TopicAlreadyExists;
@@ -205,11 +205,13 @@ impl KafkaTopicCreator {
        self.partition_client(topic).await.unwrap()
    }
 }
+
 /// Builds a kafka [Client](rskafka::client::Client).
 pub async fn build_kafka_client(connection: &KafkaConnectionConfig) -> Result<Client> {
    // Builds an kafka controller client for creating topics.
    let mut builder = ClientBuilder::new(connection.broker_endpoints.clone())
-        .backoff_config(DEFAULT_BACKOFF_CONFIG);
+        .backoff_config(DEFAULT_BACKOFF_CONFIG)
+        .connect_timeout(Some(DEFAULT_CONNECT_TIMEOUT));
    if let Some(sasl) = &connection.sasl {
        builder = builder.sasl_config(sasl.config.clone().into_sasl_config());
    };
--- a/src/common/procedure/src/local/runner.rs
+++ b/src/common/procedure/src/local/runner.rs
@@ -331,8 +331,29 @@ impl Runner {
                        }

                        match status {
-                            Status::Executing { .. } => {}
+                            Status::Executing { .. } => {
+                                let prev_state = self.meta.state();
+                                if !matches!(prev_state, ProcedureState::Running) {
+                                    info!(
+                                        "Set Procedure {}-{} state to running, prev_state: {:?}",
+                                        self.procedure.type_name(),
+                                        self.meta.id,
+                                        prev_state
+                                    );
+                                    self.meta.set_state(ProcedureState::Running);
+                                }
+                            }
                            Status::Suspended { subprocedures, .. } => {
+                                let prev_state = self.meta.state();
+                                if !matches!(prev_state, ProcedureState::Running) {
+                                    info!(
+                                        "Set Procedure {}-{} state to running, prev_state: {:?}",
+                                        self.procedure.type_name(),
+                                        self.meta.id,
+                                        prev_state
+                                    );
+                                    self.meta.set_state(ProcedureState::Running);
+                                }
                                self.on_suspended(subprocedures).await;
                            }
                            Status::Done { output } => {
@@ -393,8 +414,12 @@ impl Runner {
                            return;
                        }

-                        self.meta
-                            .set_state(ProcedureState::prepare_rollback(Arc::new(e)));
+                        if self.procedure.rollback_supported() {
+                            self.meta
+                                .set_state(ProcedureState::prepare_rollback(Arc::new(e)));
+                        } else {
+                            self.meta.set_state(ProcedureState::failed(Arc::new(e)));
+                        }
                    }
                }
            }
@@ -1080,20 +1105,10 @@ mod tests {
        let mut runner = new_runner(meta.clone(), Box::new(fail), procedure_store.clone());
        runner.manager_ctx.start();

-        runner.execute_once(&ctx).await;
-        let state = runner.meta.state();
-        assert!(state.is_prepare_rollback(), "{state:?}");
-
        runner.execute_once(&ctx).await;
        let state = runner.meta.state();
        assert!(state.is_failed(), "{state:?}");
-        check_files(
-            &object_store,
-            &procedure_store,
-            ctx.procedure_id,
-            &["0000000000.rollback"],
-        )
-        .await;
+        check_files(&object_store, &procedure_store, ctx.procedure_id, &[]).await;
    }

    #[tokio::test]
@@ -1146,6 +1161,8 @@ mod tests {
            async move {
                if times == 1 {
                    Err(Error::retry_later(MockError::new(StatusCode::Unexpected)))
+                } else if times == 2 {
+                    Ok(Status::executing(false))
                } else {
                    Ok(Status::done())
                }
@@ -1172,6 +1189,10 @@ mod tests {
        let state = runner.meta.state();
        assert!(state.is_retrying(), "{state:?}");

+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_running(), "{state:?}");
+
        runner.execute_once(&ctx).await;
        let state = runner.meta.state();
        assert!(state.is_done(), "{state:?}");
@@ -1185,6 +1206,86 @@ mod tests {
        .await;
    }

+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_execute_on_retry_later_error_with_child() {
+        common_telemetry::init_default_ut_logging();
+        let mut times = 0;
+        let child_id = ProcedureId::random();
+
+        let exec_fn = move |_| {
+            times += 1;
+            async move {
+                debug!("times: {}", times);
+                if times == 1 {
+                    Err(Error::retry_later(MockError::new(StatusCode::Unexpected)))
+                } else if times == 2 {
+                    let exec_fn = |_| {
+                        async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }
+                            .boxed()
+                    };
+                    let fail = ProcedureAdapter {
+                        data: "fail".to_string(),
+                        lock_key: LockKey::single_exclusive("catalog.schema.table.region-0"),
+                        poison_keys: PoisonKeys::default(),
+                        exec_fn,
+                        rollback_fn: None,
+                    };
+
+                    Ok(Status::Suspended {
+                        subprocedures: vec![ProcedureWithId {
+                            id: child_id,
+                            procedure: Box::new(fail),
+                        }],
+                        persist: true,
+                    })
+                } else {
+                    Ok(Status::done())
+                }
+            }
+            .boxed()
+        };
+
+        let retry_later = ProcedureAdapter {
+            data: "retry_later".to_string(),
+            lock_key: LockKey::single_exclusive("catalog.schema.table"),
+            poison_keys: PoisonKeys::default(),
+            exec_fn,
+            rollback_fn: None,
+        };
+
+        let dir = create_temp_dir("retry_later");
+        let meta = retry_later.new_meta(ROOT_ID);
+        let ctx = context_without_provider(meta.id);
+        let object_store = test_util::new_object_store(&dir);
+        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
+        let mut runner = new_runner(meta.clone(), Box::new(retry_later), procedure_store.clone());
+        runner.manager_ctx.start();
+        debug!("execute_once 1");
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_retrying(), "{state:?}");
+
+        let moved_meta = meta.clone();
+        tokio::spawn(async move {
+            moved_meta.child_notify.notify_one();
+        });
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_running(), "{state:?}");
+
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_done(), "{state:?}");
+        assert!(meta.state().is_done());
+        check_files(
+            &object_store,
+            &procedure_store,
+            ctx.procedure_id,
+            &["0000000000.step", "0000000001.commit"],
+        )
+        .await;
+    }
+
    #[tokio::test]
    async fn test_execute_exceed_max_retry_later() {
        let exec_fn =
@@ -1304,7 +1405,7 @@ mod tests {
    async fn test_child_error() {
        let mut times = 0;
        let child_id = ProcedureId::random();
-
+        common_telemetry::init_default_ut_logging();
        let exec_fn = move |ctx: Context| {
            times += 1;
            async move {
@@ -1529,7 +1630,7 @@ mod tests {

        runner.execute_once(&ctx).await;
        let state = runner.meta.state();
-        assert!(state.is_prepare_rollback(), "{state:?}");
+        assert!(state.is_failed(), "{state:?}");

        let procedure_id = runner
            .manager_ctx
@@ -1596,11 +1697,6 @@ mod tests {
        let state = runner.meta.state();
        assert!(state.is_running(), "{state:?}");

-        runner.execute_once(&ctx).await;
-        let state = runner.meta.state();
-        assert!(state.is_prepare_rollback(), "{state:?}");
-        assert!(meta.state().is_prepare_rollback());
-
        runner.execute_once(&ctx).await;
        let state = runner.meta.state();
        assert!(state.is_failed(), "{state:?}");
--- a/src/common/query/src/lib.rs
+++ b/src/common/query/src/lib.rs
@@ -46,6 +46,22 @@ pub enum OutputData {
    Stream(SendableRecordBatchStream),
 }

+impl OutputData {
+    /// Consume the data to pretty printed string.
+    pub async fn pretty_print(self) -> String {
+        match self {
+            OutputData::AffectedRows(x) => {
+                format!("Affected Rows: {x}")
+            }
+            OutputData::RecordBatches(x) => x.pretty_print().unwrap_or_else(|e| e.to_string()),
+            OutputData::Stream(x) => common_recordbatch::util::collect_batches(x)
+                .await
+                .and_then(|x| x.pretty_print())
+                .unwrap_or_else(|e| e.to_string()),
+        }
+    }
+}
+
 /// OutputMeta stores meta information produced/generated during the execution
 #[derive(Debug, Default)]
 pub struct OutputMeta {
--- a/src/common/wal/src/config/kafka/common.rs
+++ b/src/common/wal/src/config/kafka/common.rs
@@ -36,6 +36,9 @@ pub const DEFAULT_BACKOFF_CONFIG: BackoffConfig = BackoffConfig {
    deadline: Some(Duration::from_secs(3)),
 };

+/// The default connect timeout for kafka client.
+pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
+
 /// Default interval for auto WAL pruning.
 pub const DEFAULT_AUTO_PRUNE_INTERVAL: Duration = Duration::from_mins(30);
 /// Default limit for concurrent auto pruning tasks.
--- a/src/datanode/src/datanode.rs
+++ b/src/datanode/src/datanode.rs
@@ -22,6 +22,7 @@ use common_base::Plugins;
 use common_error::ext::BoxedError;
 use common_greptimedb_telemetry::GreptimeDBTelemetryTask;
 use common_meta::cache::{LayeredCacheRegistry, SchemaCacheRef, TableSchemaCacheRef};
+use common_meta::cache_invalidator::CacheInvalidatorRef;
 use common_meta::datanode::TopicStatsReporter;
 use common_meta::key::runtime_switch::RuntimeSwitchManager;
 use common_meta::key::{SchemaMetadataManager, SchemaMetadataManagerRef};
@@ -281,21 +282,11 @@ impl DatanodeBuilder {
            open_all_regions.await?;
        }

-        let mut resource_stat = ResourceStatImpl::default();
-        resource_stat.start_collect_cpu_usage();
-
        let heartbeat_task = if let Some(meta_client) = meta_client {
-            Some(
-                HeartbeatTask::try_new(
-                    &self.opts,
-                    region_server.clone(),
-                    meta_client,
-                    cache_registry,
-                    self.plugins.clone(),
-                    Arc::new(resource_stat),
-                )
-                .await?,
-            )
+            let task = self
+                .create_heartbeat_task(&region_server, meta_client, cache_registry)
+                .await?;
+            Some(task)
        } else {
            None
        };
@@ -324,6 +315,29 @@ impl DatanodeBuilder {
        })
    }

+    async fn create_heartbeat_task(
+        &self,
+        region_server: &RegionServer,
+        meta_client: MetaClientRef,
+        cache_invalidator: CacheInvalidatorRef,
+    ) -> Result<HeartbeatTask> {
+        let stat = {
+            let mut stat = ResourceStatImpl::default();
+            stat.start_collect_cpu_usage();
+            Arc::new(stat)
+        };
+
+        HeartbeatTask::try_new(
+            &self.opts,
+            region_server.clone(),
+            meta_client,
+            cache_invalidator,
+            self.plugins.clone(),
+            stat,
+        )
+        .await
+    }
+
    /// Builds [ObjectStoreManager] from [StorageConfig].
    pub async fn build_object_store_manager(cfg: &StorageConfig) -> Result<ObjectStoreManagerRef> {
        let object_store = store::new_object_store(cfg.store.clone(), &cfg.data_home).await?;
--- a/src/datanode/src/heartbeat.rs
+++ b/src/datanode/src/heartbeat.rs
@@ -25,6 +25,7 @@ use common_meta::datanode::REGION_STATISTIC_KEY;
 use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS;
 use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHandler;
 use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
+use common_meta::heartbeat::handler::suspend::SuspendHandler;
 use common_meta::heartbeat::handler::{
    HandlerGroupExecutor, HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef,
 };
@@ -91,6 +92,7 @@ impl HeartbeatTask {
        let resp_handler_executor = Arc::new(HandlerGroupExecutor::new(vec![
            region_alive_keeper.clone(),
            Arc::new(ParseMailboxMessageHandler),
+            Arc::new(SuspendHandler::new(region_server.suspend_state())),
            Arc::new(
                RegionHeartbeatResponseHandler::new(region_server.clone())
                    .with_open_region_parallelism(opts.init_regions_parallelism),
--- a/src/datanode/src/heartbeat/handler.rs
+++ b/src/datanode/src/heartbeat/handler.rs
@@ -99,26 +99,30 @@ impl RegionHeartbeatResponseHandler {
        self
    }

-    fn build_handler(&self, instruction: &Instruction) -> MetaResult<Box<InstructionHandlers>> {
+    fn build_handler(
+        &self,
+        instruction: &Instruction,
+    ) -> MetaResult<Option<Box<InstructionHandlers>>> {
        match instruction {
-            Instruction::CloseRegions(_) => Ok(Box::new(CloseRegionsHandler.into())),
-            Instruction::OpenRegions(_) => Ok(Box::new(
+            Instruction::CloseRegions(_) => Ok(Some(Box::new(CloseRegionsHandler.into()))),
+            Instruction::OpenRegions(_) => Ok(Some(Box::new(
                OpenRegionsHandler {
                    open_region_parallelism: self.open_region_parallelism,
                }
                .into(),
-            )),
-            Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler.into())),
-            Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler.into())),
-            Instruction::UpgradeRegions(_) => Ok(Box::new(
+            ))),
+            Instruction::FlushRegions(_) => Ok(Some(Box::new(FlushRegionsHandler.into()))),
+            Instruction::DowngradeRegions(_) => Ok(Some(Box::new(DowngradeRegionsHandler.into()))),
+            Instruction::UpgradeRegions(_) => Ok(Some(Box::new(
                UpgradeRegionsHandler {
                    upgrade_region_parallelism: self.open_region_parallelism,
                }
                .into(),
-            )),
-            Instruction::GetFileRefs(_) => Ok(Box::new(GetFileRefsHandler.into())),
-            Instruction::GcRegions(_) => Ok(Box::new(GcRegionsHandler.into())),
+            ))),
+            Instruction::GetFileRefs(_) => Ok(Some(Box::new(GetFileRefsHandler.into()))),
+            Instruction::GcRegions(_) => Ok(Some(Box::new(GcRegionsHandler.into()))),
            Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(),
+            Instruction::Suspend => Ok(None),
        }
    }
 }
@@ -216,30 +220,24 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
            .context(InvalidHeartbeatResponseSnafu)?;

        let mailbox = ctx.mailbox.clone();
-        let region_server = self.region_server.clone();
-        let downgrade_tasks = self.downgrade_tasks.clone();
-        let flush_tasks = self.flush_tasks.clone();
-        let gc_tasks = self.gc_tasks.clone();
-        let handler = self.build_handler(&instruction)?;
-        let _handle = common_runtime::spawn_global(async move {
-            let reply = handler
-                .handle(
-                    &HandlerContext {
-                        region_server,
-                        downgrade_tasks,
-                        flush_tasks,
-                        gc_tasks,
-                    },
-                    instruction,
-                )
-                .await;
-
-            if let Some(reply) = reply
-                && let Err(e) = mailbox.send((meta, reply)).await
-            {
-                error!(e; "Failed to send reply to mailbox");
-            }
-        });
+        if let Some(handler) = self.build_handler(&instruction)? {
+            let context = HandlerContext {
+                region_server: self.region_server.clone(),
+                downgrade_tasks: self.downgrade_tasks.clone(),
+                flush_tasks: self.flush_tasks.clone(),
+                gc_tasks: self.gc_tasks.clone(),
+            };
+            let _handle = common_runtime::spawn_global(async move {
+                let reply = handler.handle(&context, instruction).await;
+                if let Some(reply) = reply
+                    && let Err(e) = mailbox.send((meta, reply)).await
+                {
+                    let error = e.to_string();
+                    let (meta, reply) = e.0;
+                    error!("Failed to send reply {reply} to {meta:?}: {error}");
+                }
+            });
+        }

        Ok(HandleControl::Continue)
    }
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -17,6 +17,7 @@ mod catalog;
 use std::collections::HashMap;
 use std::fmt::Debug;
 use std::ops::Deref;
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, RwLock};
 use std::time::Duration;

@@ -52,7 +53,9 @@ pub use query::dummy_catalog::{
    DummyCatalogList, DummyTableProviderFactory, TableProviderFactoryRef,
 };
 use serde_json;
-use servers::error::{self as servers_error, ExecuteGrpcRequestSnafu, Result as ServerResult};
+use servers::error::{
+    self as servers_error, ExecuteGrpcRequestSnafu, Result as ServerResult, SuspendedSnafu,
+};
 use servers::grpc::FlightCompression;
 use servers::grpc::flight::{FlightCraft, FlightRecordBatchStream, TonicStream};
 use servers::grpc::region_server::RegionServerHandler;
@@ -89,6 +92,7 @@ use crate::region_server::catalog::{NameAwareCatalogList, NameAwareDataSourceInj
 pub struct RegionServer {
    inner: Arc<RegionServerInner>,
    flight_compression: FlightCompression,
+    suspend: Arc<AtomicBool>,
 }

 pub struct RegionStat {
@@ -136,6 +140,7 @@ impl RegionServer {
                ),
            )),
            flight_compression,
+            suspend: Arc::new(AtomicBool::new(false)),
        }
    }

@@ -595,6 +600,14 @@ impl RegionServer {
            .handle_sync_region(engine_with_status.engine(), region_id, manifest_info)
            .await
    }
+
+    fn is_suspended(&self) -> bool {
+        self.suspend.load(Ordering::Relaxed)
+    }
+
+    pub(crate) fn suspend_state(&self) -> Arc<AtomicBool> {
+        self.suspend.clone()
+    }
 }

 #[async_trait]
@@ -644,6 +657,8 @@ impl FlightCraft for RegionServer {
        &self,
        request: Request<Ticket>,
    ) -> TonicResult<Response<TonicStream<FlightData>>> {
+        ensure!(!self.is_suspended(), SuspendedSnafu);
+
        let ticket = request.into_inner().ticket;
        let request = api::v1::region::QueryRequest::decode(ticket.as_ref())
            .context(servers_error::InvalidFlightTicketSnafu)?;
@@ -1261,7 +1276,6 @@ impl RegionServerInner {
            .with_context(|_| HandleRegionRequestSnafu { region_id })?
            .new_opened_logical_region_ids()
        else {
-            warn!("No new opened logical regions");
            return Ok(());
        };

--- a/src/datanode/src/tests.rs
+++ b/src/datanode/src/tests.rs
@@ -24,8 +24,8 @@ use common_query::Output;
 use common_runtime::Runtime;
 use common_runtime::runtime::{BuilderBuild, RuntimeTrait};
 use datafusion::catalog::TableFunction;
+use datafusion::dataframe::DataFrame;
 use datafusion_expr::{AggregateUDF, LogicalPlan};
-use query::dataframe::DataFrame;
 use query::planner::LogicalPlanner;
 use query::query_engine::{DescribeResult, QueryEngineState};
 use query::{QueryEngine, QueryEngineContext};
--- a/src/frontend/Cargo.toml
+++ b/src/frontend/Cargo.toml
@@ -17,6 +17,7 @@ arc-swap = "1.0"
 async-stream.workspace = true
 async-trait.workspace = true
 auth.workspace = true
+axum.workspace = true
 bytes.workspace = true
 cache.workspace = true
 catalog.workspace = true
@@ -85,6 +86,9 @@ common-test-util.workspace = true
 datanode.workspace = true
 datatypes.workspace = true
 futures.workspace = true
+hyper-util = { workspace = true, features = ["tokio"] }
+meta-srv.workspace = true
+reqwest.workspace = true
 serde_json.workspace = true
 strfmt = "0.2"
 tower.workspace = true
--- a/src/frontend/src/error.rs
+++ b/src/frontend/src/error.rs
@@ -364,6 +364,12 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display("Service suspended"))]
+    Suspended {
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -444,6 +450,8 @@ impl ErrorExt for Error {
            Error::StatementTimeout { .. } => StatusCode::Cancelled,

            Error::AcquireLimiter { .. } => StatusCode::Internal,
+
+            Error::Suspended { .. } => StatusCode::Suspended,
        }
    }

--- a/src/frontend/src/frontend.rs
+++ b/src/frontend/src/frontend.rs
@@ -141,7 +141,43 @@ impl Frontend {

 #[cfg(test)]
 mod tests {
+    use std::sync::atomic::{AtomicBool, Ordering};
+    use std::time::Duration;
+
+    use api::v1::meta::heartbeat_server::HeartbeatServer;
+    use api::v1::meta::mailbox_message::Payload;
+    use api::v1::meta::{
+        AskLeaderRequest, AskLeaderResponse, HeartbeatRequest, HeartbeatResponse, MailboxMessage,
+        Peer, ResponseHeader, Role, heartbeat_server,
+    };
+    use async_trait::async_trait;
+    use client::{Client, Database};
+    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+    use common_error::ext::ErrorExt;
+    use common_error::from_header_to_err_code_msg;
+    use common_error::status_code::StatusCode;
+    use common_grpc::channel_manager::ChannelManager;
+    use common_meta::distributed_time_constants::FRONTEND_HEARTBEAT_INTERVAL_MILLIS;
+    use common_meta::heartbeat::handler::HandlerGroupExecutor;
+    use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
+    use common_meta::heartbeat::handler::suspend::SuspendHandler;
+    use common_meta::instruction::Instruction;
+    use common_stat::ResourceStatImpl;
+    use meta_client::MetaClientRef;
+    use meta_client::client::MetaClientBuilder;
+    use meta_srv::service::GrpcStream;
+    use servers::grpc::{FlightCompression, GRPC_SERVER};
+    use servers::http::HTTP_SERVER;
+    use servers::http::result::greptime_result_v1::GreptimedbV1Response;
+    use tokio::sync::mpsc;
+    use tonic::codec::CompressionEncoding;
+    use tonic::codegen::tokio_stream::StreamExt;
+    use tonic::codegen::tokio_stream::wrappers::ReceiverStream;
+    use tonic::{Request, Response, Status, Streaming};
+
    use super::*;
+    use crate::instance::builder::FrontendBuilder;
+    use crate::server::Services;

    #[test]
    fn test_toml() {
@@ -149,4 +185,277 @@ mod tests {
        let toml_string = toml::to_string(&opts).unwrap();
        let _parsed: FrontendOptions = toml::from_str(&toml_string).unwrap();
    }
+
+    struct SuspendableHeartbeatServer {
+        suspend: Arc<AtomicBool>,
+    }
+
+    #[async_trait]
+    impl heartbeat_server::Heartbeat for SuspendableHeartbeatServer {
+        type HeartbeatStream = GrpcStream<HeartbeatResponse>;
+
+        async fn heartbeat(
+            &self,
+            request: Request<Streaming<HeartbeatRequest>>,
+        ) -> std::result::Result<Response<Self::HeartbeatStream>, Status> {
+            let (tx, rx) = mpsc::channel(4);
+
+            common_runtime::spawn_global({
+                let mut requests = request.into_inner();
+                let suspend = self.suspend.clone();
+                async move {
+                    while let Some(request) = requests.next().await {
+                        if let Err(e) = request {
+                            let _ = tx.send(Err(e)).await;
+                            return;
+                        }
+
+                        let mailbox_message =
+                            suspend.load(Ordering::Relaxed).then(|| MailboxMessage {
+                                payload: Some(Payload::Json(
+                                    serde_json::to_string(&Instruction::Suspend).unwrap(),
+                                )),
+                                ..Default::default()
+                            });
+                        let response = HeartbeatResponse {
+                            header: Some(ResponseHeader::success()),
+                            mailbox_message,
+                            ..Default::default()
+                        };
+
+                        let _ = tx.send(Ok(response)).await;
+                    }
+                }
+            });
+
+            Ok(Response::new(Box::pin(ReceiverStream::new(rx))))
+        }
+
+        async fn ask_leader(
+            &self,
+            _: Request<AskLeaderRequest>,
+        ) -> std::result::Result<Response<AskLeaderResponse>, Status> {
+            Ok(Response::new(AskLeaderResponse {
+                header: Some(ResponseHeader::success()),
+                leader: Some(Peer {
+                    addr: "localhost:0".to_string(),
+                    ..Default::default()
+                }),
+            }))
+        }
+    }
+
+    async fn create_meta_client(
+        options: &MetaClientOptions,
+        heartbeat_server: Arc<SuspendableHeartbeatServer>,
+    ) -> MetaClientRef {
+        let (client, server) = tokio::io::duplex(1024);
+
+        // create the heartbeat server:
+        common_runtime::spawn_global(async move {
+            let mut router = tonic::transport::Server::builder();
+            let router = router.add_service(
+                HeartbeatServer::from_arc(heartbeat_server)
+                    .accept_compressed(CompressionEncoding::Zstd)
+                    .send_compressed(CompressionEncoding::Zstd),
+            );
+            router
+                .serve_with_incoming(futures::stream::iter([Ok::<_, std::io::Error>(server)]))
+                .await
+        });
+
+        // Move client to an option so we can _move_ the inner value
+        // on the first attempt to connect. All other attempts will fail.
+        let mut client = Some(client);
+        let connector = tower::service_fn(move |_| {
+            let client = client.take();
+            async move {
+                if let Some(client) = client {
+                    Ok(hyper_util::rt::TokioIo::new(client))
+                } else {
+                    Err(std::io::Error::other("client already taken"))
+                }
+            }
+        });
+        let manager = ChannelManager::new();
+        manager
+            .reset_with_connector("localhost:0", connector)
+            .unwrap();
+
+        // create the heartbeat client:
+        let mut client = MetaClientBuilder::new(0, Role::Frontend)
+            .enable_heartbeat()
+            .heartbeat_channel_manager(manager)
+            .build();
+        client.start(&options.metasrv_addrs).await.unwrap();
+        Arc::new(client)
+    }
+
+    async fn create_frontend(
+        options: &FrontendOptions,
+        meta_client: MetaClientRef,
+    ) -> Result<Frontend> {
+        let instance = Arc::new(
+            FrontendBuilder::new_test(options, meta_client.clone())
+                .try_build()
+                .await?,
+        );
+
+        let servers =
+            Services::new(options.clone(), instance.clone(), Default::default()).build()?;
+
+        let executor = Arc::new(HandlerGroupExecutor::new(vec![
+            Arc::new(ParseMailboxMessageHandler),
+            Arc::new(SuspendHandler::new(instance.suspend_state())),
+        ]));
+        let heartbeat_task = Some(HeartbeatTask::new(
+            options,
+            meta_client,
+            executor,
+            Arc::new(ResourceStatImpl::default()),
+        ));
+
+        let mut frontend = Frontend {
+            instance,
+            servers,
+            heartbeat_task,
+        };
+        frontend.start().await?;
+        Ok(frontend)
+    }
+
+    async fn verify_suspend_state_by_http(
+        frontend: &Frontend,
+        expected: std::result::Result<&str, (StatusCode, &str)>,
+    ) {
+        let addr = frontend.server_handlers().addr(HTTP_SERVER).unwrap();
+        let response = reqwest::get(format!("http://{}/v1/sql?sql=SELECT 1", addr))
+            .await
+            .unwrap();
+
+        let headers = response.headers();
+        let response = if let Some((code, error)) = from_header_to_err_code_msg(headers) {
+            Err((code, error))
+        } else {
+            Ok(response.text().await.unwrap())
+        };
+
+        match (response, expected) {
+            (Ok(response), Ok(expected)) => {
+                let response: GreptimedbV1Response = serde_json::from_str(&response).unwrap();
+                let response = serde_json::to_string(response.output()).unwrap();
+                assert_eq!(&response, expected);
+            }
+            (Err(actual), Err(expected)) => assert_eq!(actual, expected),
+            _ => unreachable!(),
+        }
+    }
+
+    async fn verify_suspend_state_by_grpc(
+        frontend: &Frontend,
+        expected: std::result::Result<&str, (StatusCode, &str)>,
+    ) {
+        let addr = frontend.server_handlers().addr(GRPC_SERVER).unwrap();
+        let client = Client::with_urls([addr.to_string()]);
+        let client = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
+        let response = client.sql("SELECT 1").await;
+
+        match (response, expected) {
+            (Ok(response), Ok(expected)) => {
+                let response = response.data.pretty_print().await;
+                assert_eq!(&response, expected.trim());
+            }
+            (Err(actual), Err(expected)) => {
+                assert_eq!(actual.status_code(), expected.0);
+                assert_eq!(actual.output_msg(), expected.1);
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn test_suspend_frontend() -> Result<()> {
+        common_telemetry::init_default_ut_logging();
+
+        let meta_client_options = MetaClientOptions {
+            metasrv_addrs: vec!["localhost:0".to_string()],
+            ..Default::default()
+        };
+        let options = FrontendOptions {
+            http: HttpOptions {
+                addr: "127.0.0.1:0".to_string(),
+                ..Default::default()
+            },
+            grpc: GrpcOptions {
+                bind_addr: "127.0.0.1:0".to_string(),
+                flight_compression: FlightCompression::None,
+                ..Default::default()
+            },
+            mysql: MysqlOptions {
+                enable: false,
+                ..Default::default()
+            },
+            postgres: PostgresOptions {
+                enable: false,
+                ..Default::default()
+            },
+            meta_client: Some(meta_client_options.clone()),
+            ..Default::default()
+        };
+
+        let server = Arc::new(SuspendableHeartbeatServer {
+            suspend: Arc::new(AtomicBool::new(false)),
+        });
+        let meta_client = create_meta_client(&meta_client_options, server.clone()).await;
+        let frontend = create_frontend(&options, meta_client).await?;
+
+        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        // initial state: not suspend:
+        assert!(!frontend.instance.is_suspended());
+        verify_suspend_state_by_http(&frontend, Ok(r#"[{"records":{"schema":{"column_schemas":[{"name":"Int64(1)","data_type":"Int64"}]},"rows":[[1]],"total_rows":1}}]"#)).await;
+        verify_suspend_state_by_grpc(
+            &frontend,
+            Ok(r#"
+----------+
+| Int64(1) |
+----------+
+| 1        |
+----------+"#),
+        )
+        .await;
+
+        // make heartbeat server returned "suspend" instruction,
+        server.suspend.store(true, Ordering::Relaxed);
+        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        // ... then the frontend is suspended:
+        assert!(frontend.instance.is_suspended());
+        verify_suspend_state_by_http(
+            &frontend,
+            Err((
+                StatusCode::Suspended,
+                "error: Service suspended, execution_time_ms: 0",
+            )),
+        )
+        .await;
+        verify_suspend_state_by_grpc(&frontend, Err((StatusCode::Suspended, "Service suspended")))
+            .await;
+
+        // make heartbeat server NOT returned "suspend" instruction,
+        server.suspend.store(false, Ordering::Relaxed);
+        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        // ... then frontend's suspend state is cleared:
+        assert!(!frontend.instance.is_suspended());
+        verify_suspend_state_by_http(&frontend, Ok(r#"[{"records":{"schema":{"column_schemas":[{"name":"Int64(1)","data_type":"Int64"}]},"rows":[[1]],"total_rows":1}}]"#)).await;
+        verify_suspend_state_by_grpc(
+            &frontend,
+            Ok(r#"
+----------+
+| Int64(1) |
+----------+
+| 1        |
+----------+"#),
+        )
+        .await;
+        Ok(())
+    }
 }
--- a/src/frontend/src/heartbeat.rs
+++ b/src/frontend/src/heartbeat.rs
@@ -27,7 +27,6 @@ use common_stat::ResourceStatRef;
 use common_telemetry::{debug, error, info, warn};
 use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient};
 use servers::addrs;
-use servers::heartbeat_options::HeartbeatOptions;
 use snafu::ResultExt;
 use tokio::sync::mpsc;
 use tokio::sync::mpsc::Receiver;
@@ -54,7 +53,6 @@ impl HeartbeatTask {
    pub fn new(
        opts: &FrontendOptions,
        meta_client: Arc<MetaClient>,
-        heartbeat_opts: HeartbeatOptions,
        resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
        resource_stat: ResourceStatRef,
    ) -> Self {
@@ -68,8 +66,8 @@ impl HeartbeatTask {
                addrs::resolve_addr(&opts.grpc.bind_addr, Some(&opts.grpc.server_addr))
            },
            meta_client,
-            report_interval: heartbeat_opts.interval,
-            retry_interval: heartbeat_opts.retry_interval,
+            report_interval: opts.heartbeat.interval,
+            retry_interval: opts.heartbeat.retry_interval,
            resp_handler_executor,
            start_time_ms: common_time::util::current_time_millis() as u64,
            resource_stat,
@@ -196,7 +194,8 @@ impl HeartbeatTask {
        let report_interval = self.report_interval;
        let start_time_ms = self.start_time_ms;
        let self_peer = Some(Peer {
-            // The peer id doesn't make sense for frontend, so we just set it 0.
+            // The node id will be actually calculated from its address (by hashing the address
+            // string) in the metasrv. So it can be set to 0 here, as a placeholder.
            id: 0,
            addr: self.peer_addr.clone(),
        });
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -26,7 +26,8 @@ mod region_query;
 pub mod standalone;

 use std::pin::Pin;
-use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
+use std::sync::{Arc, atomic};
 use std::time::{Duration, SystemTime};

 use async_stream::stream;
@@ -83,6 +84,7 @@ use snafu::prelude::*;
 use sql::ast::ObjectNamePartExt;
 use sql::dialect::Dialect;
 use sql::parser::{ParseOptions, ParserContext};
+use sql::statements::comment::CommentObject;
 use sql::statements::copy::{CopyDatabase, CopyTable};
 use sql::statements::statement::Statement;
 use sql::statements::tql::Tql;
@@ -119,6 +121,7 @@ pub struct Instance {
    limiter: Option<LimiterRef>,
    process_manager: ProcessManagerRef,
    slow_query_options: SlowQueryOptions,
+    suspend: Arc<AtomicBool>,

    // cache for otlp metrics
    // first layer key: db-string
@@ -171,6 +174,14 @@ impl Instance {
    pub fn procedure_executor(&self) -> &ProcedureExecutorRef {
        self.statement_executor.procedure_executor()
    }
+
+    pub fn suspend_state(&self) -> Arc<AtomicBool> {
+        self.suspend.clone()
+    }
+
+    pub(crate) fn is_suspended(&self) -> bool {
+        self.suspend.load(atomic::Ordering::Relaxed)
+    }
 }

 fn parse_stmt(sql: &str, dialect: &(dyn Dialect + Send + Sync)) -> Result<Vec<Statement>> {
@@ -513,6 +524,10 @@ impl SqlQueryHandler for Instance {

    #[tracing::instrument(skip_all)]
    async fn do_query(&self, query: &str, query_ctx: QueryContextRef) -> Vec<Result<Output>> {
+        if self.is_suspended() {
+            return vec![error::SuspendedSnafu {}.fail()];
+        }
+
        let query_interceptor_opt = self.plugins.get::<SqlQueryInterceptorRef<Error>>();
        let query_interceptor = query_interceptor_opt.as_ref();
        let query = match query_interceptor.pre_parsing(query, query_ctx.clone()) {
@@ -580,6 +595,8 @@ impl SqlQueryHandler for Instance {
        plan: LogicalPlan,
        query_ctx: QueryContextRef,
    ) -> Result<Output> {
+        ensure!(!self.is_suspended(), error::SuspendedSnafu);
+
        if should_capture_statement(stmt.as_ref()) {
            // It's safe to unwrap here because we've already checked the type.
            let stmt = stmt.unwrap();
@@ -641,6 +658,10 @@ impl SqlQueryHandler for Instance {
        query: &PromQuery,
        query_ctx: QueryContextRef,
    ) -> Vec<Result<Output>> {
+        if self.is_suspended() {
+            return vec![error::SuspendedSnafu {}.fail()];
+        }
+
        // check will be done in prometheus handler's do_query
        let result = PrometheusHandler::do_query(self, query, query_ctx)
            .await
@@ -655,6 +676,8 @@ impl SqlQueryHandler for Instance {
        stmt: Statement,
        query_ctx: QueryContextRef,
    ) -> Result<Option<DescribeResult>> {
+        ensure!(!self.is_suspended(), error::SuspendedSnafu);
+
        if matches!(
            stmt,
            Statement::Insert(_) | Statement::Query(_) | Statement::Delete(_)
@@ -875,7 +898,7 @@ pub fn check_permission(
            validate_param(&stmt.table_name, query_ctx)?;
        }
        Statement::ShowCreateFlow(stmt) => {
-            validate_param(&stmt.flow_name, query_ctx)?;
+            validate_flow(&stmt.flow_name, query_ctx)?;
        }
        #[cfg(feature = "enterprise")]
        Statement::ShowCreateTrigger(stmt) => {
@@ -908,6 +931,12 @@ pub fn check_permission(
        // show charset and show collation won't be checked
        Statement::ShowCharset(_) | Statement::ShowCollation(_) => {}

+        Statement::Comment(comment) => match &comment.object {
+            CommentObject::Table(table) => validate_param(table, query_ctx)?,
+            CommentObject::Column { table, .. } => validate_param(table, query_ctx)?,
+            CommentObject::Flow(flow) => validate_flow(flow, query_ctx)?,
+        },
+
        Statement::Insert(insert) => {
            let name = insert.table_name().context(ParseSqlSnafu)?;
            validate_param(name, query_ctx)?;
@@ -993,6 +1022,27 @@ fn validate_param(name: &ObjectName, query_ctx: &QueryContextRef) -> Result<()>
        .context(SqlExecInterceptedSnafu)
 }

+fn validate_flow(name: &ObjectName, query_ctx: &QueryContextRef) -> Result<()> {
+    let catalog = match &name.0[..] {
+        [_flow] => query_ctx.current_catalog().to_string(),
+        [catalog, _flow] => catalog.to_string_unquoted(),
+        _ => {
+            return InvalidSqlSnafu {
+                err_msg: format!(
+                    "expect flow name to be <catalog>.<flow_name> or <flow_name>, actual: {name}",
+                ),
+            }
+            .fail();
+        }
+    };
+
+    let schema = query_ctx.current_schema();
+
+    validate_catalog_and_schema(&catalog, &schema, query_ctx)
+        .map_err(BoxedError::new)
+        .context(SqlExecInterceptedSnafu)
+}
+
 fn validate_database(name: &ObjectName, query_ctx: &QueryContextRef) -> Result<()> {
    let (catalog, schema) = match &name.0[..] {
        [schema] => (
@@ -1251,6 +1301,28 @@ mod tests {

        // test describe table
        let sql = "DESC TABLE {catalog}{schema}demo;";
-        replace_test(sql, plugins, &query_ctx);
+        replace_test(sql, plugins.clone(), &query_ctx);
+
+        let comment_flow_cases = [
+            ("COMMENT ON FLOW my_flow IS 'comment';", true),
+            ("COMMENT ON FLOW greptime.my_flow IS 'comment';", true),
+            ("COMMENT ON FLOW wrongcatalog.my_flow IS 'comment';", false),
+        ];
+        for (sql, is_ok) in comment_flow_cases {
+            let stmt = &parse_stmt(sql, &GreptimeDbDialect {}).unwrap()[0];
+            let result = check_permission(plugins.clone(), stmt, &query_ctx);
+            assert_eq!(result.is_ok(), is_ok);
+        }
+
+        let show_flow_cases = [
+            ("SHOW CREATE FLOW my_flow;", true),
+            ("SHOW CREATE FLOW greptime.my_flow;", true),
+            ("SHOW CREATE FLOW wrongcatalog.my_flow;", false),
+        ];
+        for (sql, is_ok) in show_flow_cases {
+            let stmt = &parse_stmt(sql, &GreptimeDbDialect {}).unwrap()[0];
+            let result = check_permission(plugins.clone(), stmt, &query_ctx);
+            assert_eq!(result.is_ok(), is_ok);
+        }
    }
 }
--- a/src/frontend/src/instance/builder.rs
+++ b/src/frontend/src/instance/builder.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 use std::sync::Arc;
+use std::sync::atomic::AtomicBool;

 use cache::{TABLE_FLOWNODE_SET_CACHE_NAME, TABLE_ROUTE_CACHE_NAME};
 use catalog::CatalogManagerRef;
@@ -87,6 +88,33 @@ impl FrontendBuilder {
        }
    }

+    #[cfg(test)]
+    pub(crate) fn new_test(
+        options: &FrontendOptions,
+        meta_client: meta_client::MetaClientRef,
+    ) -> Self {
+        let kv_backend = Arc::new(common_meta::kv_backend::memory::MemoryKvBackend::new());
+
+        let layered_cache_registry = Arc::new(
+            common_meta::cache::LayeredCacheRegistryBuilder::default()
+                .add_cache_registry(cache::build_fundamental_cache_registry(kv_backend.clone()))
+                .build(),
+        );
+
+        Self::new(
+            options.clone(),
+            kv_backend,
+            layered_cache_registry,
+            catalog::memory::MemoryCatalogManager::with_default_setup(),
+            Arc::new(client::client_manager::NodeClients::default()),
+            meta_client,
+            Arc::new(catalog::process_manager::ProcessManager::new(
+                "".to_string(),
+                None,
+            )),
+        )
+    }
+
    pub fn with_local_cache_invalidator(self, cache_invalidator: CacheInvalidatorRef) -> Self {
        Self {
            local_cache_invalidator: Some(cache_invalidator),
@@ -242,6 +270,7 @@ impl FrontendBuilder {
            process_manager,
            otlp_metrics_table_legacy_cache: DashMap::new(),
            slow_query_options: self.options.slow_query.clone(),
+            suspend: Arc::new(AtomicBool::new(false)),
        })
    }
 }
--- a/src/frontend/src/instance/grpc.rs
+++ b/src/frontend/src/instance/grpc.rs
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::pin::Pin;
 use std::sync::Arc;
+use std::time::Instant;

 use api::helper::from_pb_time_ranges;
 use api::v1::ddl_request::{Expr as DdlExpr, Expr};
@@ -22,16 +24,18 @@ use api::v1::{
    DeleteRequests, DropFlowExpr, InsertIntoPlan, InsertRequests, RowDeleteRequests,
    RowInsertRequests,
 };
+use async_stream::try_stream;
 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use common_base::AffectedRows;
 use common_error::ext::BoxedError;
-use common_grpc::FlightData;
-use common_grpc::flight::FlightDecoder;
+use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
 use common_query::logical_plan::add_insert_to_logical_plan;
 use common_telemetry::tracing::{self};
 use datafusion::datasource::DefaultTableSource;
+use futures::Stream;
+use futures::stream::StreamExt;
 use query::parser::PromQuery;
 use servers::interceptor::{GrpcQueryInterceptor, GrpcQueryInterceptorRef};
 use servers::query_handler::grpc::GrpcQueryHandler;
@@ -230,6 +234,11 @@ impl GrpcQueryHandler for Instance {
                    DdlExpr::DropView(_) => {
                        todo!("implemented in the following PR")
                    }
+                    DdlExpr::CommentOn(expr) => {
+                        self.statement_executor
+                            .comment_by_expr(expr, ctx.clone())
+                            .await?
+                    }
                }
            }
        };
@@ -240,10 +249,8 @@ impl GrpcQueryHandler for Instance {

    async fn put_record_batch(
        &self,
-        table_name: &TableName,
+        request: servers::grpc::flight::PutRecordBatchRequest,
        table_ref: &mut Option<TableRef>,
-        decoder: &mut FlightDecoder,
-        data: FlightData,
        ctx: QueryContextRef,
    ) -> Result<AffectedRows> {
        let table = if let Some(table) = table_ref {
@@ -252,15 +259,15 @@ impl GrpcQueryHandler for Instance {
            let table = self
                .catalog_manager()
                .table(
-                    &table_name.catalog_name,
-                    &table_name.schema_name,
-                    &table_name.table_name,
+                    &request.table_name.catalog_name,
+                    &request.table_name.schema_name,
+                    &request.table_name.table_name,
                    None,
                )
                .await
                .context(CatalogSnafu)?
                .with_context(|| TableNotFoundSnafu {
-                    table_name: table_name.to_string(),
+                    table_name: request.table_name.to_string(),
                })?;
            *table_ref = Some(table.clone());
            table
@@ -279,10 +286,77 @@ impl GrpcQueryHandler for Instance {
        // do we check limit for bulk insert?

        self.inserter
-            .handle_bulk_insert(table, decoder, data)
+            .handle_bulk_insert(
+                table,
+                request.flight_data,
+                request.record_batch,
+                request.schema_bytes,
+            )
            .await
            .context(TableOperationSnafu)
    }
+
+    fn handle_put_record_batch_stream(
+        &self,
+        mut stream: servers::grpc::flight::PutRecordBatchRequestStream,
+        ctx: QueryContextRef,
+    ) -> Pin<Box<dyn Stream<Item = Result<DoPutResponse>> + Send>> {
+        // Resolve table once for the stream
+        // Clone all necessary data to make it 'static
+        let catalog_manager = self.catalog_manager().clone();
+        let plugins = self.plugins.clone();
+        let inserter = self.inserter.clone();
+        let table_name = stream.table_name().clone();
+        let ctx = ctx.clone();
+
+        Box::pin(try_stream! {
+            plugins
+                .get::<PermissionCheckerRef>()
+                .as_ref()
+                .check_permission(ctx.current_user(), PermissionReq::BulkInsert)
+                .context(PermissionSnafu)?;
+            // Cache for resolved table reference - resolve once and reuse
+            let table_ref = catalog_manager
+                .table(
+                    &table_name.catalog_name,
+                    &table_name.schema_name,
+                    &table_name.table_name,
+                    None,
+                )
+                .await
+                .context(CatalogSnafu)?
+                .with_context(|| TableNotFoundSnafu {
+                    table_name: table_name.to_string(),
+                })?;
+
+            // Check permissions once for the stream
+            let interceptor_ref = plugins.get::<GrpcQueryInterceptorRef<Error>>();
+            let interceptor = interceptor_ref.as_ref();
+            interceptor.pre_bulk_insert(table_ref.clone(), ctx.clone())?;
+
+            // Process each request in the stream
+            while let Some(request_result) = stream.next().await {
+                let request = request_result.map_err(|e| {
+                    let error_msg = format!("Stream error: {:?}", e);
+                    IncompleteGrpcRequestSnafu { err_msg: error_msg }.build()
+                })?;
+
+                let request_id = request.request_id;
+                let start = Instant::now();
+                let rows = inserter
+                    .handle_bulk_insert(
+                        table_ref.clone(),
+                        request.flight_data,
+                        request.record_batch,
+                        request.schema_bytes,
+                    )
+                    .await
+                    .context(TableOperationSnafu)?;
+                let elapsed_secs = start.elapsed().as_secs_f64();
+                yield DoPutResponse::new(request_id, rows, elapsed_secs);
+            }
+        })
+    }
 }

 fn fill_catalog_and_schema_from_context(ddl_expr: &mut DdlExpr, ctx: &QueryContextRef) {
@@ -330,6 +404,9 @@ fn fill_catalog_and_schema_from_context(ddl_expr: &mut DdlExpr, ctx: &QueryConte
        Expr::DropView(expr) => {
            check_and_fill!(expr);
        }
+        Expr::CommentOn(expr) => {
+            check_and_fill!(expr);
+        }
    }
 }

--- a/src/frontend/src/instance/jaeger.rs
+++ b/src/frontend/src/instance/jaeger.rs
@@ -65,8 +65,7 @@ impl JaegerQueryHandler for Instance {
        // It's equivalent to `SELECT DISTINCT(service_name) FROM {db}.{trace_table}`.
        Ok(query_trace_table(
            ctx,
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
            vec![SelectExpr::from(col(SERVICE_NAME_COLUMN))],
            vec![],
            vec![],
@@ -107,8 +106,7 @@ impl JaegerQueryHandler for Instance {
        // ```.
        Ok(query_trace_table(
            ctx,
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
            vec![
                SelectExpr::from(col(SPAN_NAME_COLUMN)),
                SelectExpr::from(col(SPAN_KIND_COLUMN)),
@@ -160,8 +158,7 @@ impl JaegerQueryHandler for Instance {

        Ok(query_trace_table(
            ctx,
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
            selects,
            filters,
            vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order.
@@ -220,8 +217,7 @@ impl JaegerQueryHandler for Instance {
        // ```.
        let output = query_trace_table(
            ctx.clone(),
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
            vec![wildcard()],
            filters,
            vec![],
@@ -285,8 +281,7 @@ impl JaegerQueryHandler for Instance {
                // query all spans
                Ok(query_trace_table(
                    ctx,
-                    self.catalog_manager(),
-                    self.query_engine(),
+                    self,
                    vec![wildcard()],
                    filters,
                    vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order.
@@ -303,8 +298,7 @@ impl JaegerQueryHandler for Instance {
 #[allow(clippy::too_many_arguments)]
 async fn query_trace_table(
    ctx: QueryContextRef,
-    catalog_manager: &CatalogManagerRef,
-    query_engine: &QueryEngineRef,
+    instance: &Instance,
    selects: Vec<SelectExpr>,
    filters: Vec<Expr>,
    sorts: Vec<SortExpr>,
@@ -334,7 +328,8 @@ async fn query_trace_table(
        }
    };

-    let table = catalog_manager
+    let table = instance
+        .catalog_manager()
        .table(
            ctx.current_catalog(),
            &ctx.current_schema(),
@@ -367,7 +362,7 @@ async fn query_trace_table(
        .map(|s| format!("\"{}\"", s))
        .collect::<HashSet<String>>();

-    let df_context = create_df_context(query_engine)?;
+    let df_context = create_df_context(instance.query_engine())?;

    let dataframe = df_context
        .read_table(Arc::new(DfTableProviderAdapter::new(table)))
--- a/src/frontend/src/instance/promql.rs
+++ b/src/frontend/src/instance/promql.rs
@@ -136,7 +136,7 @@ impl Instance {
                table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric),
            })?;

-        let scan_plan = dataframe.into_logical_plan();
+        let scan_plan = dataframe.into_unoptimized_plan();
        let filter_conditions =
            PromPlanner::matchers_to_expr(Matchers::new(matchers), scan_plan.schema())
                .context(PrometheusLabelValuesQueryPlanSnafu)?;
--- a/src/frontend/src/server.rs
+++ b/src/frontend/src/server.rs
@@ -16,16 +16,21 @@ use std::net::SocketAddr;
 use std::sync::Arc;

 use auth::UserProviderRef;
+use axum::extract::{Request, State};
+use axum::middleware::Next;
+use axum::response::IntoResponse;
 use common_base::Plugins;
 use common_config::Configurable;
 use common_telemetry::info;
 use meta_client::MetaClientOptions;
 use servers::error::Error as ServerError;
 use servers::grpc::builder::GrpcServerBuilder;
+use servers::grpc::flight::FlightCraftRef;
 use servers::grpc::frontend_grpc_handler::FrontendGrpcHandler;
 use servers::grpc::greptime_handler::GreptimeRequestHandler;
 use servers::grpc::{GrpcOptions, GrpcServer};
 use servers::http::event::LogValidatorRef;
+use servers::http::result::error_result::ErrorResponse;
 use servers::http::utils::router::RouterConfigurator;
 use servers::http::{HttpServer, HttpServerBuilder};
 use servers::interceptor::LogIngestInterceptorRef;
@@ -38,6 +43,7 @@ use servers::query_handler::sql::ServerSqlQueryHandlerAdapter;
 use servers::server::{Server, ServerHandlers};
 use servers::tls::{ReloadableTlsServerConfig, maybe_watch_server_tls_config};
 use snafu::ResultExt;
+use tonic::Status;

 use crate::error::{self, Result, StartServerSnafu, TomlFormatSnafu};
 use crate::frontend::FrontendOptions;
@@ -52,6 +58,7 @@ where
    grpc_server_builder: Option<GrpcServerBuilder>,
    http_server_builder: Option<HttpServerBuilder>,
    plugins: Plugins,
+    flight_handler: Option<FlightCraftRef>,
 }

 impl<T> Services<T>
@@ -65,6 +72,7 @@ where
            grpc_server_builder: None,
            http_server_builder: None,
            plugins,
+            flight_handler: None,
        }
    }

@@ -122,7 +130,16 @@ where
            builder = builder.with_extra_router(configurator.router());
        }

-        builder
+        builder.add_layer(axum::middleware::from_fn_with_state(
+            self.instance.clone(),
+            async move |State(state): State<Arc<Instance>>, request: Request, next: Next| {
+                if state.is_suspended() {
+                    return ErrorResponse::from_error(servers::error::SuspendedSnafu.build())
+                        .into_response();
+                }
+                next.run(request).await
+            },
+        ))
    }

    pub fn with_grpc_server_builder(self, builder: GrpcServerBuilder) -> Self {
@@ -139,6 +156,13 @@ where
        }
    }

+    pub fn with_flight_handler(self, flight_handler: FlightCraftRef) -> Self {
+        Self {
+            flight_handler: Some(flight_handler),
+            ..self
+        }
+    }
+
    fn build_grpc_server(
        &mut self,
        grpc: &GrpcOptions,
@@ -173,6 +197,12 @@ where
            grpc.flight_compression,
        );

+        // Use custom flight handler if provided, otherwise use the default GreptimeRequestHandler
+        let flight_handler = self
+            .flight_handler
+            .clone()
+            .unwrap_or_else(|| Arc::new(greptime_request_handler.clone()) as FlightCraftRef);
+
        let grpc_server = builder
            .name(name)
            .database_handler(greptime_request_handler.clone())
@@ -181,7 +211,17 @@ where
                self.instance.clone(),
                user_provider.clone(),
            ))
-            .flight_handler(Arc::new(greptime_request_handler));
+            .flight_handler(flight_handler)
+            .add_layer(axum::middleware::from_fn_with_state(
+                self.instance.clone(),
+                async move |State(state): State<Arc<Instance>>, request: Request, next: Next| {
+                    if state.is_suspended() {
+                        let status = Status::from(servers::error::SuspendedSnafu.build());
+                        return status.into_http();
+                    }
+                    next.run(request).await
+                },
+            ));

        let grpc_server = if !external {
            let frontend_grpc_handler =
--- a/src/index/src/bloom_filter/applier.rs
+++ b/src/index/src/bloom_filter/applier.rs
@@ -21,7 +21,7 @@ use itertools::Itertools;

 use crate::Bytes;
 use crate::bloom_filter::error::Result;
-use crate::bloom_filter::reader::BloomFilterReader;
+use crate::bloom_filter::reader::{BloomFilterReadMetrics, BloomFilterReader};

 /// `InListPredicate` contains a list of acceptable values. A value needs to match at least
 /// one of the elements (logical OR semantic) for the predicate to be satisfied.
@@ -38,7 +38,7 @@ pub struct BloomFilterApplier {

 impl BloomFilterApplier {
    pub async fn new(reader: Box<dyn BloomFilterReader + Send>) -> Result<Self> {
-        let meta = reader.metadata().await?;
+        let meta = reader.metadata(None).await?;

        Ok(Self { reader, meta })
    }
@@ -50,6 +50,7 @@ impl BloomFilterApplier {
        &mut self,
        predicates: &[InListPredicate],
        search_ranges: &[Range<usize>],
+        metrics: Option<&mut BloomFilterReadMetrics>,
    ) -> Result<Vec<Range<usize>>> {
        if predicates.is_empty() {
            // If no predicates, return empty result
@@ -57,7 +58,7 @@ impl BloomFilterApplier {
        }

        let segments = self.row_ranges_to_segments(search_ranges);
-        let (seg_locations, bloom_filters) = self.load_bloom_filters(&segments).await?;
+        let (seg_locations, bloom_filters) = self.load_bloom_filters(&segments, metrics).await?;
        let matching_row_ranges = self.find_matching_rows(seg_locations, bloom_filters, predicates);
        Ok(intersect_ranges(search_ranges, &matching_row_ranges))
    }
@@ -95,6 +96,7 @@ impl BloomFilterApplier {
    async fn load_bloom_filters(
        &mut self,
        segments: &[usize],
+        metrics: Option<&mut BloomFilterReadMetrics>,
    ) -> Result<(Vec<(u64, usize)>, Vec<BloomFilter>)> {
        let segment_locations = segments
            .iter()
@@ -108,7 +110,10 @@ impl BloomFilterApplier {
            .map(|i| self.meta.bloom_filter_locs[i as usize])
            .collect::<Vec<_>>();

-        let bloom_filters = self.reader.bloom_filter_vec(&bloom_filter_locs).await?;
+        let bloom_filters = self
+            .reader
+            .bloom_filter_vec(&bloom_filter_locs, metrics)
+            .await?;

        Ok((segment_locations, bloom_filters))
    }
@@ -422,7 +427,10 @@ mod tests {
        ];

        for (predicates, search_range, expected) in cases {
-            let result = applier.search(&predicates, &[search_range]).await.unwrap();
+            let result = applier
+                .search(&predicates, &[search_range], None)
+                .await
+                .unwrap();
            assert_eq!(
                result, expected,
                "Expected {:?}, got {:?}",
--- a/src/index/src/bloom_filter/reader.rs
+++ b/src/index/src/bloom_filter/reader.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 use std::ops::{Range, Rem};
+use std::time::{Duration, Instant};

 use async_trait::async_trait;
 use bytemuck::try_cast_slice;
@@ -34,6 +35,72 @@ const BLOOM_META_LEN_SIZE: u64 = 4;
 /// Default prefetch size of bloom filter meta.
 pub const DEFAULT_PREFETCH_SIZE: u64 = 8192; // 8KiB

+/// Metrics for bloom filter read operations.
+#[derive(Default, Clone)]
+pub struct BloomFilterReadMetrics {
+    /// Total byte size to read.
+    pub total_bytes: u64,
+    /// Total number of ranges to read.
+    pub total_ranges: usize,
+    /// Elapsed time to fetch data.
+    pub fetch_elapsed: Duration,
+    /// Number of cache hits.
+    pub cache_hit: usize,
+    /// Number of cache misses.
+    pub cache_miss: usize,
+}
+
+impl std::fmt::Debug for BloomFilterReadMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let Self {
+            total_bytes,
+            total_ranges,
+            fetch_elapsed,
+            cache_hit,
+            cache_miss,
+        } = self;
+
+        // If both total_bytes and cache_hit are 0, we didn't read anything.
+        if *total_bytes == 0 && *cache_hit == 0 {
+            return write!(f, "{{}}");
+        }
+        write!(f, "{{")?;
+
+        if *total_bytes > 0 {
+            write!(f, "\"total_bytes\":{}", total_bytes)?;
+        }
+        if *cache_hit > 0 {
+            if *total_bytes > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "\"cache_hit\":{}", cache_hit)?;
+        }
+
+        if *total_ranges > 0 {
+            write!(f, ", \"total_ranges\":{}", total_ranges)?;
+        }
+        if !fetch_elapsed.is_zero() {
+            write!(f, ", \"fetch_elapsed\":\"{:?}\"", fetch_elapsed)?;
+        }
+        if *cache_miss > 0 {
+            write!(f, ", \"cache_miss\":{}", cache_miss)?;
+        }
+
+        write!(f, "}}")
+    }
+}
+
+impl BloomFilterReadMetrics {
+    /// Merges another metrics into this one.
+    pub fn merge_from(&mut self, other: &Self) {
+        self.total_bytes += other.total_bytes;
+        self.total_ranges += other.total_ranges;
+        self.fetch_elapsed += other.fetch_elapsed;
+        self.cache_hit += other.cache_hit;
+        self.cache_miss += other.cache_miss;
+    }
+}
+
 /// Safely converts bytes to Vec<u64> using bytemuck for optimal performance.
 /// Faster than chunking and converting each piece individually.
 ///
@@ -79,25 +146,33 @@ pub fn bytes_to_u64_vec(bytes: &Bytes) -> Vec<u64> {
 #[async_trait]
 pub trait BloomFilterReader: Sync {
    /// Reads range of bytes from the file.
-    async fn range_read(&self, offset: u64, size: u32) -> Result<Bytes>;
+    async fn range_read(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Bytes>;

    /// Reads bunch of ranges from the file.
-    async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
-        let mut results = Vec::with_capacity(ranges.len());
-        for range in ranges {
-            let size = (range.end - range.start) as u32;
-            let data = self.range_read(range.start, size).await?;
-            results.push(data);
-        }
-        Ok(results)
-    }
+    async fn read_vec(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Vec<Bytes>>;

    /// Reads the meta information of the bloom filter.
-    async fn metadata(&self) -> Result<BloomFilterMeta>;
+    async fn metadata(
+        &self,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<BloomFilterMeta>;

    /// Reads a bloom filter with the given location.
-    async fn bloom_filter(&self, loc: &BloomFilterLoc) -> Result<BloomFilter> {
-        let bytes = self.range_read(loc.offset, loc.size as _).await?;
+    async fn bloom_filter(
+        &self,
+        loc: &BloomFilterLoc,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<BloomFilter> {
+        let bytes = self.range_read(loc.offset, loc.size as _, metrics).await?;
        let vec = bytes_to_u64_vec(&bytes);
        let bm = BloomFilter::from_vec(vec)
            .seed(&SEED)
@@ -105,12 +180,16 @@ pub trait BloomFilterReader: Sync {
        Ok(bm)
    }

-    async fn bloom_filter_vec(&self, locs: &[BloomFilterLoc]) -> Result<Vec<BloomFilter>> {
+    async fn bloom_filter_vec(
+        &self,
+        locs: &[BloomFilterLoc],
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Vec<BloomFilter>> {
        let ranges = locs
            .iter()
            .map(|l| l.offset..l.offset + l.size)
            .collect::<Vec<_>>();
-        let bss = self.read_vec(&ranges).await?;
+        let bss = self.read_vec(&ranges, metrics).await?;

        let mut result = Vec::with_capacity(bss.len());
        for (bs, loc) in bss.into_iter().zip(locs.iter()) {
@@ -140,24 +219,59 @@ impl<R: RangeReader> BloomFilterReaderImpl<R> {

 #[async_trait]
 impl<R: RangeReader> BloomFilterReader for BloomFilterReaderImpl<R> {
-    async fn range_read(&self, offset: u64, size: u32) -> Result<Bytes> {
-        self.reader
+    async fn range_read(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Bytes> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+        let result = self
+            .reader
            .read(offset..offset + size as u64)
            .await
-            .context(IoSnafu)
+            .context(IoSnafu)?;
+
+        if let Some(m) = metrics {
+            m.total_ranges += 1;
+            m.total_bytes += size as u64;
+            if let Some(start) = start {
+                m.fetch_elapsed += start.elapsed();
+            }
+        }
+
+        Ok(result)
    }

-    async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
-        self.reader.read_vec(ranges).await.context(IoSnafu)
+    async fn read_vec(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Vec<Bytes>> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+        let result = self.reader.read_vec(ranges).await.context(IoSnafu)?;
+
+        if let Some(m) = metrics {
+            m.total_ranges += ranges.len();
+            m.total_bytes += ranges.iter().map(|r| r.end - r.start).sum::<u64>();
+            if let Some(start) = start {
+                m.fetch_elapsed += start.elapsed();
+            }
+        }
+
+        Ok(result)
    }

-    async fn metadata(&self) -> Result<BloomFilterMeta> {
+    async fn metadata(
+        &self,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<BloomFilterMeta> {
        let metadata = self.reader.metadata().await.context(IoSnafu)?;
        let file_size = metadata.content_length;

        let mut meta_reader =
            BloomFilterMetaReader::new(&self.reader, file_size, Some(DEFAULT_PREFETCH_SIZE));
-        meta_reader.metadata().await
+        meta_reader.metadata(metrics).await
    }
 }

@@ -183,7 +297,10 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
    ///
    /// It will first prefetch some bytes from the end of the file,
    /// then parse the metadata from the prefetch bytes.
-    pub async fn metadata(&mut self) -> Result<BloomFilterMeta> {
+    pub async fn metadata(
+        &mut self,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<BloomFilterMeta> {
        ensure!(
            self.file_size >= BLOOM_META_LEN_SIZE,
            FileSizeTooSmallSnafu {
@@ -191,6 +308,7 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
            }
        );

+        let start = metrics.as_ref().map(|_| Instant::now());
        let meta_start = self.file_size.saturating_sub(self.prefetch_size);
        let suffix = self
            .reader
@@ -208,8 +326,28 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
                .read(metadata_start..self.file_size - BLOOM_META_LEN_SIZE)
                .await
                .context(IoSnafu)?;
+
+            if let Some(m) = metrics {
+                // suffix read + meta read
+                m.total_ranges += 2;
+                // Ignores the meta length size to simplify the calculation.
+                m.total_bytes += self.file_size.min(self.prefetch_size) + length;
+                if let Some(start) = start {
+                    m.fetch_elapsed += start.elapsed();
+                }
+            }
+
            BloomFilterMeta::decode(meta).context(DecodeProtoSnafu)
        } else {
+            if let Some(m) = metrics {
+                // suffix read only
+                m.total_ranges += 1;
+                m.total_bytes += self.file_size.min(self.prefetch_size);
+                if let Some(start) = start {
+                    m.fetch_elapsed += start.elapsed();
+                }
+            }
+
            let metadata_start = self.file_size - length - BLOOM_META_LEN_SIZE - meta_start;
            let meta = &suffix[metadata_start as usize..suffix_len - BLOOM_META_LEN_SIZE as usize];
            BloomFilterMeta::decode(meta).context(DecodeProtoSnafu)
@@ -290,7 +428,7 @@ mod tests {
        for prefetch in [0u64, file_size / 2, file_size, file_size + 10] {
            let mut reader =
                BloomFilterMetaReader::new(bytes.clone(), file_size as _, Some(prefetch));
-            let meta = reader.metadata().await.unwrap();
+            let meta = reader.metadata(None).await.unwrap();

            assert_eq!(meta.rows_per_segment, 2);
            assert_eq!(meta.segment_count, 2);
@@ -312,11 +450,11 @@ mod tests {
        let bytes = mock_bloom_filter_bytes().await;

        let reader = BloomFilterReaderImpl::new(bytes);
-        let meta = reader.metadata().await.unwrap();
+        let meta = reader.metadata(None).await.unwrap();

        assert_eq!(meta.bloom_filter_locs.len(), 2);
        let bf = reader
-            .bloom_filter(&meta.bloom_filter_locs[0])
+            .bloom_filter(&meta.bloom_filter_locs[0], None)
            .await
            .unwrap();
        assert!(bf.contains(&b"a"));
@@ -325,7 +463,7 @@ mod tests {
        assert!(bf.contains(&b"d"));

        let bf = reader
-            .bloom_filter(&meta.bloom_filter_locs[1])
+            .bloom_filter(&meta.bloom_filter_locs[1], None)
            .await
            .unwrap();
        assert!(bf.contains(&b"e"));
--- a/src/index/src/fulltext_index/tests.rs
+++ b/src/index/src/fulltext_index/tests.rs
@@ -74,7 +74,7 @@ async fn test_search(
    writer.finish().await.unwrap();

    let reader = puffin_manager.reader(&file_name).await.unwrap();
-    let index_dir = reader.dir(&blob_key).await.unwrap();
+    let (index_dir, _metrics) = reader.dir(&blob_key).await.unwrap();
    let searcher = TantivyFulltextIndexSearcher::new(index_dir.path(), config).unwrap();
    for (query, expected) in query_expected {
        let results = searcher.search(query).await.unwrap();
--- a/src/index/src/inverted_index/format/reader.rs
+++ b/src/index/src/inverted_index/format/reader.rs
@@ -15,6 +15,7 @@
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
+use std::time::Duration;

 use async_trait::async_trait;
 use bytes::Bytes;
@@ -29,37 +30,115 @@ pub use crate::inverted_index::format::reader::blob::InvertedIndexBlobReader;
 mod blob;
 mod footer;

+/// Metrics for inverted index read operations.
+#[derive(Default, Clone)]
+pub struct InvertedIndexReadMetrics {
+    /// Total byte size to read.
+    pub total_bytes: u64,
+    /// Total number of ranges to read.
+    pub total_ranges: usize,
+    /// Elapsed time to fetch data.
+    pub fetch_elapsed: Duration,
+    /// Number of cache hits.
+    pub cache_hit: usize,
+    /// Number of cache misses.
+    pub cache_miss: usize,
+}
+
+impl std::fmt::Debug for InvertedIndexReadMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let Self {
+            total_bytes,
+            total_ranges,
+            fetch_elapsed,
+            cache_hit,
+            cache_miss,
+        } = self;
+
+        // If both total_bytes and cache_hit are 0, we didn't read anything.
+        if *total_bytes == 0 && *cache_hit == 0 {
+            return write!(f, "{{}}");
+        }
+        write!(f, "{{")?;
+
+        if *total_bytes > 0 {
+            write!(f, "\"total_bytes\":{}", total_bytes)?;
+        }
+        if *cache_hit > 0 {
+            if *total_bytes > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "\"cache_hit\":{}", cache_hit)?;
+        }
+
+        if *total_ranges > 0 {
+            write!(f, ", \"total_ranges\":{}", total_ranges)?;
+        }
+        if !fetch_elapsed.is_zero() {
+            write!(f, ", \"fetch_elapsed\":\"{:?}\"", fetch_elapsed)?;
+        }
+        if *cache_miss > 0 {
+            write!(f, ", \"cache_miss\":{}", cache_miss)?;
+        }
+
+        write!(f, "}}")
+    }
+}
+
+impl InvertedIndexReadMetrics {
+    /// Merges another metrics into this one.
+    pub fn merge_from(&mut self, other: &Self) {
+        self.total_bytes += other.total_bytes;
+        self.total_ranges += other.total_ranges;
+        self.fetch_elapsed += other.fetch_elapsed;
+        self.cache_hit += other.cache_hit;
+        self.cache_miss += other.cache_miss;
+    }
+}
+
 /// InvertedIndexReader defines an asynchronous reader of inverted index data
 #[mockall::automock]
 #[async_trait]
 pub trait InvertedIndexReader: Send + Sync {
    /// Seeks to given offset and reads data with exact size as provided.
-    async fn range_read(&self, offset: u64, size: u32) -> Result<Vec<u8>>;
+    async fn range_read<'a>(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<u8>>;

    /// Reads the bytes in the given ranges.
-    async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
-        let mut result = Vec::with_capacity(ranges.len());
-        for range in ranges {
-            let data = self
-                .range_read(range.start, (range.end - range.start) as u32)
-                .await?;
-            result.push(Bytes::from(data));
-        }
-        Ok(result)
-    }
+    async fn read_vec<'a>(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<Bytes>>;

    /// Retrieves metadata of all inverted indices stored within the blob.
-    async fn metadata(&self) -> Result<Arc<InvertedIndexMetas>>;
+    async fn metadata<'a>(
+        &self,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Arc<InvertedIndexMetas>>;

    /// Retrieves the finite state transducer (FST) map from the given offset and size.
-    async fn fst(&self, offset: u64, size: u32) -> Result<FstMap> {
-        let fst_data = self.range_read(offset, size).await?;
+    async fn fst<'a>(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<FstMap> {
+        let fst_data = self.range_read(offset, size, metrics).await?;
        FstMap::new(fst_data).context(DecodeFstSnafu)
    }

    /// Retrieves the multiple finite state transducer (FST) maps from the given ranges.
-    async fn fst_vec(&mut self, ranges: &[Range<u64>]) -> Result<Vec<FstMap>> {
-        self.read_vec(ranges)
+    async fn fst_vec<'a>(
+        &mut self,
+        ranges: &[Range<u64>],
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<FstMap>> {
+        self.read_vec(ranges, metrics)
            .await?
            .into_iter()
            .map(|bytes| FstMap::new(bytes.to_vec()).context(DecodeFstSnafu))
@@ -67,19 +146,28 @@ pub trait InvertedIndexReader: Send + Sync {
    }

    /// Retrieves the bitmap from the given offset and size.
-    async fn bitmap(&self, offset: u64, size: u32, bitmap_type: BitmapType) -> Result<Bitmap> {
-        self.range_read(offset, size).await.and_then(|bytes| {
-            Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
-        })
+    async fn bitmap<'a>(
+        &self,
+        offset: u64,
+        size: u32,
+        bitmap_type: BitmapType,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Bitmap> {
+        self.range_read(offset, size, metrics)
+            .await
+            .and_then(|bytes| {
+                Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
+            })
    }

    /// Retrieves the multiple bitmaps from the given ranges.
-    async fn bitmap_deque(
+    async fn bitmap_deque<'a>(
        &mut self,
        ranges: &[(Range<u64>, BitmapType)],
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
    ) -> Result<VecDeque<Bitmap>> {
        let (ranges, types): (Vec<_>, Vec<_>) = ranges.iter().cloned().unzip();
-        let bytes = self.read_vec(&ranges).await?;
+        let bytes = self.read_vec(&ranges, metrics).await?;
        bytes
            .into_iter()
            .zip(types)
--- a/src/index/src/inverted_index/format/reader/blob.rs
+++ b/src/index/src/inverted_index/format/reader/blob.rs
@@ -14,6 +14,7 @@

 use std::ops::Range;
 use std::sync::Arc;
+use std::time::Instant;

 use async_trait::async_trait;
 use bytes::Bytes;
@@ -23,10 +24,10 @@ use snafu::{ResultExt, ensure};

 use crate::inverted_index::error::{CommonIoSnafu, Result, UnexpectedBlobSizeSnafu};
 use crate::inverted_index::format::MIN_BLOB_SIZE;
-use crate::inverted_index::format::reader::InvertedIndexReader;
 use crate::inverted_index::format::reader::footer::{
    DEFAULT_PREFETCH_SIZE, InvertedIndexFooterReader,
 };
+use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};

 /// Inverted index blob reader, implements [`InvertedIndexReader`]
 pub struct InvertedIndexBlobReader<R> {
@@ -53,27 +54,58 @@ impl<R> InvertedIndexBlobReader<R> {

 #[async_trait]
 impl<R: RangeReader + Sync> InvertedIndexReader for InvertedIndexBlobReader<R> {
-    async fn range_read(&self, offset: u64, size: u32) -> Result<Vec<u8>> {
+    async fn range_read<'a>(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<u8>> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+
        let buf = self
            .source
            .read(offset..offset + size as u64)
            .await
            .context(CommonIoSnafu)?;
+
+        if let Some(m) = metrics {
+            m.total_bytes += size as u64;
+            m.total_ranges += 1;
+            m.fetch_elapsed += start.unwrap().elapsed();
+        }
+
        Ok(buf.into())
    }

-    async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
-        self.source.read_vec(ranges).await.context(CommonIoSnafu)
+    async fn read_vec<'a>(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<Bytes>> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+
+        let result = self.source.read_vec(ranges).await.context(CommonIoSnafu)?;
+
+        if let Some(m) = metrics {
+            m.total_bytes += ranges.iter().map(|r| r.end - r.start).sum::<u64>();
+            m.total_ranges += ranges.len();
+            m.fetch_elapsed += start.unwrap().elapsed();
+        }
+
+        Ok(result)
    }

-    async fn metadata(&self) -> Result<Arc<InvertedIndexMetas>> {
+    async fn metadata<'a>(
+        &self,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Arc<InvertedIndexMetas>> {
        let metadata = self.source.metadata().await.context(CommonIoSnafu)?;
        let blob_size = metadata.content_length;
        Self::validate_blob_size(blob_size)?;

        let mut footer_reader = InvertedIndexFooterReader::new(&self.source, blob_size)
            .with_prefetch_size(DEFAULT_PREFETCH_SIZE);
-        footer_reader.metadata().await.map(Arc::new)
+        footer_reader.metadata(metrics).await.map(Arc::new)
    }
 }

@@ -173,7 +205,7 @@ mod tests {
        let blob = create_inverted_index_blob();
        let blob_reader = InvertedIndexBlobReader::new(blob);

-        let metas = blob_reader.metadata().await.unwrap();
+        let metas = blob_reader.metadata(None).await.unwrap();
        assert_eq!(metas.metas.len(), 2);

        let meta0 = metas.metas.get("tag0").unwrap();
@@ -200,13 +232,14 @@ mod tests {
        let blob = create_inverted_index_blob();
        let blob_reader = InvertedIndexBlobReader::new(blob);

-        let metas = blob_reader.metadata().await.unwrap();
+        let metas = blob_reader.metadata(None).await.unwrap();
        let meta = metas.metas.get("tag0").unwrap();

        let fst_map = blob_reader
            .fst(
                meta.base_offset + meta.relative_fst_offset as u64,
                meta.fst_size,
+                None,
            )
            .await
            .unwrap();
@@ -219,6 +252,7 @@ mod tests {
            .fst(
                meta.base_offset + meta.relative_fst_offset as u64,
                meta.fst_size,
+                None,
            )
            .await
            .unwrap();
@@ -232,30 +266,30 @@ mod tests {
        let blob = create_inverted_index_blob();
        let blob_reader = InvertedIndexBlobReader::new(blob);

-        let metas = blob_reader.metadata().await.unwrap();
+        let metas = blob_reader.metadata(None).await.unwrap();
        let meta = metas.metas.get("tag0").unwrap();

        let bitmap = blob_reader
-            .bitmap(meta.base_offset, 26, BitmapType::Roaring)
+            .bitmap(meta.base_offset, 26, BitmapType::Roaring, None)
            .await
            .unwrap();
        assert_eq!(bitmap, mock_bitmap());
        let bitmap = blob_reader
-            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
+            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring, None)
            .await
            .unwrap();
        assert_eq!(bitmap, mock_bitmap());

-        let metas = blob_reader.metadata().await.unwrap();
+        let metas = blob_reader.metadata(None).await.unwrap();
        let meta = metas.metas.get("tag1").unwrap();

        let bitmap = blob_reader
-            .bitmap(meta.base_offset, 26, BitmapType::Roaring)
+            .bitmap(meta.base_offset, 26, BitmapType::Roaring, None)
            .await
            .unwrap();
        assert_eq!(bitmap, mock_bitmap());
        let bitmap = blob_reader
-            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
+            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring, None)
            .await
            .unwrap();
        assert_eq!(bitmap, mock_bitmap());
--- a/src/index/src/inverted_index/format/reader/footer.rs
+++ b/src/index/src/inverted_index/format/reader/footer.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::time::Instant;
+
 use common_base::range_read::RangeReader;
 use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexMetas};
 use prost::Message;
@@ -23,6 +25,7 @@ use crate::inverted_index::error::{
    UnexpectedZeroSegmentRowCountSnafu,
 };
 use crate::inverted_index::format::FOOTER_PAYLOAD_SIZE_SIZE;
+use crate::inverted_index::format::reader::InvertedIndexReadMetrics;

 pub const DEFAULT_PREFETCH_SIZE: u64 = 8192; // 8KiB

@@ -54,12 +57,17 @@ impl<R> InvertedIndexFooterReader<R> {
 }

 impl<R: RangeReader> InvertedIndexFooterReader<R> {
-    pub async fn metadata(&mut self) -> Result<InvertedIndexMetas> {
+    pub async fn metadata(
+        &mut self,
+        mut metrics: Option<&mut InvertedIndexReadMetrics>,
+    ) -> Result<InvertedIndexMetas> {
        ensure!(
            self.blob_size >= FOOTER_PAYLOAD_SIZE_SIZE,
            BlobSizeTooSmallSnafu
        );

+        let start = metrics.as_ref().map(|_| Instant::now());
+
        let footer_start = self.blob_size.saturating_sub(self.prefetch_size());
        let suffix = self
            .source
@@ -73,19 +81,36 @@ impl<R: RangeReader> InvertedIndexFooterReader<R> {
        let footer_size = FOOTER_PAYLOAD_SIZE_SIZE;

        // Did not fetch the entire file metadata in the initial read, need to make a second request.
-        if length > suffix_len as u64 - footer_size {
+        let result = if length > suffix_len as u64 - footer_size {
            let metadata_start = self.blob_size - length - footer_size;
            let meta = self
                .source
                .read(metadata_start..self.blob_size - footer_size)
                .await
                .context(CommonIoSnafu)?;
+
+            if let Some(m) = metrics.as_deref_mut() {
+                m.total_bytes += self.blob_size.min(self.prefetch_size()) + length;
+                m.total_ranges += 2;
+            }
+
            self.parse_payload(&meta, length)
        } else {
+            if let Some(m) = metrics.as_deref_mut() {
+                m.total_bytes += self.blob_size.min(self.prefetch_size());
+                m.total_ranges += 1;
+            }
+
            let metadata_start = self.blob_size - length - footer_size - footer_start;
            let meta = &suffix[metadata_start as usize..suffix_len - footer_size as usize];
            self.parse_payload(meta, length)
+        };
+
+        if let Some(m) = metrics {
+            m.fetch_elapsed += start.unwrap().elapsed();
        }
+
+        result
    }

    fn read_tailing_four_bytes(suffix: &[u8]) -> Result<[u8; 4]> {
@@ -186,7 +211,7 @@ mod tests {
                reader = reader.with_prefetch_size(prefetch);
            }

-            let metas = reader.metadata().await.unwrap();
+            let metas = reader.metadata(None).await.unwrap();
            assert_eq!(metas.metas.len(), 1);
            let index_meta = &metas.metas.get("test").unwrap();
            assert_eq!(index_meta.name, "test");
@@ -210,7 +235,7 @@ mod tests {
                reader = reader.with_prefetch_size(prefetch);
            }

-            let result = reader.metadata().await;
+            let result = reader.metadata(None).await;
            assert_matches!(result, Err(Error::UnexpectedFooterPayloadSize { .. }));
        }
    }
@@ -233,7 +258,7 @@ mod tests {
                reader = reader.with_prefetch_size(prefetch);
            }

-            let result = reader.metadata().await;
+            let result = reader.metadata(None).await;
            assert_matches!(result, Err(Error::UnexpectedOffsetSize { .. }));
        }
    }
--- a/src/index/src/inverted_index/format/writer/blob.rs
+++ b/src/index/src/inverted_index/format/writer/blob.rs
@@ -122,7 +122,7 @@ mod tests {
            .unwrap();

        let reader = InvertedIndexBlobReader::new(blob);
-        let metadata = reader.metadata().await.unwrap();
+        let metadata = reader.metadata(None).await.unwrap();
        assert_eq!(metadata.total_row_count, 8);
        assert_eq!(metadata.segment_row_count, 1);
        assert_eq!(metadata.metas.len(), 0);
@@ -182,7 +182,7 @@ mod tests {
            .unwrap();

        let reader = InvertedIndexBlobReader::new(blob);
-        let metadata = reader.metadata().await.unwrap();
+        let metadata = reader.metadata(None).await.unwrap();
        assert_eq!(metadata.total_row_count, 8);
        assert_eq!(metadata.segment_row_count, 1);
        assert_eq!(metadata.metas.len(), 2);
@@ -198,13 +198,19 @@ mod tests {
            .fst(
                tag0.base_offset + tag0.relative_fst_offset as u64,
                tag0.fst_size,
+                None,
            )
            .await
            .unwrap();
        assert_eq!(fst0.len(), 3);
        let [offset, size] = unpack(fst0.get(b"a").unwrap());
        let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag0.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
@@ -213,7 +219,12 @@ mod tests {
        );
        let [offset, size] = unpack(fst0.get(b"b").unwrap());
        let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag0.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
@@ -222,7 +233,12 @@ mod tests {
        );
        let [offset, size] = unpack(fst0.get(b"c").unwrap());
        let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag0.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
@@ -241,13 +257,19 @@ mod tests {
            .fst(
                tag1.base_offset + tag1.relative_fst_offset as u64,
                tag1.fst_size,
+                None,
            )
            .await
            .unwrap();
        assert_eq!(fst1.len(), 3);
        let [offset, size] = unpack(fst1.get(b"x").unwrap());
        let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag1.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
@@ -256,7 +278,12 @@ mod tests {
        );
        let [offset, size] = unpack(fst1.get(b"y").unwrap());
        let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag1.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
@@ -265,7 +292,12 @@ mod tests {
        );
        let [offset, size] = unpack(fst1.get(b"z").unwrap());
        let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag1.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
--- a/src/index/src/inverted_index/search/fst_values_mapper.rs
+++ b/src/index/src/inverted_index/search/fst_values_mapper.rs
@@ -16,7 +16,7 @@ use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta};

 use crate::bitmap::Bitmap;
 use crate::inverted_index::error::Result;
-use crate::inverted_index::format::reader::InvertedIndexReader;
+use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};

 /// `ParallelFstValuesMapper` enables parallel mapping of multiple FST value groups to their
 /// corresponding bitmaps within an inverted index.
@@ -35,7 +35,8 @@ impl<'a> ParallelFstValuesMapper<'a> {

    pub async fn map_values_vec(
        &mut self,
-        value_and_meta_vec: &[(Vec<u64>, &'a InvertedIndexMeta)],
+        value_and_meta_vec: &[(Vec<u64>, &InvertedIndexMeta)],
+        metrics: Option<&mut InvertedIndexReadMetrics>,
    ) -> Result<Vec<Bitmap>> {
        let groups = value_and_meta_vec
            .iter()
@@ -64,7 +65,7 @@ impl<'a> ParallelFstValuesMapper<'a> {
        }

        common_telemetry::debug!("fetch ranges: {:?}", fetch_ranges);
-        let mut bitmaps = self.reader.bitmap_deque(&fetch_ranges).await?;
+        let mut bitmaps = self.reader.bitmap_deque(&fetch_ranges, metrics).await?;
        let mut output = Vec::with_capacity(groups.len());

        for counter in groups {
@@ -95,23 +96,25 @@ mod tests {
    #[tokio::test]
    async fn test_map_values_vec() {
        let mut mock_reader = MockInvertedIndexReader::new();
-        mock_reader.expect_bitmap_deque().returning(|ranges| {
-            let mut output = VecDeque::new();
-            for (range, bitmap_type) in ranges {
-                let offset = range.start;
-                let size = range.end - range.start;
-                match (offset, size, bitmap_type) {
-                    (1, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
+        mock_reader
+            .expect_bitmap_deque()
+            .returning(|ranges, _metrics| {
+                let mut output = VecDeque::new();
+                for (range, bitmap_type) in ranges {
+                    let offset = range.start;
+                    let size = range.end - range.start;
+                    match (offset, size, bitmap_type) {
+                        (1, 1, BitmapType::Roaring) => {
+                            output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
+                        }
+                        (2, 1, BitmapType::Roaring) => {
+                            output.push_back(Bitmap::from_lsb0_bytes(&[0b01010101], *bitmap_type))
+                        }
+                        _ => unreachable!(),
                    }
-                    (2, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b01010101], *bitmap_type))
-                    }
-                    _ => unreachable!(),
                }
-            }
-            Ok(output)
-        });
+                Ok(output)
+            });

        let meta = InvertedIndexMeta {
            bitmap_type: BitmapType::Roaring.into(),
@@ -120,13 +123,13 @@ mod tests {
        let mut values_mapper = ParallelFstValuesMapper::new(&mut mock_reader);

        let result = values_mapper
-            .map_values_vec(&[(vec![], &meta)])
+            .map_values_vec(&[(vec![], &meta)], None)
            .await
            .unwrap();
        assert_eq!(result[0].count_ones(), 0);

        let result = values_mapper
-            .map_values_vec(&[(vec![value(1, 1)], &meta)])
+            .map_values_vec(&[(vec![value(1, 1)], &meta)], None)
            .await
            .unwrap();
        assert_eq!(
@@ -135,7 +138,7 @@ mod tests {
        );

        let result = values_mapper
-            .map_values_vec(&[(vec![value(2, 1)], &meta)])
+            .map_values_vec(&[(vec![value(2, 1)], &meta)], None)
            .await
            .unwrap();
        assert_eq!(
@@ -144,7 +147,7 @@ mod tests {
        );

        let result = values_mapper
-            .map_values_vec(&[(vec![value(1, 1), value(2, 1)], &meta)])
+            .map_values_vec(&[(vec![value(1, 1), value(2, 1)], &meta)], None)
            .await
            .unwrap();
        assert_eq!(
@@ -153,7 +156,7 @@ mod tests {
        );

        let result = values_mapper
-            .map_values_vec(&[(vec![value(2, 1), value(1, 1)], &meta)])
+            .map_values_vec(&[(vec![value(2, 1), value(1, 1)], &meta)], None)
            .await
            .unwrap();
        assert_eq!(
@@ -162,7 +165,10 @@ mod tests {
        );

        let result = values_mapper
-            .map_values_vec(&[(vec![value(2, 1)], &meta), (vec![value(1, 1)], &meta)])
+            .map_values_vec(
+                &[(vec![value(2, 1)], &meta), (vec![value(1, 1)], &meta)],
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
@@ -174,10 +180,13 @@ mod tests {
            Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
        );
        let result = values_mapper
-            .map_values_vec(&[
-                (vec![value(2, 1), value(1, 1)], &meta),
-                (vec![value(1, 1)], &meta),
-            ])
+            .map_values_vec(
+                &[
+                    (vec![value(2, 1), value(1, 1)], &meta),
+                    (vec![value(1, 1)], &meta),
+                ],
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
--- a/src/index/src/inverted_index/search/index_apply.rs
+++ b/src/index/src/inverted_index/search/index_apply.rs
@@ -19,7 +19,7 @@ pub use predicates_apply::PredicatesIndexApplier;

 use crate::bitmap::Bitmap;
 use crate::inverted_index::error::Result;
-use crate::inverted_index::format::reader::InvertedIndexReader;
+use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};

 /// The output of an apply operation.
 #[derive(Clone, Debug, PartialEq)]
@@ -44,10 +44,11 @@ pub trait IndexApplier: Send + Sync {
    /// Applies the predefined predicates to the data read by the given index reader, returning
    /// a list of relevant indices (e.g., post IDs, group IDs, row IDs).
    #[allow(unused_parens)]
-    async fn apply<'a>(
+    async fn apply<'a, 'b>(
        &self,
        context: SearchContext,
        reader: &mut (dyn InvertedIndexReader + 'a),
+        metrics: Option<&'b mut InvertedIndexReadMetrics>,
    ) -> Result<ApplyOutput>;

    /// Returns the memory usage of the applier.
--- a/src/index/src/inverted_index/search/index_apply/predicates_apply.rs
+++ b/src/index/src/inverted_index/search/index_apply/predicates_apply.rs
@@ -19,7 +19,7 @@ use greptime_proto::v1::index::InvertedIndexMetas;

 use crate::bitmap::Bitmap;
 use crate::inverted_index::error::{IndexNotFoundSnafu, Result};
-use crate::inverted_index::format::reader::InvertedIndexReader;
+use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
 use crate::inverted_index::search::fst_apply::{
    FstApplier, IntersectionFstApplier, KeysFstApplier,
 };
@@ -43,12 +43,14 @@ pub struct PredicatesIndexApplier {
 impl IndexApplier for PredicatesIndexApplier {
    /// Applies all `FstApplier`s to the data in the inverted index reader, intersecting the individual
    /// bitmaps obtained for each index to result in a final set of indices.
-    async fn apply<'a>(
+    async fn apply<'a, 'b>(
        &self,
        context: SearchContext,
        reader: &mut (dyn InvertedIndexReader + 'a),
+        metrics: Option<&'b mut InvertedIndexReadMetrics>,
    ) -> Result<ApplyOutput> {
-        let metadata = reader.metadata().await?;
+        let mut metrics = metrics;
+        let metadata = reader.metadata(metrics.as_deref_mut()).await?;
        let mut output = ApplyOutput {
            matched_segment_ids: Bitmap::new_bitvec(),
            total_row_count: metadata.total_row_count as _,
@@ -84,7 +86,7 @@ impl IndexApplier for PredicatesIndexApplier {
            return Ok(output);
        }

-        let fsts = reader.fst_vec(&fst_ranges).await?;
+        let fsts = reader.fst_vec(&fst_ranges, metrics.as_deref_mut()).await?;
        let value_and_meta_vec = fsts
            .into_iter()
            .zip(appliers)
@@ -92,7 +94,7 @@ impl IndexApplier for PredicatesIndexApplier {
            .collect::<Vec<_>>();

        let mut mapper = ParallelFstValuesMapper::new(reader);
-        let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec).await?;
+        let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec, metrics).await?;

        let mut bitmap = bm_vec.pop().unwrap(); // SAFETY: `fst_ranges` is not empty
        for bm in bm_vec {
@@ -221,26 +223,28 @@ mod tests {
        let mut mock_reader = MockInvertedIndexReader::new();
        mock_reader
            .expect_metadata()
-            .returning(|| Ok(mock_metas([("tag-0", 0)])));
-        mock_reader.expect_fst_vec().returning(|_ranges| {
+            .returning(|_| Ok(mock_metas([("tag-0", 0)])));
+        mock_reader.expect_fst_vec().returning(|_ranges, _metrics| {
            Ok(vec![
                FstMap::from_iter([(b"tag-0_value-0", fst_value(2, 1))]).unwrap(),
            ])
        });

-        mock_reader.expect_bitmap_deque().returning(|arg| {
-            assert_eq!(arg.len(), 1);
-            let range = &arg[0].0;
-            let bitmap_type = arg[0].1;
-            assert_eq!(*range, 2..3);
-            assert_eq!(bitmap_type, BitmapType::Roaring);
-            Ok(VecDeque::from([Bitmap::from_lsb0_bytes(
-                &[0b10101010],
-                bitmap_type,
-            )]))
-        });
+        mock_reader
+            .expect_bitmap_deque()
+            .returning(|arg, _metrics| {
+                assert_eq!(arg.len(), 1);
+                let range = &arg[0].0;
+                let bitmap_type = arg[0].1;
+                assert_eq!(*range, 2..3);
+                assert_eq!(bitmap_type, BitmapType::Roaring);
+                Ok(VecDeque::from([Bitmap::from_lsb0_bytes(
+                    &[0b10101010],
+                    bitmap_type,
+                )]))
+            });
        let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
            .await
            .unwrap();
        assert_eq!(
@@ -252,14 +256,14 @@ mod tests {
        let mut mock_reader = MockInvertedIndexReader::new();
        mock_reader
            .expect_metadata()
-            .returning(|| Ok(mock_metas([("tag-0", 0)])));
-        mock_reader.expect_fst_vec().returning(|_range| {
+            .returning(|_| Ok(mock_metas([("tag-0", 0)])));
+        mock_reader.expect_fst_vec().returning(|_range, _metrics| {
            Ok(vec![
                FstMap::from_iter([(b"tag-0_value-1", fst_value(2, 1))]).unwrap(),
            ])
        });
        let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
            .await
            .unwrap();
        assert_eq!(output.matched_segment_ids.count_ones(), 0);
@@ -279,8 +283,8 @@ mod tests {
        let mut mock_reader = MockInvertedIndexReader::new();
        mock_reader
            .expect_metadata()
-            .returning(|| Ok(mock_metas([("tag-0", 0), ("tag-1", 1)])));
-        mock_reader.expect_fst_vec().returning(|ranges| {
+            .returning(|_| Ok(mock_metas([("tag-0", 0), ("tag-1", 1)])));
+        mock_reader.expect_fst_vec().returning(|ranges, _metrics| {
            let mut output = vec![];
            for range in ranges {
                match range.start {
@@ -293,27 +297,29 @@ mod tests {
            }
            Ok(output)
        });
-        mock_reader.expect_bitmap_deque().returning(|ranges| {
-            let mut output = VecDeque::new();
-            for (range, bitmap_type) in ranges {
-                let offset = range.start;
-                let size = range.end - range.start;
-                match (offset, size, bitmap_type) {
-                    (1, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
+        mock_reader
+            .expect_bitmap_deque()
+            .returning(|ranges, _metrics| {
+                let mut output = VecDeque::new();
+                for (range, bitmap_type) in ranges {
+                    let offset = range.start;
+                    let size = range.end - range.start;
+                    match (offset, size, bitmap_type) {
+                        (1, 1, BitmapType::Roaring) => {
+                            output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
+                        }
+                        (2, 1, BitmapType::Roaring) => {
+                            output.push_back(Bitmap::from_lsb0_bytes(&[0b11011011], *bitmap_type))
+                        }
+                        _ => unreachable!(),
                    }
-                    (2, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b11011011], *bitmap_type))
-                    }
-                    _ => unreachable!(),
                }
-            }

-            Ok(output)
-        });
+                Ok(output)
+            });

        let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
            .await
            .unwrap();
        assert_eq!(
@@ -331,10 +337,10 @@ mod tests {
        let mut mock_reader: MockInvertedIndexReader = MockInvertedIndexReader::new();
        mock_reader
            .expect_metadata()
-            .returning(|| Ok(mock_metas([("tag-0", 0)])));
+            .returning(|_| Ok(mock_metas([("tag-0", 0)])));

        let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
            .await
            .unwrap();
        assert_eq!(output.matched_segment_ids, Bitmap::full_bitvec(8)); // full range to scan
@@ -343,7 +349,7 @@ mod tests {
    #[tokio::test]
    async fn test_index_applier_with_empty_index() {
        let mut mock_reader = MockInvertedIndexReader::new();
-        mock_reader.expect_metadata().returning(move || {
+        mock_reader.expect_metadata().returning(move |_| {
            Ok(Arc::new(InvertedIndexMetas {
                total_row_count: 0, // No rows
                segment_row_count: 1,
@@ -359,7 +365,7 @@ mod tests {
        };

        let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
            .await
            .unwrap();
        assert!(output.matched_segment_ids.is_empty());
@@ -370,7 +376,7 @@ mod tests {
        let mut mock_reader = MockInvertedIndexReader::new();
        mock_reader
            .expect_metadata()
-            .returning(|| Ok(mock_metas(vec![])));
+            .returning(|_| Ok(mock_metas(vec![])));

        let mut mock_fst_applier = MockFstApplier::new();
        mock_fst_applier.expect_apply().never();
@@ -385,6 +391,7 @@ mod tests {
                    index_not_found_strategy: IndexNotFoundStrategy::ThrowError,
                },
                &mut mock_reader,
+                None,
            )
            .await;
        assert!(matches!(result, Err(Error::IndexNotFound { .. })));
@@ -395,6 +402,7 @@ mod tests {
                    index_not_found_strategy: IndexNotFoundStrategy::ReturnEmpty,
                },
                &mut mock_reader,
+                None,
            )
            .await
            .unwrap();
@@ -406,6 +414,7 @@ mod tests {
                    index_not_found_strategy: IndexNotFoundStrategy::Ignore,
                },
                &mut mock_reader,
+                None,
            )
            .await
            .unwrap();
--- a/src/log-store/src/kafka/client_manager.rs
+++ b/src/log-store/src/kafka/client_manager.rs
@@ -16,7 +16,7 @@ use std::collections::HashMap;
 use std::sync::Arc;

 use common_wal::config::kafka::DatanodeKafkaConfig;
-use common_wal::config::kafka::common::DEFAULT_BACKOFF_CONFIG;
+use common_wal::config::kafka::common::{DEFAULT_BACKOFF_CONFIG, DEFAULT_CONNECT_TIMEOUT};
 use dashmap::DashMap;
 use rskafka::client::ClientBuilder;
 use rskafka::client::partition::{Compression, PartitionClient, UnknownTopicHandling};
@@ -78,7 +78,8 @@ impl ClientManager {
    ) -> Result<Self> {
        // Sets backoff config for the top-level kafka client and all clients constructed by it.
        let mut builder = ClientBuilder::new(config.connection.broker_endpoints.clone())
-            .backoff_config(DEFAULT_BACKOFF_CONFIG);
+            .backoff_config(DEFAULT_BACKOFF_CONFIG)
+            .connect_timeout(Some(DEFAULT_CONNECT_TIMEOUT));
        if let Some(sasl) = &config.connection.sasl {
            builder = builder.sasl_config(sasl.config.clone().into_sasl_config());
        };
--- a/src/meta-client/src/client.rs
+++ b/src/meta-client/src/client.rs
@@ -189,6 +189,9 @@ impl MetaClientBuilder {
        let mgr = client.channel_manager.clone();

        if self.enable_heartbeat {
+            if self.heartbeat_channel_manager.is_some() {
+                info!("Enable heartbeat channel using the heartbeat channel manager.");
+            }
            let mgr = self.heartbeat_channel_manager.unwrap_or(mgr.clone());
            client.heartbeat = Some(HeartbeatClient::new(
                self.id,
--- a/src/meta-client/src/client/ask_leader.rs
+++ b/src/meta-client/src/client/ask_leader.rs
@@ -24,7 +24,7 @@ use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS;
 use common_telemetry::tracing_context::TracingContext;
 use common_telemetry::warn;
 use rand::seq::SliceRandom;
-use snafu::{OptionExt, ResultExt};
+use snafu::ResultExt;
 use tokio::time::timeout;
 use tonic::transport::Channel;

@@ -101,12 +101,14 @@ impl AskLeader {
        };

        let (tx, mut rx) = tokio::sync::mpsc::channel(peers.len());
+        let channel_manager = self.channel_manager.clone();

        for addr in &peers {
            let mut client = self.create_asker(addr)?;
            let tx_clone = tx.clone();
            let req = req.clone();
            let addr = addr.clone();
+            let channel_manager = channel_manager.clone();
            tokio::spawn(async move {
                match client.ask_leader(req).await {
                    Ok(res) => {
@@ -117,13 +119,19 @@ impl AskLeader {
                        };
                    }
                    Err(status) => {
+                        // Reset cached channel even on generic errors: the VIP may keep us on a dead
+                        // backend, so forcing a reconnect gives us a chance to hit a healthy peer.
+                        Self::reset_channels_with_manager(
+                            &channel_manager,
+                            std::slice::from_ref(&addr),
+                        );
                        warn!("Failed to ask leader from: {addr}, {status}");
                    }
                }
            });
        }

-        let leader = timeout(
+        let leader = match timeout(
            self.channel_manager
                .config()
                .timeout
@@ -131,8 +139,16 @@ impl AskLeader {
            rx.recv(),
        )
        .await
-        .context(error::AskLeaderTimeoutSnafu)?
-        .context(error::NoLeaderSnafu)?;
+        {
+            Ok(Some(leader)) => leader,
+            Ok(None) => return error::NoLeaderSnafu.fail(),
+            Err(e) => {
+                // All peers timed out. Reset channels to force reconnection,
+                // which may help escape dead backends in VIP/LB scenarios.
+                Self::reset_channels_with_manager(&self.channel_manager, &peers);
+                return Err(e).context(error::AskLeaderTimeoutSnafu);
+            }
+        };

        let mut leadership_group = self.leadership_group.write().unwrap();
        leadership_group.leader = Some(leader.clone());
@@ -169,6 +185,15 @@ impl AskLeader {
                .context(error::CreateChannelSnafu)?,
        ))
    }
+
+    /// Drop cached channels for the given peers so a fresh connection is used next time.
+    fn reset_channels_with_manager(channel_manager: &ChannelManager, peers: &[String]) {
+        if peers.is_empty() {
+            return;
+        }
+
+        channel_manager.retain_channel(|addr, _| !peers.iter().any(|peer| peer == addr));
+    }
 }

 #[async_trait]
--- a/src/meta-client/src/lib.rs
+++ b/src/meta-client/src/lib.rs
@@ -18,6 +18,10 @@ use std::time::Duration;
 use client::RegionFollowerClientRef;
 use common_base::Plugins;
 use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
+use common_meta::distributed_time_constants::{
+    HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS, HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS,
+    HEARTBEAT_TIMEOUT,
+};
 use common_telemetry::{debug, info};
 use serde::{Deserialize, Serialize};

@@ -34,8 +38,6 @@ pub struct MetaClientOptions {
    #[serde(with = "humantime_serde")]
    pub timeout: Duration,
    #[serde(with = "humantime_serde")]
-    pub heartbeat_timeout: Duration,
-    #[serde(with = "humantime_serde")]
    pub ddl_timeout: Duration,
    #[serde(with = "humantime_serde")]
    pub connect_timeout: Duration,
@@ -52,7 +54,6 @@ impl Default for MetaClientOptions {
        Self {
            metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
            timeout: Duration::from_millis(3_000u64),
-            heartbeat_timeout: Duration::from_millis(500u64),
            ddl_timeout: Duration::from_millis(10_000u64),
            connect_timeout: Duration::from_millis(1_000u64),
            tcp_nodelay: true,
@@ -97,7 +98,11 @@ pub async fn create_meta_client(
        .timeout(meta_client_options.timeout)
        .connect_timeout(meta_client_options.connect_timeout)
        .tcp_nodelay(meta_client_options.tcp_nodelay);
-    let heartbeat_config = base_config.clone();
+    let heartbeat_config = base_config
+        .clone()
+        .timeout(HEARTBEAT_TIMEOUT)
+        .http2_keep_alive_interval(HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS)
+        .http2_keep_alive_timeout(HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS);

    if let MetaClientType::Frontend = client_type {
        let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout);
--- a/src/meta-srv/src/bootstrap.rs
+++ b/src/meta-srv/src/bootstrap.rs
@@ -14,6 +14,7 @@

 use std::net::SocketAddr;
 use std::sync::Arc;
+use std::time::Duration;

 use api::v1::meta::cluster_server::ClusterServer;
 use api::v1::meta::heartbeat_server::HeartbeatServer;
@@ -49,16 +50,21 @@ use crate::metasrv::builder::MetasrvBuilder;
 use crate::metasrv::{
    BackendImpl, ElectionRef, Metasrv, MetasrvOptions, SelectTarget, SelectorRef,
 };
-use crate::selector::SelectorType;
 use crate::selector::lease_based::LeaseBasedSelector;
 use crate::selector::load_based::LoadBasedSelector;
 use crate::selector::round_robin::RoundRobinSelector;
 use crate::selector::weight_compute::RegionNumsBasedWeightCompute;
+use crate::selector::{Selector, SelectorType};
 use crate::service::admin;
 use crate::service::admin::admin_axum_router;
 use crate::utils::etcd::create_etcd_client_with_tls;
 use crate::{Result, error};

+/// The default keep-alive interval for gRPC.
+const DEFAULT_GRPC_KEEP_ALIVE_INTERVAL: Duration = Duration::from_secs(10);
+/// The default keep-alive timeout for gRPC.
+const DEFAULT_GRPC_KEEP_ALIVE_TIMEOUT: Duration = Duration::from_secs(10);
+
 pub struct MetasrvInstance {
    metasrv: Arc<Metasrv>,

@@ -245,7 +251,12 @@ macro_rules! add_compressed_service {
 }

 pub fn router(metasrv: Arc<Metasrv>) -> Router {
-    let mut router = tonic::transport::Server::builder().accept_http1(true); // for admin services
+    let mut router = tonic::transport::Server::builder()
+        // for admin services
+        .accept_http1(true)
+        // For quick network failures detection.
+        .http2_keepalive_interval(Some(DEFAULT_GRPC_KEEP_ALIVE_INTERVAL))
+        .http2_keepalive_timeout(Some(DEFAULT_GRPC_KEEP_ALIVE_TIMEOUT));
    let router = add_compressed_service!(router, HeartbeatServer::from_arc(metasrv.clone()));
    let router = add_compressed_service!(router, StoreServer::from_arc(metasrv.clone()));
    let router = add_compressed_service!(router, ClusterServer::from_arc(metasrv.clone()));
@@ -393,7 +404,12 @@ pub async fn metasrv_builder(
        info!("Using selector from plugins");
        selector
    } else {
-        let selector = match opts.selector {
+        let selector: Arc<
+            dyn Selector<
+                    Context = crate::metasrv::SelectorContext,
+                    Output = Vec<common_meta::peer::Peer>,
+                >,
+        > = match opts.selector {
            SelectorType::LoadBased => Arc::new(LoadBasedSelector::new(
                RegionNumsBasedWeightCompute,
                meta_peer_client.clone(),
--- a/src/meta-srv/src/election/etcd.rs
+++ b/src/meta-srv/src/election/etcd.rs
@@ -63,22 +63,6 @@ pub struct EtcdElection {
 }

 impl EtcdElection {
-    pub async fn with_endpoints<E, S>(
-        leader_value: E,
-        endpoints: S,
-        store_key_prefix: String,
-    ) -> Result<ElectionRef>
-    where
-        E: AsRef<str>,
-        S: AsRef<[E]>,
-    {
-        let client = Client::connect(endpoints, None)
-            .await
-            .context(error::ConnectEtcdSnafu)?;
-
-        Self::with_etcd_client(leader_value, client, store_key_prefix).await
-    }
-
    pub async fn with_etcd_client<E>(
        leader_value: E,
        client: Client,
--- a/src/meta-srv/src/gc.rs
+++ b/src/meta-srv/src/gc.rs
@@ -23,6 +23,8 @@ use store_api::storage::RegionId;
 mod candidate;
 mod ctx;
 mod handler;
+#[cfg(test)]
+mod mock;
 mod options;
 mod procedure;
 mod scheduler;
--- a/src/meta-srv/src/gc/candidate.rs
+++ b/src/meta-srv/src/gc/candidate.rs
@@ -88,7 +88,8 @@ impl GcScheduler {

                // Skip regions that are in cooldown period
                if let Some(gc_info) = tracker.get(&region_stat.id)
-                    && now.duration_since(gc_info.last_gc_time) < self.config.gc_cooldown_period
+                    && now.saturating_duration_since(gc_info.last_gc_time)
+                        < self.config.gc_cooldown_period
                {
                    debug!("Skipping region {} due to cooldown", region_stat.id);
                    continue;
--- a/src/meta-srv/src/gc/handler.rs
+++ b/src/meta-srv/src/gc/handler.rs
@@ -434,7 +434,7 @@ impl GcScheduler {
                if let Some(gc_info) = gc_tracker.get(&region_id) {
                    if let Some(last_full_listing) = gc_info.last_full_listing_time {
                        // check if pass cooling down interval after last full listing
-                        let elapsed = now.duration_since(last_full_listing);
+                        let elapsed = now.saturating_duration_since(last_full_listing);
                        elapsed >= self.config.full_file_listing_interval
                    } else {
                        // Never did full listing for this region, do it now
--- a/src/meta-srv/src/gc/mock.rs
+++ b/src/meta-srv/src/gc/mock.rs
@@ -0,0 +1,458 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+mod basic;
+mod candidate_select;
+mod concurrent;
+mod config;
+mod err_handle;
+mod full_list;
+mod integration;
+mod misc;
+
+use std::collections::{HashMap, HashSet};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use common_meta::datanode::{RegionManifestInfo, RegionStat};
+use common_meta::key::table_route::PhysicalTableRouteValue;
+use common_meta::peer::Peer;
+use common_meta::rpc::router::{Region, RegionRoute};
+use common_telemetry::debug;
+use ordered_float::OrderedFloat;
+use store_api::region_engine::RegionRole;
+use store_api::storage::{FileRefsManifest, GcReport, RegionId};
+use table::metadata::TableId;
+use tokio::sync::mpsc::Sender;
+
+use crate::error::{Result, UnexpectedSnafu};
+use crate::gc::candidate::GcCandidate;
+use crate::gc::ctx::SchedulerCtx;
+use crate::gc::handler::Region2Peers;
+use crate::gc::options::GcSchedulerOptions;
+use crate::gc::scheduler::{Event, GcScheduler};
+
+pub const TEST_REGION_SIZE_200MB: u64 = 200_000_000;
+
+/// Helper function to create an empty GcReport for the given region IDs
+pub fn new_empty_report_with(region_ids: impl IntoIterator<Item = RegionId>) -> GcReport {
+    let mut deleted_files = HashMap::new();
+    for region_id in region_ids {
+        deleted_files.insert(region_id, vec![]);
+    }
+    GcReport {
+        deleted_files,
+        need_retry_regions: HashSet::new(),
+    }
+}
+
+#[allow(clippy::type_complexity)]
+#[derive(Debug, Default)]
+pub struct MockSchedulerCtx {
+    pub table_to_region_stats: Arc<Mutex<Option<HashMap<TableId, Vec<RegionStat>>>>>,
+    pub table_routes: Arc<Mutex<HashMap<TableId, (TableId, PhysicalTableRouteValue)>>>,
+    pub file_refs: Arc<Mutex<Option<FileRefsManifest>>>,
+    pub gc_reports: Arc<Mutex<HashMap<RegionId, GcReport>>>,
+    pub candidates: Arc<Mutex<Option<HashMap<TableId, Vec<GcCandidate>>>>>,
+    pub get_table_to_region_stats_calls: Arc<Mutex<usize>>,
+    pub get_file_references_calls: Arc<Mutex<usize>>,
+    pub gc_regions_calls: Arc<Mutex<usize>>,
+    // Error injection fields for testing
+    pub get_table_to_region_stats_error: Arc<Mutex<Option<crate::error::Error>>>,
+    pub get_table_route_error: Arc<Mutex<Option<crate::error::Error>>>,
+    pub get_file_references_error: Arc<Mutex<Option<crate::error::Error>>>,
+    pub gc_regions_error: Arc<Mutex<Option<crate::error::Error>>>,
+    // Retry testing fields
+    pub gc_regions_retry_count: Arc<Mutex<HashMap<RegionId, usize>>>,
+    pub gc_regions_error_sequence: Arc<Mutex<Vec<crate::error::Error>>>,
+    pub gc_regions_success_after_retries: Arc<Mutex<HashMap<RegionId, usize>>>,
+    // Per-region error injection
+    pub gc_regions_per_region_errors: Arc<Mutex<HashMap<RegionId, crate::error::Error>>>,
+}
+
+impl MockSchedulerCtx {
+    pub fn with_table_routes(
+        self,
+        table_routes: HashMap<TableId, (TableId, Vec<(RegionId, Peer)>)>,
+    ) -> Self {
+        *self.table_routes.lock().unwrap() = table_routes
+            .into_iter()
+            .map(|(k, (phy_id, region2peer))| {
+                let phy = PhysicalTableRouteValue::new(
+                    region2peer
+                        .into_iter()
+                        .map(|(region_id, peer)| RegionRoute {
+                            region: Region::new_test(region_id),
+                            leader_peer: Some(peer),
+                            ..Default::default()
+                        })
+                        .collect(),
+                );
+
+                (k, (phy_id, phy))
+            })
+            .collect();
+        self
+    }
+
+    /// Set an error to be returned by `get_table_to_region_stats`
+    #[allow(dead_code)]
+    pub fn with_get_table_to_region_stats_error(self, error: crate::error::Error) -> Self {
+        *self.get_table_to_region_stats_error.lock().unwrap() = Some(error);
+        self
+    }
+
+    /// Set an error to be returned by `get_table_route`
+    pub fn set_table_route_error(&self, error: crate::error::Error) {
+        *self.get_table_route_error.lock().unwrap() = Some(error);
+    }
+
+    /// Set an error to be returned by `get_file_references`
+    #[allow(dead_code)]
+    pub fn with_get_file_references_error(self, error: crate::error::Error) -> Self {
+        *self.get_file_references_error.lock().unwrap() = Some(error);
+        self
+    }
+
+    /// Set an error to be returned by `gc_regions`
+    pub fn with_gc_regions_error(self, error: crate::error::Error) -> Self {
+        *self.gc_regions_error.lock().unwrap() = Some(error);
+        self
+    }
+
+    /// Set a sequence of errors to be returned by `gc_regions` for retry testing
+    pub fn set_gc_regions_error_sequence(&self, errors: Vec<crate::error::Error>) {
+        *self.gc_regions_error_sequence.lock().unwrap() = errors;
+    }
+
+    /// Set success after a specific number of retries for a region
+    pub fn set_gc_regions_success_after_retries(&self, region_id: RegionId, retries: usize) {
+        self.gc_regions_success_after_retries
+            .lock()
+            .unwrap()
+            .insert(region_id, retries);
+    }
+
+    /// Get the retry count for a specific region
+    pub fn get_retry_count(&self, region_id: RegionId) -> usize {
+        self.gc_regions_retry_count
+            .lock()
+            .unwrap()
+            .get(&region_id)
+            .copied()
+            .unwrap_or(0)
+    }
+
+    /// Reset all retry tracking
+    pub fn reset_retry_tracking(&self) {
+        *self.gc_regions_retry_count.lock().unwrap() = HashMap::new();
+        *self.gc_regions_error_sequence.lock().unwrap() = Vec::new();
+        *self.gc_regions_success_after_retries.lock().unwrap() = HashMap::new();
+    }
+
+    /// Set an error to be returned for a specific region
+    pub fn set_gc_regions_error_for_region(&self, region_id: RegionId, error: crate::error::Error) {
+        self.gc_regions_per_region_errors
+            .lock()
+            .unwrap()
+            .insert(region_id, error);
+    }
+
+    /// Clear per-region errors
+    #[allow(unused)]
+    pub fn clear_gc_regions_per_region_errors(&self) {
+        self.gc_regions_per_region_errors.lock().unwrap().clear();
+    }
+}
+
+#[async_trait::async_trait]
+impl SchedulerCtx for MockSchedulerCtx {
+    async fn get_table_to_region_stats(&self) -> Result<HashMap<TableId, Vec<RegionStat>>> {
+        *self.get_table_to_region_stats_calls.lock().unwrap() += 1;
+
+        // Check if we should return an injected error
+        if let Some(error) = self.get_table_to_region_stats_error.lock().unwrap().take() {
+            return Err(error);
+        }
+
+        Ok(self
+            .table_to_region_stats
+            .lock()
+            .unwrap()
+            .clone()
+            .unwrap_or_default())
+    }
+
+    async fn get_table_route(
+        &self,
+        table_id: TableId,
+    ) -> Result<(TableId, PhysicalTableRouteValue)> {
+        // Check if we should return an injected error
+        if let Some(error) = self.get_table_route_error.lock().unwrap().take() {
+            return Err(error);
+        }
+
+        Ok(self
+            .table_routes
+            .lock()
+            .unwrap()
+            .get(&table_id)
+            .cloned()
+            .unwrap_or_else(|| (table_id, PhysicalTableRouteValue::default())))
+    }
+
+    async fn get_file_references(
+        &self,
+        query_regions: &[RegionId],
+        _related_regions: HashMap<RegionId, Vec<RegionId>>,
+        region_to_peer: &Region2Peers,
+        _timeout: Duration,
+    ) -> Result<FileRefsManifest> {
+        *self.get_file_references_calls.lock().unwrap() += 1;
+
+        // Check if we should return an injected error
+        if let Some(error) = self.get_file_references_error.lock().unwrap().take() {
+            return Err(error);
+        }
+        if query_regions
+            .iter()
+            .any(|region_id| !region_to_peer.contains_key(region_id))
+        {
+            UnexpectedSnafu {
+                violated: format!(
+                    "region_to_peer{region_to_peer:?} does not contain all region_ids requested: {:?}",
+                    query_regions
+                ),
+            }.fail()?;
+        }
+
+        Ok(self.file_refs.lock().unwrap().clone().unwrap_or_default())
+    }
+
+    async fn gc_regions(
+        &self,
+        _peer: Peer,
+        region_ids: &[RegionId],
+        _file_refs_manifest: &FileRefsManifest,
+        _full_file_listing: bool,
+        _timeout: Duration,
+    ) -> Result<GcReport> {
+        *self.gc_regions_calls.lock().unwrap() += 1;
+
+        // Check per-region error injection first (for any region)
+        for &region_id in region_ids {
+            if let Some(error) = self
+                .gc_regions_per_region_errors
+                .lock()
+                .unwrap()
+                .remove(&region_id)
+            {
+                *self
+                    .gc_regions_retry_count
+                    .lock()
+                    .unwrap()
+                    .entry(region_id)
+                    .or_insert(0) += 1;
+                return Err(error);
+            }
+        }
+
+        // Check if we should return an injected error
+        if let Some(error) = self.gc_regions_error.lock().unwrap().take() {
+            for region_id in region_ids {
+                *self
+                    .gc_regions_retry_count
+                    .lock()
+                    .unwrap()
+                    .entry(*region_id)
+                    .or_insert(0) += 1;
+            }
+            return Err(error);
+        }
+
+        // Handle error sequence for retry testing
+        {
+            let mut error_sequence = self.gc_regions_error_sequence.lock().unwrap();
+            if !error_sequence.is_empty() {
+                let error = error_sequence.remove(0);
+                for region_id in region_ids {
+                    *self
+                        .gc_regions_retry_count
+                        .lock()
+                        .unwrap()
+                        .entry(*region_id)
+                        .or_insert(0) += 1;
+                }
+                return Err(error);
+            }
+        }
+
+        // Build the final report by processing each region individually
+        let mut final_report = GcReport::default();
+        let gc_reports = self.gc_reports.lock().unwrap();
+        let success_after_retries = self.gc_regions_success_after_retries.lock().unwrap();
+
+        for &region_id in region_ids {
+            // Get current retry count for this region
+            let retry_count = self
+                .gc_regions_retry_count
+                .lock()
+                .unwrap()
+                .get(&region_id)
+                .copied()
+                .unwrap_or(0);
+
+            // Check if this region should succeed or need retry
+            if let Some(&required_retries) = success_after_retries.get(&region_id) {
+                if retry_count < required_retries {
+                    debug!(
+                        "Region {} needs retry (attempt {}/{})",
+                        region_id,
+                        retry_count + 1,
+                        required_retries
+                    );
+                    // This region needs more retries - add to need_retry_regions
+                    final_report.need_retry_regions.insert(region_id);
+                    // Track the retry attempt
+                    let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
+                    *retry_count_map.entry(region_id).or_insert(0) += 1;
+                } else {
+                    debug!(
+                        "Region {} has completed retries - succeeding now",
+                        region_id
+                    );
+                    // This region has completed all required retries - succeed
+                    if let Some(report) = gc_reports.get(&region_id) {
+                        final_report.merge(report.clone());
+                    }
+                    // Track the success attempt
+                    let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
+                    *retry_count_map.entry(region_id).or_insert(0) += 1;
+                }
+            } else {
+                // No retry requirement - check if we have a GC report for this region
+                if let Some(report) = gc_reports.get(&region_id) {
+                    // We have a GC report - succeed immediately
+                    final_report.merge(report.clone());
+                    // Track the success attempt
+                    let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
+                    *retry_count_map.entry(region_id).or_insert(0) += 1;
+                } else {
+                    // No GC report available - this region should be marked for retry
+                    final_report.need_retry_regions.insert(region_id);
+                    // Track the attempt
+                    let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
+                    *retry_count_map.entry(region_id).or_insert(0) += 1;
+                }
+            }
+        }
+
+        // Return the report with need_retry_regions populated - let the caller handle retry logic
+        Ok(final_report)
+    }
+}
+
+pub struct TestEnv {
+    pub scheduler: GcScheduler,
+    pub ctx: Arc<MockSchedulerCtx>,
+    #[allow(dead_code)]
+    tx: Sender<Event>,
+}
+
+#[allow(unused)]
+impl TestEnv {
+    pub fn new() -> Self {
+        let ctx = Arc::new(MockSchedulerCtx::default());
+        let (tx, rx) = GcScheduler::channel();
+        let config = GcSchedulerOptions::default();
+
+        let scheduler = GcScheduler {
+            ctx: ctx.clone(),
+            receiver: rx,
+            config,
+            region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+            last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+        };
+
+        Self { scheduler, ctx, tx }
+    }
+
+    pub fn with_candidates(self, candidates: HashMap<TableId, Vec<GcCandidate>>) -> Self {
+        *self.ctx.candidates.lock().unwrap() = Some(candidates);
+        self
+    }
+
+    #[allow(dead_code)]
+    pub async fn run_scheduler(mut self) {
+        self.scheduler.run().await;
+    }
+
+    #[allow(dead_code)]
+    pub async fn tick(&self) {
+        self.tx.send(Event::Tick).await.unwrap();
+    }
+}
+
+/// Helper function to create a mock GC candidate that will pass the GC threshold
+fn new_candidate(region_id: RegionId, score: f64) -> GcCandidate {
+    // will pass threshold for gc
+    let region_stat = mock_region_stat(region_id, RegionRole::Leader, 10_000, 10);
+
+    GcCandidate {
+        region_id,
+        score: OrderedFloat(score),
+        region_stat,
+    }
+}
+
+/// Helper function to create a mock GC candidate
+fn mock_candidate(region_id: RegionId) -> GcCandidate {
+    let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10);
+    GcCandidate {
+        region_id,
+        score: ordered_float::OrderedFloat(1.0),
+        region_stat,
+    }
+}
+
+/// Helper function to create a mock RegionStat
+fn mock_region_stat(
+    id: RegionId,
+    role: RegionRole,
+    approximate_bytes: u64,
+    sst_num: u64,
+) -> RegionStat {
+    RegionStat {
+        id,
+        role,
+        approximate_bytes,
+        sst_num,
+        region_manifest: RegionManifestInfo::Mito {
+            manifest_version: 0,
+            flushed_entry_id: 0,
+            file_removed_cnt: 0,
+        },
+        rcus: 0,
+        wcus: 0,
+        engine: "mito".to_string(),
+        num_rows: 0,
+        memtable_size: 0,
+        manifest_size: 0,
+        sst_size: 0,
+        index_size: 0,
+        data_topic_latest_entry_id: 0,
+        metadata_topic_latest_entry_id: 0,
+        written_bytes: 0,
+    }
+}
--- a/src/meta-srv/src/gc/mock/basic.rs
+++ b/src/meta-srv/src/gc/mock/basic.rs
@@ -0,0 +1,164 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+
+use common_meta::peer::Peer;
+use common_telemetry::init_default_ut_logging;
+use store_api::region_engine::RegionRole;
+use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
+
+use crate::gc::mock::{
+    MockSchedulerCtx, TEST_REGION_SIZE_200MB, TestEnv, mock_region_stat, new_candidate,
+};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+#[tokio::test]
+async fn test_parallel_process_datanodes_empty() {
+    let env = TestEnv::new();
+    let report = env
+        .scheduler
+        .parallel_process_datanodes(HashMap::new())
+        .await;
+
+    assert_eq!(report.per_datanode_reports.len(), 0);
+    assert_eq!(report.failed_datanodes.len(), 0);
+}
+
+#[tokio::test]
+async fn test_parallel_process_datanodes_with_candidates() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+    let candidates = HashMap::from([(table_id, vec![new_candidate(region_id, 1.0)])]);
+
+    let mut gc_reports = HashMap::new();
+    let deleted_files = vec![FileId::random()];
+    gc_reports.insert(
+        region_id,
+        GcReport {
+            deleted_files: HashMap::from([(region_id, deleted_files.clone())]),
+            ..Default::default()
+        },
+    );
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        ..Default::default()
+    };
+    let ctx = MockSchedulerCtx {
+        gc_reports: Arc::new(Mutex::new(gc_reports)),
+        file_refs: Arc::new(Mutex::new(Some(file_refs))),
+        ..Default::default()
+    }
+    .with_table_routes(HashMap::from([(
+        table_id,
+        (table_id, vec![(region_id, peer.clone())]),
+    )]));
+
+    let env = TestEnv::new();
+    // We need to replace the ctx with the one with gc_reports
+    let mut scheduler = env.scheduler;
+    scheduler.ctx = Arc::new(ctx);
+
+    // Convert table-based candidates to datanode-based candidates
+    let datanode_to_candidates = HashMap::from([(
+        peer,
+        candidates
+            .into_iter()
+            .flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
+            .collect(),
+    )]);
+
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    assert_eq!(report.per_datanode_reports.len(), 1);
+    assert_eq!(report.failed_datanodes.len(), 0);
+}
+
+#[tokio::test]
+async fn test_handle_tick() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+    let candidates = HashMap::from([(table_id, vec![new_candidate(region_id, 1.0)])]);
+
+    let mut gc_reports = HashMap::new();
+    gc_reports.insert(region_id, GcReport::default());
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        ..Default::default()
+    };
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(HashMap::from([(
+                table_id,
+                vec![mock_region_stat(
+                    region_id,
+                    RegionRole::Leader,
+                    TEST_REGION_SIZE_200MB,
+                    10,
+                )],
+            )])))),
+            gc_reports: Arc::new(Mutex::new(gc_reports)),
+            candidates: Arc::new(Mutex::new(Some(candidates))),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer)]),
+        )])),
+    );
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let report = scheduler.handle_tick().await.unwrap();
+
+    // Validate the returned GcJobReport
+    assert_eq!(
+        report.per_datanode_reports.len(),
+        1,
+        "Should process 1 datanode"
+    );
+    assert_eq!(
+        report.failed_datanodes.len(),
+        0,
+        "Should have 0 failed datanodes"
+    );
+
+    assert_eq!(*ctx.get_table_to_region_stats_calls.lock().unwrap(), 1);
+    assert_eq!(*ctx.get_file_references_calls.lock().unwrap(), 1);
+    assert_eq!(*ctx.gc_regions_calls.lock().unwrap(), 1);
+
+    let tracker = scheduler.region_gc_tracker.lock().await;
+    assert!(
+        tracker.contains_key(&region_id),
+        "Tracker should have one region: {:?}",
+        tracker
+    );
+}
--- a/src/meta-srv/src/gc/mock/candidate_select.rs
+++ b/src/meta-srv/src/gc/mock/candidate_select.rs
@@ -0,0 +1,390 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+
+use common_meta::datanode::RegionManifestInfo;
+use common_telemetry::init_default_ut_logging;
+use store_api::region_engine::RegionRole;
+use store_api::storage::RegionId;
+
+use crate::gc::mock::{MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+/// Candidate Selection Tests
+#[tokio::test]
+async fn test_gc_candidate_filtering_by_role() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let leader_region = RegionId::new(table_id, 1);
+    let follower_region = RegionId::new(table_id, 2);
+
+    let mut leader_stat = mock_region_stat(
+        leader_region,
+        RegionRole::Leader,
+        TEST_REGION_SIZE_200MB,
+        10,
+    ); // 200MB
+
+    let mut follower_stat = mock_region_stat(
+        follower_region,
+        RegionRole::Follower,
+        TEST_REGION_SIZE_200MB,
+        10,
+    ); // 200MB
+
+    // Set up manifest info for scoring
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut leader_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut follower_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![leader_stat.clone(), follower_stat.clone()])]);
+
+    let ctx = Arc::new(MockSchedulerCtx {
+        table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+        ..Default::default()
+    });
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let stats = ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+
+    let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
+
+    // Should only select leader regions
+    assert_eq!(
+        candidates.len(),
+        1,
+        "Expected 1 table with candidates, got {}",
+        candidates.len()
+    );
+    if let Some(table_candidates) = candidates.get(&table_id) {
+        assert_eq!(
+            table_candidates.len(),
+            1,
+            "Expected 1 candidate for table {}, got {}",
+            table_id,
+            table_candidates.len()
+        );
+        assert_eq!(
+            table_candidates[0].region_id, leader_region,
+            "Expected leader region {}, got {}",
+            leader_region, table_candidates[0].region_id
+        );
+    } else {
+        panic!("Expected table {} to have candidates", table_id);
+    }
+}
+
+#[tokio::test]
+async fn test_gc_candidate_size_threshold() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let small_region = RegionId::new(table_id, 1);
+    let large_region = RegionId::new(table_id, 2);
+
+    let mut small_stat = mock_region_stat(small_region, RegionRole::Leader, 50_000_000, 5); // 50MB
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut small_stat.region_manifest
+    {
+        *file_removed_cnt = 3;
+    }
+
+    let mut large_stat =
+        mock_region_stat(large_region, RegionRole::Leader, TEST_REGION_SIZE_200MB, 20); // 200MB
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut large_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![small_stat, large_stat])]);
+
+    let ctx = Arc::new(MockSchedulerCtx {
+        table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+        ..Default::default()
+    });
+
+    let config = GcSchedulerOptions {
+        min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let stats = ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+
+    let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
+
+    // Should only select large region
+    assert_eq!(
+        candidates.len(),
+        1,
+        "Expected 1 table with candidates, got {}",
+        candidates.len()
+    );
+    if let Some(table_candidates) = candidates.get(&table_id) {
+        assert_eq!(
+            table_candidates.len(),
+            1,
+            "Expected 1 candidate for table {}, got {}",
+            table_id,
+            table_candidates.len()
+        );
+        assert_eq!(
+            table_candidates[0].region_id, large_region,
+            "Expected large region {}, got {}",
+            large_region, table_candidates[0].region_id
+        );
+    } else {
+        panic!("Expected table {} to have candidates", table_id);
+    }
+}
+
+#[tokio::test]
+async fn test_gc_candidate_scoring() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let low_score_region = RegionId::new(table_id, 1);
+    let high_score_region = RegionId::new(table_id, 2);
+
+    let mut low_stat = mock_region_stat(
+        low_score_region,
+        RegionRole::Leader,
+        TEST_REGION_SIZE_200MB,
+        5,
+    ); // 200MB
+    // Set low file removal rate for low_score_region
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut low_stat.region_manifest
+    {
+        *file_removed_cnt = 2;
+    }
+
+    let mut high_stat = mock_region_stat(
+        high_score_region,
+        RegionRole::Leader,
+        TEST_REGION_SIZE_200MB,
+        50,
+    ); // 200MB
+    // Set high file removal rate for high_score_region
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut high_stat.region_manifest
+    {
+        *file_removed_cnt = 20;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![low_stat, high_stat])]);
+
+    let ctx = Arc::new(MockSchedulerCtx {
+        table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+        ..Default::default()
+    });
+
+    let config = GcSchedulerOptions {
+        sst_count_weight: 1.0,
+        file_removed_count_weight: 0.5,
+        min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let stats = ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+
+    let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
+
+    // Should select both regions but high score region should be first
+    assert_eq!(
+        candidates.len(),
+        1,
+        "Expected 1 table with candidates, got {}",
+        candidates.len()
+    );
+    if let Some(table_candidates) = candidates.get(&table_id) {
+        assert_eq!(
+            table_candidates.len(),
+            2,
+            "Expected 2 candidates for table {}, got {}",
+            table_id,
+            table_candidates.len()
+        );
+        // Higher score region should come first (sorted by score descending)
+        assert_eq!(
+            table_candidates[0].region_id, high_score_region,
+            "High score region should be first"
+        );
+        assert!(
+            table_candidates[0].score > table_candidates[1].score,
+            "High score region should have higher score: {} > {}",
+            table_candidates[0].score,
+            table_candidates[1].score
+        );
+    } else {
+        panic!("Expected table {} to have candidates", table_id);
+    }
+}
+
+#[tokio::test]
+async fn test_gc_candidate_regions_per_table_threshold() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    // Create 10 regions for the same table
+    let mut region_stats = Vec::new();
+
+    for i in 0..10 {
+        let region_id = RegionId::new(table_id, i + 1);
+        let mut stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 20); // 200MB
+
+        // Set different file removal rates to create different scores
+        // Higher region IDs get higher scores (better GC candidates)
+        if let RegionManifestInfo::Mito {
+            file_removed_cnt, ..
+        } = &mut stat.region_manifest
+        {
+            *file_removed_cnt = (i as u64 + 1) * 2; // Region 1: 2, Region 2: 4, ..., Region 10: 20
+        }
+
+        region_stats.push(stat);
+    }
+
+    let table_stats = HashMap::from([(table_id, region_stats)]);
+
+    let ctx = Arc::new(MockSchedulerCtx {
+        table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+        ..Default::default()
+    });
+
+    // Set regions_per_table_threshold to 3
+    let config = GcSchedulerOptions {
+        regions_per_table_threshold: 3,
+        min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let stats = ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+
+    let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
+
+    // Should have 1 table with candidates
+    assert_eq!(
+        candidates.len(),
+        1,
+        "Expected 1 table with candidates, got {}",
+        candidates.len()
+    );
+
+    if let Some(table_candidates) = candidates.get(&table_id) {
+        // Should only have 3 candidates due to regions_per_table_threshold
+        assert_eq!(
+            table_candidates.len(),
+            3,
+            "Expected 3 candidates for table {} due to regions_per_table_threshold, got {}",
+            table_id,
+            table_candidates.len()
+        );
+
+        // Verify that the top 3 scoring regions are selected
+        // Regions 8, 9, 10 should have the highest scores (file_removed_cnt: 16, 18, 20)
+        // They should be returned in descending order by score
+        let expected_regions = vec![10, 9, 8];
+        let actual_regions: Vec<u32> = table_candidates
+            .iter()
+            .map(|c| c.region_id.region_number())
+            .collect();
+
+        assert_eq!(
+            actual_regions, expected_regions,
+            "Expected regions {:?} to be selected, got {:?}",
+            expected_regions, actual_regions
+        );
+
+        // Verify they are sorted by score in descending order
+        for i in 0..table_candidates.len() - 1 {
+            assert!(
+                table_candidates[i].score >= table_candidates[i + 1].score,
+                "Candidates should be sorted by score descending: {} >= {}",
+                table_candidates[i].score,
+                table_candidates[i + 1].score
+            );
+        }
+    } else {
+        panic!("Expected table {} to have candidates", table_id);
+    }
+}
--- a/src/meta-srv/src/gc/mock/concurrent.rs
+++ b/src/meta-srv/src/gc/mock/concurrent.rs
@@ -0,0 +1,516 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{HashMap, HashSet};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use common_meta::key::table_route::PhysicalTableRouteValue;
+use common_meta::peer::Peer;
+use common_meta::rpc::router::{Region, RegionRoute};
+use common_telemetry::{info, init_default_ut_logging};
+use store_api::region_engine::RegionRole;
+use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
+
+use crate::gc::mock::{
+    MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_candidate, mock_region_stat, new_candidate,
+};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+/// Concurrent Processing Tests
+#[tokio::test]
+async fn test_concurrent_table_processing_limits() {
+    init_default_ut_logging();
+
+    let mut candidates = HashMap::new();
+    let mut gc_reports = HashMap::new();
+
+    // Create many tables with candidates
+    for table_id in 1..=10 {
+        let region_id = RegionId::new(table_id, 1);
+        candidates.insert(table_id, vec![new_candidate(region_id, 1.0)]);
+        gc_reports.insert(
+            region_id,
+            GcReport {
+                deleted_files: HashMap::from([(region_id, vec![FileId::random()])]),
+                ..Default::default()
+            },
+        );
+    }
+
+    let ctx = MockSchedulerCtx {
+        candidates: Arc::new(Mutex::new(Some(candidates))),
+        file_refs: Arc::new(Mutex::new(Some(FileRefsManifest {
+            manifest_version: (1..=10).map(|i| (RegionId::new(i, 1), 1)).collect(),
+            ..Default::default()
+        }))),
+        gc_reports: Arc::new(Mutex::new(gc_reports)),
+        ..Default::default()
+    }
+    .with_table_routes(
+        (1..=10)
+            .map(|table_id| {
+                let region_id = RegionId::new(table_id, 1);
+                (table_id, (table_id, vec![(region_id, Peer::new(1, ""))]))
+            })
+            .collect(),
+    );
+
+    let ctx = Arc::new(ctx);
+
+    let config = GcSchedulerOptions {
+        max_concurrent_tables: 3,                          // Set a low limit
+        retry_backoff_duration: Duration::from_millis(50), // for faster test
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
+
+    // Convert table-based candidates to datanode-based candidates
+    let peer = Peer::new(1, "");
+    let datanode_to_candidates = HashMap::from([(
+        peer,
+        candidates
+            .into_iter()
+            .flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
+            .collect(),
+    )]);
+
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    // Should process all datanodes
+    assert_eq!(report.per_datanode_reports.len(), 1);
+    assert_eq!(report.failed_datanodes.len(), 0);
+}
+
+#[tokio::test]
+async fn test_datanode_processes_tables_with_partial_gc_failures() {
+    init_default_ut_logging();
+
+    let table1 = 1;
+    let region1 = RegionId::new(table1, 1);
+    let table2 = 2;
+    let region2 = RegionId::new(table2, 1);
+    let peer = Peer::new(1, "");
+
+    let mut candidates = HashMap::new();
+    candidates.insert(table1, vec![new_candidate(region1, 1.0)]);
+    candidates.insert(table2, vec![new_candidate(region2, 1.0)]);
+
+    // Set up GC reports for success and failure
+    let mut gc_reports = HashMap::new();
+    gc_reports.insert(
+        region1,
+        GcReport {
+            deleted_files: HashMap::from([(region1, vec![])]),
+            ..Default::default()
+        },
+    );
+    // region2 will have no GC report, simulating failure
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region1, 1), (region2, 1)]),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            gc_reports: Arc::new(Mutex::new(gc_reports)),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            candidates: Arc::new(Mutex::new(Some(candidates))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([
+            (table1, (table1, vec![(region1, peer.clone())])),
+            (table2, (table2, vec![(region2, peer.clone())])),
+        ])),
+    );
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
+
+    // Convert table-based candidates to datanode-based candidates
+
+    let datanode_to_candidates = HashMap::from([(
+        peer,
+        candidates
+            .into_iter()
+            .flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
+            .collect(),
+    )]);
+
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    // Should have one datanode with mixed results
+    assert_eq!(report.per_datanode_reports.len(), 1);
+    // also check one failed region (region2 has no GC report, so it should be in need_retry_regions)
+    let datanode_report = report.per_datanode_reports.values().next().unwrap();
+    assert_eq!(datanode_report.need_retry_regions.len(), 1);
+    assert_eq!(report.failed_datanodes.len(), 0);
+}
+
+// Region Concurrency Tests
+
+#[tokio::test]
+async fn test_region_gc_concurrency_limit() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let peer = Peer::new(1, "");
+
+    // Create multiple regions for the same table
+    let mut region_stats = Vec::new();
+    let mut candidates = Vec::new();
+    let mut gc_reports = HashMap::new();
+
+    for i in 1..=10 {
+        let region_id = RegionId::new(table_id, i as u32);
+        let region_stat =
+            mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+        region_stats.push(region_stat);
+
+        candidates.push(mock_candidate(region_id));
+
+        gc_reports.insert(
+            region_id,
+            GcReport {
+                deleted_files: HashMap::from([(
+                    region_id,
+                    vec![FileId::random(), FileId::random()],
+                )]),
+                ..Default::default()
+            },
+        );
+    }
+
+    let table_stats = HashMap::from([(table_id, region_stats)]);
+
+    let file_refs = FileRefsManifest {
+        manifest_version: (1..=10)
+            .map(|i| (RegionId::new(table_id, i as u32), 1))
+            .collect(),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            gc_reports: Arc::new(Mutex::new(gc_reports)),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (
+                table_id,
+                (1..=10)
+                    .map(|i| (RegionId::new(table_id, i as u32), peer.clone()))
+                    .collect(),
+            ),
+        )])),
+    );
+
+    // Configure low concurrency limit
+    let config = GcSchedulerOptions {
+        region_gc_concurrency: 3, // Only 3 regions can be processed concurrently
+        retry_backoff_duration: Duration::from_millis(50), // for faster test
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let start_time = Instant::now();
+    let report = scheduler
+        .process_datanode_gc(
+            peer,
+            candidates.into_iter().map(|c| (table_id, c)).collect(),
+        )
+        .await
+        .unwrap();
+    let duration = start_time.elapsed();
+
+    // All regions should be processed successfully
+    // Check that all 10 regions have deleted files
+    assert_eq!(report.deleted_files.len(), 10);
+    for i in 1..=10 {
+        let region_id = RegionId::new(table_id, i as u32);
+        assert!(report.deleted_files.contains_key(&region_id));
+        assert_eq!(report.deleted_files[&region_id].len(), 2); // Each region has 2 deleted files
+    }
+    assert!(report.need_retry_regions.is_empty());
+
+    // Verify that concurrency limit was respected (this is hard to test directly,
+    // but we can verify that the processing completed successfully)
+    info!(
+        "Processed 10 regions with concurrency limit 3 in {:?}",
+        duration
+    );
+}
+
+#[tokio::test]
+async fn test_region_gc_concurrency_with_partial_failures() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let peer = Peer::new(1, "");
+
+    // Create multiple regions with mixed success/failure
+    let mut region_stats = Vec::new();
+    let mut candidates = Vec::new();
+    let mut gc_reports = HashMap::new();
+
+    // Create the context first so we can set errors on it
+    let ctx = Arc::new(MockSchedulerCtx::default());
+
+    for i in 1..=6 {
+        let region_id = RegionId::new(table_id, i as u32);
+        let region_stat =
+            mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+        region_stats.push(region_stat);
+
+        candidates.push(mock_candidate(region_id));
+
+        if i % 2 == 0 {
+            // Even regions will succeed
+            gc_reports.insert(
+                region_id,
+                GcReport {
+                    deleted_files: HashMap::from([(
+                        region_id,
+                        vec![FileId::random(), FileId::random()],
+                    )]),
+                    ..Default::default()
+                },
+            );
+        } else {
+            // Odd regions will fail - don't add them to gc_reports
+            // This will cause them to be marked as needing retry
+        }
+    }
+
+    let table_stats = HashMap::from([(table_id, region_stats)]);
+
+    let file_refs = FileRefsManifest {
+        manifest_version: (1..=6)
+            .map(|i| (RegionId::new(table_id, i as u32), 1))
+            .collect(),
+        ..Default::default()
+    };
+
+    // Update the context with the data
+    *ctx.table_to_region_stats.lock().unwrap() = Some(table_stats);
+    *ctx.gc_reports.lock().unwrap() = gc_reports;
+    *ctx.file_refs.lock().unwrap() = Some(file_refs);
+    let region_routes = (1..=6)
+        .map(|i| RegionRoute {
+            region: Region::new_test(RegionId::new(table_id, i as u32)),
+            leader_peer: Some(peer.clone()),
+            ..Default::default()
+        })
+        .collect();
+
+    *ctx.table_routes.lock().unwrap() = HashMap::from([(
+        table_id,
+        (table_id, PhysicalTableRouteValue::new(region_routes)),
+    )]);
+
+    // Configure concurrency limit
+    let config = GcSchedulerOptions {
+        region_gc_concurrency: 2, // Process 2 regions concurrently
+        retry_backoff_duration: Duration::from_millis(50), // for faster test
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let datanode_to_candidates = HashMap::from([(
+        peer.clone(),
+        candidates.into_iter().map(|c| (table_id, c)).collect(),
+    )]);
+
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    let report = report.per_datanode_reports.get(&peer.id).unwrap();
+
+    // Should have 3 successful and 3 failed regions
+    // Even regions (2, 4, 6) should succeed, odd regions (1, 3, 5) should fail
+    let mut successful_regions = 0;
+    let mut failed_regions = 0;
+
+    for i in 1..=6 {
+        let region_id = RegionId::new(table_id, i as u32);
+        if i % 2 == 0 {
+            // Even regions should succeed
+            if report.deleted_files.contains_key(&region_id) {
+                successful_regions += 1;
+            }
+        } else {
+            // Odd regions should fail - they should be in need_retry_regions
+            if report.need_retry_regions.contains(&region_id) {
+                failed_regions += 1;
+            }
+        }
+    }
+
+    // In the new implementation, regions that cause gc_regions to return an error
+    // are added to need_retry_regions. Let's check if we have the expected mix.
+    info!(
+        "Successful regions: {}, Failed regions: {}",
+        successful_regions, failed_regions
+    );
+    info!(
+        "Deleted files: {:?}",
+        report.deleted_files.keys().collect::<Vec<_>>()
+    );
+    info!("Need retry regions: {:?}", report.need_retry_regions);
+
+    // The exact count might vary depending on how the mock handles errors,
+    // but we should have some successful and some failed regions
+    assert!(
+        successful_regions > 0,
+        "Should have at least some successful regions"
+    );
+    assert!(
+        failed_regions > 0,
+        "Should have at least some failed regions"
+    );
+}
+
+#[tokio::test]
+async fn test_region_gc_concurrency_with_retryable_errors() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let peer = Peer::new(1, "");
+
+    // Create multiple regions
+    let mut region_stats = Vec::new();
+    let mut candidates = Vec::new();
+
+    for i in 1..=5 {
+        let region_id = RegionId::new(table_id, i as u32);
+        let region_stat =
+            mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+        region_stats.push(region_stat);
+        candidates.push(mock_candidate(region_id));
+    }
+
+    let table_stats = HashMap::from([(table_id, region_stats)]);
+
+    let file_refs = FileRefsManifest {
+        manifest_version: (1..=5)
+            .map(|i| (RegionId::new(table_id, i as u32), 1))
+            .collect(),
+        ..Default::default()
+    };
+
+    let gc_report = (1..=5)
+        .map(|i| {
+            let region_id = RegionId::new(table_id, i as u32);
+            (
+                region_id,
+                // mock the actual gc report with deleted files when succeeded(even no files to delete)
+                GcReport::new(HashMap::from([(region_id, vec![])]), HashSet::new()),
+            )
+        })
+        .collect();
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            gc_reports: Arc::new(Mutex::new(gc_report)),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (
+                table_id,
+                (1..=5)
+                    .map(|i| (RegionId::new(table_id, i as u32), peer.clone()))
+                    .collect(),
+            ),
+        )])),
+    );
+
+    // Configure concurrency limit
+    let config = GcSchedulerOptions {
+        region_gc_concurrency: 2, // Process 2 regions concurrently
+        retry_backoff_duration: Duration::from_millis(50),
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let datanode_to_candidates = HashMap::from([(
+        peer.clone(),
+        candidates.into_iter().map(|c| (table_id, c)).collect(),
+    )]);
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    let report = report.per_datanode_reports.get(&peer.id).unwrap();
+
+    // In the new implementation without retry logic, all regions should be processed
+    // The exact behavior depends on how the mock handles the regions
+    info!(
+        "Deleted files: {:?}",
+        report.deleted_files.keys().collect::<Vec<_>>()
+    );
+    info!("Need retry regions: {:?}", report.need_retry_regions);
+
+    // We should have processed all 5 regions in some way
+    let total_processed = report.deleted_files.len() + report.need_retry_regions.len();
+    assert_eq!(total_processed, 5, "Should have processed all 5 regions");
+}
--- a/src/meta-srv/src/gc/mock/config.rs
+++ b/src/meta-srv/src/gc/mock/config.rs
@@ -0,0 +1,197 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+
+use common_meta::datanode::RegionManifestInfo;
+use common_telemetry::init_default_ut_logging;
+use store_api::region_engine::RegionRole;
+use store_api::storage::RegionId;
+
+use crate::gc::mock::{MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+/// Configuration Tests
+#[tokio::test]
+async fn test_different_gc_weights() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+
+    let mut region_stat =
+        mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB to pass size threshold
+
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut region_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    let ctx = Arc::new(MockSchedulerCtx {
+        table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+        ..Default::default()
+    });
+
+    // Test with different weights
+    let config1 = GcSchedulerOptions {
+        sst_count_weight: 2.0,
+        file_removed_count_weight: 0.5,
+        min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
+        ..Default::default()
+    };
+
+    let scheduler1 = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: config1,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let stats = ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+
+    let candidates1 = scheduler1.select_gc_candidates(&stats).await.unwrap();
+
+    let config2 = GcSchedulerOptions {
+        sst_count_weight: 0.5,
+        file_removed_count_weight: 2.0,
+        min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
+        ..Default::default()
+    };
+
+    let scheduler2 = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: config2,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let stats = &ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+    let candidates2 = scheduler2.select_gc_candidates(stats).await.unwrap();
+
+    // Both should select the region but with different scores
+    assert_eq!(
+        candidates1.len(),
+        1,
+        "Expected 1 table with candidates for config1, got {}",
+        candidates1.len()
+    );
+    assert_eq!(
+        candidates2.len(),
+        1,
+        "Expected 1 table with candidates for config2, got {}",
+        candidates2.len()
+    );
+
+    // Verify the region is actually selected
+    assert!(
+        candidates1.contains_key(&table_id),
+        "Config1 should contain table_id {}",
+        table_id
+    );
+    assert!(
+        candidates2.contains_key(&table_id),
+        "Config2 should contain table_id {}",
+        table_id
+    );
+}
+
+#[tokio::test]
+async fn test_regions_per_table_threshold() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let mut region_stats = Vec::new();
+
+    // Create many regions
+    for i in 1..=10 {
+        let region_id = RegionId::new(table_id, i as u32);
+        let mut stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+
+        if let RegionManifestInfo::Mito {
+            file_removed_cnt, ..
+        } = &mut stat.region_manifest
+        {
+            *file_removed_cnt = 5;
+        }
+
+        region_stats.push(stat);
+    }
+
+    let table_stats = HashMap::from([(table_id, region_stats)]);
+
+    let ctx = Arc::new(MockSchedulerCtx {
+        table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+        ..Default::default()
+    });
+
+    let config = GcSchedulerOptions {
+        regions_per_table_threshold: 3, // Limit to 3 regions per table
+        min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let stats = ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+
+    let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
+
+    assert_eq!(
+        candidates.len(),
+        1,
+        "Expected 1 table with candidates, got {}",
+        candidates.len()
+    );
+    if let Some(table_candidates) = candidates.get(&table_id) {
+        // Should be limited to 3 regions
+        assert_eq!(
+            table_candidates.len(),
+            3,
+            "Expected 3 candidates for table {}, got {}",
+            table_id,
+            table_candidates.len()
+        );
+    } else {
+        panic!("Expected table {} to have candidates", table_id);
+    }
+}
--- a/src/meta-srv/src/gc/mock/err_handle.rs
+++ b/src/meta-srv/src/gc/mock/err_handle.rs
@@ -0,0 +1,293 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{HashMap, HashSet};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use common_meta::datanode::RegionManifestInfo;
+use common_meta::peer::Peer;
+use common_telemetry::init_default_ut_logging;
+use store_api::region_engine::RegionRole;
+use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
+
+use crate::gc::mock::{
+    MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat, new_empty_report_with,
+};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+/// Error Handling Tests
+#[tokio::test]
+async fn test_gc_regions_failure_handling() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+
+    // Create region stat with proper size and file_removed_cnt to ensure it gets selected as candidate
+    let mut region_stat =
+        mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut region_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    // Create a context that will return an error for gc_regions
+    let mut gc_reports = HashMap::new();
+    gc_reports.insert(region_id, GcReport::default());
+
+    // Inject an error for gc_regions method
+    let gc_error = crate::error::UnexpectedSnafu {
+        violated: "Simulated GC failure for testing".to_string(),
+    }
+    .build();
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        file_refs: HashMap::from([(region_id, HashSet::from([FileId::random()]))]),
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            gc_reports: Arc::new(Mutex::new(gc_reports)),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer)]),
+        )]))
+        .with_gc_regions_error(gc_error),
+    );
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    // This should handle the failure gracefully
+    let report = scheduler.handle_tick().await.unwrap();
+
+    // Validate the report shows the failure handling
+    assert_eq!(
+        report.per_datanode_reports.len(),
+        1,
+        "Should process 1 datanode despite failure"
+    );
+    assert_eq!(
+        report.failed_datanodes.len(),
+        0,
+        "Should have 0 failed datanodes (failure handled via need_retry_regions)"
+    );
+
+    // Check that the region is in need_retry_regions due to the failure
+    let datanode_report = report.per_datanode_reports.values().next().unwrap();
+    assert_eq!(
+        datanode_report.need_retry_regions.len(),
+        1,
+        "Should have 1 region in need_retry_regions due to failure"
+    );
+    assert!(
+        datanode_report.need_retry_regions.contains(&region_id),
+        "Region should be in need_retry_regions"
+    );
+
+    // Verify that calls were made despite potential failures
+    assert_eq!(
+        *ctx.get_table_to_region_stats_calls.lock().unwrap(),
+        1,
+        "Expected 1 call to get_table_to_region_stats"
+    );
+    assert!(
+        *ctx.get_file_references_calls.lock().unwrap() >= 1,
+        "Expected at least 1 call to get_file_references"
+    );
+    assert!(
+        *ctx.gc_regions_calls.lock().unwrap() >= 1,
+        "Expected at least 1 call to gc_regions"
+    );
+}
+
+#[tokio::test]
+async fn test_get_file_references_failure() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+
+    // Create region stat with proper size and file_removed_cnt to ensure it gets selected as candidate
+    let mut region_stat =
+        mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut region_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    // Create context with empty file refs (simulating failure)
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            file_refs: Arc::new(Mutex::new(Some(FileRefsManifest::default()))),
+            gc_reports: Arc::new(Mutex::new(HashMap::from([(
+                region_id,
+                new_empty_report_with([region_id]),
+            )]))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer)]),
+        )])),
+    );
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions {
+            retry_backoff_duration: Duration::from_millis(10), // shorten for test
+            ..Default::default()
+        },
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let report = scheduler.handle_tick().await.unwrap();
+
+    // Validate the report shows the expected results
+    // In the new implementation, even if get_file_references fails, we still create a datanode report
+    assert_eq!(
+        report.per_datanode_reports.len(),
+        1,
+        "Should process 1 datanode"
+    );
+    assert_eq!(
+        report.failed_datanodes.len(),
+        0,
+        "Should have 0 failed datanodes (failure handled gracefully)"
+    );
+
+    // The region should be processed but may have empty results due to file refs failure
+    let datanode_report = report.per_datanode_reports.values().next().unwrap();
+    // The current implementation still processes the region even with file refs failure
+    // and creates an empty entry in deleted_files
+    assert!(
+        datanode_report.deleted_files.contains_key(&region_id),
+        "Should have region in deleted_files (even if empty)"
+    );
+    assert!(
+        datanode_report.deleted_files[&region_id].is_empty(),
+        "Should have empty deleted files due to file refs failure"
+    );
+
+    // Should still attempt to get file references (may be called multiple times due to retry logic)
+    assert!(
+        *ctx.get_file_references_calls.lock().unwrap() >= 1,
+        "Expected at least 1 call to get_file_references, got {}",
+        *ctx.get_file_references_calls.lock().unwrap()
+    );
+}
+
+#[tokio::test]
+async fn test_get_table_route_failure() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+
+    // Create region stat with proper size and file_removed_cnt to ensure it gets selected as candidate
+    let mut region_stat =
+        mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut region_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    // Inject an error for get_table_route method to simulate failure
+    let route_error = crate::error::UnexpectedSnafu {
+        violated: "Simulated table route failure for testing".to_string(),
+    }
+    .build();
+
+    // Create context with table route error injection
+    let ctx = Arc::new(MockSchedulerCtx {
+        table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+        ..Default::default()
+    });
+    ctx.set_table_route_error(route_error);
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    // Get candidates first
+    let stats = &ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+    let candidates = scheduler.select_gc_candidates(stats).await.unwrap();
+
+    // Convert table-based candidates to datanode-based candidates
+    let datanode_to_candidates = HashMap::from([(
+        Peer::new(1, ""),
+        candidates
+            .into_iter()
+            .flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
+            .collect(),
+    )]);
+
+    // This should handle table route failure gracefully
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    // Should process the datanode but handle route error gracefully
+    assert_eq!(
+        report.per_datanode_reports.len(),
+        0,
+        "Expected 0 datanode report"
+    );
+    assert_eq!(
+        report.failed_datanodes.len(),
+        1,
+        "Expected 1 failed datanodes (route error handled gracefully)"
+    );
+    assert!(
+        report.failed_datanodes.contains_key(&1),
+        "Failed datanodes should contain the datanode with route error"
+    );
+}
--- a/src/meta-srv/src/gc/mock/full_list.rs
+++ b/src/meta-srv/src/gc/mock/full_list.rs
@@ -0,0 +1,272 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use common_meta::peer::Peer;
+use common_telemetry::init_default_ut_logging;
+use store_api::region_engine::RegionRole;
+use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
+
+use crate::gc::mock::{MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_candidate, mock_region_stat};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+// Full File Listing Tests
+
+#[tokio::test]
+async fn test_full_file_listing_first_time_gc() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+
+    let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    let gc_report = GcReport {
+        deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
+        ..Default::default()
+    };
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            gc_reports: Arc::new(Mutex::new(HashMap::from([(region_id, gc_report)]))),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer.clone())]),
+        )])),
+    );
+
+    // Configure short full file listing interval for testing
+    let config = GcSchedulerOptions {
+        full_file_listing_interval: Duration::from_secs(3600), // 1 hour
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    // First GC - should use full listing since region has never been GC'd
+    let reports = scheduler
+        .process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
+        .await
+        .unwrap();
+
+    assert_eq!(reports.deleted_files.len(), 1);
+
+    // Verify that full listing was used by checking the tracker
+    let tracker = scheduler.region_gc_tracker.lock().await;
+    let gc_info = tracker
+        .get(&region_id)
+        .expect("Region should be in tracker");
+    assert!(
+        gc_info.last_full_listing_time.is_some(),
+        "First GC should use full listing"
+    );
+}
+
+#[tokio::test]
+async fn test_full_file_listing_interval_enforcement() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+
+    let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    let gc_report = GcReport {
+        deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
+        ..Default::default()
+    };
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            gc_reports: Arc::new(Mutex::new(HashMap::from([(region_id, gc_report)]))),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer.clone())]),
+        )])),
+    );
+
+    // Configure very short full file listing interval for testing
+    let config = GcSchedulerOptions {
+        full_file_listing_interval: Duration::from_millis(100), // 100ms
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    // First GC - should use full listing
+    let reports1 = scheduler
+        .process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
+        .await
+        .unwrap();
+    assert_eq!(reports1.deleted_files.len(), 1);
+
+    // Get the first full listing time
+    let first_full_listing_time = {
+        let tracker = scheduler.region_gc_tracker.lock().await;
+        let gc_info = tracker
+            .get(&region_id)
+            .expect("Region should be in tracker");
+        gc_info
+            .last_full_listing_time
+            .expect("Should have full listing time")
+    };
+
+    // Wait for interval to pass
+    tokio::time::sleep(Duration::from_millis(150)).await;
+
+    // Second GC - should use full listing again since interval has passed
+    let _reports2 = scheduler
+        .process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
+        .await
+        .unwrap();
+
+    // Verify that full listing was used again
+    let tracker = scheduler.region_gc_tracker.lock().await;
+    let gc_info = tracker
+        .get(&region_id)
+        .expect("Region should be in tracker");
+    let second_full_listing_time = gc_info
+        .last_full_listing_time
+        .expect("Should have full listing time");
+
+    assert!(
+        second_full_listing_time > first_full_listing_time,
+        "Second GC should update full listing time"
+    );
+}
+
+#[tokio::test]
+async fn test_full_file_listing_no_interval_passed() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+
+    let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    let gc_report = GcReport {
+        deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
+        ..Default::default()
+    };
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            gc_reports: Arc::new(Mutex::new(HashMap::from([(region_id, gc_report)]))),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer.clone())]),
+        )])),
+    );
+
+    // Configure long full file listing interval
+    let config = GcSchedulerOptions {
+        full_file_listing_interval: Duration::from_secs(3600), // 1 hour
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    // First GC - should use full listing
+    let reports1 = scheduler
+        .process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
+        .await
+        .unwrap();
+    assert_eq!(reports1.deleted_files.len(), 1);
+
+    // Get the first full listing time
+    let first_full_listing_time = {
+        let tracker = scheduler.region_gc_tracker.lock().await;
+        let gc_info = tracker
+            .get(&region_id)
+            .expect("Region should be in tracker");
+        gc_info
+            .last_full_listing_time
+            .expect("Should have full listing time")
+    };
+
+    // Second GC immediately - should NOT use full listing since interval hasn't passed
+    let reports2 = scheduler
+        .process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
+        .await
+        .unwrap();
+    assert_eq!(reports2.deleted_files.len(), 1);
+
+    // Verify that full listing time was NOT updated
+    let tracker = scheduler.region_gc_tracker.lock().await;
+    let gc_info = tracker
+        .get(&region_id)
+        .expect("Region should be in tracker");
+    let second_full_listing_time = gc_info
+        .last_full_listing_time
+        .expect("Should have full listing time");
+
+    assert_eq!(
+        second_full_listing_time, first_full_listing_time,
+        "Second GC should not update full listing time when interval hasn't passed"
+    );
+}
--- a/src/meta-srv/src/gc/mock/integration.rs
+++ b/src/meta-srv/src/gc/mock/integration.rs
@@ -0,0 +1,252 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use common_meta::datanode::RegionManifestInfo;
+use common_meta::peer::Peer;
+use common_telemetry::init_default_ut_logging;
+use store_api::region_engine::RegionRole;
+use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
+
+use crate::gc::mock::{
+    MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat, new_empty_report_with,
+};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+// Integration Flow Tests
+
+#[tokio::test]
+async fn test_full_gc_workflow() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+
+    let mut region_stat =
+        mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut region_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    let mut gc_reports = HashMap::new();
+    gc_reports.insert(
+        region_id,
+        GcReport {
+            deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
+            ..Default::default()
+        },
+    );
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            gc_reports: Arc::new(Mutex::new(gc_reports)),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer)]),
+        )])),
+    );
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    // Run the full workflow
+    let report = scheduler.handle_tick().await.unwrap();
+
+    // Validate the returned GcJobReport - should have 1 datanode report
+    assert_eq!(
+        report.per_datanode_reports.len(),
+        1,
+        "Should process 1 datanode"
+    );
+    assert_eq!(
+        report.failed_datanodes.len(),
+        0,
+        "Should have no failed datanodes"
+    );
+
+    // Get the datanode report
+    let datanode_report = report.per_datanode_reports.values().next().unwrap();
+
+    // Check that the region was processed successfully
+    assert!(
+        datanode_report.deleted_files.contains_key(&region_id),
+        "Should have deleted files for region"
+    );
+    assert_eq!(
+        datanode_report.deleted_files[&region_id].len(),
+        2,
+        "Should have 2 deleted files"
+    );
+    assert!(
+        datanode_report.need_retry_regions.is_empty(),
+        "Should have no retry regions"
+    );
+
+    // Verify all steps were executed
+    assert_eq!(
+        *ctx.get_table_to_region_stats_calls.lock().unwrap(),
+        1,
+        "Expected 1 call to get_table_to_region_stats"
+    );
+    assert_eq!(
+        *ctx.get_file_references_calls.lock().unwrap(),
+        1,
+        "Expected 1 call to get_file_references"
+    );
+    assert_eq!(
+        *ctx.gc_regions_calls.lock().unwrap(),
+        1,
+        "Expected 1 call to gc_regions"
+    );
+}
+
+#[tokio::test]
+async fn test_tracker_cleanup() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+
+    // Create region stat with proper file_removed_cnt to ensure it gets selected as candidate
+    let mut region_stat =
+        mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut region_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    let mut gc_reports = HashMap::new();
+    gc_reports.insert(region_id, new_empty_report_with([region_id]));
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            gc_reports: Arc::new(Mutex::new(gc_reports)),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer)]),
+        )])),
+    );
+
+    let old_region_gc_tracker = {
+        let mut tracker = HashMap::new();
+        tracker.insert(
+            region_id,
+            crate::gc::tracker::RegionGcInfo {
+                last_full_listing_time: Some(Instant::now() - Duration::from_secs(7200)), // 2 hours ago
+                last_gc_time: Instant::now() - Duration::from_secs(7200), // 2 hours ago
+            },
+        );
+        // also insert a different table that should also be cleaned up
+        tracker.insert(
+            RegionId::new(2, 1),
+            crate::gc::tracker::RegionGcInfo {
+                last_full_listing_time: Some(Instant::now() - Duration::from_secs(7200)), // 2 hours ago
+                last_gc_time: Instant::now() - Duration::from_secs(7200), // 2 hours ago
+            },
+        );
+        tracker
+    };
+
+    // Use a custom config with shorter cleanup interval to trigger cleanup
+    let config = GcSchedulerOptions {
+        // 30 minutes
+        tracker_cleanup_interval: Duration::from_secs(1800),
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(old_region_gc_tracker)),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(
+            Instant::now() - Duration::from_secs(3600), // Old cleanup time (1 hour ago)
+        )),
+    };
+
+    let report = scheduler.handle_tick().await.unwrap();
+
+    // Validate the returned GcJobReport - should have 1 datanode report
+    assert_eq!(
+        report.per_datanode_reports.len(),
+        1,
+        "Should process 1 datanode"
+    );
+    assert_eq!(
+        report.failed_datanodes.len(),
+        0,
+        "Should have no failed datanodes"
+    );
+
+    // Get the datanode report
+    let datanode_report = report.per_datanode_reports.values().next().unwrap();
+
+    // Check that the region was processed successfully
+    assert!(
+        datanode_report.deleted_files.contains_key(&region_id),
+        "Should have deleted files for region"
+    );
+    assert!(
+        datanode_report.need_retry_regions.is_empty(),
+        "Should have no retry regions"
+    );
+
+    // Verify tracker was updated
+    let tracker = scheduler.region_gc_tracker.lock().await;
+    assert!(
+        tracker.contains_key(&region_id),
+        "Tracker should contain region {}",
+        region_id
+    );
+    // only one entry
+    assert_eq!(tracker.len(), 1, "Tracker should only have 1 entry");
+}
--- a/src/meta-srv/src/gc/mock/misc.rs
+++ b/src/meta-srv/src/gc/mock/misc.rs
@@ -0,0 +1,155 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+
+use common_meta::peer::Peer;
+use common_telemetry::init_default_ut_logging;
+use store_api::storage::{FileRefsManifest, GcReport, RegionId};
+
+use crate::gc::mock::{MockSchedulerCtx, new_candidate};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+/// Edge Case Tests
+
+#[tokio::test]
+async fn test_empty_file_refs_manifest() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+    let candidates = HashMap::from([(table_id, vec![new_candidate(region_id, 1.0)])]);
+
+    // Empty file refs manifest
+    let file_refs = FileRefsManifest::default();
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            candidates: Arc::new(Mutex::new(Some(candidates))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer)]),
+        )])),
+    );
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
+
+    // Convert table-based candidates to datanode-based candidates
+    let peer = Peer::new(1, "");
+    let datanode_to_candidates = HashMap::from([(
+        peer,
+        candidates
+            .into_iter()
+            .flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
+            .collect(),
+    )]);
+
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    assert_eq!(report.per_datanode_reports.len(), 1);
+    assert_eq!(report.failed_datanodes.len(), 0);
+    // Should handle empty file refs gracefully
+}
+
+#[tokio::test]
+async fn test_multiple_regions_per_table() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region1 = RegionId::new(table_id, 1);
+    let region2 = RegionId::new(table_id, 2);
+    let region3 = RegionId::new(table_id, 3);
+    let peer = Peer::new(1, "");
+
+    let candidates = HashMap::from([(
+        table_id,
+        vec![
+            new_candidate(region1, 1.0),
+            new_candidate(region2, 2.0),
+            new_candidate(region3, 3.0),
+        ],
+    )]);
+
+    let mut gc_reports = HashMap::new();
+    gc_reports.insert(region1, GcReport::default());
+    gc_reports.insert(region2, GcReport::default());
+    gc_reports.insert(region3, GcReport::default());
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region1, 1), (region2, 1), (region3, 1)]),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            gc_reports: Arc::new(Mutex::new(gc_reports)),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            candidates: Arc::new(Mutex::new(Some(candidates))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (
+                table_id,
+                vec![
+                    (region1, peer.clone()),
+                    (region2, peer.clone()),
+                    (region3, peer.clone()),
+                ],
+            ),
+        )])),
+    );
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
+
+    // Convert table-based candidates to datanode-based candidates
+    let datanode_to_candidates = HashMap::from([(
+        peer.clone(),
+        candidates
+            .into_iter()
+            .flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
+            .collect(),
+    )]);
+
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    assert_eq!(report.per_datanode_reports.len(), 1);
+    assert_eq!(report.failed_datanodes.len(), 0);
+}
--- a/src/meta-srv/src/gc/tracker.rs
+++ b/src/meta-srv/src/gc/tracker.rs
@@ -50,7 +50,7 @@ impl GcScheduler {
        let now = Instant::now();

        // Check if enough time has passed since last cleanup
-        if now.duration_since(last_cleanup) < self.config.tracker_cleanup_interval {
+        if now.saturating_duration_since(last_cleanup) < self.config.tracker_cleanup_interval {
            return Ok(());
        }

@@ -92,7 +92,7 @@ impl GcScheduler {

        if let Some(gc_info) = gc_tracker.get(&region_id) {
            if let Some(last_full_listing) = gc_info.last_full_listing_time {
-                let elapsed = now.duration_since(last_full_listing);
+                let elapsed = now.saturating_duration_since(last_full_listing);
                elapsed >= self.config.full_file_listing_interval
            } else {
                // Never did full listing for this region, do it now
--- a/src/meta-srv/src/handler.rs
+++ b/src/meta-srv/src/handler.rs
@@ -32,7 +32,7 @@ use collect_leader_region_handler::CollectLeaderRegionHandler;
 use collect_stats_handler::CollectStatsHandler;
 use common_base::Plugins;
 use common_meta::datanode::Stat;
-use common_meta::instruction::{Instruction, InstructionReply};
+use common_meta::instruction::InstructionReply;
 use common_meta::sequence::Sequence;
 use common_telemetry::{debug, info, warn};
 use dashmap::DashMap;
@@ -114,16 +114,19 @@ pub enum HandleControl {
 #[derive(Debug, Default)]
 pub struct HeartbeatAccumulator {
    pub header: Option<ResponseHeader>,
-    pub instructions: Vec<Instruction>,
+    mailbox_message: Option<MailboxMessage>,
    pub stat: Option<Stat>,
    pub inactive_region_ids: HashSet<RegionId>,
    pub region_lease: Option<RegionLease>,
 }

 impl HeartbeatAccumulator {
-    pub fn into_mailbox_message(self) -> Option<MailboxMessage> {
-        // TODO(jiachun): to HeartbeatResponse payload
-        None
+    pub(crate) fn take_mailbox_message(&mut self) -> Option<MailboxMessage> {
+        self.mailbox_message.take()
+    }
+
+    pub fn set_mailbox_message(&mut self, message: MailboxMessage) {
+        let _ = self.mailbox_message.insert(message);
    }
 }

@@ -275,6 +278,15 @@ impl Pushers {
    async fn remove(&self, pusher_id: &str) -> Option<Pusher> {
        self.0.write().await.remove(pusher_id)
    }
+
+    pub(crate) async fn clear(&self) -> Vec<String> {
+        let mut pushers = self.0.write().await;
+        let keys = pushers.keys().cloned().collect::<Vec<_>>();
+        if !keys.is_empty() {
+            pushers.clear();
+        }
+        keys
+    }
 }

 #[derive(Clone)]
@@ -309,12 +321,11 @@ impl HeartbeatHandlerGroup {
    }

    /// Deregisters the heartbeat response [`Pusher`] with the given key from the group.
-    ///
-    /// Returns the [`Pusher`] if it exists.
-    pub async fn deregister_push(&self, pusher_id: PusherId) -> Option<Pusher> {
-        METRIC_META_HEARTBEAT_CONNECTION_NUM.dec();
+    pub async fn deregister_push(&self, pusher_id: PusherId) {
        info!("Pusher unregister: {}", pusher_id);
-        self.pushers.remove(&pusher_id.string_key()).await
+        if self.pushers.remove(&pusher_id.string_key()).await.is_some() {
+            METRIC_META_HEARTBEAT_CONNECTION_NUM.dec();
+        }
    }

    /// Returns the [`Pushers`] of the group.
@@ -351,10 +362,11 @@ impl HeartbeatHandlerGroup {
            }
        }
        let header = std::mem::take(&mut acc.header);
+        let mailbox_message = acc.take_mailbox_message();
        let res = HeartbeatResponse {
            header,
            region_lease: acc.region_lease,
-            ..Default::default()
+            mailbox_message,
        };
        Ok(res)
    }
@@ -382,7 +394,9 @@ impl HeartbeatMailbox {

    /// Parses the [Instruction] from [MailboxMessage].
    #[cfg(test)]
-    pub fn json_instruction(msg: &MailboxMessage) -> Result<Instruction> {
+    pub(crate) fn json_instruction(
+        msg: &MailboxMessage,
+    ) -> Result<common_meta::instruction::Instruction> {
        let Payload::Json(payload) =
            msg.payload
                .as_ref()
@@ -519,6 +533,14 @@ impl Mailbox for HeartbeatMailbox {

        Ok(())
    }
+
+    async fn reset(&self) {
+        let keys = self.pushers.clear().await;
+        if !keys.is_empty() {
+            info!("Reset mailbox, deregister pushers: {:?}", keys);
+            METRIC_META_HEARTBEAT_CONNECTION_NUM.sub(keys.len() as i64);
+        }
+    }
 }

 /// The builder to build the group of heartbeat handlers.
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -452,6 +452,7 @@ pub struct MetaStateHandler {
    greptimedb_telemetry_task: Arc<GreptimeDBTelemetryTask>,
    leader_cached_kv_backend: Arc<LeaderCachedKvBackend>,
    leadership_change_notifier: LeadershipChangeNotifier,
+    mailbox: MailboxRef,
    state: StateRef,
 }

@@ -475,6 +476,9 @@ impl MetaStateHandler {
    pub async fn on_leader_stop(&self) {
        self.state.write().unwrap().next_state(become_follower());

+        // Enforces the mailbox to clear all pushers.
+        // The remaining heartbeat connections will be closed by the remote peer or keep-alive detection.
+        self.mailbox.reset().await;
        self.leadership_change_notifier
            .notify_on_leader_stop()
            .await;
@@ -602,6 +606,7 @@ impl Metasrv {
                state: self.state.clone(),
                leader_cached_kv_backend: leader_cached_kv_backend.clone(),
                leadership_change_notifier,
+                mailbox: self.mailbox.clone(),
            };
            let _handle = common_runtime::spawn_global(async move {
                loop {
--- a/src/meta-srv/src/service/mailbox.rs
+++ b/src/meta-srv/src/service/mailbox.rs
@@ -207,6 +207,9 @@ pub trait Mailbox: Send + Sync {
    async fn broadcast(&self, ch: &BroadcastChannel, msg: &MailboxMessage) -> Result<()>;

    async fn on_recv(&self, id: MessageId, maybe_msg: Result<MailboxMessage>) -> Result<()>;
+
+    /// Reset all pushers of the mailbox.
+    async fn reset(&self);
 }

 #[cfg(test)]
--- a/src/meta-srv/src/utils/etcd.rs
+++ b/src/meta-srv/src/utils/etcd.rs
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use common_meta::distributed_time_constants::default_etcd_client_options;
 use common_meta::kv_backend::etcd::create_etcd_tls_options;
-use etcd_client::{Client, ConnectOptions};
+use etcd_client::Client;
 use servers::tls::{TlsMode, TlsOption};
 use snafu::ResultExt;

@@ -30,14 +31,15 @@ pub async fn create_etcd_client_with_tls(
        .filter(|x| !x.is_empty())
        .collect::<Vec<_>>();

-    let connect_options = tls_config
-        .map(|c| create_etcd_tls_options(&convert_tls_option(c)))
-        .transpose()
-        .context(BuildTlsOptionsSnafu)?
-        .flatten()
-        .map(|tls_options| ConnectOptions::new().with_tls(tls_options));
+    let mut connect_options = default_etcd_client_options();
+    if let Some(tls_config) = tls_config
+        && let Some(tls_options) = create_etcd_tls_options(&convert_tls_option(tls_config))
+            .context(BuildTlsOptionsSnafu)?
+    {
+        connect_options = connect_options.with_tls(tls_options);
+    }

-    Client::connect(&etcd_endpoints, connect_options)
+    Client::connect(&etcd_endpoints, Some(connect_options))
        .await
        .context(error::ConnectEtcdSnafu)
 }
--- a/src/metric-engine/Cargo.toml
+++ b/src/metric-engine/Cargo.toml
@@ -14,6 +14,7 @@ async-stream.workspace = true
 async-trait.workspace = true
 base64.workspace = true
 bytes.workspace = true
+fxhash = "0.2"
 common-base.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
@@ -31,7 +32,6 @@ lazy_static = "1.4"
 mito-codec.workspace = true
 mito2.workspace = true
 moka.workspace = true
-mur3 = "0.1"
 object-store.workspace = true
 prometheus.workspace = true
 serde.workspace = true
@@ -47,6 +47,12 @@ common-meta = { workspace = true, features = ["testing"] }
 common-test-util.workspace = true
 mito2 = { workspace = true, features = ["test"] }
 common-wal = { workspace = true }
+criterion = { version = "0.4", features = ["async", "async_tokio"] }
+mur3 = "0.1"
+
+[[bench]]
+name = "bench_tsid_generator"
+harness = false

 [package.metadata.cargo-udeps.ignore]
 normal = ["aquamarine"]
--- a/src/metric-engine/benches/bench_tsid_generator.rs
+++ b/src/metric-engine/benches/bench_tsid_generator.rs
@@ -0,0 +1,273 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::hash::Hasher;
+
+use criterion::{Criterion, black_box, criterion_group, criterion_main};
+use fxhash::FxHasher;
+use mur3::Hasher128;
+
+// A random number (from original implementation)
+const TSID_HASH_SEED: u32 = 846793005;
+
+/// Original TSID generator using mur3::Hasher128
+/// Hashes both label name and value for each label pair
+struct OriginalTsidGenerator {
+    hasher: Hasher128,
+}
+
+impl OriginalTsidGenerator {
+    fn new() -> Self {
+        Self {
+            hasher: Hasher128::with_seed(TSID_HASH_SEED),
+        }
+    }
+
+    /// Writes a label pair (name and value) to the generator.
+    fn write_label(&mut self, name: &str, value: &str) {
+        use std::hash::Hash;
+        name.hash(&mut self.hasher);
+        value.hash(&mut self.hasher);
+    }
+
+    /// Generates a new TSID.
+    fn finish(&mut self) -> u64 {
+        // TSID is 64 bits, simply truncate the 128 bits hash
+        let (hash, _) = self.hasher.finish128();
+        hash
+    }
+}
+
+/// Current TSID generator using fxhash::FxHasher
+/// Fast path: pre-computes label name hash, only hashes values
+struct CurrentTsidGenerator {
+    hasher: FxHasher,
+}
+
+impl CurrentTsidGenerator {
+    fn new() -> Self {
+        Self {
+            hasher: FxHasher::default(),
+        }
+    }
+
+    fn new_with_label_name_hash(label_name_hash: u64) -> Self {
+        let mut hasher = FxHasher::default();
+        hasher.write_u64(label_name_hash);
+        Self { hasher }
+    }
+
+    /// Writes a label value to the generator.
+    fn write_str(&mut self, value: &str) {
+        self.hasher.write(value.as_bytes());
+        self.hasher.write_u8(0xff);
+    }
+
+    /// Generates a new TSID.
+    fn finish(&mut self) -> u64 {
+        self.hasher.finish()
+    }
+}
+
+/// Pre-computes label name hash (used in fast path)
+fn compute_label_name_hash(labels: &[(&str, &str)]) -> u64 {
+    let mut hasher = FxHasher::default();
+    for (name, _) in labels {
+        hasher.write(name.as_bytes());
+        hasher.write_u8(0xff);
+    }
+    hasher.finish()
+}
+
+fn bench_tsid_generator_small(c: &mut Criterion) {
+    let labels = vec![("namespace", "greptimedb"), ("host", "127.0.0.1")];
+
+    let mut group = c.benchmark_group("tsid_generator_small_2_labels");
+    group.bench_function("original_mur3", |b| {
+        b.iter(|| {
+            let mut tsid_gen = OriginalTsidGenerator::new();
+            for (name, value) in &labels {
+                tsid_gen.write_label(black_box(name), black_box(value));
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    let label_name_hash = compute_label_name_hash(&labels);
+    group.bench_function("current_fxhash_fast_path", |b| {
+        b.iter(|| {
+            let mut tsid_gen =
+                CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
+            for (_, value) in &labels {
+                tsid_gen.write_str(black_box(value));
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    group.finish();
+}
+
+fn bench_tsid_generator_medium(c: &mut Criterion) {
+    let labels = vec![
+        ("namespace", "greptimedb"),
+        ("host", "127.0.0.1"),
+        ("region", "us-west-2"),
+        ("env", "production"),
+        ("service", "api"),
+    ];
+
+    let mut group = c.benchmark_group("tsid_generator_medium_5_labels");
+    group.bench_function("original_mur3", |b| {
+        b.iter(|| {
+            let mut tsid_gen = OriginalTsidGenerator::new();
+            for (name, value) in &labels {
+                tsid_gen.write_label(black_box(name), black_box(value));
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    let label_name_hash = compute_label_name_hash(&labels);
+    group.bench_function("current_fxhash_fast_path", |b| {
+        b.iter(|| {
+            let mut tsid_gen =
+                CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
+            for (_, value) in &labels {
+                tsid_gen.write_str(black_box(value));
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    group.finish();
+}
+
+fn bench_tsid_generator_large(c: &mut Criterion) {
+    let labels = vec![
+        ("namespace", "greptimedb"),
+        ("host", "127.0.0.1"),
+        ("region", "us-west-2"),
+        ("env", "production"),
+        ("service", "api"),
+        ("version", "v1.0.0"),
+        ("cluster", "cluster-1"),
+        ("dc", "dc1"),
+        ("rack", "rack-1"),
+        ("pod", "pod-123"),
+    ];
+
+    let mut group = c.benchmark_group("tsid_generator_large_10_labels");
+    group.bench_function("original_mur3", |b| {
+        b.iter(|| {
+            let mut tsid_gen = OriginalTsidGenerator::new();
+            for (name, value) in &labels {
+                tsid_gen.write_label(black_box(name), black_box(value));
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    let label_name_hash = compute_label_name_hash(&labels);
+    group.bench_function("current_fxhash_fast_path", |b| {
+        b.iter(|| {
+            let mut tsid_gen =
+                CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
+            for (_, value) in &labels {
+                tsid_gen.write_str(black_box(value));
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    group.finish();
+}
+
+fn bench_tsid_generator_slow_path(c: &mut Criterion) {
+    // Simulate slow path: some labels have null values (empty strings)
+    let labels_with_nulls = vec![
+        ("namespace", "greptimedb"),
+        ("host", "127.0.0.1"),
+        ("region", ""), // null
+        ("env", "production"),
+    ];
+
+    let labels_all_non_null = vec![
+        ("namespace", "greptimedb"),
+        ("host", "127.0.0.1"),
+        ("env", "production"),
+    ];
+
+    let mut group = c.benchmark_group("tsid_generator_slow_path_with_nulls");
+
+    // Original: always hashes name and value
+    group.bench_function("original_mur3_with_nulls", |b| {
+        b.iter(|| {
+            let mut tsid_gen = OriginalTsidGenerator::new();
+            for (name, value) in &labels_with_nulls {
+                if !value.is_empty() {
+                    tsid_gen.write_label(black_box(name), black_box(value));
+                }
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    // Current slow path: recomputes label name hash
+    group.bench_function("current_fxhash_slow_path", |b| {
+        b.iter(|| {
+            // Step 1: Compute label name hash for non-null labels
+            let mut name_hasher = CurrentTsidGenerator::new();
+            for (name, value) in &labels_with_nulls {
+                if !value.is_empty() {
+                    name_hasher.write_str(black_box(name));
+                }
+            }
+            let label_name_hash = name_hasher.finish();
+
+            // Step 2: Use label name hash and hash values
+            let mut tsid_gen = CurrentTsidGenerator::new_with_label_name_hash(label_name_hash);
+            for (_, value) in &labels_with_nulls {
+                if !value.is_empty() {
+                    tsid_gen.write_str(black_box(value));
+                }
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    // Current fast path: pre-computed (for comparison)
+    let label_name_hash = compute_label_name_hash(&labels_all_non_null);
+    group.bench_function("current_fxhash_fast_path_no_nulls", |b| {
+        b.iter(|| {
+            let mut tsid_gen =
+                CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
+            for (_, value) in &labels_all_non_null {
+                tsid_gen.write_str(black_box(value));
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_tsid_generator_small,
+    bench_tsid_generator_medium,
+    bench_tsid_generator_large,
+    bench_tsid_generator_slow_path
+);
+criterion_main!(benches);
--- a/src/metric-engine/src/engine/flush.rs
+++ b/src/metric-engine/src/engine/flush.rs
@@ -119,7 +119,7 @@ mod tests {
                    .index_file_path
                    .map(|path| path.replace(&e.file_id, "<file_id>"));
                e.file_id = "<file_id>".to_string();
-                e.index_file_id = e.index_file_id.map(|_| "<index_file_id>".to_string());
+                e.index_version = 0;
                format!("\n{:?}", e)
            })
            .sorted()
@@ -128,12 +128,12 @@ mod tests {
        assert_eq!(
            debug_format,
            r#"
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3487, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3487, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#,
        );
        // list from storage
        let storage_entries = mito
--- a/src/metric-engine/src/engine/put.rs
+++ b/src/metric-engine/src/engine/put.rs
@@ -272,15 +272,15 @@ mod tests {
            .unwrap();
        let batches = RecordBatches::try_collect(stream).await.unwrap();
        let expected = "\
-+-------------------------+----------------+------------+----------------------+-------+
-| greptime_timestamp      | greptime_value | __table_id | __tsid               | job   |
-+-------------------------+----------------+------------+----------------------+-------+
-| 1970-01-01T00:00:00     | 0.0            | 3          | 12881218023286672757 | tag_0 |
-| 1970-01-01T00:00:00.001 | 1.0            | 3          | 12881218023286672757 | tag_0 |
-| 1970-01-01T00:00:00.002 | 2.0            | 3          | 12881218023286672757 | tag_0 |
-| 1970-01-01T00:00:00.003 | 3.0            | 3          | 12881218023286672757 | tag_0 |
-| 1970-01-01T00:00:00.004 | 4.0            | 3          | 12881218023286672757 | tag_0 |
-+-------------------------+----------------+------------+----------------------+-------+";
+-------------------------+----------------+------------+---------------------+-------+
+| greptime_timestamp      | greptime_value | __table_id | __tsid              | job   |
+-------------------------+----------------+------------+---------------------+-------+
+| 1970-01-01T00:00:00     | 0.0            | 3          | 2955007454552897459 | tag_0 |
+| 1970-01-01T00:00:00.001 | 1.0            | 3          | 2955007454552897459 | tag_0 |
+| 1970-01-01T00:00:00.002 | 2.0            | 3          | 2955007454552897459 | tag_0 |
+| 1970-01-01T00:00:00.003 | 3.0            | 3          | 2955007454552897459 | tag_0 |
+| 1970-01-01T00:00:00.004 | 4.0            | 3          | 2955007454552897459 | tag_0 |
+-------------------------+----------------+------------+---------------------+-------+";
        assert_eq!(expected, batches.pretty_print().unwrap(), "physical region");

        // read data from logical region
--- a/src/metric-engine/src/row_modifier.rs
+++ b/src/metric-engine/src/row_modifier.rs
@@ -13,11 +13,12 @@
 // limitations under the License.

 use std::collections::{BTreeMap, HashMap};
-use std::hash::Hash;
+use std::hash::Hasher;

 use api::v1::value::ValueData;
 use api::v1::{ColumnDataType, ColumnSchema, Row, Rows, SemanticType, Value};
 use datatypes::value::ValueRef;
+use fxhash::FxHasher;
 use mito_codec::row_converter::SparsePrimaryKeyCodec;
 use smallvec::SmallVec;
 use snafu::ResultExt;
@@ -30,9 +31,6 @@ use store_api::storage::{ColumnId, TableId};

 use crate::error::{EncodePrimaryKeySnafu, Result};

-// A random number
-const TSID_HASH_SEED: u32 = 846793005;
-
 /// A row modifier modifies [`Rows`].
 ///
 /// - For [`PrimaryKeyEncoding::Sparse`] encoding,
@@ -75,6 +73,7 @@ impl RowModifier {
        let num_output_column = num_column - num_primary_key_column + 1;

        let mut buffer = vec![];
+
        for mut iter in iter.iter_mut() {
            let (table_id, tsid) = Self::fill_internal_columns(table_id, &iter);
            let mut values = Vec::with_capacity(num_output_column);
@@ -147,47 +146,72 @@ impl RowModifier {

    /// Fills internal columns of a row with table name and a hash of tag values.
    pub fn fill_internal_columns(table_id: TableId, iter: &RowIter<'_>) -> (Value, Value) {
-        let mut hasher = TsidGenerator::default();
-        for (name, value) in iter.primary_keys_with_name() {
-            // The type is checked before. So only null is ignored.
-            if let Some(ValueData::StringValue(string)) = &value.value_data {
-                hasher.write_label(name, string);
+        let ts_id = if !iter.has_null_labels() {
+            // No null labels in row, we can safely reuse the precomputed label name hash.
+            let mut ts_id_gen = TsidGenerator::new(iter.index.label_name_hash);
+            for (_, value) in iter.primary_keys_with_name() {
+                // The type is checked before. So only null is ignored.
+                if let Some(ValueData::StringValue(string)) = &value.value_data {
+                    ts_id_gen.write_str(string);
+                } else {
+                    unreachable!(
+                        "Should not contain null or non-string value: {:?}, table id: {}",
+                        value, table_id
+                    );
+                }
            }
-        }
-        let hash = hasher.finish();
+            ts_id_gen.finish()
+        } else {
+            // Slow path: row contains null, recompute label hash
+            let mut hasher = TsidGenerator::default();
+            // 1. Find out label names with non-null values and get the hash.
+            for (name, value) in iter.primary_keys_with_name() {
+                // The type is checked before. So only null is ignored.
+                if let Some(ValueData::StringValue(_)) = &value.value_data {
+                    hasher.write_str(name);
+                }
+            }
+            let label_name_hash = hasher.finish();
+
+            // 2. Use label name hash as seed and continue with label values.
+            let mut final_hasher = TsidGenerator::new(label_name_hash);
+            for (_, value) in iter.primary_keys_with_name() {
+                if let Some(ValueData::StringValue(value)) = &value.value_data {
+                    final_hasher.write_str(value);
+                }
+            }
+            final_hasher.finish()
+        };

        (
            ValueData::U32Value(table_id).into(),
-            ValueData::U64Value(hash).into(),
+            ValueData::U64Value(ts_id).into(),
        )
    }
 }

 /// Tsid generator.
+#[derive(Default)]
 pub struct TsidGenerator {
-    hasher: mur3::Hasher128,
-}
-
-impl Default for TsidGenerator {
-    fn default() -> Self {
-        Self {
-            hasher: mur3::Hasher128::with_seed(TSID_HASH_SEED),
-        }
-    }
+    hasher: FxHasher,
 }

 impl TsidGenerator {
+    pub fn new(label_name_hash: u64) -> Self {
+        let mut hasher = FxHasher::default();
+        hasher.write_u64(label_name_hash);
+        Self { hasher }
+    }
+
    /// Writes a label pair to the generator.
-    pub fn write_label(&mut self, name: &str, value: &str) {
-        name.hash(&mut self.hasher);
-        value.hash(&mut self.hasher);
+    pub fn write_str(&mut self, value: &str) {
+        self.hasher.write(value.as_bytes());
+        self.hasher.write_u8(0xff);
    }

    /// Generates a new TSID.
    pub fn finish(&mut self) -> u64 {
-        // TSID is 64 bits, simply truncate the 128 bits hash
-        let (hash, _) = self.hasher.finish128();
-        hash
+        self.hasher.finish()
    }
 }

@@ -202,6 +226,8 @@ struct ValueIndex {
 struct IterIndex {
    indices: Vec<ValueIndex>,
    num_primary_key_column: usize,
+    /// Precomputed hash for label names.
+    label_name_hash: u64,
 }

 impl IterIndex {
@@ -252,15 +278,22 @@ impl IterIndex {
            }
        }
        let num_primary_key_column = primary_key_indices.len() + reserved_indices.len();
-        let indices = reserved_indices
-            .into_iter()
-            .chain(primary_key_indices.values().cloned())
-            .chain(ts_index)
-            .chain(field_indices)
-            .collect();
+        let mut indices = Vec::with_capacity(num_primary_key_column + 2);
+        indices.extend(reserved_indices);
+        let mut label_name_hasher = TsidGenerator::default();
+        for (pk_name, pk_index) in primary_key_indices {
+            // primary_key_indices already sorted.
+            label_name_hasher.write_str(pk_name);
+            indices.push(pk_index);
+        }
+        let label_name_hash = label_name_hasher.finish();
+
+        indices.extend(ts_index);
+        indices.extend(field_indices);
        IterIndex {
            indices,
            num_primary_key_column,
+            label_name_hash,
        }
    }
 }
@@ -314,6 +347,13 @@ impl RowIter<'_> {
            })
    }

+    /// Returns true if any label in current row is null.
+    fn has_null_labels(&self) -> bool {
+        self.index.indices[..self.index.num_primary_key_column]
+            .iter()
+            .any(|idx| self.row.values[idx.index].value_data.is_none())
+    }
+
    /// Returns the primary keys.
    pub fn primary_keys(&self) -> impl Iterator<Item = (ColumnId, ValueRef<'_>)> {
        self.index.indices[..self.index.num_primary_key_column]
@@ -399,9 +439,9 @@ mod tests {
        let result = encoder.modify_rows_sparse(rows_iter, table_id).unwrap();
        assert_eq!(result.rows[0].values.len(), 1);
        let encoded_primary_key = vec![
-            128, 0, 0, 4, 1, 0, 0, 4, 1, 128, 0, 0, 3, 1, 131, 9, 166, 190, 173, 37, 39, 240, 0, 0,
-            0, 2, 1, 1, 49, 50, 55, 46, 48, 46, 48, 46, 9, 49, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
-            1, 1, 103, 114, 101, 112, 116, 105, 109, 101, 9, 100, 98, 0, 0, 0, 0, 0, 0, 2,
+            128, 0, 0, 4, 1, 0, 0, 4, 1, 128, 0, 0, 3, 1, 37, 196, 242, 181, 117, 224, 7, 137, 0,
+            0, 0, 2, 1, 1, 49, 50, 55, 46, 48, 46, 48, 46, 9, 49, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+            1, 1, 1, 103, 114, 101, 112, 116, 105, 109, 101, 9, 100, 98, 0, 0, 0, 0, 0, 0, 2,
        ];
        assert_eq!(
            result.rows[0].values[0],
@@ -477,7 +517,7 @@ mod tests {
        assert_eq!(result.rows[0].values[2], ValueData::U32Value(1025).into());
        assert_eq!(
            result.rows[0].values[3],
-            ValueData::U64Value(9442261431637846000).into()
+            ValueData::U64Value(2721566936019240841).into()
        );
        assert_eq!(result.schema, expected_dense_schema());
    }
@@ -496,7 +536,7 @@ mod tests {
        let row_iter = rows_iter.iter_mut().next().unwrap();
        let (encoded_table_id, tsid) = RowModifier::fill_internal_columns(table_id, &row_iter);
        assert_eq!(encoded_table_id, ValueData::U32Value(1025).into());
-        assert_eq!(tsid, ValueData::U64Value(9442261431637846000).into());
+        assert_eq!(tsid, ValueData::U64Value(2721566936019240841).into());

        // Change the column order
        let schema = vec![
@@ -524,6 +564,264 @@ mod tests {
        let row_iter = rows_iter.iter_mut().next().unwrap();
        let (encoded_table_id, tsid) = RowModifier::fill_internal_columns(table_id, &row_iter);
        assert_eq!(encoded_table_id, ValueData::U32Value(1025).into());
-        assert_eq!(tsid, ValueData::U64Value(9442261431637846000).into());
+        assert_eq!(tsid, ValueData::U64Value(2721566936019240841).into());
+    }
+
+    /// Helper function to create a schema with multiple label columns
+    fn create_multi_label_schema(labels: &[&str]) -> Vec<ColumnSchema> {
+        labels
+            .iter()
+            .map(|name| ColumnSchema {
+                column_name: name.to_string(),
+                datatype: ColumnDataType::String as i32,
+                semantic_type: SemanticType::Tag as _,
+                datatype_extension: None,
+                options: None,
+            })
+            .collect()
+    }
+
+    /// Helper function to create a name_to_column_id map
+    fn create_name_to_column_id(labels: &[&str]) -> HashMap<String, ColumnId> {
+        labels
+            .iter()
+            .enumerate()
+            .map(|(idx, name)| (name.to_string(), idx as ColumnId + 1))
+            .collect()
+    }
+
+    /// Helper function to create a row with string values
+    fn create_row_with_values(values: &[&str]) -> Row {
+        Row {
+            values: values
+                .iter()
+                .map(|v| ValueData::StringValue(v.to_string()).into())
+                .collect(),
+        }
+    }
+
+    /// Helper function to create a row with some null values
+    fn create_row_with_nulls(values: &[Option<&str>]) -> Row {
+        Row {
+            values: values
+                .iter()
+                .map(|v| {
+                    v.map(|s| ValueData::StringValue(s.to_string()).into())
+                        .unwrap_or(Value { value_data: None })
+                })
+                .collect(),
+        }
+    }
+
+    /// Helper function to extract TSID from a row
+    fn extract_tsid(
+        schema: Vec<ColumnSchema>,
+        row: Row,
+        name_to_column_id: &HashMap<String, ColumnId>,
+        table_id: TableId,
+    ) -> u64 {
+        let rows = Rows {
+            schema,
+            rows: vec![row],
+        };
+        let mut rows_iter = RowsIter::new(rows, name_to_column_id);
+        let row_iter = rows_iter.iter_mut().next().unwrap();
+        let (_, tsid_value) = RowModifier::fill_internal_columns(table_id, &row_iter);
+        match tsid_value.value_data {
+            Some(ValueData::U64Value(tsid)) => tsid,
+            _ => panic!("Expected U64Value for TSID"),
+        }
+    }
+
+    #[test]
+    fn test_tsid_same_for_different_label_orders() {
+        // Test that rows with the same label name-value pairs but in different orders
+        // produce the same TSID
+        let table_id = 1025;
+
+        // Schema 1: a, b, c
+        let schema1 = create_multi_label_schema(&["a", "b", "c"]);
+        let name_to_column_id1 = create_name_to_column_id(&["a", "b", "c"]);
+        let row1 = create_row_with_values(&["A", "B", "C"]);
+        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
+
+        // Schema 2: b, a, c (different order)
+        let schema2 = create_multi_label_schema(&["b", "a", "c"]);
+        let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
+        let row2 = create_row_with_values(&["B", "A", "C"]);
+        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
+
+        // Schema 3: c, b, a (another different order)
+        let schema3 = create_multi_label_schema(&["c", "b", "a"]);
+        let name_to_column_id3 = create_name_to_column_id(&["a", "b", "c"]);
+        let row3 = create_row_with_values(&["C", "B", "A"]);
+        let tsid3 = extract_tsid(schema3, row3, &name_to_column_id3, table_id);
+
+        // All should have the same TSID since label names are sorted lexicographically
+        // and we're using the same label name-value pairs
+        assert_eq!(
+            tsid1, tsid2,
+            "TSID should be same for different column orders"
+        );
+        assert_eq!(
+            tsid2, tsid3,
+            "TSID should be same for different column orders"
+        );
+    }
+
+    #[test]
+    fn test_tsid_same_with_null_labels() {
+        // Test that rows that differ only by null label values produce the same TSID
+        let table_id = 1025;
+
+        // Row 1: a=A, b=B (no nulls, fast path)
+        let schema1 = create_multi_label_schema(&["a", "b"]);
+        let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
+        let row1 = create_row_with_values(&["A", "B"]);
+        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
+
+        // Row 2: a=A, b=B, c=null (has null, slow path)
+        let schema2 = create_multi_label_schema(&["a", "b", "c"]);
+        let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
+        let row2 = create_row_with_nulls(&[Some("A"), Some("B"), None]);
+        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
+
+        // Both should have the same TSID since null labels are ignored
+        assert_eq!(
+            tsid1, tsid2,
+            "TSID should be same when only difference is null label values"
+        );
+    }
+
+    #[test]
+    fn test_tsid_same_with_multiple_null_labels() {
+        // Test with multiple null labels
+        let table_id = 1025;
+
+        // Row 1: a=A, b=B (no nulls)
+        let schema1 = create_multi_label_schema(&["a", "b"]);
+        let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
+        let row1 = create_row_with_values(&["A", "B"]);
+        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
+
+        // Row 2: a=A, b=B, c=null, d=null (multiple nulls)
+        let schema2 = create_multi_label_schema(&["a", "b", "c", "d"]);
+        let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c", "d"]);
+        let row2 = create_row_with_nulls(&[Some("A"), Some("B"), None, None]);
+        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
+
+        assert_eq!(
+            tsid1, tsid2,
+            "TSID should be same when only difference is multiple null label values"
+        );
+    }
+
+    #[test]
+    fn test_tsid_different_with_different_non_null_values() {
+        // Test that rows with different non-null values produce different TSIDs
+        let table_id = 1025;
+
+        // Row 1: a=A, b=B
+        let schema1 = create_multi_label_schema(&["a", "b"]);
+        let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
+        let row1 = create_row_with_values(&["A", "B"]);
+        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
+
+        // Row 2: a=A, b=C (different value for b)
+        let schema2 = create_multi_label_schema(&["a", "b"]);
+        let name_to_column_id2 = create_name_to_column_id(&["a", "b"]);
+        let row2 = create_row_with_values(&["A", "C"]);
+        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
+
+        assert_ne!(
+            tsid1, tsid2,
+            "TSID should be different when label values differ"
+        );
+    }
+
+    #[test]
+    fn test_tsid_fast_path_vs_slow_path_consistency() {
+        // Test that fast path (no nulls) and slow path (with nulls) produce
+        // the same TSID for the same non-null label values
+        let table_id = 1025;
+
+        // Fast path: a=A, b=B (no nulls)
+        let schema_fast = create_multi_label_schema(&["a", "b"]);
+        let name_to_column_id_fast = create_name_to_column_id(&["a", "b"]);
+        let row_fast = create_row_with_values(&["A", "B"]);
+        let tsid_fast = extract_tsid(schema_fast, row_fast, &name_to_column_id_fast, table_id);
+
+        // Slow path: a=A, b=B, c=null (has null, triggers slow path)
+        let schema_slow = create_multi_label_schema(&["a", "b", "c"]);
+        let name_to_column_id_slow = create_name_to_column_id(&["a", "b", "c"]);
+        let row_slow = create_row_with_nulls(&[Some("A"), Some("B"), None]);
+        let tsid_slow = extract_tsid(schema_slow, row_slow, &name_to_column_id_slow, table_id);
+
+        assert_eq!(
+            tsid_fast, tsid_slow,
+            "Fast path and slow path should produce same TSID for same non-null values"
+        );
+    }
+
+    #[test]
+    fn test_tsid_with_null_in_middle() {
+        // Test with null in the middle of labels
+        let table_id = 1025;
+
+        // Row 1: a=A, b=B, c=C
+        let schema1 = create_multi_label_schema(&["a", "b", "c"]);
+        let name_to_column_id1 = create_name_to_column_id(&["a", "b", "c"]);
+        let row1 = create_row_with_values(&["A", "B", "C"]);
+        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
+
+        // Row 2: a=A, b=null, c=C (null in middle)
+        let schema2 = create_multi_label_schema(&["a", "b", "c"]);
+        let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
+        let row2 = create_row_with_nulls(&[Some("A"), None, Some("C")]);
+        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
+
+        // Should be different because b is null in row2 but B in row1
+        // Actually wait, let me reconsider - if b is null, it should be ignored
+        // So row2 should be equivalent to a=A, c=C
+        // But row1 is a=A, b=B, c=C, so they should be different
+        assert_ne!(
+            tsid1, tsid2,
+            "TSID should be different when a non-null value becomes null"
+        );
+
+        // Row 3: a=A, c=C (no b at all, equivalent to row2)
+        let schema3 = create_multi_label_schema(&["a", "c"]);
+        let name_to_column_id3 = create_name_to_column_id(&["a", "c"]);
+        let row3 = create_row_with_values(&["A", "C"]);
+        let tsid3 = extract_tsid(schema3, row3, &name_to_column_id3, table_id);
+
+        // Row2 (a=A, b=null, c=C) should be same as row3 (a=A, c=C)
+        assert_eq!(
+            tsid2, tsid3,
+            "TSID should be same when null label is ignored"
+        );
+    }
+
+    #[test]
+    fn test_tsid_all_null_labels() {
+        // Test with all labels being null
+        let table_id = 1025;
+
+        // Row with all nulls
+        let schema = create_multi_label_schema(&["a", "b", "c"]);
+        let name_to_column_id = create_name_to_column_id(&["a", "b", "c"]);
+        let row = create_row_with_nulls(&[None, None, None]);
+        let tsid = extract_tsid(schema.clone(), row, &name_to_column_id, table_id);
+
+        // Should still produce a TSID (based on label names only when all values are null)
+        // This tests that the slow path handles the case where all values are null
+        // The TSID will be based on the label name hash only
+        // Test that it's consistent - same schema with all nulls should produce same TSID
+        let row2 = create_row_with_nulls(&[None, None, None]);
+        let tsid2 = extract_tsid(schema, row2, &name_to_column_id, table_id);
+        assert_eq!(
+            tsid, tsid2,
+            "TSID should be consistent when all label values are null"
+        );
    }
 }
--- a/Show More
+++ b/Show More