chore: feature gate vector_index (#7428 )

Signed-off-by: discord9 <discord9@163.com>
fix: using anonymous s3 access when ak and sk is not provided (#7425 )
2025-12-23 14:40:01 +00:00 · 2025-12-17 07:14:25 +00:00 · 2025-12-17 06:34:29 +00:00 · 2025-12-17 04:18:23 +00:00 · 2025-12-17 03:14:35 +00:00 · 2025-12-17 01:29:36 +00:00
321 changed files with 21494 additions and 5197 deletions
--- a/.github/actions/setup-greptimedb-cluster/action.yml
+++ b/.github/actions/setup-greptimedb-cluster/action.yml
@@ -51,7 +51,7 @@ runs:
    run: |
      helm upgrade \
        --install my-greptimedb \
-        --set meta.backendStorage.etcd.endpoints=${{ inputs.etcd-endpoints }} \
+        --set 'meta.backendStorage.etcd.endpoints[0]=${{ inputs.etcd-endpoints }}' \
        --set meta.enableRegionFailover=${{ inputs.enable-region-failover }} \
        --set image.registry=${{ inputs.image-registry }} \
        --set image.repository=${{ inputs.image-repository }}  \
--- a/.github/scripts/deploy-greptimedb.sh
+++ b/.github/scripts/deploy-greptimedb.sh
@@ -81,7 +81,7 @@ function deploy_greptimedb_cluster() {
    --create-namespace \
    --set image.tag="$GREPTIMEDB_IMAGE_TAG" \
    --set initializer.tag="$GREPTIMEDB_INITIALIZER_IMAGE_TAG" \
-    --set meta.backendStorage.etcd.endpoints="etcd.$install_namespace:2379" \
+    --set "meta.backendStorage.etcd.endpoints[0]=etcd.$install_namespace.svc.cluster.local:2379" \
    --set meta.backendStorage.etcd.storeKeyPrefix="$cluster_name" \
    -n "$install_namespace"

@@ -119,7 +119,7 @@ function deploy_greptimedb_cluster_with_s3_storage() {
    --create-namespace \
    --set image.tag="$GREPTIMEDB_IMAGE_TAG" \
    --set initializer.tag="$GREPTIMEDB_INITIALIZER_IMAGE_TAG" \
-    --set meta.backendStorage.etcd.endpoints="etcd.$install_namespace:2379" \
+    --set "meta.backendStorage.etcd.endpoints[0]=etcd.$install_namespace.svc.cluster.local:2379" \
    --set meta.backendStorage.etcd.storeKeyPrefix="$cluster_name" \
    --set objectStorage.s3.bucket="$AWS_CI_TEST_BUCKET" \
    --set objectStorage.s3.region="$AWS_REGION" \
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -49,14 +49,9 @@ on:
        description: Do not run integration tests during the build
        type: boolean
        default: true
-      build_linux_amd64_artifacts:
+      build_linux_artifacts:
        type: boolean
-        description: Build linux-amd64 artifacts
-        required: false
-        default: false
-      build_linux_arm64_artifacts:
-        type: boolean
-        description: Build linux-arm64 artifacts
+        description: Build linux artifacts (both amd64 and arm64)
        required: false
        default: false
      build_macos_artifacts:
@@ -144,7 +139,7 @@ jobs:
          ./.github/scripts/check-version.sh "${{ steps.create-version.outputs.version }}"

      - name: Allocate linux-amd64 runner
-        if: ${{ inputs.build_linux_amd64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+        if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
        uses: ./.github/actions/start-runner
        id: start-linux-amd64-runner
        with:
@@ -158,7 +153,7 @@ jobs:
          subnet-id: ${{ vars.EC2_RUNNER_SUBNET_ID }}

      - name: Allocate linux-arm64 runner
-        if: ${{ inputs.build_linux_arm64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+        if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
        uses: ./.github/actions/start-runner
        id: start-linux-arm64-runner
        with:
@@ -173,7 +168,7 @@ jobs:

  build-linux-amd64-artifacts:
    name: Build linux-amd64 artifacts
-    if: ${{ inputs.build_linux_amd64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+    if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
    needs: [
      allocate-runners,
    ]
@@ -195,7 +190,7 @@ jobs:

  build-linux-arm64-artifacts:
    name: Build linux-arm64 artifacts
-    if: ${{ inputs.build_linux_arm64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+    if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
    needs: [
      allocate-runners,
    ]
@@ -217,7 +212,7 @@ jobs:

  run-multi-lang-tests:
    name: Run Multi-language SDK Tests
-    if: ${{ inputs.build_linux_amd64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+    if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
    needs: [
      allocate-runners,
      build-linux-amd64-artifacts,
@@ -386,7 +381,18 @@ jobs:

  publish-github-release:
    name: Create GitHub release and upload artifacts
-    if: ${{ inputs.publish_github_release || github.event_name == 'push' || github.event_name == 'schedule' }}
+    # Use always() to run even when optional jobs (macos, windows) are skipped.
+    # Then check that required jobs succeeded and optional jobs didn't fail.
+    if: |
+      always() &&
+      (inputs.publish_github_release || github.event_name == 'push' || github.event_name == 'schedule') &&
+      needs.allocate-runners.result == 'success' &&
+      (needs.build-linux-amd64-artifacts.result == 'success' || needs.build-linux-amd64-artifacts.result == 'skipped') &&
+      (needs.build-linux-arm64-artifacts.result == 'success' || needs.build-linux-arm64-artifacts.result == 'skipped') &&
+      (needs.build-macos-artifacts.result == 'success' || needs.build-macos-artifacts.result == 'skipped') &&
+      (needs.build-windows-artifacts.result == 'success' || needs.build-windows-artifacts.result == 'skipped') &&
+      (needs.release-images-to-dockerhub.result == 'success' || needs.release-images-to-dockerhub.result == 'skipped') &&
+      (needs.run-multi-lang-tests.result == 'success' || needs.run-multi-lang-tests.result == 'skipped')
    needs: [ # The job have to wait for all the artifacts are built.
      allocate-runners,
      build-linux-amd64-artifacts,
--- a/AUTHOR.md
+++ b/AUTHOR.md
@@ -2,41 +2,41 @@

 ## Individual Committers (in alphabetical order)

-* [CookiePieWw](https://github.com/CookiePieWw)
-* [etolbakov](https://github.com/etolbakov)
-* [irenjj](https://github.com/irenjj)
-* [KKould](https://github.com/KKould)
-* [Lanqing Yang](https://github.com/lyang24)
-* [NiwakaDev](https://github.com/NiwakaDev)
-* [tisonkun](https://github.com/tisonkun)
+- [apdong2022](https://github.com/apdong2022)
+- [beryl678](https://github.com/beryl678)
+- [CookiePieWw](https://github.com/CookiePieWw)
+- [etolbakov](https://github.com/etolbakov)
+- [irenjj](https://github.com/irenjj)
+- [KKould](https://github.com/KKould)
+- [Lanqing Yang](https://github.com/lyang24)
+- [nicecui](https://github.com/nicecui)
+- [NiwakaDev](https://github.com/NiwakaDev)
+- [paomian](https://github.com/paomian)
+- [tisonkun](https://github.com/tisonkun)
+- [Wenjie0329](https://github.com/Wenjie0329)
+- [zhaoyingnan01](https://github.com/zhaoyingnan01)
+- [zhongzc](https://github.com/zhongzc)
+- [ZonaHex](https://github.com/ZonaHex)
+- [zyy17](https://github.com/zyy17)

 ## Team Members (in alphabetical order)

-* [apdong2022](https://github.com/apdong2022)
-* [beryl678](https://github.com/beryl678)
-* [daviderli614](https://github.com/daviderli614)
-* [discord9](https://github.com/discord9)
-* [evenyag](https://github.com/evenyag)
-* [fengjiachun](https://github.com/fengjiachun)
-* [fengys1996](https://github.com/fengys1996)
-* [GrepTime](https://github.com/GrepTime)
-* [holalengyu](https://github.com/holalengyu)
-* [killme2008](https://github.com/killme2008)
-* [MichaelScofield](https://github.com/MichaelScofield)
-* [nicecui](https://github.com/nicecui)
-* [paomian](https://github.com/paomian)
-* [shuiyisong](https://github.com/shuiyisong)
-* [sunchanglong](https://github.com/sunchanglong)
-* [sunng87](https://github.com/sunng87)
-* [v0y4g3r](https://github.com/v0y4g3r)
-* [waynexia](https://github.com/waynexia)
-* [Wenjie0329](https://github.com/Wenjie0329)
-* [WenyXu](https://github.com/WenyXu)
-* [xtang](https://github.com/xtang)
-* [zhaoyingnan01](https://github.com/zhaoyingnan01)
-* [zhongzc](https://github.com/zhongzc)
-* [ZonaHex](https://github.com/ZonaHex)
-* [zyy17](https://github.com/zyy17)
+- [daviderli614](https://github.com/daviderli614)
+- [discord9](https://github.com/discord9)
+- [evenyag](https://github.com/evenyag)
+- [fengjiachun](https://github.com/fengjiachun)
+- [fengys1996](https://github.com/fengys1996)
+- [GrepTime](https://github.com/GrepTime)
+- [holalengyu](https://github.com/holalengyu)
+- [killme2008](https://github.com/killme2008)
+- [MichaelScofield](https://github.com/MichaelScofield)
+- [shuiyisong](https://github.com/shuiyisong)
+- [sunchanglong](https://github.com/sunchanglong)
+- [sunng87](https://github.com/sunng87)
+- [v0y4g3r](https://github.com/v0y4g3r)
+- [waynexia](https://github.com/waynexia)
+- [WenyXu](https://github.com/WenyXu)
+- [xtang](https://github.com/xtang)

 ## All Contributors

--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,6 +21,7 @@ members = [
    "src/common/grpc-expr",
    "src/common/macro",
    "src/common/mem-prof",
+    "src/common/memory-manager",
    "src/common/meta",
    "src/common/options",
    "src/common/plugins",
@@ -74,7 +75,7 @@ members = [
 resolver = "2"

 [workspace.package]
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 edition = "2024"
 license = "Apache-2.0"

@@ -131,7 +132,7 @@ datafusion-functions = "50"
 datafusion-functions-aggregate-common = "50"
 datafusion-optimizer = "50"
 datafusion-orc = "0.5"
-datafusion-pg-catalog = "0.12.2"
+datafusion-pg-catalog = "0.12.3"
 datafusion-physical-expr = "50"
 datafusion-physical-plan = "50"
 datafusion-sql = "50"
@@ -139,6 +140,7 @@ datafusion-substrait = "50"
 deadpool = "0.12"
 deadpool-postgres = "0.14"
 derive_builder = "0.20"
+derive_more = { version = "2.1", features = ["full"] }
 dotenv = "0.15"
 either = "1.15"
 etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62df834f0cffda355eba96691fe1a9a332b75a7", features = [
@@ -148,7 +150,7 @@ etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62d
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "0df99f09f1d6785055b2d9da96fc4ecc2bdf6803" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "0423fa30203187c75e2937a668df1da699c8b96c" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
@@ -200,7 +202,8 @@ reqwest = { version = "0.12", default-features = false, features = [
    "stream",
    "multipart",
 ] }
-rskafka = { git = "https://github.com/WenyXu/rskafka.git", rev = "7b0f31ed39db049b4ee2e5f1e95b5a30be9baf76", features = [
+# Branch: feat/request-timeout
+rskafka = { git = "https://github.com/GreptimeTeam/rskafka.git", rev = "f5688f83e7da591cda3f2674c2408b4c0ed4ed50", features = [
    "transport-tls",
 ] }
 rstest = "0.25"
@@ -264,6 +267,7 @@ common-grpc = { path = "src/common/grpc" }
 common-grpc-expr = { path = "src/common/grpc-expr" }
 common-macro = { path = "src/common/macro" }
 common-mem-prof = { path = "src/common/mem-prof" }
+common-memory-manager = { path = "src/common/memory-manager" }
 common-meta = { path = "src/common/meta" }
 common-options = { path = "src/common/options" }
 common-plugins = { path = "src/common/plugins" }
@@ -316,18 +320,18 @@ git = "https://github.com/GreptimeTeam/greptime-meter.git"
 rev = "5618e779cf2bb4755b499c630fba4c35e91898cb"

 [patch.crates-io]
-datafusion = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
-datafusion-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
-datafusion-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
-datafusion-functions = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
-datafusion-functions-aggregate-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
-datafusion-optimizer = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
-datafusion-physical-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
-datafusion-physical-expr-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
-datafusion-physical-plan = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
-datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
-datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
-datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
+datafusion = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
+datafusion-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
+datafusion-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
+datafusion-functions = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
+datafusion-functions-aggregate-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
+datafusion-optimizer = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
+datafusion-physical-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
+datafusion-physical-expr-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
+datafusion-physical-plan = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
+datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
+datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
+datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
 sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "4b519a5caa95472cc3988f5556813a583dd35af1" }                           # branch = "v0.58.x"

 [profile.release]
--- a/config/config.md
+++ b/config/config.md
@@ -108,9 +108,6 @@
 | `storage` | -- | -- | The data storage options. |
 | `storage.data_home` | String | `./greptimedb_data` | The working home directory. |
 | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
-| `storage.enable_read_cache` | Bool | `true` | Whether to enable read cache. If not set, the read cache will be enabled by default when using object storage. |
-| `storage.cache_path` | String | Unset | Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.<br/>A local file directory, defaults to `{data_home}`. An empty string means disabling. |
-| `storage.cache_capacity` | String | Unset | The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger. |
 | `storage.bucket` | String | Unset | The S3 bucket name.<br/>**It's only used when the storage type is `S3`, `Oss` and `Gcs`**. |
 | `storage.root` | String | Unset | The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.<br/>**It's only used when the storage type is `S3`, `Oss` and `Azblob`**. |
 | `storage.access_key_id` | String | Unset | The access key id of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3` and `Oss`**. |
@@ -141,6 +138,8 @@
 | `region_engine.mito.max_background_flushes` | Integer | Auto | Max number of running background flush jobs (default: 1/2 of cpu cores). |
 | `region_engine.mito.max_background_compactions` | Integer | Auto | Max number of running background compaction jobs (default: 1/4 of cpu cores). |
 | `region_engine.mito.max_background_purges` | Integer | Auto | Max number of running background purge jobs (default: number of cpu cores). |
+| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. |
+| `region_engine.mito.experimental_compaction_on_exhausted` | String | wait | Behavior when compaction cannot acquire memory from the budget.<br/>Options: "wait" (default, 10s), "wait(<duration>)", "fail" |
 | `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. |
 | `region_engine.mito.global_write_buffer_size` | String | Auto | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. |
 | `region_engine.mito.global_write_buffer_reject_size` | String | Auto | Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size`. |
@@ -154,6 +153,8 @@
 | `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.preload_index_cache` | Bool | `true` | Preload index (puffin) files into cache on region open (default: true).<br/>When enabled, index files are loaded into the write cache during region initialization,<br/>which can improve query performance at the cost of longer startup times. |
 | `region_engine.mito.index_cache_percent` | Integer | `20` | Percentage of write cache capacity allocated for index (puffin) files (default: 20).<br/>The remaining capacity is used for data (parquet) files.<br/>Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,<br/>1GiB is reserved for index files and 4GiB for data files. |
+| `region_engine.mito.enable_refill_cache_on_read` | Bool | `true` | Enable refilling cache on read operations (default: true).<br/>When disabled, cache refilling on read won't happen. |
+| `region_engine.mito.manifest_cache_size` | String | `256MB` | Capacity for manifest cache (default: 256MB). |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
 | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. |
@@ -294,7 +295,6 @@
 | `meta_client` | -- | -- | The metasrv client options. |
 | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
 | `meta_client.timeout` | String | `3s` | Operation timeout. |
-| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
 | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
 | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
 | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
@@ -457,7 +457,6 @@
 | `meta_client` | -- | -- | The metasrv client options. |
 | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
 | `meta_client.timeout` | String | `3s` | Operation timeout. |
-| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
 | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
 | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
 | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
@@ -488,9 +487,6 @@
 | `storage` | -- | -- | The data storage options. |
 | `storage.data_home` | String | `./greptimedb_data` | The working home directory. |
 | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
-| `storage.cache_path` | String | Unset | Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.<br/>A local file directory, defaults to `{data_home}`. An empty string means disabling. |
-| `storage.enable_read_cache` | Bool | `true` | Whether to enable read cache. If not set, the read cache will be enabled by default when using object storage. |
-| `storage.cache_capacity` | String | Unset | The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger. |
 | `storage.bucket` | String | Unset | The S3 bucket name.<br/>**It's only used when the storage type is `S3`, `Oss` and `Gcs`**. |
 | `storage.root` | String | Unset | The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.<br/>**It's only used when the storage type is `S3`, `Oss` and `Azblob`**. |
 | `storage.access_key_id` | String | Unset | The access key id of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3` and `Oss`**. |
@@ -523,6 +519,8 @@
 | `region_engine.mito.max_background_flushes` | Integer | Auto | Max number of running background flush jobs (default: 1/2 of cpu cores). |
 | `region_engine.mito.max_background_compactions` | Integer | Auto | Max number of running background compaction jobs (default: 1/4 of cpu cores). |
 | `region_engine.mito.max_background_purges` | Integer | Auto | Max number of running background purge jobs (default: number of cpu cores). |
+| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. |
+| `region_engine.mito.experimental_compaction_on_exhausted` | String | wait | Behavior when compaction cannot acquire memory from the budget.<br/>Options: "wait" (default, 10s), "wait(<duration>)", "fail" |
 | `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. |
 | `region_engine.mito.global_write_buffer_size` | String | Auto | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. |
 | `region_engine.mito.global_write_buffer_reject_size` | String | Auto | Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size` |
@@ -536,6 +534,8 @@
 | `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.preload_index_cache` | Bool | `true` | Preload index (puffin) files into cache on region open (default: true).<br/>When enabled, index files are loaded into the write cache during region initialization,<br/>which can improve query performance at the cost of longer startup times. |
 | `region_engine.mito.index_cache_percent` | Integer | `20` | Percentage of write cache capacity allocated for index (puffin) files (default: 20).<br/>The remaining capacity is used for data (parquet) files.<br/>Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,<br/>1GiB is reserved for index files and 4GiB for data files. |
+| `region_engine.mito.enable_refill_cache_on_read` | Bool | `true` | Enable refilling cache on read operations (default: true).<br/>When disabled, cache refilling on read won't happen. |
+| `region_engine.mito.manifest_cache_size` | String | `256MB` | Capacity for manifest cache (default: 256MB). |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
 | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. |
@@ -629,7 +629,6 @@
 | `meta_client` | -- | -- | The metasrv client options. |
 | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
 | `meta_client.timeout` | String | `3s` | Operation timeout. |
-| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
 | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
 | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
 | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -99,9 +99,6 @@ metasrv_addrs = ["127.0.0.1:3002"]
 ## Operation timeout.
 timeout = "3s"

-## Heartbeat timeout.
-heartbeat_timeout = "500ms"
-
 ## DDL timeout.
 ddl_timeout = "10s"

@@ -284,18 +281,6 @@ data_home = "./greptimedb_data"
 ## - `Oss`: the data is stored in the Aliyun OSS.
 type = "File"

-## Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.
-## A local file directory, defaults to `{data_home}`. An empty string means disabling.
-## @toml2docs:none-default
-#+ cache_path = ""
-
-## Whether to enable read cache. If not set, the read cache will be enabled by default when using object storage.
-#+ enable_read_cache = true
-
-## The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger.
-## @toml2docs:none-default
-cache_capacity = "5GiB"
-
 ## The S3 bucket name.
 ## **It's only used when the storage type is `S3`, `Oss` and `Gcs`**.
 ## @toml2docs:none-default
@@ -455,6 +440,15 @@ compress_manifest = false
 ## @toml2docs:none-default="Auto"
 #+ max_background_purges = 8

+## Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit.
+## @toml2docs:none-default="0"
+#+ experimental_compaction_memory_limit = "0"
+
+## Behavior when compaction cannot acquire memory from the budget.
+## Options: "wait" (default, 10s), "wait(<duration>)", "fail"
+## @toml2docs:none-default="wait"
+#+ experimental_compaction_on_exhausted = "wait"
+
 ## Interval to auto flush a region if it has not flushed yet.
 auto_flush_interval = "1h"

@@ -510,6 +504,13 @@ preload_index_cache = true
 ## 1GiB is reserved for index files and 4GiB for data files.
 index_cache_percent = 20

+## Enable refilling cache on read operations (default: true).
+## When disabled, cache refilling on read won't happen.
+enable_refill_cache_on_read = true
+
+## Capacity for manifest cache (default: 256MB).
+manifest_cache_size = "256MB"
+
 ## Buffer size for SST writing.
 sst_write_buffer_size = "8MB"

--- a/config/flownode.example.toml
+++ b/config/flownode.example.toml
@@ -78,9 +78,6 @@ metasrv_addrs = ["127.0.0.1:3002"]
 ## Operation timeout.
 timeout = "3s"

-## Heartbeat timeout.
-heartbeat_timeout = "500ms"
-
 ## DDL timeout.
 ddl_timeout = "10s"

--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -226,9 +226,6 @@ metasrv_addrs = ["127.0.0.1:3002"]
 ## Operation timeout.
 timeout = "3s"

-## Heartbeat timeout.
-heartbeat_timeout = "500ms"
-
 ## DDL timeout.
 ddl_timeout = "10s"

--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -388,18 +388,6 @@ data_home = "./greptimedb_data"
 ## - `Oss`: the data is stored in the Aliyun OSS.
 type = "File"

-## Whether to enable read cache. If not set, the read cache will be enabled by default when using object storage.
-#+ enable_read_cache = true
-
-## Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.
-## A local file directory, defaults to `{data_home}`. An empty string means disabling.
-## @toml2docs:none-default
-#+ cache_path = ""
-
-## The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger.
-## @toml2docs:none-default
-cache_capacity = "5GiB"
-
 ## The S3 bucket name.
 ## **It's only used when the storage type is `S3`, `Oss` and `Gcs`**.
 ## @toml2docs:none-default
@@ -546,6 +534,15 @@ compress_manifest = false
 ## @toml2docs:none-default="Auto"
 #+ max_background_purges = 8

+## Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit.
+## @toml2docs:none-default="0"
+#+ experimental_compaction_memory_limit = "0"
+
+## Behavior when compaction cannot acquire memory from the budget.
+## Options: "wait" (default, 10s), "wait(<duration>)", "fail"
+## @toml2docs:none-default="wait"
+#+ experimental_compaction_on_exhausted = "wait"
+
 ## Interval to auto flush a region if it has not flushed yet.
 auto_flush_interval = "1h"

@@ -601,6 +598,13 @@ preload_index_cache = true
 ## 1GiB is reserved for index files and 4GiB for data files.
 index_cache_percent = 20

+## Enable refilling cache on read operations (default: true).
+## When disabled, cache refilling on read won't happen.
+enable_refill_cache_on_read = true
+
+## Capacity for manifest cache (default: 256MB).
+manifest_cache_size = "256MB"
+
 ## Buffer size for SST writing.
 sst_write_buffer_size = "8MB"

--- a/flake.lock
+++ b/flake.lock
@@ -8,11 +8,11 @@
        "rust-analyzer-src": "rust-analyzer-src"
      },
      "locked": {
-        "lastModified": 1760078406,
-        "narHash": "sha256-JeJK0ZA845PtkCHkfo4KjeI1mYrsr2s3cxBYKhF4BoE=",
+        "lastModified": 1765252472,
+        "narHash": "sha256-byMt/uMi7DJ8tRniFopDFZMO3leSjGp6GS4zWOFT+uQ=",
        "owner": "nix-community",
        "repo": "fenix",
-        "rev": "351277c60d104944122ee389cdf581c5ce2c6732",
+        "rev": "8456b985f6652e3eef0632ee9992b439735c5544",
        "type": "github"
      },
      "original": {
@@ -41,16 +41,16 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1759994382,
-        "narHash": "sha256-wSK+3UkalDZRVHGCRikZ//CyZUJWDJkBDTQX1+G77Ow=",
+        "lastModified": 1764983851,
+        "narHash": "sha256-y7RPKl/jJ/KAP/VKLMghMgXTlvNIJMHKskl8/Uuar7o=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "5da4a26309e796daa7ffca72df93dbe53b8164c7",
+        "rev": "d9bc5c7dceb30d8d6fafa10aeb6aa8a48c218454",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
-        "ref": "nixos-25.05",
+        "ref": "nixos-25.11",
        "repo": "nixpkgs",
        "type": "github"
      }
@@ -65,11 +65,11 @@
    "rust-analyzer-src": {
      "flake": false,
      "locked": {
-        "lastModified": 1760014945,
-        "narHash": "sha256-ySdl7F9+oeWNHVrg3QL/brazqmJvYFEdpGnF3pyoDH8=",
+        "lastModified": 1765120009,
+        "narHash": "sha256-nG76b87rkaDzibWbnB5bYDm6a52b78A+fpm+03pqYIw=",
        "owner": "rust-lang",
        "repo": "rust-analyzer",
-        "rev": "90d2e1ce4dfe7dc49250a8b88a0f08ffdb9cb23f",
+        "rev": "5e3e9c4e61bba8a5e72134b9ffefbef8f531d008",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@@ -2,7 +2,7 @@
  description = "Development environment flake";

  inputs = {
-    nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05";
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11";
    fenix = {
      url = "github:nix-community/fenix";
      inputs.nixpkgs.follows = "nixpkgs";
@@ -48,7 +48,7 @@
            gnuplot ## for cargo bench
          ];

-          LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath buildInputs;
+          buildInputs = buildInputs;
          NIX_HARDENING_ENABLE = "";
        };
      });
--- a/src/api/src/helper.rs
+++ b/src/api/src/helper.rs
@@ -708,6 +708,7 @@ fn ddl_request_type(request: &DdlRequest) -> &'static str {
        Some(Expr::CreateView(_)) => "ddl.create_view",
        Some(Expr::DropView(_)) => "ddl.drop_view",
        Some(Expr::AlterDatabase(_)) => "ddl.alter_database",
+        Some(Expr::CommentOn(_)) => "ddl.comment_on",
        None => "ddl.empty",
    }
 }
--- a/src/auth/Cargo.toml
+++ b/src/auth/Cargo.toml
@@ -15,11 +15,11 @@ workspace = true
 api.workspace = true
 async-trait.workspace = true
 common-base.workspace = true
+common-config.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 common-telemetry.workspace = true
 digest = "0.10"
-notify.workspace = true
 sha1 = "0.10"
 snafu.workspace = true
 sql.workspace = true
--- a/src/auth/src/error.rs
+++ b/src/auth/src/error.rs
@@ -75,11 +75,12 @@ pub enum Error {
        username: String,
    },

-    #[snafu(display("Failed to initialize a watcher for file {}", path))]
+    #[snafu(display("Failed to initialize a file watcher"))]
    FileWatch {
-        path: String,
        #[snafu(source)]
-        error: notify::Error,
+        source: common_config::error::Error,
+        #[snafu(implicit)]
+        location: Location,
    },

    #[snafu(display("User is not authorized to perform this action"))]
--- a/src/auth/src/user_provider/watch_file_user_provider.rs
+++ b/src/auth/src/user_provider/watch_file_user_provider.rs
@@ -12,16 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::path::Path;
-use std::sync::mpsc::channel;
 use std::sync::{Arc, Mutex};

 use async_trait::async_trait;
+use common_config::file_watcher::{FileWatcherBuilder, FileWatcherConfig};
 use common_telemetry::{info, warn};
-use notify::{EventKind, RecursiveMode, Watcher};
-use snafu::{ResultExt, ensure};
+use snafu::ResultExt;

-use crate::error::{FileWatchSnafu, InvalidConfigSnafu, Result};
+use crate::error::{FileWatchSnafu, Result};
 use crate::user_provider::{UserInfoMap, authenticate_with_credential, load_credential_from_file};
 use crate::{Identity, Password, UserInfoRef, UserProvider};

@@ -41,61 +39,36 @@ impl WatchFileUserProvider {
    pub fn new(filepath: &str) -> Result<Self> {
        let credential = load_credential_from_file(filepath)?;
        let users = Arc::new(Mutex::new(credential));
-        let this = WatchFileUserProvider {
-            users: users.clone(),
-        };

-        let (tx, rx) = channel::<notify::Result<notify::Event>>();
-        let mut debouncer =
-            notify::recommended_watcher(tx).context(FileWatchSnafu { path: "<none>" })?;
-        let mut dir = Path::new(filepath).to_path_buf();
-        ensure!(
-            dir.pop(),
-            InvalidConfigSnafu {
-                value: filepath,
-                msg: "UserProvider path must be a file path",
-            }
-        );
-        debouncer
-            .watch(&dir, RecursiveMode::NonRecursive)
-            .context(FileWatchSnafu { path: filepath })?;
+        let users_clone = users.clone();
+        let filepath_owned = filepath.to_string();

-        let filepath = filepath.to_string();
-        std::thread::spawn(move || {
-            let filename = Path::new(&filepath).file_name();
-            let _hold = debouncer;
-            while let Ok(res) = rx.recv() {
-                if let Ok(event) = res {
-                    let is_this_file = event.paths.iter().any(|p| p.file_name() == filename);
-                    let is_relevant_event = matches!(
-                        event.kind,
-                        EventKind::Modify(_) | EventKind::Create(_) | EventKind::Remove(_)
+        FileWatcherBuilder::new()
+            .watch_path(filepath)
+            .context(FileWatchSnafu)?
+            .config(FileWatcherConfig::new())
+            .spawn(move || match load_credential_from_file(&filepath_owned) {
+                Ok(credential) => {
+                    let mut users = users_clone.lock().expect("users credential must be valid");
+                    #[cfg(not(test))]
+                    info!("User provider file {} reloaded", &filepath_owned);
+                    #[cfg(test)]
+                    info!(
+                        "User provider file {} reloaded: {:?}",
+                        &filepath_owned, credential
                    );
-                    if is_this_file && is_relevant_event {
-                        info!(?event.kind, "User provider file {} changed", &filepath);
-                        match load_credential_from_file(&filepath) {
-                            Ok(credential) => {
-                                let mut users =
-                                    users.lock().expect("users credential must be valid");
-                                #[cfg(not(test))]
-                                info!("User provider file {filepath} reloaded");
-                                #[cfg(test)]
-                                info!("User provider file {filepath} reloaded: {credential:?}");
-                                *users = credential;
-                            }
-                            Err(err) => {
-                                warn!(
-                                    ?err,
-                                    "Fail to load credential from file {filepath}; keep the old one",
-                                )
-                            }
-                        }
-                    }
+                    *users = credential;
                }
-            }
-        });
+                Err(err) => {
+                    warn!(
+                        ?err,
+                        "Fail to load credential from file {}; keep the old one", &filepath_owned
+                    )
+                }
+            })
+            .context(FileWatchSnafu)?;

-        Ok(this)
+        Ok(WatchFileUserProvider { users })
    }
 }

--- a/src/catalog/src/system_schema/information_schema.rs
+++ b/src/catalog/src/system_schema/information_schema.rs
@@ -428,7 +428,7 @@ pub trait InformationExtension {
 }

 /// The request to inspect the datanode.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq)]
 pub struct DatanodeInspectRequest {
    /// Kind to fetch from datanode.
    pub kind: DatanodeInspectKind,
--- a/src/cmd/src/datanode/objbench.rs
+++ b/src/cmd/src/datanode/objbench.rs
@@ -145,6 +145,17 @@ impl ObjbenchCommand {
        let region_meta = extract_region_metadata(&self.source, &parquet_meta)?;
        let num_rows = parquet_meta.file_metadata().num_rows() as u64;
        let num_row_groups = parquet_meta.num_row_groups() as u64;
+        let max_row_group_uncompressed_size: u64 = parquet_meta
+            .row_groups()
+            .iter()
+            .map(|rg| {
+                rg.columns()
+                    .iter()
+                    .map(|c| c.uncompressed_size() as u64)
+                    .sum::<u64>()
+            })
+            .max()
+            .unwrap_or(0);

        println!(
            "{} Metadata loaded - rows: {}, size: {} bytes",
@@ -160,10 +171,11 @@ impl ObjbenchCommand {
            time_range: Default::default(),
            level: 0,
            file_size,
+            max_row_group_uncompressed_size,
            available_indexes: Default::default(),
            indexes: Default::default(),
            index_file_size: 0,
-            index_file_id: None,
+            index_version: 0,
            num_rows,
            num_row_groups,
            sequence: None,
@@ -564,7 +576,7 @@ fn new_noop_file_purger() -> FilePurgerRef {
    #[derive(Debug)]
    struct Noop;
    impl FilePurger for Noop {
-        fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool) {}
+        fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool, _index_outdated: bool) {}
    }
    Arc::new(Noop)
 }
--- a/src/cmd/src/frontend.rs
+++ b/src/cmd/src/frontend.rs
@@ -35,6 +35,7 @@ use common_meta::cache::{CacheRegistryBuilder, LayeredCacheRegistryBuilder};
 use common_meta::heartbeat::handler::HandlerGroupExecutor;
 use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHandler;
 use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
+use common_meta::heartbeat::handler::suspend::SuspendHandler;
 use common_query::prelude::set_default_prefix;
 use common_stat::ResourceStatImpl;
 use common_telemetry::info;
@@ -45,13 +46,13 @@ use frontend::frontend::Frontend;
 use frontend::heartbeat::HeartbeatTask;
 use frontend::instance::builder::FrontendBuilder;
 use frontend::server::Services;
-use meta_client::{MetaClientOptions, MetaClientType};
+use meta_client::{MetaClientOptions, MetaClientRef, MetaClientType};
 use plugins::frontend::context::{
    CatalogManagerConfigureContext, DistributedCatalogManagerConfigureContext,
 };
 use servers::addrs;
 use servers::grpc::GrpcOptions;
-use servers::tls::{TlsMode, TlsOption};
+use servers::tls::{TlsMode, TlsOption, merge_tls_option};
 use snafu::{OptionExt, ResultExt};
 use tracing_appender::non_blocking::WorkerGuard;

@@ -255,7 +256,7 @@ impl StartCommand {

        if let Some(addr) = &self.rpc_bind_addr {
            opts.grpc.bind_addr.clone_from(addr);
-            opts.grpc.tls = tls_opts.clone();
+            opts.grpc.tls = merge_tls_option(&opts.grpc.tls, tls_opts.clone());
        }

        if let Some(addr) = &self.rpc_server_addr {
@@ -290,13 +291,13 @@ impl StartCommand {
        if let Some(addr) = &self.mysql_addr {
            opts.mysql.enable = true;
            opts.mysql.addr.clone_from(addr);
-            opts.mysql.tls = tls_opts.clone();
+            opts.mysql.tls = merge_tls_option(&opts.mysql.tls, tls_opts.clone());
        }

        if let Some(addr) = &self.postgres_addr {
            opts.postgres.enable = true;
            opts.postgres.addr.clone_from(addr);
-            opts.postgres.tls = tls_opts;
+            opts.postgres.tls = merge_tls_option(&opts.postgres.tls, tls_opts.clone());
        }

        if let Some(enable) = self.influxdb_enable {
@@ -440,30 +441,13 @@ impl StartCommand {
        };
        let catalog_manager = builder.build();

-        let executor = HandlerGroupExecutor::new(vec![
-            Arc::new(ParseMailboxMessageHandler),
-            Arc::new(InvalidateCacheHandler::new(layered_cache_registry.clone())),
-        ]);
-
-        let mut resource_stat = ResourceStatImpl::default();
-        resource_stat.start_collect_cpu_usage();
-
-        let heartbeat_task = HeartbeatTask::new(
-            &opts,
-            meta_client.clone(),
-            opts.heartbeat.clone(),
-            Arc::new(executor),
-            Arc::new(resource_stat),
-        );
-        let heartbeat_task = Some(heartbeat_task);
-
        let instance = FrontendBuilder::new(
            opts.clone(),
            cached_meta_backend.clone(),
            layered_cache_registry.clone(),
            catalog_manager,
            client,
-            meta_client,
+            meta_client.clone(),
            process_manager,
        )
        .with_plugin(plugins.clone())
@@ -471,6 +455,9 @@ impl StartCommand {
        .try_build()
        .await
        .context(error::StartFrontendSnafu)?;
+
+        let heartbeat_task = Some(create_heartbeat_task(&opts, meta_client, &instance));
+
        let instance = Arc::new(instance);

        let servers = Services::new(opts, instance.clone(), plugins)
@@ -487,6 +474,28 @@ impl StartCommand {
    }
 }

+pub fn create_heartbeat_task(
+    options: &frontend::frontend::FrontendOptions,
+    meta_client: MetaClientRef,
+    instance: &frontend::instance::Instance,
+) -> HeartbeatTask {
+    let executor = Arc::new(HandlerGroupExecutor::new(vec![
+        Arc::new(ParseMailboxMessageHandler),
+        Arc::new(SuspendHandler::new(instance.suspend_state())),
+        Arc::new(InvalidateCacheHandler::new(
+            instance.cache_invalidator().clone(),
+        )),
+    ]));
+
+    let stat = {
+        let mut stat = ResourceStatImpl::default();
+        stat.start_collect_cpu_usage();
+        Arc::new(stat)
+    };
+
+    HeartbeatTask::new(options, meta_client, executor, stat)
+}
+
 #[cfg(test)]
 mod tests {
    use std::io::Write;
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -62,7 +62,7 @@ use plugins::frontend::context::{
    CatalogManagerConfigureContext, StandaloneCatalogManagerConfigureContext,
 };
 use plugins::standalone::context::DdlManagerConfigureContext;
-use servers::tls::{TlsMode, TlsOption};
+use servers::tls::{TlsMode, TlsOption, merge_tls_option};
 use snafu::ResultExt;
 use standalone::StandaloneInformationExtension;
 use standalone::options::StandaloneOptions;
@@ -293,19 +293,20 @@ impl StartCommand {
                    ),
                }.fail();
            }
-            opts.grpc.bind_addr.clone_from(addr)
+            opts.grpc.bind_addr.clone_from(addr);
+            opts.grpc.tls = merge_tls_option(&opts.grpc.tls, tls_opts.clone());
        }

        if let Some(addr) = &self.mysql_addr {
            opts.mysql.enable = true;
            opts.mysql.addr.clone_from(addr);
-            opts.mysql.tls = tls_opts.clone();
+            opts.mysql.tls = merge_tls_option(&opts.mysql.tls, tls_opts.clone());
        }

        if let Some(addr) = &self.postgres_addr {
            opts.postgres.enable = true;
            opts.postgres.addr.clone_from(addr);
-            opts.postgres.tls = tls_opts;
+            opts.postgres.tls = merge_tls_option(&opts.postgres.tls, tls_opts.clone());
        }

        if self.influxdb_enable {
@@ -765,7 +766,6 @@ mod tests {
            user_provider: Some("static_user_provider:cmd:test=test".to_string()),
            mysql_addr: Some("127.0.0.1:4002".to_string()),
            postgres_addr: Some("127.0.0.1:4003".to_string()),
-            tls_watch: true,
            ..Default::default()
        };

@@ -782,8 +782,6 @@ mod tests {

        assert_eq!("./greptimedb_data/test/logs", opts.logging.dir);
        assert_eq!("debug", opts.logging.level.unwrap());
-        assert!(opts.mysql.tls.watch);
-        assert!(opts.postgres.tls.watch);
    }

    #[test]
--- a/src/cmd/tests/load_config_test.rs
+++ b/src/cmd/tests/load_config_test.rs
@@ -52,7 +52,6 @@ fn test_load_datanode_example_config() {
            meta_client: Some(MetaClientOptions {
                metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
                timeout: Duration::from_secs(3),
-                heartbeat_timeout: Duration::from_millis(500),
                ddl_timeout: Duration::from_secs(10),
                connect_timeout: Duration::from_secs(1),
                tcp_nodelay: true,
@@ -118,7 +117,6 @@ fn test_load_frontend_example_config() {
            meta_client: Some(MetaClientOptions {
                metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
                timeout: Duration::from_secs(3),
-                heartbeat_timeout: Duration::from_millis(500),
                ddl_timeout: Duration::from_secs(10),
                connect_timeout: Duration::from_secs(1),
                tcp_nodelay: true,
@@ -241,7 +239,6 @@ fn test_load_flownode_example_config() {
            meta_client: Some(MetaClientOptions {
                metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
                timeout: Duration::from_secs(3),
-                heartbeat_timeout: Duration::from_millis(500),
                ddl_timeout: Duration::from_secs(10),
                connect_timeout: Duration::from_secs(1),
                tcp_nodelay: true,
--- a/src/common/config/Cargo.toml
+++ b/src/common/config/Cargo.toml
@@ -11,8 +11,10 @@ workspace = true
 common-base.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
+common-telemetry.workspace = true
 config.workspace = true
 humantime-serde.workspace = true
+notify.workspace = true
 object-store.workspace = true
 serde.workspace = true
 serde_json.workspace = true
--- a/src/common/config/src/error.rs
+++ b/src/common/config/src/error.rs
@@ -49,14 +49,41 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display("Failed to watch file: {}", path))]
+    FileWatch {
+        path: String,
+        #[snafu(source)]
+        error: notify::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to canonicalize path: {}", path))]
+    CanonicalizePath {
+        path: String,
+        #[snafu(source)]
+        error: std::io::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Invalid path '{}': expected a file, not a directory", path))]
+    InvalidPath {
+        path: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 impl ErrorExt for Error {
    fn status_code(&self) -> StatusCode {
        match self {
-            Error::TomlFormat { .. } | Error::LoadLayeredConfig { .. } => {
-                StatusCode::InvalidArguments
-            }
+            Error::TomlFormat { .. }
+            | Error::LoadLayeredConfig { .. }
+            | Error::FileWatch { .. }
+            | Error::InvalidPath { .. }
+            | Error::CanonicalizePath { .. } => StatusCode::InvalidArguments,
            Error::SerdeJson { .. } => StatusCode::Unexpected,
        }
    }
--- a/src/common/config/src/file_watcher.rs
+++ b/src/common/config/src/file_watcher.rs
@@ -0,0 +1,355 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Common file watching utilities for configuration hot-reloading.
+//!
+//! This module provides a generic file watcher that can be used to watch
+//! files for changes and trigger callbacks when changes occur.
+//!
+//! The watcher monitors the parent directory of each file rather than the
+//! file itself. This ensures that file deletions and recreations are properly
+//! tracked, which is common with editors that use atomic saves or when
+//! configuration files are replaced.
+
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+use std::sync::mpsc::channel;
+
+use common_telemetry::{error, info, warn};
+use notify::{EventKind, RecursiveMode, Watcher};
+use snafu::ResultExt;
+
+use crate::error::{CanonicalizePathSnafu, FileWatchSnafu, InvalidPathSnafu, Result};
+
+/// Configuration for the file watcher behavior.
+#[derive(Debug, Clone, Default)]
+pub struct FileWatcherConfig {
+    /// Whether to include Remove events in addition to Modify and Create.
+    pub include_remove_events: bool,
+}
+
+impl FileWatcherConfig {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn with_modify_and_create(mut self) -> Self {
+        self.include_remove_events = false;
+        self
+    }
+
+    pub fn with_remove_events(mut self) -> Self {
+        self.include_remove_events = true;
+        self
+    }
+}
+
+/// A builder for creating file watchers with flexible configuration.
+///
+/// The watcher monitors the parent directory of each file to handle file
+/// deletion and recreation properly. Events are filtered to only trigger
+/// callbacks for the specific files being watched.
+pub struct FileWatcherBuilder {
+    config: FileWatcherConfig,
+    /// Canonicalized paths of files to watch.
+    file_paths: Vec<PathBuf>,
+}
+
+impl FileWatcherBuilder {
+    /// Create a new builder with default configuration.
+    pub fn new() -> Self {
+        Self {
+            config: FileWatcherConfig::default(),
+            file_paths: Vec::new(),
+        }
+    }
+
+    /// Set the watcher configuration.
+    pub fn config(mut self, config: FileWatcherConfig) -> Self {
+        self.config = config;
+        self
+    }
+
+    /// Add a file path to watch.
+    ///
+    /// Returns an error if the path is a directory.
+    /// The path is canonicalized for reliable comparison with events.
+    pub fn watch_path<P: AsRef<Path>>(mut self, path: P) -> Result<Self> {
+        let path = path.as_ref();
+        snafu::ensure!(
+            path.is_file(),
+            InvalidPathSnafu {
+                path: path.display().to_string(),
+            }
+        );
+        // Canonicalize the path for reliable comparison with event paths
+        let canonical = path.canonicalize().context(CanonicalizePathSnafu {
+            path: path.display().to_string(),
+        })?;
+        self.file_paths.push(canonical);
+        Ok(self)
+    }
+
+    /// Add multiple file paths to watch.
+    ///
+    /// Returns an error if any path is a directory.
+    pub fn watch_paths<P: AsRef<Path>, I: IntoIterator<Item = P>>(
+        mut self,
+        paths: I,
+    ) -> Result<Self> {
+        for path in paths {
+            self = self.watch_path(path)?;
+        }
+        Ok(self)
+    }
+
+    /// Build and spawn the file watcher with the given callback.
+    ///
+    /// The callback is invoked when relevant file events are detected for
+    /// the watched files. The watcher monitors the parent directories to
+    /// handle file deletion and recreation properly.
+    ///
+    /// The spawned watcher thread runs for the lifetime of the process.
+    pub fn spawn<F>(self, callback: F) -> Result<()>
+    where
+        F: Fn() + Send + 'static,
+    {
+        let (tx, rx) = channel::<notify::Result<notify::Event>>();
+        let mut watcher =
+            notify::recommended_watcher(tx).context(FileWatchSnafu { path: "<none>" })?;
+
+        // Collect unique parent directories to watch
+        let mut watched_dirs: HashSet<PathBuf> = HashSet::new();
+        for file_path in &self.file_paths {
+            if let Some(parent) = file_path.parent()
+                && watched_dirs.insert(parent.to_path_buf())
+            {
+                watcher
+                    .watch(parent, RecursiveMode::NonRecursive)
+                    .context(FileWatchSnafu {
+                        path: parent.display().to_string(),
+                    })?;
+            }
+        }
+
+        let config = self.config;
+        let watched_files: HashSet<PathBuf> = self.file_paths.iter().cloned().collect();
+
+        info!(
+            "Spawning file watcher for paths: {:?} (watching parent directories)",
+            self.file_paths
+                .iter()
+                .map(|p| p.display().to_string())
+                .collect::<Vec<_>>()
+        );
+
+        std::thread::spawn(move || {
+            // Keep watcher alive in the thread
+            let _watcher = watcher;
+
+            while let Ok(res) = rx.recv() {
+                match res {
+                    Ok(event) => {
+                        if !is_relevant_event(&event.kind, &config) {
+                            continue;
+                        }
+
+                        // Check if any of the event paths match our watched files
+                        let is_watched_file = event.paths.iter().any(|event_path| {
+                            // Try to canonicalize the event path for comparison
+                            // If the file was deleted, canonicalize will fail, so we also
+                            // compare the raw path
+                            if let Ok(canonical) = event_path.canonicalize()
+                                && watched_files.contains(&canonical)
+                            {
+                                return true;
+                            }
+                            // For deleted files, compare using the raw path
+                            watched_files.contains(event_path)
+                        });
+
+                        if !is_watched_file {
+                            continue;
+                        }
+
+                        info!(?event.kind, ?event.paths, "Detected file change");
+                        callback();
+                    }
+                    Err(err) => {
+                        warn!("File watcher error: {}", err);
+                    }
+                }
+            }
+
+            error!("File watcher channel closed unexpectedly");
+        });
+
+        Ok(())
+    }
+}
+
+impl Default for FileWatcherBuilder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Check if an event kind is relevant based on the configuration.
+fn is_relevant_event(kind: &EventKind, config: &FileWatcherConfig) -> bool {
+    match kind {
+        EventKind::Modify(_) | EventKind::Create(_) => true,
+        EventKind::Remove(_) => config.include_remove_events,
+        _ => false,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+    use std::time::Duration;
+
+    use common_test_util::temp_dir::create_temp_dir;
+
+    use super::*;
+
+    #[test]
+    fn test_file_watcher_detects_changes() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("test_file_watcher");
+        let file_path = dir.path().join("test_file.txt");
+
+        // Create initial file
+        std::fs::write(&file_path, "initial content").unwrap();
+
+        let counter = Arc::new(AtomicUsize::new(0));
+        let counter_clone = counter.clone();
+
+        FileWatcherBuilder::new()
+            .watch_path(&file_path)
+            .unwrap()
+            .config(FileWatcherConfig::new())
+            .spawn(move || {
+                counter_clone.fetch_add(1, Ordering::SeqCst);
+            })
+            .unwrap();
+
+        // Give watcher time to start
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Modify the file
+        std::fs::write(&file_path, "modified content").unwrap();
+
+        // Wait for the event to be processed
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert!(
+            counter.load(Ordering::SeqCst) >= 1,
+            "Watcher should have detected at least one change"
+        );
+    }
+
+    #[test]
+    fn test_file_watcher_detects_delete_and_recreate() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("test_file_watcher_recreate");
+        let file_path = dir.path().join("test_file.txt");
+
+        // Create initial file
+        std::fs::write(&file_path, "initial content").unwrap();
+
+        let counter = Arc::new(AtomicUsize::new(0));
+        let counter_clone = counter.clone();
+
+        FileWatcherBuilder::new()
+            .watch_path(&file_path)
+            .unwrap()
+            .config(FileWatcherConfig::new())
+            .spawn(move || {
+                counter_clone.fetch_add(1, Ordering::SeqCst);
+            })
+            .unwrap();
+
+        // Give watcher time to start
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Delete the file
+        std::fs::remove_file(&file_path).unwrap();
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Recreate the file - this should still be detected because we watch the directory
+        std::fs::write(&file_path, "recreated content").unwrap();
+
+        // Wait for the event to be processed
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert!(
+            counter.load(Ordering::SeqCst) >= 1,
+            "Watcher should have detected file recreation"
+        );
+    }
+
+    #[test]
+    fn test_file_watcher_ignores_other_files() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("test_file_watcher_other");
+        let watched_file = dir.path().join("watched.txt");
+        let other_file = dir.path().join("other.txt");
+
+        // Create both files
+        std::fs::write(&watched_file, "watched content").unwrap();
+        std::fs::write(&other_file, "other content").unwrap();
+
+        let counter = Arc::new(AtomicUsize::new(0));
+        let counter_clone = counter.clone();
+
+        FileWatcherBuilder::new()
+            .watch_path(&watched_file)
+            .unwrap()
+            .config(FileWatcherConfig::new())
+            .spawn(move || {
+                counter_clone.fetch_add(1, Ordering::SeqCst);
+            })
+            .unwrap();
+
+        // Give watcher time to start
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Modify the other file - should NOT trigger callback
+        std::fs::write(&other_file, "modified other content").unwrap();
+
+        // Wait for potential event
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert_eq!(
+            counter.load(Ordering::SeqCst),
+            0,
+            "Watcher should not have detected changes to other files"
+        );
+
+        // Now modify the watched file - SHOULD trigger callback
+        std::fs::write(&watched_file, "modified watched content").unwrap();
+
+        // Wait for the event to be processed
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert!(
+            counter.load(Ordering::SeqCst) >= 1,
+            "Watcher should have detected change to watched file"
+        );
+    }
+}
--- a/src/common/config/src/lib.rs
+++ b/src/common/config/src/lib.rs
@@ -14,6 +14,7 @@

 pub mod config;
 pub mod error;
+pub mod file_watcher;

 use std::time::Duration;

--- a/src/common/error/src/lib.rs
+++ b/src/common/error/src/lib.rs
@@ -21,6 +21,8 @@ pub mod status_code;
 use http::{HeaderMap, HeaderValue};
 pub use snafu;

+use crate::status_code::StatusCode;
+
 // HACK - these headers are here for shared in gRPC services. For common HTTP headers,
 // please define in `src/servers/src/http/header.rs`.
 pub const GREPTIME_DB_HEADER_ERROR_CODE: &str = "x-greptime-err-code";
@@ -46,6 +48,29 @@ pub fn from_err_code_msg_to_header(code: u32, msg: &str) -> HeaderMap {
    header
 }

+/// Extract [StatusCode] and error message from [HeaderMap], if any.
+///
+/// Note that if the [StatusCode] is illegal, for example, a random number that is not pre-defined
+/// as a [StatusCode], the result is still `None`.
+pub fn from_header_to_err_code_msg(headers: &HeaderMap) -> Option<(StatusCode, &str)> {
+    let code = headers
+        .get(GREPTIME_DB_HEADER_ERROR_CODE)
+        .and_then(|value| {
+            value
+                .to_str()
+                .ok()
+                .and_then(|x| x.parse::<u32>().ok())
+                .and_then(StatusCode::from_u32)
+        });
+    let msg = headers
+        .get(GREPTIME_DB_HEADER_ERROR_MSG)
+        .and_then(|x| x.to_str().ok());
+    match (code, msg) {
+        (Some(code), Some(msg)) => Some((code, msg)),
+        _ => None,
+    }
+}
+
 /// Returns the external root cause of the source error (exclude the current error).
 pub fn root_source(err: &dyn std::error::Error) -> Option<&dyn std::error::Error> {
    // There are some divergence about the behavior of the `sources()` API
--- a/src/common/error/src/status_code.rs
+++ b/src/common/error/src/status_code.rs
@@ -42,6 +42,8 @@ pub enum StatusCode {
    External = 1007,
    /// The request is deadline exceeded (typically server-side).
    DeadlineExceeded = 1008,
+    /// Service got suspended for various reason. For example, resources exceed limit.
+    Suspended = 1009,
    // ====== End of common status code ================

    // ====== Begin of SQL related status code =========
@@ -175,7 +177,8 @@ impl StatusCode {
            | StatusCode::AccessDenied
            | StatusCode::PermissionDenied
            | StatusCode::RequestOutdated
-            | StatusCode::External => false,
+            | StatusCode::External
+            | StatusCode::Suspended => false,
        }
    }

@@ -223,7 +226,8 @@ impl StatusCode {
            | StatusCode::InvalidAuthHeader
            | StatusCode::AccessDenied
            | StatusCode::PermissionDenied
-            | StatusCode::RequestOutdated => false,
+            | StatusCode::RequestOutdated
+            | StatusCode::Suspended => false,
        }
    }

@@ -347,7 +351,8 @@ pub fn status_to_tonic_code(status_code: StatusCode) -> Code {
        | StatusCode::RegionNotReady => Code::Unavailable,
        StatusCode::RuntimeResourcesExhausted
        | StatusCode::RateLimited
-        | StatusCode::RegionBusy => Code::ResourceExhausted,
+        | StatusCode::RegionBusy
+        | StatusCode::Suspended => Code::ResourceExhausted,
        StatusCode::UnsupportedPasswordType
        | StatusCode::UserPasswordMismatch
        | StatusCode::AuthHeaderNotFound
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -39,7 +39,7 @@ datafusion-functions-aggregate-common.workspace = true
 datafusion-pg-catalog.workspace = true
 datafusion-physical-expr.workspace = true
 datatypes.workspace = true
-derive_more = { version = "1", default-features = false, features = ["display"] }
+derive_more.workspace = true
 geo = { version = "0.29", optional = true }
 geo-types = { version = "0.7", optional = true }
 geohash = { version = "0.13", optional = true }
--- a/src/common/function/src/scalars/geo/relation.rs
+++ b/src/common/function/src/scalars/geo/relation.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::fmt::Display;
 use std::sync::Arc;

 use datafusion_common::arrow::array::{Array, AsArray, BooleanBuilder};
--- a/src/common/function/src/system/pg_catalog.rs
+++ b/src/common/function/src/system/pg_catalog.rs
@@ -387,6 +387,8 @@ impl PGCatalogFunction {
        registry.register(pg_catalog::create_pg_stat_get_numscans());
        registry.register(pg_catalog::create_pg_get_constraintdef());
        registry.register(pg_catalog::create_pg_get_partition_ancestors_udf());
+        registry.register(pg_catalog::quote_ident_udf::create_quote_ident_udf());
+        registry.register(pg_catalog::quote_ident_udf::create_parse_ident_udf());
        registry.register_scalar(ObjDescriptionFunction::new());
        registry.register_scalar(ColDescriptionFunction::new());
        registry.register_scalar(ShobjDescriptionFunction::new());
--- a/src/common/grpc/Cargo.toml
+++ b/src/common/grpc/Cargo.toml
@@ -12,6 +12,7 @@ api.workspace = true
 arrow-flight.workspace = true
 bytes.workspace = true
 common-base.workspace = true
+common-config.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 common-recordbatch.workspace = true
@@ -23,7 +24,6 @@ datatypes.workspace = true
 flatbuffers = "25.2"
 hyper.workspace = true
 lazy_static.workspace = true
-notify.workspace = true
 prost.workspace = true
 serde.workspace = true
 serde_json.workspace = true
--- a/src/common/grpc/src/error.rs
+++ b/src/common/grpc/src/error.rs
@@ -38,11 +38,10 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Failed to watch config file path: {}", path))]
+    #[snafu(display("Failed to watch config file"))]
    FileWatch {
-        path: String,
        #[snafu(source)]
-        error: notify::Error,
+        source: common_config::error::Error,
        #[snafu(implicit)]
        location: Location,
    },
--- a/src/common/grpc/src/flight/do_put.rs
+++ b/src/common/grpc/src/flight/do_put.rs
@@ -46,13 +46,16 @@ pub struct DoPutResponse {
    request_id: i64,
    /// The successfully ingested rows number.
    affected_rows: AffectedRows,
+    /// The elapsed time in seconds for handling the bulk insert.
+    elapsed_secs: f64,
 }

 impl DoPutResponse {
-    pub fn new(request_id: i64, affected_rows: AffectedRows) -> Self {
+    pub fn new(request_id: i64, affected_rows: AffectedRows, elapsed_secs: f64) -> Self {
        Self {
            request_id,
            affected_rows,
+            elapsed_secs,
        }
    }

@@ -63,6 +66,10 @@ impl DoPutResponse {
    pub fn affected_rows(&self) -> AffectedRows {
        self.affected_rows
    }
+
+    pub fn elapsed_secs(&self) -> f64 {
+        self.elapsed_secs
+    }
 }

 impl TryFrom<PutResult> for DoPutResponse {
@@ -86,8 +93,11 @@ mod tests {

    #[test]
    fn test_serde_do_put_response() {
-        let x = DoPutResponse::new(42, 88);
+        let x = DoPutResponse::new(42, 88, 0.123);
        let serialized = serde_json::to_string(&x).unwrap();
-        assert_eq!(serialized, r#"{"request_id":42,"affected_rows":88}"#);
+        assert_eq!(
+            serialized,
+            r#"{"request_id":42,"affected_rows":88,"elapsed_secs":0.123}"#
+        );
    }
 }
--- a/src/common/grpc/src/reloadable_tls.rs
+++ b/src/common/grpc/src/reloadable_tls.rs
@@ -15,11 +15,10 @@
 use std::path::Path;
 use std::result::Result as StdResult;
 use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::mpsc::channel;
 use std::sync::{Arc, RwLock};

+use common_config::file_watcher::{FileWatcherBuilder, FileWatcherConfig};
 use common_telemetry::{error, info};
-use notify::{EventKind, RecursiveMode, Watcher};
 use snafu::ResultExt;

 use crate::error::{FileWatchSnafu, Result};
@@ -119,45 +118,28 @@ where
        return Ok(());
    }

+    let watch_paths: Vec<_> = tls_config
+        .get_tls_option()
+        .watch_paths()
+        .iter()
+        .map(|p| p.to_path_buf())
+        .collect();
+
    let tls_config_for_watcher = tls_config.clone();

-    let (tx, rx) = channel::<notify::Result<notify::Event>>();
-    let mut watcher = notify::recommended_watcher(tx).context(FileWatchSnafu { path: "<none>" })?;
-
-    // Watch all paths returned by the TlsConfigLoader
-    for path in tls_config.get_tls_option().watch_paths() {
-        watcher
-            .watch(path, RecursiveMode::NonRecursive)
-            .with_context(|_| FileWatchSnafu {
-                path: path.display().to_string(),
-            })?;
-    }
-
-    info!("Spawning background task for watching TLS cert/key file changes");
-    std::thread::spawn(move || {
-        let _watcher = watcher;
-        loop {
-            match rx.recv() {
-                Ok(Ok(event)) => {
-                    if let EventKind::Modify(_) | EventKind::Create(_) = event.kind {
-                        info!("Detected TLS cert/key file change: {:?}", event);
-                        if let Err(err) = tls_config_for_watcher.reload() {
-                            error!("Failed to reload TLS config: {}", err);
-                        } else {
-                            info!("Reloaded TLS cert/key file successfully.");
-                            on_reload();
-                        }
-                    }
-                }
-                Ok(Err(err)) => {
-                    error!("Failed to watch TLS cert/key file: {}", err);
-                }
-                Err(err) => {
-                    error!("TLS cert/key file watcher channel closed: {}", err);
-                }
+    FileWatcherBuilder::new()
+        .watch_paths(&watch_paths)
+        .context(FileWatchSnafu)?
+        .config(FileWatcherConfig::new())
+        .spawn(move || {
+            if let Err(err) = tls_config_for_watcher.reload() {
+                error!("Failed to reload TLS config: {}", err);
+            } else {
+                info!("Reloaded TLS cert/key file successfully.");
+                on_reload();
            }
-        }
-    });
+        })
+        .context(FileWatchSnafu)?;

    Ok(())
 }
--- a/src/common/memory-manager/Cargo.toml
+++ b/src/common/memory-manager/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "common-memory-manager"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+common-error = { workspace = true }
+common-macro = { workspace = true }
+common-telemetry = { workspace = true }
+humantime = { workspace = true }
+serde = { workspace = true }
+snafu = { workspace = true }
+tokio = { workspace = true, features = ["sync"] }
+
+[dev-dependencies]
+tokio = { workspace = true, features = ["rt", "macros"] }
--- a/src/common/memory-manager/src/error.rs
+++ b/src/common/memory-manager/src/error.rs
@@ -0,0 +1,53 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_macro::stack_trace_debug;
+use snafu::Snafu;
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+#[derive(Snafu)]
+#[snafu(visibility(pub))]
+#[stack_trace_debug]
+pub enum Error {
+    #[snafu(display(
+        "Memory limit exceeded: requested {requested_bytes} bytes, limit {limit_bytes} bytes"
+    ))]
+    MemoryLimitExceeded {
+        requested_bytes: u64,
+        limit_bytes: u64,
+    },
+
+    #[snafu(display("Memory semaphore unexpectedly closed"))]
+    MemorySemaphoreClosed,
+}
+
+impl ErrorExt for Error {
+    fn status_code(&self) -> StatusCode {
+        use Error::*;
+
+        match self {
+            MemoryLimitExceeded { .. } => StatusCode::RuntimeResourcesExhausted,
+            MemorySemaphoreClosed => StatusCode::Unexpected,
+        }
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
--- a/src/common/memory-manager/src/guard.rs
+++ b/src/common/memory-manager/src/guard.rs
@@ -0,0 +1,138 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::{fmt, mem};
+
+use common_telemetry::debug;
+use tokio::sync::{OwnedSemaphorePermit, TryAcquireError};
+
+use crate::manager::{MemoryMetrics, MemoryQuota, bytes_to_permits, permits_to_bytes};
+
+/// Guard representing a slice of reserved memory.
+pub struct MemoryGuard<M: MemoryMetrics> {
+    pub(crate) state: GuardState<M>,
+}
+
+pub(crate) enum GuardState<M: MemoryMetrics> {
+    Unlimited,
+    Limited {
+        permit: OwnedSemaphorePermit,
+        quota: MemoryQuota<M>,
+    },
+}
+
+impl<M: MemoryMetrics> MemoryGuard<M> {
+    pub(crate) fn unlimited() -> Self {
+        Self {
+            state: GuardState::Unlimited,
+        }
+    }
+
+    pub(crate) fn limited(permit: OwnedSemaphorePermit, quota: MemoryQuota<M>) -> Self {
+        Self {
+            state: GuardState::Limited { permit, quota },
+        }
+    }
+
+    /// Returns granted quota in bytes.
+    pub fn granted_bytes(&self) -> u64 {
+        match &self.state {
+            GuardState::Unlimited => 0,
+            GuardState::Limited { permit, .. } => permits_to_bytes(permit.num_permits() as u32),
+        }
+    }
+
+    /// Tries to allocate additional memory during task execution.
+    ///
+    /// On success, merges the new memory into this guard and returns true.
+    /// On failure, returns false and leaves this guard unchanged.
+    pub fn request_additional(&mut self, bytes: u64) -> bool {
+        match &mut self.state {
+            GuardState::Unlimited => true,
+            GuardState::Limited { permit, quota } => {
+                if bytes == 0 {
+                    return true;
+                }
+
+                let additional_permits = bytes_to_permits(bytes);
+
+                match quota
+                    .semaphore
+                    .clone()
+                    .try_acquire_many_owned(additional_permits)
+                {
+                    Ok(additional_permit) => {
+                        permit.merge(additional_permit);
+                        quota.update_in_use_metric();
+                        debug!("Allocated additional {} bytes", bytes);
+                        true
+                    }
+                    Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => {
+                        quota.metrics.inc_rejected("request_additional");
+                        false
+                    }
+                }
+            }
+        }
+    }
+
+    /// Releases a portion of granted memory back to the pool early,
+    /// before the guard is dropped.
+    ///
+    /// Returns true if the release succeeds or is a no-op; false if the request exceeds granted.
+    pub fn early_release_partial(&mut self, bytes: u64) -> bool {
+        match &mut self.state {
+            GuardState::Unlimited => true,
+            GuardState::Limited { permit, quota } => {
+                if bytes == 0 {
+                    return true;
+                }
+
+                let release_permits = bytes_to_permits(bytes);
+
+                match permit.split(release_permits as usize) {
+                    Some(released_permit) => {
+                        let released_bytes = permits_to_bytes(released_permit.num_permits() as u32);
+                        drop(released_permit);
+                        quota.update_in_use_metric();
+                        debug!("Early released {} bytes from memory guard", released_bytes);
+                        true
+                    }
+                    None => false,
+                }
+            }
+        }
+    }
+}
+
+impl<M: MemoryMetrics> Drop for MemoryGuard<M> {
+    fn drop(&mut self) {
+        if let GuardState::Limited { permit, quota } =
+            mem::replace(&mut self.state, GuardState::Unlimited)
+        {
+            let bytes = permits_to_bytes(permit.num_permits() as u32);
+            drop(permit);
+            quota.update_in_use_metric();
+            debug!("Released memory: {} bytes", bytes);
+        }
+    }
+}
+
+impl<M: MemoryMetrics> fmt::Debug for MemoryGuard<M> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("MemoryGuard")
+            .field("granted_bytes", &self.granted_bytes())
+            .finish()
+    }
+}
--- a/src/common/memory-manager/src/lib.rs
+++ b/src/common/memory-manager/src/lib.rs
@@ -0,0 +1,47 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Generic memory management for resource-constrained operations.
+//!
+//! This crate provides a reusable memory quota system based on semaphores,
+//! allowing different subsystems (compaction, flush, index build, etc.) to
+//! share the same allocation logic while using their own metrics.
+
+mod error;
+mod guard;
+mod manager;
+mod policy;
+
+#[cfg(test)]
+mod tests;
+
+pub use error::{Error, Result};
+pub use guard::MemoryGuard;
+pub use manager::{MemoryManager, MemoryMetrics, PERMIT_GRANULARITY_BYTES};
+pub use policy::{DEFAULT_MEMORY_WAIT_TIMEOUT, OnExhaustedPolicy};
+
+/// No-op metrics implementation for testing.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct NoOpMetrics;
+
+impl MemoryMetrics for NoOpMetrics {
+    #[inline(always)]
+    fn set_limit(&self, _: i64) {}
+
+    #[inline(always)]
+    fn set_in_use(&self, _: i64) {}
+
+    #[inline(always)]
+    fn inc_rejected(&self, _: &str) {}
+}
--- a/src/common/memory-manager/src/manager.rs
+++ b/src/common/memory-manager/src/manager.rs
@@ -0,0 +1,173 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use snafu::ensure;
+use tokio::sync::{Semaphore, TryAcquireError};
+
+use crate::error::{MemoryLimitExceededSnafu, MemorySemaphoreClosedSnafu, Result};
+use crate::guard::MemoryGuard;
+
+/// Minimum bytes controlled by one semaphore permit.
+pub const PERMIT_GRANULARITY_BYTES: u64 = 1 << 20; // 1 MB
+
+/// Trait for recording memory usage metrics.
+pub trait MemoryMetrics: Clone + Send + Sync + 'static {
+    fn set_limit(&self, bytes: i64);
+    fn set_in_use(&self, bytes: i64);
+    fn inc_rejected(&self, reason: &str);
+}
+
+/// Generic memory manager for quota-controlled operations.
+#[derive(Clone)]
+pub struct MemoryManager<M: MemoryMetrics> {
+    quota: Option<MemoryQuota<M>>,
+}
+
+#[derive(Clone)]
+pub(crate) struct MemoryQuota<M: MemoryMetrics> {
+    pub(crate) semaphore: Arc<Semaphore>,
+    pub(crate) limit_permits: u32,
+    pub(crate) metrics: M,
+}
+
+impl<M: MemoryMetrics> MemoryManager<M> {
+    /// Creates a new memory manager with the given limit in bytes.
+    /// `limit_bytes = 0` disables the limit.
+    pub fn new(limit_bytes: u64, metrics: M) -> Self {
+        if limit_bytes == 0 {
+            metrics.set_limit(0);
+            return Self { quota: None };
+        }
+
+        let limit_permits = bytes_to_permits(limit_bytes);
+        let limit_aligned_bytes = permits_to_bytes(limit_permits);
+        metrics.set_limit(limit_aligned_bytes as i64);
+
+        Self {
+            quota: Some(MemoryQuota {
+                semaphore: Arc::new(Semaphore::new(limit_permits as usize)),
+                limit_permits,
+                metrics,
+            }),
+        }
+    }
+
+    /// Returns the configured limit in bytes (0 if unlimited).
+    pub fn limit_bytes(&self) -> u64 {
+        self.quota
+            .as_ref()
+            .map(|quota| permits_to_bytes(quota.limit_permits))
+            .unwrap_or(0)
+    }
+
+    /// Returns currently used bytes.
+    pub fn used_bytes(&self) -> u64 {
+        self.quota
+            .as_ref()
+            .map(|quota| permits_to_bytes(quota.used_permits()))
+            .unwrap_or(0)
+    }
+
+    /// Returns available bytes.
+    pub fn available_bytes(&self) -> u64 {
+        self.quota
+            .as_ref()
+            .map(|quota| permits_to_bytes(quota.available_permits_clamped()))
+            .unwrap_or(0)
+    }
+
+    /// Acquires memory, waiting if necessary until enough is available.
+    ///
+    /// # Errors
+    /// - Returns error if requested bytes exceed the total limit
+    /// - Returns error if the semaphore is unexpectedly closed
+    pub async fn acquire(&self, bytes: u64) -> Result<MemoryGuard<M>> {
+        match &self.quota {
+            None => Ok(MemoryGuard::unlimited()),
+            Some(quota) => {
+                let permits = bytes_to_permits(bytes);
+
+                ensure!(
+                    permits <= quota.limit_permits,
+                    MemoryLimitExceededSnafu {
+                        requested_bytes: bytes,
+                        limit_bytes: permits_to_bytes(quota.limit_permits),
+                    }
+                );
+
+                let permit = quota
+                    .semaphore
+                    .clone()
+                    .acquire_many_owned(permits)
+                    .await
+                    .map_err(|_| MemorySemaphoreClosedSnafu.build())?;
+                quota.update_in_use_metric();
+                Ok(MemoryGuard::limited(permit, quota.clone()))
+            }
+        }
+    }
+
+    /// Tries to acquire memory. Returns Some(guard) on success, None if insufficient.
+    pub fn try_acquire(&self, bytes: u64) -> Option<MemoryGuard<M>> {
+        match &self.quota {
+            None => Some(MemoryGuard::unlimited()),
+            Some(quota) => {
+                let permits = bytes_to_permits(bytes);
+
+                match quota.semaphore.clone().try_acquire_many_owned(permits) {
+                    Ok(permit) => {
+                        quota.update_in_use_metric();
+                        Some(MemoryGuard::limited(permit, quota.clone()))
+                    }
+                    Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => {
+                        quota.metrics.inc_rejected("try_acquire");
+                        None
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl<M: MemoryMetrics> MemoryQuota<M> {
+    pub(crate) fn used_permits(&self) -> u32 {
+        self.limit_permits
+            .saturating_sub(self.available_permits_clamped())
+    }
+
+    pub(crate) fn available_permits_clamped(&self) -> u32 {
+        self.semaphore
+            .available_permits()
+            .min(self.limit_permits as usize) as u32
+    }
+
+    pub(crate) fn update_in_use_metric(&self) {
+        let bytes = permits_to_bytes(self.used_permits());
+        self.metrics.set_in_use(bytes as i64);
+    }
+}
+
+pub(crate) fn bytes_to_permits(bytes: u64) -> u32 {
+    bytes
+        .saturating_add(PERMIT_GRANULARITY_BYTES - 1)
+        .saturating_div(PERMIT_GRANULARITY_BYTES)
+        .min(Semaphore::MAX_PERMITS as u64)
+        .min(u32::MAX as u64) as u32
+}
+
+pub(crate) fn permits_to_bytes(permits: u32) -> u64 {
+    (permits as u64).saturating_mul(PERMIT_GRANULARITY_BYTES)
+}
--- a/src/common/memory-manager/src/policy.rs
+++ b/src/common/memory-manager/src/policy.rs
@@ -0,0 +1,83 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::time::Duration;
+
+use humantime::{format_duration, parse_duration};
+use serde::{Deserialize, Serialize};
+
+/// Default wait timeout for memory acquisition.
+pub const DEFAULT_MEMORY_WAIT_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// Defines how to react when memory cannot be acquired immediately.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum OnExhaustedPolicy {
+    /// Wait until enough memory is released, bounded by timeout.
+    Wait { timeout: Duration },
+
+    /// Fail immediately if memory is not available.
+    Fail,
+}
+
+impl Default for OnExhaustedPolicy {
+    fn default() -> Self {
+        OnExhaustedPolicy::Wait {
+            timeout: DEFAULT_MEMORY_WAIT_TIMEOUT,
+        }
+    }
+}
+
+impl Serialize for OnExhaustedPolicy {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let text = match self {
+            OnExhaustedPolicy::Fail => "fail".to_string(),
+            OnExhaustedPolicy::Wait { timeout } if *timeout == DEFAULT_MEMORY_WAIT_TIMEOUT => {
+                "wait".to_string()
+            }
+            OnExhaustedPolicy::Wait { timeout } => format!("wait({})", format_duration(*timeout)),
+        };
+        serializer.serialize_str(&text)
+    }
+}
+
+impl<'de> Deserialize<'de> for OnExhaustedPolicy {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let raw = String::deserialize(deserializer)?;
+        let lower = raw.to_ascii_lowercase();
+
+        // Accept both "skip" (legacy) and "fail".
+        if lower == "skip" || lower == "fail" {
+            return Ok(OnExhaustedPolicy::Fail);
+        }
+        if lower == "wait" {
+            return Ok(OnExhaustedPolicy::default());
+        }
+        if lower.starts_with("wait(") && lower.ends_with(')') {
+            let inner = &raw[5..raw.len() - 1];
+            let timeout = parse_duration(inner).map_err(serde::de::Error::custom)?;
+            return Ok(OnExhaustedPolicy::Wait { timeout });
+        }
+
+        Err(serde::de::Error::custom(format!(
+            "invalid memory policy: {}, expected wait, wait(<duration>), fail",
+            raw
+        )))
+    }
+}
--- a/src/common/memory-manager/src/tests.rs
+++ b/src/common/memory-manager/src/tests.rs
@@ -0,0 +1,247 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use tokio::time::{Duration, sleep};
+
+use crate::{MemoryManager, NoOpMetrics, PERMIT_GRANULARITY_BYTES};
+
+#[test]
+fn test_try_acquire_unlimited() {
+    let manager = MemoryManager::new(0, NoOpMetrics);
+    let guard = manager.try_acquire(10 * PERMIT_GRANULARITY_BYTES).unwrap();
+    assert_eq!(manager.limit_bytes(), 0);
+    assert_eq!(guard.granted_bytes(), 0);
+}
+
+#[test]
+fn test_try_acquire_limited_success_and_release() {
+    let bytes = 2 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(bytes, NoOpMetrics);
+    {
+        let guard = manager.try_acquire(PERMIT_GRANULARITY_BYTES).unwrap();
+        assert_eq!(guard.granted_bytes(), PERMIT_GRANULARITY_BYTES);
+        assert_eq!(manager.used_bytes(), PERMIT_GRANULARITY_BYTES);
+        drop(guard);
+    }
+    assert_eq!(manager.used_bytes(), 0);
+}
+
+#[test]
+fn test_try_acquire_exceeds_limit() {
+    let limit = PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+    let result = manager.try_acquire(limit + PERMIT_GRANULARITY_BYTES);
+    assert!(result.is_none());
+}
+
+#[tokio::test(flavor = "current_thread")]
+async fn test_acquire_blocks_and_unblocks() {
+    let bytes = 2 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(bytes, NoOpMetrics);
+    let guard = manager.try_acquire(bytes).unwrap();
+
+    // Spawn a task that will block on acquire()
+    let waiter = {
+        let manager = manager.clone();
+        tokio::spawn(async move {
+            // This will block until memory is available
+            let _guard = manager.acquire(bytes).await.unwrap();
+        })
+    };
+
+    sleep(Duration::from_millis(10)).await;
+    // Release memory - this should unblock the waiter
+    drop(guard);
+
+    // Waiter should complete now
+    waiter.await.unwrap();
+}
+
+#[test]
+fn test_request_additional_success() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES; // 10MB limit
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    // Acquire base quota (5MB)
+    let base = 5 * PERMIT_GRANULARITY_BYTES;
+    let mut guard = manager.try_acquire(base).unwrap();
+    assert_eq!(guard.granted_bytes(), base);
+    assert_eq!(manager.used_bytes(), base);
+
+    // Request additional memory (3MB) - should succeed and merge
+    assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_request_additional_exceeds_limit() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES; // 10MB limit
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    // Acquire base quota (5MB)
+    let base = 5 * PERMIT_GRANULARITY_BYTES;
+    let mut guard = manager.try_acquire(base).unwrap();
+
+    // Request additional memory (3MB) - should succeed
+    assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+
+    // Request more (3MB) - should fail (would exceed 10MB limit)
+    let result = guard.request_additional(3 * PERMIT_GRANULARITY_BYTES);
+    assert!(!result);
+
+    // Still at 8MB
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(guard.granted_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_request_additional_auto_release_on_guard_drop() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    {
+        let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+        // Request additional - memory is merged into guard
+        assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
+        assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+
+        // When guard drops, all memory (base + additional) is released together
+    }
+
+    // After scope, all memory should be released
+    assert_eq!(manager.used_bytes(), 0);
+}
+
+#[test]
+fn test_request_additional_unlimited() {
+    let manager = MemoryManager::new(0, NoOpMetrics); // Unlimited
+    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Should always succeed with unlimited manager
+    assert!(guard.request_additional(100 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 0);
+    assert_eq!(manager.used_bytes(), 0);
+}
+
+#[test]
+fn test_request_additional_zero_bytes() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Request 0 bytes should succeed without affecting anything
+    assert!(guard.request_additional(0));
+    assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_early_release_partial_success() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(8 * PERMIT_GRANULARITY_BYTES).unwrap();
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+
+    // Release half
+    assert!(guard.early_release_partial(4 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 4 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 4 * PERMIT_GRANULARITY_BYTES);
+
+    // Released memory should be available to others
+    let _guard2 = manager.try_acquire(4 * PERMIT_GRANULARITY_BYTES).unwrap();
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_early_release_partial_exceeds_granted() {
+    let manager = MemoryManager::new(10 * PERMIT_GRANULARITY_BYTES, NoOpMetrics);
+    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Try to release more than granted - should fail
+    assert!(!guard.early_release_partial(10 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_early_release_partial_unlimited() {
+    let manager = MemoryManager::new(0, NoOpMetrics);
+    let mut guard = manager.try_acquire(100 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Unlimited guard - release should succeed (no-op)
+    assert!(guard.early_release_partial(50 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 0);
+}
+
+#[test]
+fn test_request_and_early_release_symmetry() {
+    let limit = 20 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Request additional
+    assert!(guard.request_additional(5 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
+
+    // Early release some
+    assert!(guard.early_release_partial(3 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
+
+    // Request again
+    assert!(guard.request_additional(2 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
+
+    // Early release again
+    assert!(guard.early_release_partial(4 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+
+    drop(guard);
+    assert_eq!(manager.used_bytes(), 0);
+}
+
+#[test]
+fn test_small_allocation_rounds_up() {
+    // Test that allocations smaller than PERMIT_GRANULARITY_BYTES
+    // round up to 1 permit and can use request_additional()
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(512 * 1024).unwrap(); // 512KB
+    assert_eq!(guard.granted_bytes(), PERMIT_GRANULARITY_BYTES); // Rounds up to 1MB
+    assert!(guard.request_additional(2 * PERMIT_GRANULARITY_BYTES)); // Can request more
+    assert_eq!(guard.granted_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_acquire_zero_bytes_lazy_allocation() {
+    // Test that acquire(0) returns 0 permits but can request_additional() later
+    let manager = MemoryManager::new(10 * PERMIT_GRANULARITY_BYTES, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(0).unwrap();
+    assert_eq!(guard.granted_bytes(), 0); // No permits consumed
+    assert_eq!(manager.used_bytes(), 0);
+
+    assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES)); // Lazy allocation
+    assert_eq!(guard.granted_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
+}
--- a/src/common/meta/src/cluster.rs
+++ b/src/common/meta/src/cluster.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::fmt::{Display, Formatter};
 use std::hash::{DefaultHasher, Hash, Hasher};
 use std::str::FromStr;

@@ -60,7 +61,7 @@ pub trait ClusterInfo {
 }

 /// The key of [NodeInfo] in the storage. The format is `__meta_cluster_node_info-0-{role}-{node_id}`.
-#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize, PartialOrd, Ord)]
 pub struct NodeInfoKey {
    /// The role of the node. It can be `[Role::Datanode]` or `[Role::Frontend]`.
    pub role: Role,
@@ -135,7 +136,7 @@ pub struct NodeInfo {
    pub hostname: String,
 }

-#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize, PartialOrd, Ord)]
 pub enum Role {
    Datanode,
    Frontend,
@@ -241,6 +242,12 @@ impl From<&NodeInfoKey> for Vec<u8> {
    }
 }

+impl Display for NodeInfoKey {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}-{}", self.role, self.node_id)
+    }
+}
+
 impl FromStr for NodeInfo {
    type Err = Error;

--- a/src/common/meta/src/ddl.rs
+++ b/src/common/meta/src/ddl.rs
@@ -31,6 +31,7 @@ use crate::region_registry::LeaderRegionRegistryRef;
 pub mod alter_database;
 pub mod alter_logical_tables;
 pub mod alter_table;
+pub mod comment_on;
 pub mod create_database;
 pub mod create_flow;
 pub mod create_logical_tables;
--- a/src/common/meta/src/ddl/alter_table/executor.rs
+++ b/src/common/meta/src/ddl/alter_table/executor.rs
@@ -301,8 +301,8 @@ fn build_new_table_info(
        | AlterKind::UnsetTableOptions { .. }
        | AlterKind::SetIndexes { .. }
        | AlterKind::UnsetIndexes { .. }
-        | AlterKind::DropDefaults { .. } => {}
-        AlterKind::SetDefaults { .. } => {}
+        | AlterKind::DropDefaults { .. }
+        | AlterKind::SetDefaults { .. } => {}
    }

    info!(
--- a/src/common/meta/src/ddl/comment_on.rs
+++ b/src/common/meta/src/ddl/comment_on.rs
@@ -0,0 +1,509 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use async_trait::async_trait;
+use chrono::Utc;
+use common_catalog::format_full_table_name;
+use common_procedure::error::{FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu};
+use common_procedure::{Context as ProcedureContext, LockKey, Procedure, Status};
+use common_telemetry::tracing::info;
+use datatypes::schema::COMMENT_KEY as COLUMN_COMMENT_KEY;
+use serde::{Deserialize, Serialize};
+use snafu::{OptionExt, ResultExt, ensure};
+use store_api::storage::TableId;
+use strum::AsRefStr;
+use table::metadata::RawTableInfo;
+use table::requests::COMMENT_KEY as TABLE_COMMENT_KEY;
+use table::table_name::TableName;
+
+use crate::cache_invalidator::Context;
+use crate::ddl::DdlContext;
+use crate::ddl::utils::map_to_procedure_error;
+use crate::error::{ColumnNotFoundSnafu, FlowNotFoundSnafu, Result, TableNotFoundSnafu};
+use crate::instruction::CacheIdent;
+use crate::key::flow::flow_info::{FlowInfoKey, FlowInfoValue};
+use crate::key::table_info::{TableInfoKey, TableInfoValue};
+use crate::key::table_name::TableNameKey;
+use crate::key::{DeserializedValueWithBytes, FlowId, MetadataKey, MetadataValue};
+use crate::lock_key::{CatalogLock, FlowNameLock, SchemaLock, TableNameLock};
+use crate::rpc::ddl::{CommentObjectType, CommentOnTask};
+use crate::rpc::store::PutRequest;
+
+pub struct CommentOnProcedure {
+    pub context: DdlContext,
+    pub data: CommentOnData,
+}
+
+impl CommentOnProcedure {
+    pub const TYPE_NAME: &'static str = "metasrv-procedure::CommentOn";
+
+    pub fn new(task: CommentOnTask, context: DdlContext) -> Self {
+        Self {
+            context,
+            data: CommentOnData::new(task),
+        }
+    }
+
+    pub fn from_json(json: &str, context: DdlContext) -> ProcedureResult<Self> {
+        let data = serde_json::from_str(json).context(FromJsonSnafu)?;
+
+        Ok(Self { context, data })
+    }
+
+    pub async fn on_prepare(&mut self) -> Result<Status> {
+        match self.data.object_type {
+            CommentObjectType::Table | CommentObjectType::Column => {
+                self.prepare_table_or_column().await?;
+            }
+            CommentObjectType::Flow => {
+                self.prepare_flow().await?;
+            }
+        }
+
+        // Fast path: if comment is unchanged, skip update
+        if self.data.is_unchanged {
+            let object_desc = match self.data.object_type {
+                CommentObjectType::Table => format!(
+                    "table {}",
+                    format_full_table_name(
+                        &self.data.catalog_name,
+                        &self.data.schema_name,
+                        &self.data.object_name,
+                    )
+                ),
+                CommentObjectType::Column => format!(
+                    "column {}.{}",
+                    format_full_table_name(
+                        &self.data.catalog_name,
+                        &self.data.schema_name,
+                        &self.data.object_name,
+                    ),
+                    self.data.column_name.as_ref().unwrap()
+                ),
+                CommentObjectType::Flow => {
+                    format!("flow {}.{}", self.data.catalog_name, self.data.object_name)
+                }
+            };
+            info!("Comment unchanged for {}, skipping update", object_desc);
+            return Ok(Status::done());
+        }
+
+        self.data.state = CommentOnState::UpdateMetadata;
+        Ok(Status::executing(true))
+    }
+
+    async fn prepare_table_or_column(&mut self) -> Result<()> {
+        let table_name_key = TableNameKey::new(
+            &self.data.catalog_name,
+            &self.data.schema_name,
+            &self.data.object_name,
+        );
+
+        let table_id = self
+            .context
+            .table_metadata_manager
+            .table_name_manager()
+            .get(table_name_key)
+            .await?
+            .with_context(|| TableNotFoundSnafu {
+                table_name: format_full_table_name(
+                    &self.data.catalog_name,
+                    &self.data.schema_name,
+                    &self.data.object_name,
+                ),
+            })?
+            .table_id();
+
+        let table_info = self
+            .context
+            .table_metadata_manager
+            .table_info_manager()
+            .get(table_id)
+            .await?
+            .with_context(|| TableNotFoundSnafu {
+                table_name: format_full_table_name(
+                    &self.data.catalog_name,
+                    &self.data.schema_name,
+                    &self.data.object_name,
+                ),
+            })?;
+
+        // For column comments, validate the column exists
+        if self.data.object_type == CommentObjectType::Column {
+            let column_name = self.data.column_name.as_ref().unwrap();
+            let column_exists = table_info
+                .table_info
+                .meta
+                .schema
+                .column_schemas
+                .iter()
+                .any(|col| &col.name == column_name);
+
+            ensure!(
+                column_exists,
+                ColumnNotFoundSnafu {
+                    column_name,
+                    column_id: 0u32, // column_id is not known here
+                }
+            );
+        }
+
+        self.data.table_id = Some(table_id);
+
+        // Check if comment is unchanged for early exit optimization
+        match self.data.object_type {
+            CommentObjectType::Table => {
+                let current_comment = &table_info.table_info.desc;
+                if &self.data.comment == current_comment {
+                    self.data.is_unchanged = true;
+                }
+            }
+            CommentObjectType::Column => {
+                let column_name = self.data.column_name.as_ref().unwrap();
+                let column_schema = table_info
+                    .table_info
+                    .meta
+                    .schema
+                    .column_schemas
+                    .iter()
+                    .find(|col| &col.name == column_name)
+                    .unwrap(); // Safe: validated above
+
+                let current_comment = column_schema.metadata().get(COLUMN_COMMENT_KEY);
+                if self.data.comment.as_deref() == current_comment.map(String::as_str) {
+                    self.data.is_unchanged = true;
+                }
+            }
+            CommentObjectType::Flow => {
+                // this branch is handled in `prepare_flow`
+            }
+        }
+
+        self.data.table_info = Some(table_info);
+
+        Ok(())
+    }
+
+    async fn prepare_flow(&mut self) -> Result<()> {
+        let flow_name_value = self
+            .context
+            .flow_metadata_manager
+            .flow_name_manager()
+            .get(&self.data.catalog_name, &self.data.object_name)
+            .await?
+            .with_context(|| FlowNotFoundSnafu {
+                flow_name: &self.data.object_name,
+            })?;
+
+        let flow_id = flow_name_value.flow_id();
+        let flow_info = self
+            .context
+            .flow_metadata_manager
+            .flow_info_manager()
+            .get_raw(flow_id)
+            .await?
+            .with_context(|| FlowNotFoundSnafu {
+                flow_name: &self.data.object_name,
+            })?;
+
+        self.data.flow_id = Some(flow_id);
+
+        // Check if comment is unchanged for early exit optimization
+        let current_comment = &flow_info.get_inner_ref().comment;
+        let new_comment = self.data.comment.as_deref().unwrap_or("");
+        if new_comment == current_comment.as_str() {
+            self.data.is_unchanged = true;
+        }
+
+        self.data.flow_info = Some(flow_info);
+
+        Ok(())
+    }
+
+    pub async fn on_update_metadata(&mut self) -> Result<Status> {
+        match self.data.object_type {
+            CommentObjectType::Table => {
+                self.update_table_comment().await?;
+            }
+            CommentObjectType::Column => {
+                self.update_column_comment().await?;
+            }
+            CommentObjectType::Flow => {
+                self.update_flow_comment().await?;
+            }
+        }
+
+        self.data.state = CommentOnState::InvalidateCache;
+        Ok(Status::executing(true))
+    }
+
+    async fn update_table_comment(&mut self) -> Result<()> {
+        let table_info_value = self.data.table_info.as_ref().unwrap();
+        let mut new_table_info = table_info_value.table_info.clone();
+
+        new_table_info.desc = self.data.comment.clone();
+
+        // Sync comment to table options
+        sync_table_comment_option(
+            &mut new_table_info.meta.options,
+            new_table_info.desc.as_deref(),
+        );
+
+        self.update_table_info(table_info_value, new_table_info)
+            .await?;
+
+        info!(
+            "Updated comment for table {}.{}.{}",
+            self.data.catalog_name, self.data.schema_name, self.data.object_name
+        );
+
+        Ok(())
+    }
+
+    async fn update_column_comment(&mut self) -> Result<()> {
+        let table_info_value = self.data.table_info.as_ref().unwrap();
+        let mut new_table_info = table_info_value.table_info.clone();
+
+        let column_name = self.data.column_name.as_ref().unwrap();
+        let column_schema = new_table_info
+            .meta
+            .schema
+            .column_schemas
+            .iter_mut()
+            .find(|col| &col.name == column_name)
+            .unwrap(); // Safe: validated in prepare
+
+        update_column_comment_metadata(column_schema, self.data.comment.clone());
+
+        self.update_table_info(table_info_value, new_table_info)
+            .await?;
+
+        info!(
+            "Updated comment for column {}.{}.{}.{}",
+            self.data.catalog_name, self.data.schema_name, self.data.object_name, column_name
+        );
+
+        Ok(())
+    }
+
+    async fn update_flow_comment(&mut self) -> Result<()> {
+        let flow_id = self.data.flow_id.unwrap();
+        let flow_info_value = self.data.flow_info.as_ref().unwrap();
+
+        let mut new_flow_info = flow_info_value.get_inner_ref().clone();
+        new_flow_info.comment = self.data.comment.clone().unwrap_or_default();
+        new_flow_info.updated_time = Utc::now();
+
+        let raw_value = new_flow_info.try_as_raw_value()?;
+
+        self.context
+            .table_metadata_manager
+            .kv_backend()
+            .put(
+                PutRequest::new()
+                    .with_key(FlowInfoKey::new(flow_id).to_bytes())
+                    .with_value(raw_value),
+            )
+            .await?;
+
+        info!(
+            "Updated comment for flow {}.{}",
+            self.data.catalog_name, self.data.object_name
+        );
+
+        Ok(())
+    }
+
+    async fn update_table_info(
+        &self,
+        current_table_info: &DeserializedValueWithBytes<TableInfoValue>,
+        new_table_info: RawTableInfo,
+    ) -> Result<()> {
+        let table_id = current_table_info.table_info.ident.table_id;
+        let new_table_info_value = current_table_info.update(new_table_info);
+        let raw_value = new_table_info_value.try_as_raw_value()?;
+
+        self.context
+            .table_metadata_manager
+            .kv_backend()
+            .put(
+                PutRequest::new()
+                    .with_key(TableInfoKey::new(table_id).to_bytes())
+                    .with_value(raw_value),
+            )
+            .await?;
+
+        Ok(())
+    }
+
+    pub async fn on_invalidate_cache(&mut self) -> Result<Status> {
+        let cache_invalidator = &self.context.cache_invalidator;
+
+        match self.data.object_type {
+            CommentObjectType::Table | CommentObjectType::Column => {
+                let table_id = self.data.table_id.unwrap();
+                let table_name = TableName::new(
+                    self.data.catalog_name.clone(),
+                    self.data.schema_name.clone(),
+                    self.data.object_name.clone(),
+                );
+
+                let cache_ident = vec![
+                    CacheIdent::TableId(table_id),
+                    CacheIdent::TableName(table_name),
+                ];
+
+                cache_invalidator
+                    .invalidate(&Context::default(), &cache_ident)
+                    .await?;
+            }
+            CommentObjectType::Flow => {
+                let flow_id = self.data.flow_id.unwrap();
+                let cache_ident = vec![CacheIdent::FlowId(flow_id)];
+
+                cache_invalidator
+                    .invalidate(&Context::default(), &cache_ident)
+                    .await?;
+            }
+        }
+
+        Ok(Status::done())
+    }
+}
+
+#[async_trait]
+impl Procedure for CommentOnProcedure {
+    fn type_name(&self) -> &str {
+        Self::TYPE_NAME
+    }
+
+    async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
+        match self.data.state {
+            CommentOnState::Prepare => self.on_prepare().await,
+            CommentOnState::UpdateMetadata => self.on_update_metadata().await,
+            CommentOnState::InvalidateCache => self.on_invalidate_cache().await,
+        }
+        .map_err(map_to_procedure_error)
+    }
+
+    fn dump(&self) -> ProcedureResult<String> {
+        serde_json::to_string(&self.data).context(ToJsonSnafu)
+    }
+
+    fn lock_key(&self) -> LockKey {
+        let catalog = &self.data.catalog_name;
+        let schema = &self.data.schema_name;
+
+        let lock_key = match self.data.object_type {
+            CommentObjectType::Table | CommentObjectType::Column => {
+                vec![
+                    CatalogLock::Read(catalog).into(),
+                    SchemaLock::read(catalog, schema).into(),
+                    TableNameLock::new(catalog, schema, &self.data.object_name).into(),
+                ]
+            }
+            CommentObjectType::Flow => {
+                vec![
+                    CatalogLock::Read(catalog).into(),
+                    FlowNameLock::new(catalog, &self.data.object_name).into(),
+                ]
+            }
+        };
+
+        LockKey::new(lock_key)
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, AsRefStr)]
+enum CommentOnState {
+    Prepare,
+    UpdateMetadata,
+    InvalidateCache,
+}
+
+/// The data of comment on procedure.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct CommentOnData {
+    state: CommentOnState,
+    catalog_name: String,
+    schema_name: String,
+    object_type: CommentObjectType,
+    object_name: String,
+    /// Column name (only for Column comments)
+    column_name: Option<String>,
+    comment: Option<String>,
+    /// Cached table ID (for Table/Column)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    table_id: Option<TableId>,
+    /// Cached table info (for Table/Column)
+    #[serde(skip)]
+    table_info: Option<DeserializedValueWithBytes<TableInfoValue>>,
+    /// Cached flow ID (for Flow)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    flow_id: Option<FlowId>,
+    /// Cached flow info (for Flow)
+    #[serde(skip)]
+    flow_info: Option<DeserializedValueWithBytes<FlowInfoValue>>,
+    /// Whether the comment is unchanged (optimization for early exit)
+    #[serde(skip)]
+    is_unchanged: bool,
+}
+
+impl CommentOnData {
+    pub fn new(task: CommentOnTask) -> Self {
+        Self {
+            state: CommentOnState::Prepare,
+            catalog_name: task.catalog_name,
+            schema_name: task.schema_name,
+            object_type: task.object_type,
+            object_name: task.object_name,
+            column_name: task.column_name,
+            comment: task.comment,
+            table_id: None,
+            table_info: None,
+            flow_id: None,
+            flow_info: None,
+            is_unchanged: false,
+        }
+    }
+}
+
+fn update_column_comment_metadata(
+    column_schema: &mut datatypes::schema::ColumnSchema,
+    comment: Option<String>,
+) {
+    match comment {
+        Some(value) => {
+            column_schema
+                .mut_metadata()
+                .insert(COLUMN_COMMENT_KEY.to_string(), value);
+        }
+        None => {
+            column_schema.mut_metadata().remove(COLUMN_COMMENT_KEY);
+        }
+    }
+}
+
+fn sync_table_comment_option(options: &mut table::requests::TableOptions, comment: Option<&str>) {
+    match comment {
+        Some(value) => {
+            options
+                .extra_options
+                .insert(TABLE_COMMENT_KEY.to_string(), value.to_string());
+        }
+        None => {
+            options.extra_options.remove(TABLE_COMMENT_KEY);
+        }
+    }
+}
--- a/src/common/meta/src/ddl_manager.rs
+++ b/src/common/meta/src/ddl_manager.rs
@@ -27,6 +27,7 @@ use store_api::storage::TableId;
 use crate::ddl::alter_database::AlterDatabaseProcedure;
 use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure;
 use crate::ddl::alter_table::AlterTableProcedure;
+use crate::ddl::comment_on::CommentOnProcedure;
 use crate::ddl::create_database::CreateDatabaseProcedure;
 use crate::ddl::create_flow::CreateFlowProcedure;
 use crate::ddl::create_logical_tables::CreateLogicalTablesProcedure;
@@ -52,18 +53,18 @@ use crate::rpc::ddl::DdlTask::CreateTrigger;
 #[cfg(feature = "enterprise")]
 use crate::rpc::ddl::DdlTask::DropTrigger;
 use crate::rpc::ddl::DdlTask::{
-    AlterDatabase, AlterLogicalTables, AlterTable, CreateDatabase, CreateFlow, CreateLogicalTables,
-    CreateTable, CreateView, DropDatabase, DropFlow, DropLogicalTables, DropTable, DropView,
-    TruncateTable,
+    AlterDatabase, AlterLogicalTables, AlterTable, CommentOn, CreateDatabase, CreateFlow,
+    CreateLogicalTables, CreateTable, CreateView, DropDatabase, DropFlow, DropLogicalTables,
+    DropTable, DropView, TruncateTable,
 };
 #[cfg(feature = "enterprise")]
 use crate::rpc::ddl::trigger::CreateTriggerTask;
 #[cfg(feature = "enterprise")]
 use crate::rpc::ddl::trigger::DropTriggerTask;
 use crate::rpc::ddl::{
-    AlterDatabaseTask, AlterTableTask, CreateDatabaseTask, CreateFlowTask, CreateTableTask,
-    CreateViewTask, DropDatabaseTask, DropFlowTask, DropTableTask, DropViewTask, QueryContext,
-    SubmitDdlTaskRequest, SubmitDdlTaskResponse, TruncateTableTask,
+    AlterDatabaseTask, AlterTableTask, CommentOnTask, CreateDatabaseTask, CreateFlowTask,
+    CreateTableTask, CreateViewTask, DropDatabaseTask, DropFlowTask, DropTableTask, DropViewTask,
+    QueryContext, SubmitDdlTaskRequest, SubmitDdlTaskResponse, TruncateTableTask,
 };
 use crate::rpc::router::RegionRoute;

@@ -192,7 +193,8 @@ impl DdlManager {
            TruncateTableProcedure,
            CreateDatabaseProcedure,
            DropDatabaseProcedure,
-            DropViewProcedure
+            DropViewProcedure,
+            CommentOnProcedure
        );

        for (type_name, loader_factory) in loaders {
@@ -408,6 +410,19 @@ impl DdlManager {
        self.submit_procedure(procedure_with_id).await
    }

+    /// Submits and executes a comment on task.
+    #[tracing::instrument(skip_all)]
+    pub async fn submit_comment_on_task(
+        &self,
+        comment_on_task: CommentOnTask,
+    ) -> Result<(ProcedureId, Option<Output>)> {
+        let context = self.create_context();
+        let procedure = CommentOnProcedure::new(comment_on_task, context);
+        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
+
+        self.submit_procedure(procedure_with_id).await
+    }
+
    async fn submit_procedure(
        &self,
        procedure_with_id: ProcedureWithId,
@@ -476,6 +491,7 @@ impl DdlManager {
                    handle_create_view_task(self, create_view_task).await
                }
                DropView(drop_view_task) => handle_drop_view_task(self, drop_view_task).await,
+                CommentOn(comment_on_task) => handle_comment_on_task(self, comment_on_task).await,
                #[cfg(feature = "enterprise")]
                CreateTrigger(create_trigger_task) => {
                    handle_create_trigger_task(
@@ -907,6 +923,26 @@ async fn handle_create_view_task(
    })
 }

+async fn handle_comment_on_task(
+    ddl_manager: &DdlManager,
+    comment_on_task: CommentOnTask,
+) -> Result<SubmitDdlTaskResponse> {
+    let (id, _) = ddl_manager
+        .submit_comment_on_task(comment_on_task.clone())
+        .await?;
+
+    let procedure_id = id.to_string();
+    info!(
+        "Comment on {}.{}.{} is updated via procedure_id {id:?}",
+        comment_on_task.catalog_name, comment_on_task.schema_name, comment_on_task.object_name
+    );
+
+    Ok(SubmitDdlTaskResponse {
+        key: procedure_id.into(),
+        ..Default::default()
+    })
+}
+
 #[cfg(test)]
 mod tests {
    use std::sync::Arc;
--- a/src/common/meta/src/distributed_time_constants.rs
+++ b/src/common/meta/src/distributed_time_constants.rs
@@ -14,6 +14,8 @@

 use std::time::Duration;

+use etcd_client::ConnectOptions;
+
 /// Heartbeat interval time (is the basic unit of various time).
 pub const HEARTBEAT_INTERVAL_MILLIS: u64 = 3000;

@@ -41,6 +43,23 @@ pub const POSTGRES_KEEP_ALIVE_SECS: u64 = 30;
 /// In a lease, there are two opportunities for renewal.
 pub const META_KEEP_ALIVE_INTERVAL_SECS: u64 = META_LEASE_SECS / 2;

+/// The timeout of the heartbeat request.
+pub const HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);
+
+/// The keep-alive interval of the heartbeat channel.
+pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS: Duration = Duration::from_secs(15);
+
+/// The keep-alive timeout of the heartbeat channel.
+pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS: Duration = Duration::from_secs(5);
+
+/// The default options for the etcd client.
+pub fn default_etcd_client_options() -> ConnectOptions {
+    ConnectOptions::new()
+        .with_keep_alive_while_idle(true)
+        .with_keep_alive(Duration::from_secs(15), Duration::from_secs(5))
+        .with_connect_timeout(Duration::from_secs(10))
+}
+
 /// The default mailbox round-trip timeout.
 pub const MAILBOX_RTT_SECS: u64 = 1;

--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -272,13 +272,6 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Failed to send message: {err_msg}"))]
-    SendMessage {
-        err_msg: String,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Failed to serde json"))]
    SerdeJson {
        #[snafu(source)]
@@ -1118,7 +1111,7 @@ impl ErrorExt for Error {
            | DeserializeFlexbuffers { .. }
            | ConvertTimeRanges { .. } => StatusCode::Unexpected,

-            SendMessage { .. } | GetKvCache { .. } | CacheNotGet { .. } => StatusCode::Internal,
+            GetKvCache { .. } | CacheNotGet { .. } => StatusCode::Internal,

            SchemaAlreadyExists { .. } => StatusCode::DatabaseAlreadyExists,

--- a/src/common/meta/src/heartbeat/handler.rs
+++ b/src/common/meta/src/heartbeat/handler.rs
@@ -23,6 +23,7 @@ use crate::heartbeat::mailbox::{IncomingMessage, MailboxRef};

 pub mod invalidate_table_cache;
 pub mod parse_mailbox_message;
+pub mod suspend;
 #[cfg(test)]
 mod tests;

--- a/src/common/meta/src/heartbeat/handler/suspend.rs
+++ b/src/common/meta/src/heartbeat/handler/suspend.rs
@@ -0,0 +1,69 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use async_trait::async_trait;
+use common_telemetry::{info, warn};
+
+use crate::error::Result;
+use crate::heartbeat::handler::{
+    HandleControl, HeartbeatResponseHandler, HeartbeatResponseHandlerContext,
+};
+use crate::instruction::Instruction;
+
+/// A heartbeat response handler that handles special "suspend" error.
+/// It will simply set or clear (if previously set) the inner suspend atomic state.
+pub struct SuspendHandler {
+    suspend: Arc<AtomicBool>,
+}
+
+impl SuspendHandler {
+    pub fn new(suspend: Arc<AtomicBool>) -> Self {
+        Self { suspend }
+    }
+}
+
+#[async_trait]
+impl HeartbeatResponseHandler for SuspendHandler {
+    fn is_acceptable(&self, context: &HeartbeatResponseHandlerContext) -> bool {
+        matches!(
+            context.incoming_message,
+            Some((_, Instruction::Suspend)) | None
+        )
+    }
+
+    async fn handle(&self, context: &mut HeartbeatResponseHandlerContext) -> Result<HandleControl> {
+        let flip_state = |expect: bool| {
+            self.suspend
+                .compare_exchange(expect, !expect, Ordering::Relaxed, Ordering::Relaxed)
+                .is_ok()
+        };
+
+        if let Some((_, Instruction::Suspend)) = context.incoming_message.take() {
+            if flip_state(false) {
+                warn!("Suspend instruction received from meta, entering suspension state");
+            }
+        } else {
+            // Suspended components are made always tried to get rid of this state, we don't want
+            // an "un-suspend" instruction to resume them running. That can be error-prone.
+            // So if the "suspend" instruction is not found in the heartbeat, just unset the state.
+            if flip_state(true) {
+                info!("clear suspend state");
+            }
+        }
+        Ok(HandleControl::Continue)
+    }
+}
--- a/src/common/meta/src/heartbeat/mailbox.rs
+++ b/src/common/meta/src/heartbeat/mailbox.rs
@@ -15,8 +15,8 @@
 use std::sync::Arc;

 use tokio::sync::mpsc::Sender;
+use tokio::sync::mpsc::error::SendError;

-use crate::error::{self, Result};
 use crate::instruction::{Instruction, InstructionReply};

 pub type IncomingMessage = (MessageMeta, Instruction);
@@ -51,13 +51,8 @@ impl HeartbeatMailbox {
        Self { sender }
    }

-    pub async fn send(&self, message: OutgoingMessage) -> Result<()> {
-        self.sender.send(message).await.map_err(|e| {
-            error::SendMessageSnafu {
-                err_msg: e.to_string(),
-            }
-            .build()
-        })
+    pub async fn send(&self, message: OutgoingMessage) -> Result<(), SendError<OutgoingMessage>> {
+        self.sender.send(message).await
    }
 }

--- a/src/common/meta/src/instruction.rs
+++ b/src/common/meta/src/instruction.rs
@@ -539,6 +539,8 @@ pub enum Instruction {
    GetFileRefs(GetFileRefs),
    /// Triggers garbage collection for a region.
    GcRegions(GcRegions),
+    /// Temporary suspend serving reads or writes
+    Suspend,
 }

 impl Instruction {
--- a/src/common/meta/src/key/table_info.rs
+++ b/src/common/meta/src/key/table_info.rs
@@ -94,7 +94,7 @@ impl TableInfoValue {
        }
    }

-    pub(crate) fn update(&self, new_table_info: RawTableInfo) -> Self {
+    pub fn update(&self, new_table_info: RawTableInfo) -> Self {
        Self {
            table_info: new_table_info,
            version: self.version + 1,
--- a/src/common/meta/src/rpc/ddl.rs
+++ b/src/common/meta/src/rpc/ddl.rs
@@ -23,19 +23,20 @@ use api::v1::alter_database_expr::Kind as PbAlterDatabaseKind;
 use api::v1::meta::ddl_task_request::Task;
 use api::v1::meta::{
    AlterDatabaseTask as PbAlterDatabaseTask, AlterTableTask as PbAlterTableTask,
-    AlterTableTasks as PbAlterTableTasks, CreateDatabaseTask as PbCreateDatabaseTask,
-    CreateFlowTask as PbCreateFlowTask, CreateTableTask as PbCreateTableTask,
-    CreateTableTasks as PbCreateTableTasks, CreateViewTask as PbCreateViewTask,
-    DdlTaskRequest as PbDdlTaskRequest, DdlTaskResponse as PbDdlTaskResponse,
-    DropDatabaseTask as PbDropDatabaseTask, DropFlowTask as PbDropFlowTask,
-    DropTableTask as PbDropTableTask, DropTableTasks as PbDropTableTasks,
-    DropViewTask as PbDropViewTask, Partition, ProcedureId,
+    AlterTableTasks as PbAlterTableTasks, CommentOnTask as PbCommentOnTask,
+    CreateDatabaseTask as PbCreateDatabaseTask, CreateFlowTask as PbCreateFlowTask,
+    CreateTableTask as PbCreateTableTask, CreateTableTasks as PbCreateTableTasks,
+    CreateViewTask as PbCreateViewTask, DdlTaskRequest as PbDdlTaskRequest,
+    DdlTaskResponse as PbDdlTaskResponse, DropDatabaseTask as PbDropDatabaseTask,
+    DropFlowTask as PbDropFlowTask, DropTableTask as PbDropTableTask,
+    DropTableTasks as PbDropTableTasks, DropViewTask as PbDropViewTask, Partition, ProcedureId,
    TruncateTableTask as PbTruncateTableTask,
 };
 use api::v1::{
-    AlterDatabaseExpr, AlterTableExpr, CreateDatabaseExpr, CreateFlowExpr, CreateTableExpr,
-    CreateViewExpr, DropDatabaseExpr, DropFlowExpr, DropTableExpr, DropViewExpr, EvalInterval,
-    ExpireAfter, Option as PbOption, QueryContext as PbQueryContext, TruncateTableExpr,
+    AlterDatabaseExpr, AlterTableExpr, CommentObjectType as PbCommentObjectType, CommentOnExpr,
+    CreateDatabaseExpr, CreateFlowExpr, CreateTableExpr, CreateViewExpr, DropDatabaseExpr,
+    DropFlowExpr, DropTableExpr, DropViewExpr, EvalInterval, ExpireAfter, Option as PbOption,
+    QueryContext as PbQueryContext, TruncateTableExpr,
 };
 use base64::Engine as _;
 use base64::engine::general_purpose;
@@ -78,6 +79,7 @@ pub enum DdlTask {
    DropView(DropViewTask),
    #[cfg(feature = "enterprise")]
    CreateTrigger(trigger::CreateTriggerTask),
+    CommentOn(CommentOnTask),
 }

 impl DdlTask {
@@ -200,6 +202,11 @@ impl DdlTask {
            view_info,
        })
    }
+
+    /// Creates a [`DdlTask`] to comment on a table, column, or flow.
+    pub fn new_comment_on(task: CommentOnTask) -> Self {
+        DdlTask::CommentOn(task)
+    }
 }

 impl TryFrom<Task> for DdlTask {
@@ -278,6 +285,7 @@ impl TryFrom<Task> for DdlTask {
                    .fail()
                }
            }
+            Task::CommentOnTask(comment_on) => Ok(DdlTask::CommentOn(comment_on.try_into()?)),
        }
    }
 }
@@ -332,6 +340,7 @@ impl TryFrom<SubmitDdlTaskRequest> for PbDdlTaskRequest {
            DdlTask::CreateTrigger(task) => Task::CreateTriggerTask(task.try_into()?),
            #[cfg(feature = "enterprise")]
            DdlTask::DropTrigger(task) => Task::DropTriggerTask(task.into()),
+            DdlTask::CommentOn(task) => Task::CommentOnTask(task.into()),
        };

        Ok(Self {
@@ -1277,6 +1286,119 @@ impl From<DropFlowTask> for PbDropFlowTask {
    }
 }

+/// Represents the ID of the object being commented on (Table or Flow).
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum CommentObjectId {
+    Table(TableId),
+    Flow(FlowId),
+}
+
+/// Comment on table, column, or flow
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct CommentOnTask {
+    pub catalog_name: String,
+    pub schema_name: String,
+    pub object_type: CommentObjectType,
+    pub object_name: String,
+    /// Column name (only for Column comments)
+    pub column_name: Option<String>,
+    /// Object ID (Table or Flow) for validation and cache invalidation
+    pub object_id: Option<CommentObjectId>,
+    pub comment: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum CommentObjectType {
+    Table,
+    Column,
+    Flow,
+}
+
+impl CommentOnTask {
+    pub fn table_ref(&self) -> TableReference<'_> {
+        TableReference {
+            catalog: &self.catalog_name,
+            schema: &self.schema_name,
+            table: &self.object_name,
+        }
+    }
+}
+
+// Proto conversions for CommentObjectType
+impl From<CommentObjectType> for PbCommentObjectType {
+    fn from(object_type: CommentObjectType) -> Self {
+        match object_type {
+            CommentObjectType::Table => PbCommentObjectType::Table,
+            CommentObjectType::Column => PbCommentObjectType::Column,
+            CommentObjectType::Flow => PbCommentObjectType::Flow,
+        }
+    }
+}
+
+impl TryFrom<i32> for CommentObjectType {
+    type Error = error::Error;
+
+    fn try_from(value: i32) -> Result<Self> {
+        match value {
+            0 => Ok(CommentObjectType::Table),
+            1 => Ok(CommentObjectType::Column),
+            2 => Ok(CommentObjectType::Flow),
+            _ => error::InvalidProtoMsgSnafu {
+                err_msg: format!(
+                    "Invalid CommentObjectType value: {}. Valid values are: 0 (Table), 1 (Column), 2 (Flow)",
+                    value
+                ),
+            }
+            .fail(),
+        }
+    }
+}
+
+// Proto conversions for CommentOnTask
+impl TryFrom<PbCommentOnTask> for CommentOnTask {
+    type Error = error::Error;
+
+    fn try_from(pb: PbCommentOnTask) -> Result<Self> {
+        let comment_on = pb.comment_on.context(error::InvalidProtoMsgSnafu {
+            err_msg: "expected comment_on",
+        })?;
+
+        Ok(CommentOnTask {
+            catalog_name: comment_on.catalog_name,
+            schema_name: comment_on.schema_name,
+            object_type: comment_on.object_type.try_into()?,
+            object_name: comment_on.object_name,
+            column_name: if comment_on.column_name.is_empty() {
+                None
+            } else {
+                Some(comment_on.column_name)
+            },
+            comment: if comment_on.comment.is_empty() {
+                None
+            } else {
+                Some(comment_on.comment)
+            },
+            object_id: None,
+        })
+    }
+}
+
+impl From<CommentOnTask> for PbCommentOnTask {
+    fn from(task: CommentOnTask) -> Self {
+        let pb_object_type: PbCommentObjectType = task.object_type.into();
+        PbCommentOnTask {
+            comment_on: Some(CommentOnExpr {
+                catalog_name: task.catalog_name,
+                schema_name: task.schema_name,
+                object_type: pb_object_type as i32,
+                object_name: task.object_name,
+                column_name: task.column_name.unwrap_or_default(),
+                comment: task.comment.unwrap_or_default(),
+            }),
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct QueryContext {
    pub(crate) current_catalog: String,
--- a/src/common/meta/src/wal_options_allocator/topic_creator.rs
+++ b/src/common/meta/src/wal_options_allocator/topic_creator.rs
@@ -14,7 +14,7 @@

 use common_telemetry::{debug, error, info};
 use common_wal::config::kafka::common::{
-    DEFAULT_BACKOFF_CONFIG, KafkaConnectionConfig, KafkaTopicConfig,
+    DEFAULT_BACKOFF_CONFIG, DEFAULT_CONNECT_TIMEOUT, KafkaConnectionConfig, KafkaTopicConfig,
 };
 use rskafka::client::error::Error as RsKafkaError;
 use rskafka::client::error::ProtocolError::TopicAlreadyExists;
@@ -205,11 +205,13 @@ impl KafkaTopicCreator {
        self.partition_client(topic).await.unwrap()
    }
 }
+
 /// Builds a kafka [Client](rskafka::client::Client).
 pub async fn build_kafka_client(connection: &KafkaConnectionConfig) -> Result<Client> {
    // Builds an kafka controller client for creating topics.
    let mut builder = ClientBuilder::new(connection.broker_endpoints.clone())
-        .backoff_config(DEFAULT_BACKOFF_CONFIG);
+        .backoff_config(DEFAULT_BACKOFF_CONFIG)
+        .connect_timeout(Some(DEFAULT_CONNECT_TIMEOUT));
    if let Some(sasl) = &connection.sasl {
        builder = builder.sasl_config(sasl.config.clone().into_sasl_config());
    };
--- a/src/common/procedure/src/local/runner.rs
+++ b/src/common/procedure/src/local/runner.rs
@@ -331,8 +331,29 @@ impl Runner {
                        }

                        match status {
-                            Status::Executing { .. } => {}
+                            Status::Executing { .. } => {
+                                let prev_state = self.meta.state();
+                                if !matches!(prev_state, ProcedureState::Running) {
+                                    info!(
+                                        "Set Procedure {}-{} state to running, prev_state: {:?}",
+                                        self.procedure.type_name(),
+                                        self.meta.id,
+                                        prev_state
+                                    );
+                                    self.meta.set_state(ProcedureState::Running);
+                                }
+                            }
                            Status::Suspended { subprocedures, .. } => {
+                                let prev_state = self.meta.state();
+                                if !matches!(prev_state, ProcedureState::Running) {
+                                    info!(
+                                        "Set Procedure {}-{} state to running, prev_state: {:?}",
+                                        self.procedure.type_name(),
+                                        self.meta.id,
+                                        prev_state
+                                    );
+                                    self.meta.set_state(ProcedureState::Running);
+                                }
                                self.on_suspended(subprocedures).await;
                            }
                            Status::Done { output } => {
@@ -393,8 +414,12 @@ impl Runner {
                            return;
                        }

-                        self.meta
-                            .set_state(ProcedureState::prepare_rollback(Arc::new(e)));
+                        if self.procedure.rollback_supported() {
+                            self.meta
+                                .set_state(ProcedureState::prepare_rollback(Arc::new(e)));
+                        } else {
+                            self.meta.set_state(ProcedureState::failed(Arc::new(e)));
+                        }
                    }
                }
            }
@@ -1080,20 +1105,10 @@ mod tests {
        let mut runner = new_runner(meta.clone(), Box::new(fail), procedure_store.clone());
        runner.manager_ctx.start();

-        runner.execute_once(&ctx).await;
-        let state = runner.meta.state();
-        assert!(state.is_prepare_rollback(), "{state:?}");
-
        runner.execute_once(&ctx).await;
        let state = runner.meta.state();
        assert!(state.is_failed(), "{state:?}");
-        check_files(
-            &object_store,
-            &procedure_store,
-            ctx.procedure_id,
-            &["0000000000.rollback"],
-        )
-        .await;
+        check_files(&object_store, &procedure_store, ctx.procedure_id, &[]).await;
    }

    #[tokio::test]
@@ -1146,6 +1161,8 @@ mod tests {
            async move {
                if times == 1 {
                    Err(Error::retry_later(MockError::new(StatusCode::Unexpected)))
+                } else if times == 2 {
+                    Ok(Status::executing(false))
                } else {
                    Ok(Status::done())
                }
@@ -1172,6 +1189,10 @@ mod tests {
        let state = runner.meta.state();
        assert!(state.is_retrying(), "{state:?}");

+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_running(), "{state:?}");
+
        runner.execute_once(&ctx).await;
        let state = runner.meta.state();
        assert!(state.is_done(), "{state:?}");
@@ -1185,6 +1206,86 @@ mod tests {
        .await;
    }

+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_execute_on_retry_later_error_with_child() {
+        common_telemetry::init_default_ut_logging();
+        let mut times = 0;
+        let child_id = ProcedureId::random();
+
+        let exec_fn = move |_| {
+            times += 1;
+            async move {
+                debug!("times: {}", times);
+                if times == 1 {
+                    Err(Error::retry_later(MockError::new(StatusCode::Unexpected)))
+                } else if times == 2 {
+                    let exec_fn = |_| {
+                        async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }
+                            .boxed()
+                    };
+                    let fail = ProcedureAdapter {
+                        data: "fail".to_string(),
+                        lock_key: LockKey::single_exclusive("catalog.schema.table.region-0"),
+                        poison_keys: PoisonKeys::default(),
+                        exec_fn,
+                        rollback_fn: None,
+                    };
+
+                    Ok(Status::Suspended {
+                        subprocedures: vec![ProcedureWithId {
+                            id: child_id,
+                            procedure: Box::new(fail),
+                        }],
+                        persist: true,
+                    })
+                } else {
+                    Ok(Status::done())
+                }
+            }
+            .boxed()
+        };
+
+        let retry_later = ProcedureAdapter {
+            data: "retry_later".to_string(),
+            lock_key: LockKey::single_exclusive("catalog.schema.table"),
+            poison_keys: PoisonKeys::default(),
+            exec_fn,
+            rollback_fn: None,
+        };
+
+        let dir = create_temp_dir("retry_later");
+        let meta = retry_later.new_meta(ROOT_ID);
+        let ctx = context_without_provider(meta.id);
+        let object_store = test_util::new_object_store(&dir);
+        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
+        let mut runner = new_runner(meta.clone(), Box::new(retry_later), procedure_store.clone());
+        runner.manager_ctx.start();
+        debug!("execute_once 1");
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_retrying(), "{state:?}");
+
+        let moved_meta = meta.clone();
+        tokio::spawn(async move {
+            moved_meta.child_notify.notify_one();
+        });
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_running(), "{state:?}");
+
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_done(), "{state:?}");
+        assert!(meta.state().is_done());
+        check_files(
+            &object_store,
+            &procedure_store,
+            ctx.procedure_id,
+            &["0000000000.step", "0000000001.commit"],
+        )
+        .await;
+    }
+
    #[tokio::test]
    async fn test_execute_exceed_max_retry_later() {
        let exec_fn =
@@ -1304,7 +1405,7 @@ mod tests {
    async fn test_child_error() {
        let mut times = 0;
        let child_id = ProcedureId::random();
-
+        common_telemetry::init_default_ut_logging();
        let exec_fn = move |ctx: Context| {
            times += 1;
            async move {
@@ -1529,7 +1630,7 @@ mod tests {

        runner.execute_once(&ctx).await;
        let state = runner.meta.state();
-        assert!(state.is_prepare_rollback(), "{state:?}");
+        assert!(state.is_failed(), "{state:?}");

        let procedure_id = runner
            .manager_ctx
@@ -1596,11 +1697,6 @@ mod tests {
        let state = runner.meta.state();
        assert!(state.is_running(), "{state:?}");

-        runner.execute_once(&ctx).await;
-        let state = runner.meta.state();
-        assert!(state.is_prepare_rollback(), "{state:?}");
-        assert!(meta.state().is_prepare_rollback());
-
        runner.execute_once(&ctx).await;
        let state = runner.meta.state();
        assert!(state.is_failed(), "{state:?}");
--- a/src/common/query/src/lib.rs
+++ b/src/common/query/src/lib.rs
@@ -46,6 +46,22 @@ pub enum OutputData {
    Stream(SendableRecordBatchStream),
 }

+impl OutputData {
+    /// Consume the data to pretty printed string.
+    pub async fn pretty_print(self) -> String {
+        match self {
+            OutputData::AffectedRows(x) => {
+                format!("Affected Rows: {x}")
+            }
+            OutputData::RecordBatches(x) => x.pretty_print().unwrap_or_else(|e| e.to_string()),
+            OutputData::Stream(x) => common_recordbatch::util::collect_batches(x)
+                .await
+                .and_then(|x| x.pretty_print())
+                .unwrap_or_else(|e| e.to_string()),
+        }
+    }
+}
+
 /// OutputMeta stores meta information produced/generated during the execution
 #[derive(Debug, Default)]
 pub struct OutputMeta {
--- a/src/common/stat/src/resource.rs
+++ b/src/common/stat/src/resource.rs
@@ -58,10 +58,14 @@ pub fn get_total_memory_bytes() -> i64 {
    }
 }

-/// Get the total CPU cores. The result will be rounded to the nearest integer.
-/// For example, if the total CPU is 1.5 cores(1500 millicores), the result will be 2.
+/// Get the total CPU cores. The result will be rounded up to the next integer (ceiling).
+/// For example, if the total CPU is 1.1 cores (1100 millicores) or 1.5 cores (1500 millicores), the result will be 2.
 pub fn get_total_cpu_cores() -> usize {
-    ((get_total_cpu_millicores() as f64) / 1000.0).round() as usize
+    cpu_cores(get_total_cpu_millicores())
+}
+
+fn cpu_cores(cpu_millicores: i64) -> usize {
+    ((cpu_millicores as f64) / 1_000.0).ceil() as usize
 }

 /// Get the total memory in readable size.
@@ -178,6 +182,13 @@ mod tests {
    #[test]
    fn test_get_total_cpu_cores() {
        assert!(get_total_cpu_cores() > 0);
+        assert_eq!(cpu_cores(1), 1);
+        assert_eq!(cpu_cores(100), 1);
+        assert_eq!(cpu_cores(500), 1);
+        assert_eq!(cpu_cores(1000), 1);
+        assert_eq!(cpu_cores(1100), 2);
+        assert_eq!(cpu_cores(1900), 2);
+        assert_eq!(cpu_cores(10_000), 10);
    }

    #[test]
--- a/src/common/wal/src/config/kafka/common.rs
+++ b/src/common/wal/src/config/kafka/common.rs
@@ -36,6 +36,9 @@ pub const DEFAULT_BACKOFF_CONFIG: BackoffConfig = BackoffConfig {
    deadline: Some(Duration::from_secs(3)),
 };

+/// The default connect timeout for kafka client.
+pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
+
 /// Default interval for auto WAL pruning.
 pub const DEFAULT_AUTO_PRUNE_INTERVAL: Duration = Duration::from_mins(30);
 /// Default limit for concurrent auto pruning tasks.
--- a/src/datanode/src/datanode.rs
+++ b/src/datanode/src/datanode.rs
@@ -22,6 +22,7 @@ use common_base::Plugins;
 use common_error::ext::BoxedError;
 use common_greptimedb_telemetry::GreptimeDBTelemetryTask;
 use common_meta::cache::{LayeredCacheRegistry, SchemaCacheRef, TableSchemaCacheRef};
+use common_meta::cache_invalidator::CacheInvalidatorRef;
 use common_meta::datanode::TopicStatsReporter;
 use common_meta::key::runtime_switch::RuntimeSwitchManager;
 use common_meta::key::{SchemaMetadataManager, SchemaMetadataManagerRef};
@@ -281,21 +282,11 @@ impl DatanodeBuilder {
            open_all_regions.await?;
        }

-        let mut resource_stat = ResourceStatImpl::default();
-        resource_stat.start_collect_cpu_usage();
-
        let heartbeat_task = if let Some(meta_client) = meta_client {
-            Some(
-                HeartbeatTask::try_new(
-                    &self.opts,
-                    region_server.clone(),
-                    meta_client,
-                    cache_registry,
-                    self.plugins.clone(),
-                    Arc::new(resource_stat),
-                )
-                .await?,
-            )
+            let task = self
+                .create_heartbeat_task(&region_server, meta_client, cache_registry)
+                .await?;
+            Some(task)
        } else {
            None
        };
@@ -324,6 +315,29 @@ impl DatanodeBuilder {
        })
    }

+    async fn create_heartbeat_task(
+        &self,
+        region_server: &RegionServer,
+        meta_client: MetaClientRef,
+        cache_invalidator: CacheInvalidatorRef,
+    ) -> Result<HeartbeatTask> {
+        let stat = {
+            let mut stat = ResourceStatImpl::default();
+            stat.start_collect_cpu_usage();
+            Arc::new(stat)
+        };
+
+        HeartbeatTask::try_new(
+            &self.opts,
+            region_server.clone(),
+            meta_client,
+            cache_invalidator,
+            self.plugins.clone(),
+            stat,
+        )
+        .await
+    }
+
    /// Builds [ObjectStoreManager] from [StorageConfig].
    pub async fn build_object_store_manager(cfg: &StorageConfig) -> Result<ObjectStoreManagerRef> {
        let object_store = store::new_object_store(cfg.store.clone(), &cfg.data_home).await?;
--- a/src/datanode/src/error.rs
+++ b/src/datanode/src/error.rs
@@ -410,14 +410,6 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Failed to build cache store"))]
-    BuildCacheStore {
-        #[snafu(source)]
-        error: object_store::Error,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Not yet implemented: {what}"))]
    NotYetImplemented { what: String },
 }
@@ -493,7 +485,6 @@ impl ErrorExt for Error {
            SerializeJson { .. } => StatusCode::Internal,

            ObjectStore { source, .. } => source.status_code(),
-            BuildCacheStore { .. } => StatusCode::StorageUnavailable,
        }
    }

--- a/src/datanode/src/heartbeat.rs
+++ b/src/datanode/src/heartbeat.rs
@@ -25,6 +25,7 @@ use common_meta::datanode::REGION_STATISTIC_KEY;
 use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS;
 use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHandler;
 use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
+use common_meta::heartbeat::handler::suspend::SuspendHandler;
 use common_meta::heartbeat::handler::{
    HandlerGroupExecutor, HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef,
 };
@@ -91,6 +92,7 @@ impl HeartbeatTask {
        let resp_handler_executor = Arc::new(HandlerGroupExecutor::new(vec![
            region_alive_keeper.clone(),
            Arc::new(ParseMailboxMessageHandler),
+            Arc::new(SuspendHandler::new(region_server.suspend_state())),
            Arc::new(
                RegionHeartbeatResponseHandler::new(region_server.clone())
                    .with_open_region_parallelism(opts.init_regions_parallelism),
--- a/src/datanode/src/heartbeat/handler.rs
+++ b/src/datanode/src/heartbeat/handler.rs
@@ -99,26 +99,30 @@ impl RegionHeartbeatResponseHandler {
        self
    }

-    fn build_handler(&self, instruction: &Instruction) -> MetaResult<Box<InstructionHandlers>> {
+    fn build_handler(
+        &self,
+        instruction: &Instruction,
+    ) -> MetaResult<Option<Box<InstructionHandlers>>> {
        match instruction {
-            Instruction::CloseRegions(_) => Ok(Box::new(CloseRegionsHandler.into())),
-            Instruction::OpenRegions(_) => Ok(Box::new(
+            Instruction::CloseRegions(_) => Ok(Some(Box::new(CloseRegionsHandler.into()))),
+            Instruction::OpenRegions(_) => Ok(Some(Box::new(
                OpenRegionsHandler {
                    open_region_parallelism: self.open_region_parallelism,
                }
                .into(),
-            )),
-            Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler.into())),
-            Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler.into())),
-            Instruction::UpgradeRegions(_) => Ok(Box::new(
+            ))),
+            Instruction::FlushRegions(_) => Ok(Some(Box::new(FlushRegionsHandler.into()))),
+            Instruction::DowngradeRegions(_) => Ok(Some(Box::new(DowngradeRegionsHandler.into()))),
+            Instruction::UpgradeRegions(_) => Ok(Some(Box::new(
                UpgradeRegionsHandler {
                    upgrade_region_parallelism: self.open_region_parallelism,
                }
                .into(),
-            )),
-            Instruction::GetFileRefs(_) => Ok(Box::new(GetFileRefsHandler.into())),
-            Instruction::GcRegions(_) => Ok(Box::new(GcRegionsHandler.into())),
+            ))),
+            Instruction::GetFileRefs(_) => Ok(Some(Box::new(GetFileRefsHandler.into()))),
+            Instruction::GcRegions(_) => Ok(Some(Box::new(GcRegionsHandler.into()))),
            Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(),
+            Instruction::Suspend => Ok(None),
        }
    }
 }
@@ -216,30 +220,24 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
            .context(InvalidHeartbeatResponseSnafu)?;

        let mailbox = ctx.mailbox.clone();
-        let region_server = self.region_server.clone();
-        let downgrade_tasks = self.downgrade_tasks.clone();
-        let flush_tasks = self.flush_tasks.clone();
-        let gc_tasks = self.gc_tasks.clone();
-        let handler = self.build_handler(&instruction)?;
-        let _handle = common_runtime::spawn_global(async move {
-            let reply = handler
-                .handle(
-                    &HandlerContext {
-                        region_server,
-                        downgrade_tasks,
-                        flush_tasks,
-                        gc_tasks,
-                    },
-                    instruction,
-                )
-                .await;
-
-            if let Some(reply) = reply
-                && let Err(e) = mailbox.send((meta, reply)).await
-            {
-                error!(e; "Failed to send reply to mailbox");
-            }
-        });
+        if let Some(handler) = self.build_handler(&instruction)? {
+            let context = HandlerContext {
+                region_server: self.region_server.clone(),
+                downgrade_tasks: self.downgrade_tasks.clone(),
+                flush_tasks: self.flush_tasks.clone(),
+                gc_tasks: self.gc_tasks.clone(),
+            };
+            let _handle = common_runtime::spawn_global(async move {
+                let reply = handler.handle(&context, instruction).await;
+                if let Some(reply) = reply
+                    && let Err(e) = mailbox.send((meta, reply)).await
+                {
+                    let error = e.to_string();
+                    let (meta, reply) = e.0;
+                    error!("Failed to send reply {reply} to {meta:?}: {error}");
+                }
+            });
+        }

        Ok(HandleControl::Continue)
    }
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -17,6 +17,7 @@ mod catalog;
 use std::collections::HashMap;
 use std::fmt::Debug;
 use std::ops::Deref;
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, RwLock};
 use std::time::Duration;

@@ -52,7 +53,9 @@ pub use query::dummy_catalog::{
    DummyCatalogList, DummyTableProviderFactory, TableProviderFactoryRef,
 };
 use serde_json;
-use servers::error::{self as servers_error, ExecuteGrpcRequestSnafu, Result as ServerResult};
+use servers::error::{
+    self as servers_error, ExecuteGrpcRequestSnafu, Result as ServerResult, SuspendedSnafu,
+};
 use servers::grpc::FlightCompression;
 use servers::grpc::flight::{FlightCraft, FlightRecordBatchStream, TonicStream};
 use servers::grpc::region_server::RegionServerHandler;
@@ -89,6 +92,7 @@ use crate::region_server::catalog::{NameAwareCatalogList, NameAwareDataSourceInj
 pub struct RegionServer {
    inner: Arc<RegionServerInner>,
    flight_compression: FlightCompression,
+    suspend: Arc<AtomicBool>,
 }

 pub struct RegionStat {
@@ -136,6 +140,7 @@ impl RegionServer {
                ),
            )),
            flight_compression,
+            suspend: Arc::new(AtomicBool::new(false)),
        }
    }

@@ -595,6 +600,14 @@ impl RegionServer {
            .handle_sync_region(engine_with_status.engine(), region_id, manifest_info)
            .await
    }
+
+    fn is_suspended(&self) -> bool {
+        self.suspend.load(Ordering::Relaxed)
+    }
+
+    pub(crate) fn suspend_state(&self) -> Arc<AtomicBool> {
+        self.suspend.clone()
+    }
 }

 #[async_trait]
@@ -644,6 +657,8 @@ impl FlightCraft for RegionServer {
        &self,
        request: Request<Ticket>,
    ) -> TonicResult<Response<TonicStream<FlightData>>> {
+        ensure!(!self.is_suspended(), SuspendedSnafu);
+
        let ticket = request.into_inner().ticket;
        let request = api::v1::region::QueryRequest::decode(ticket.as_ref())
            .context(servers_error::InvalidFlightTicketSnafu)?;
@@ -1261,7 +1276,6 @@ impl RegionServerInner {
            .with_context(|_| HandleRegionRequestSnafu { region_id })?
            .new_opened_logical_region_ids()
        else {
-            warn!("No new opened logical regions");
            return Ok(());
        };

--- a/src/datanode/src/store.rs
+++ b/src/datanode/src/store.rs
@@ -14,15 +14,10 @@

 //! object storage utilities

-use std::sync::Arc;
-
-use common_telemetry::info;
-use object_store::config::ObjectStorageCacheConfig;
+use common_telemetry::{info, warn};
 use object_store::factory::new_raw_object_store;
-use object_store::layers::LruCacheLayer;
-use object_store::services::Fs;
 use object_store::util::{clean_temp_dir, join_dir, with_instrument_layers, with_retry_layers};
-use object_store::{ATOMIC_WRITE_DIR, Access, ObjectStore, ObjectStoreBuilder};
+use object_store::{ATOMIC_WRITE_DIR, ObjectStore};
 use snafu::prelude::*;

 use crate::config::ObjectStoreConfig;
@@ -47,23 +42,58 @@ pub(crate) async fn new_object_store_without_cache(
    Ok(object_store)
 }

+/// Cleans up old LRU read cache directories that were removed.
+fn clean_old_read_cache(store: &ObjectStoreConfig, data_home: &str) {
+    if !store.is_object_storage() {
+        return;
+    }
+
+    let Some(cache_config) = store.cache_config() else {
+        return;
+    };
+
+    // Only cleans if read cache was enabled
+    if !cache_config.enable_read_cache {
+        return;
+    }
+
+    let cache_base_dir = if cache_config.cache_path.is_empty() {
+        data_home
+    } else {
+        &cache_config.cache_path
+    };
+
+    // Cleans up the old read cache directory
+    let old_read_cache_dir = join_dir(cache_base_dir, "cache/object/read");
+    info!(
+        "Cleaning up old read cache directory: {}",
+        old_read_cache_dir
+    );
+    if let Err(e) = clean_temp_dir(&old_read_cache_dir) {
+        warn!(e; "Failed to clean old read cache directory {}", old_read_cache_dir);
+    }
+
+    // Cleans up the atomic temp dir used by the cache layer
+    let cache_atomic_temp_dir = join_dir(cache_base_dir, ATOMIC_WRITE_DIR);
+    info!(
+        "Cleaning up old cache atomic temp directory: {}",
+        cache_atomic_temp_dir
+    );
+    if let Err(e) = clean_temp_dir(&cache_atomic_temp_dir) {
+        warn!(e; "Failed to clean old cache atomic temp directory {}", cache_atomic_temp_dir);
+    }
+}
+
 pub async fn new_object_store(store: ObjectStoreConfig, data_home: &str) -> Result<ObjectStore> {
+    // Cleans up old LRU read cache directories.
+    // TODO: Remove this line after the 1.0 release.
+    clean_old_read_cache(&store, data_home);
+
    let object_store = new_raw_object_store(&store, data_home)
        .await
        .context(error::ObjectStoreSnafu)?;
-    // Enable retry layer and cache layer for non-fs object storages
+    // Enables retry layer for non-fs object storages
    let object_store = if store.is_object_storage() {
-        let object_store = {
-            // It's safe to unwrap here because we already checked above.
-            let cache_config = store.cache_config().unwrap();
-            if let Some(cache_layer) = build_cache_layer(cache_config, data_home).await? {
-                // Adds cache layer
-                object_store.layer(cache_layer)
-            } else {
-                object_store
-            }
-        };
-
        // Adds retry layer
        with_retry_layers(object_store)
    } else {
@@ -73,40 +103,3 @@ pub async fn new_object_store(store: ObjectStoreConfig, data_home: &str) -> Resu
    let object_store = with_instrument_layers(object_store, true);
    Ok(object_store)
 }
-
-async fn build_cache_layer(
-    cache_config: &ObjectStorageCacheConfig,
-    data_home: &str,
-) -> Result<Option<LruCacheLayer<impl Access>>> {
-    // No need to build cache layer if read cache is disabled.
-    if !cache_config.enable_read_cache {
-        return Ok(None);
-    }
-    let cache_base_dir = if cache_config.cache_path.is_empty() {
-        data_home
-    } else {
-        &cache_config.cache_path
-    };
-    let atomic_temp_dir = join_dir(cache_base_dir, ATOMIC_WRITE_DIR);
-    clean_temp_dir(&atomic_temp_dir).context(error::ObjectStoreSnafu)?;
-
-    let cache_store = Fs::default()
-        .root(cache_base_dir)
-        .atomic_write_dir(&atomic_temp_dir)
-        .build()
-        .context(error::BuildCacheStoreSnafu)?;
-
-    let cache_layer = LruCacheLayer::new(
-        Arc::new(cache_store),
-        cache_config.cache_capacity.0 as usize,
-    )
-    .context(error::BuildCacheStoreSnafu)?;
-    cache_layer.recover_cache(false).await;
-
-    info!(
-        "Enabled local object storage cache, path: {}, capacity: {}.",
-        cache_config.cache_path, cache_config.cache_capacity
-    );
-
-    Ok(Some(cache_layer))
-}
--- a/src/datanode/src/tests.rs
+++ b/src/datanode/src/tests.rs
@@ -24,8 +24,8 @@ use common_query::Output;
 use common_runtime::Runtime;
 use common_runtime::runtime::{BuilderBuild, RuntimeTrait};
 use datafusion::catalog::TableFunction;
+use datafusion::dataframe::DataFrame;
 use datafusion_expr::{AggregateUDF, LogicalPlan};
-use query::dataframe::DataFrame;
 use query::planner::LogicalPlanner;
 use query::query_engine::{DescribeResult, QueryEngineState};
 use query::{QueryEngine, QueryEngineContext};
@@ -33,9 +33,9 @@ use servers::grpc::FlightCompression;
 use session::context::QueryContextRef;
 use store_api::metadata::RegionMetadataRef;
 use store_api::region_engine::{
-    RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
-    RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse,
-    SettableRegionRoleState, SyncManifestResponse,
+    CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine, RegionManifestInfo, RegionRole,
+    RegionScannerRef, RegionStatistic, RemapManifestsRequest, RemapManifestsResponse,
+    SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse,
 };
 use store_api::region_request::{AffectedRows, RegionRequest};
 use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
@@ -299,6 +299,14 @@ impl RegionEngine for MockRegionEngine {
        unimplemented!()
    }

+    async fn copy_region_from(
+        &self,
+        _region_id: RegionId,
+        _request: CopyRegionFromRequest,
+    ) -> Result<CopyRegionFromResponse, BoxedError> {
+        unimplemented!()
+    }
+
    fn as_any(&self) -> &dyn Any {
        self
    }
--- a/src/datatypes/src/schema.rs
+++ b/src/datatypes/src/schema.rs
@@ -33,7 +33,8 @@ pub use crate::schema::column_schema::{
    COLUMN_SKIPPING_INDEX_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY,
    COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, ColumnExtType, ColumnSchema, FULLTEXT_KEY,
    FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata,
-    SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY,
+    SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY, VECTOR_INDEX_KEY,
+    VectorDistanceMetric, VectorIndexEngineType, VectorIndexOptions,
 };
 pub use crate::schema::constraint::ColumnDefaultConstraint;
 pub use crate::schema::raw::RawSchema;
--- a/src/datatypes/src/schema/column_schema.rs
+++ b/src/datatypes/src/schema/column_schema.rs
@@ -46,6 +46,8 @@ pub const FULLTEXT_KEY: &str = "greptime:fulltext";
 pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index";
 /// Key used to store skip options in arrow field's metadata.
 pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index";
+/// Key used to store vector index options in arrow field's metadata.
+pub const VECTOR_INDEX_KEY: &str = "greptime:vector_index";

 /// Keys used in fulltext options
 pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable";
@@ -216,6 +218,53 @@ impl ColumnSchema {
        self.metadata.contains_key(INVERTED_INDEX_KEY)
    }

+    /// Checks if this column has a vector index.
+    pub fn is_vector_indexed(&self) -> bool {
+        match self.vector_index_options() {
+            Ok(opts) => opts.is_some(),
+            Err(e) => {
+                common_telemetry::warn!(
+                    "Failed to deserialize vector_index_options for column '{}': {}",
+                    self.name,
+                    e
+                );
+                false
+            }
+        }
+    }
+
+    /// Gets the vector index options.
+    pub fn vector_index_options(&self) -> Result<Option<VectorIndexOptions>> {
+        match self.metadata.get(VECTOR_INDEX_KEY) {
+            None => Ok(None),
+            Some(json) => {
+                let options =
+                    serde_json::from_str(json).context(error::DeserializeSnafu { json })?;
+                Ok(Some(options))
+            }
+        }
+    }
+
+    /// Sets the vector index options.
+    pub fn set_vector_index_options(&mut self, options: &VectorIndexOptions) -> Result<()> {
+        self.metadata.insert(
+            VECTOR_INDEX_KEY.to_string(),
+            serde_json::to_string(options).context(error::SerializeSnafu)?,
+        );
+        Ok(())
+    }
+
+    /// Removes the vector index options.
+    pub fn unset_vector_index_options(&mut self) {
+        self.metadata.remove(VECTOR_INDEX_KEY);
+    }
+
+    /// Sets vector index options and returns self for chaining.
+    pub fn with_vector_index_options(mut self, options: &VectorIndexOptions) -> Result<Self> {
+        self.set_vector_index_options(options)?;
+        Ok(self)
+    }
+
    /// Set default constraint.
    ///
    /// If a default constraint exists for the column, this method will
@@ -964,6 +1013,181 @@ impl TryFrom<HashMap<String, String>> for SkippingIndexOptions {
    }
 }

+/// Distance metric for vector similarity search.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default, Visit, VisitMut)]
+#[serde(rename_all = "lowercase")]
+pub enum VectorDistanceMetric {
+    /// Squared Euclidean distance (L2^2).
+    #[default]
+    L2sq,
+    /// Cosine distance (1 - cosine similarity).
+    Cosine,
+    /// Inner product (negative, for maximum inner product search).
+    #[serde(alias = "ip")]
+    InnerProduct,
+}
+
+impl fmt::Display for VectorDistanceMetric {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            VectorDistanceMetric::L2sq => write!(f, "l2sq"),
+            VectorDistanceMetric::Cosine => write!(f, "cosine"),
+            VectorDistanceMetric::InnerProduct => write!(f, "ip"),
+        }
+    }
+}
+
+impl std::str::FromStr for VectorDistanceMetric {
+    type Err = String;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "l2sq" | "l2" | "euclidean" => Ok(VectorDistanceMetric::L2sq),
+            "cosine" | "cos" => Ok(VectorDistanceMetric::Cosine),
+            "inner_product" | "ip" | "dot" => Ok(VectorDistanceMetric::InnerProduct),
+            _ => Err(format!(
+                "Unknown distance metric: {}. Expected: l2sq, cosine, or ip",
+                s
+            )),
+        }
+    }
+}
+
+impl VectorDistanceMetric {
+    /// Returns the metric as u8 for blob serialization.
+    pub fn as_u8(&self) -> u8 {
+        match self {
+            Self::L2sq => 0,
+            Self::Cosine => 1,
+            Self::InnerProduct => 2,
+        }
+    }
+
+    /// Parses metric from u8 (used when reading blob).
+    pub fn try_from_u8(v: u8) -> Option<Self> {
+        match v {
+            0 => Some(Self::L2sq),
+            1 => Some(Self::Cosine),
+            2 => Some(Self::InnerProduct),
+            _ => None,
+        }
+    }
+}
+
+/// Default HNSW connectivity parameter.
+const DEFAULT_VECTOR_INDEX_CONNECTIVITY: u32 = 16;
+/// Default expansion factor during index construction.
+const DEFAULT_VECTOR_INDEX_EXPANSION_ADD: u32 = 128;
+/// Default expansion factor during search.
+const DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH: u32 = 64;
+
+fn default_vector_index_connectivity() -> u32 {
+    DEFAULT_VECTOR_INDEX_CONNECTIVITY
+}
+
+fn default_vector_index_expansion_add() -> u32 {
+    DEFAULT_VECTOR_INDEX_EXPANSION_ADD
+}
+
+fn default_vector_index_expansion_search() -> u32 {
+    DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH
+}
+
+/// Supported vector index engine types.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, Visit, VisitMut)]
+#[serde(rename_all = "lowercase")]
+pub enum VectorIndexEngineType {
+    /// USearch HNSW implementation.
+    #[default]
+    Usearch,
+    // Future: Vsag,
+}
+
+impl VectorIndexEngineType {
+    /// Returns the engine type as u8 for blob serialization.
+    pub fn as_u8(&self) -> u8 {
+        match self {
+            Self::Usearch => 0,
+        }
+    }
+
+    /// Parses engine type from u8 (used when reading blob).
+    pub fn try_from_u8(v: u8) -> Option<Self> {
+        match v {
+            0 => Some(Self::Usearch),
+            _ => None,
+        }
+    }
+}
+
+impl fmt::Display for VectorIndexEngineType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Usearch => write!(f, "usearch"),
+        }
+    }
+}
+
+impl std::str::FromStr for VectorIndexEngineType {
+    type Err = String;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "usearch" => Ok(Self::Usearch),
+            _ => Err(format!(
+                "Unknown vector index engine: {}. Expected: usearch",
+                s
+            )),
+        }
+    }
+}
+
+/// Options for vector index (HNSW).
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Visit, VisitMut)]
+#[serde(rename_all = "kebab-case")]
+pub struct VectorIndexOptions {
+    /// Vector index engine type (default: usearch).
+    #[serde(default)]
+    pub engine: VectorIndexEngineType,
+    /// Distance metric for similarity search.
+    #[serde(default)]
+    pub metric: VectorDistanceMetric,
+    /// HNSW connectivity parameter (M in the paper).
+    /// Higher values improve recall but increase memory usage.
+    #[serde(default = "default_vector_index_connectivity")]
+    pub connectivity: u32,
+    /// Expansion factor during index construction (ef_construction).
+    /// Higher values improve index quality but slow down construction.
+    #[serde(default = "default_vector_index_expansion_add")]
+    pub expansion_add: u32,
+    /// Expansion factor during search (ef_search).
+    /// Higher values improve recall but slow down search.
+    #[serde(default = "default_vector_index_expansion_search")]
+    pub expansion_search: u32,
+}
+
+impl Default for VectorIndexOptions {
+    fn default() -> Self {
+        Self {
+            engine: VectorIndexEngineType::default(),
+            metric: VectorDistanceMetric::default(),
+            connectivity: DEFAULT_VECTOR_INDEX_CONNECTIVITY,
+            expansion_add: DEFAULT_VECTOR_INDEX_EXPANSION_ADD,
+            expansion_search: DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH,
+        }
+    }
+}
+
+impl fmt::Display for VectorIndexOptions {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "engine={}, metric={}, connectivity={}, expansion_add={}, expansion_search={}",
+            self.engine, self.metric, self.connectivity, self.expansion_add, self.expansion_search
+        )
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::sync::Arc;
--- a/src/file-engine/src/engine.rs
+++ b/src/file-engine/src/engine.rs
@@ -26,10 +26,10 @@ use object_store::ObjectStore;
 use snafu::{OptionExt, ensure};
 use store_api::metadata::RegionMetadataRef;
 use store_api::region_engine::{
-    RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
-    RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse,
-    SetRegionRoleStateSuccess, SettableRegionRoleState, SinglePartitionScanner,
-    SyncManifestResponse,
+    CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine, RegionManifestInfo, RegionRole,
+    RegionScannerRef, RegionStatistic, RemapManifestsRequest, RemapManifestsResponse,
+    SetRegionRoleStateResponse, SetRegionRoleStateSuccess, SettableRegionRoleState,
+    SinglePartitionScanner, SyncManifestResponse,
 };
 use store_api::region_request::{
    AffectedRows, RegionCloseRequest, RegionCreateRequest, RegionDropRequest, RegionOpenRequest,
@@ -163,6 +163,19 @@ impl RegionEngine for FileRegionEngine {
        ))
    }

+    async fn copy_region_from(
+        &self,
+        _region_id: RegionId,
+        _request: CopyRegionFromRequest,
+    ) -> Result<CopyRegionFromResponse, BoxedError> {
+        Err(BoxedError::new(
+            UnsupportedSnafu {
+                operation: "copy_region_from",
+            }
+            .build(),
+        ))
+    }
+
    fn role(&self, region_id: RegionId) -> Option<RegionRole> {
        self.inner.state(region_id)
    }
--- a/src/flow/src/batching_mode/frontend_client.rs
+++ b/src/flow/src/batching_mode/frontend_client.rs
@@ -110,6 +110,26 @@ impl FrontendClient {
        )
    }

+    /// Check if the frontend client is initialized.
+    ///
+    /// In distributed mode, it is always initialized.
+    /// In standalone mode, it checks if the database client is set.
+    pub fn is_initialized(&self) -> bool {
+        match self {
+            FrontendClient::Distributed { .. } => true,
+            FrontendClient::Standalone {
+                database_client, ..
+            } => {
+                let guard = database_client.lock();
+                if let Ok(guard) = guard {
+                    guard.is_some()
+                } else {
+                    false
+                }
+            }
+        }
+    }
+
    pub fn from_meta_client(
        meta_client: Arc<MetaClient>,
        auth: Option<FlowAuthHeader>,
--- a/src/frontend/Cargo.toml
+++ b/src/frontend/Cargo.toml
@@ -17,6 +17,7 @@ arc-swap = "1.0"
 async-stream.workspace = true
 async-trait.workspace = true
 auth.workspace = true
+axum.workspace = true
 bytes.workspace = true
 cache.workspace = true
 catalog.workspace = true
@@ -85,6 +86,9 @@ common-test-util.workspace = true
 datanode.workspace = true
 datatypes.workspace = true
 futures.workspace = true
+hyper-util = { workspace = true, features = ["tokio"] }
+meta-srv.workspace = true
+reqwest.workspace = true
 serde_json.workspace = true
 strfmt = "0.2"
 tower.workspace = true
--- a/src/frontend/src/error.rs
+++ b/src/frontend/src/error.rs
@@ -364,6 +364,12 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display("Service suspended"))]
+    Suspended {
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -444,6 +450,8 @@ impl ErrorExt for Error {
            Error::StatementTimeout { .. } => StatusCode::Cancelled,

            Error::AcquireLimiter { .. } => StatusCode::Internal,
+
+            Error::Suspended { .. } => StatusCode::Suspended,
        }
    }

--- a/src/frontend/src/frontend.rs
+++ b/src/frontend/src/frontend.rs
@@ -141,7 +141,43 @@ impl Frontend {

 #[cfg(test)]
 mod tests {
+    use std::sync::atomic::{AtomicBool, Ordering};
+    use std::time::Duration;
+
+    use api::v1::meta::heartbeat_server::HeartbeatServer;
+    use api::v1::meta::mailbox_message::Payload;
+    use api::v1::meta::{
+        AskLeaderRequest, AskLeaderResponse, HeartbeatRequest, HeartbeatResponse, MailboxMessage,
+        Peer, ResponseHeader, Role, heartbeat_server,
+    };
+    use async_trait::async_trait;
+    use client::{Client, Database};
+    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+    use common_error::ext::ErrorExt;
+    use common_error::from_header_to_err_code_msg;
+    use common_error::status_code::StatusCode;
+    use common_grpc::channel_manager::ChannelManager;
+    use common_meta::distributed_time_constants::FRONTEND_HEARTBEAT_INTERVAL_MILLIS;
+    use common_meta::heartbeat::handler::HandlerGroupExecutor;
+    use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
+    use common_meta::heartbeat::handler::suspend::SuspendHandler;
+    use common_meta::instruction::Instruction;
+    use common_stat::ResourceStatImpl;
+    use meta_client::MetaClientRef;
+    use meta_client::client::MetaClientBuilder;
+    use meta_srv::service::GrpcStream;
+    use servers::grpc::{FlightCompression, GRPC_SERVER};
+    use servers::http::HTTP_SERVER;
+    use servers::http::result::greptime_result_v1::GreptimedbV1Response;
+    use tokio::sync::mpsc;
+    use tonic::codec::CompressionEncoding;
+    use tonic::codegen::tokio_stream::StreamExt;
+    use tonic::codegen::tokio_stream::wrappers::ReceiverStream;
+    use tonic::{Request, Response, Status, Streaming};
+
    use super::*;
+    use crate::instance::builder::FrontendBuilder;
+    use crate::server::Services;

    #[test]
    fn test_toml() {
@@ -149,4 +185,277 @@ mod tests {
        let toml_string = toml::to_string(&opts).unwrap();
        let _parsed: FrontendOptions = toml::from_str(&toml_string).unwrap();
    }
+
+    struct SuspendableHeartbeatServer {
+        suspend: Arc<AtomicBool>,
+    }
+
+    #[async_trait]
+    impl heartbeat_server::Heartbeat for SuspendableHeartbeatServer {
+        type HeartbeatStream = GrpcStream<HeartbeatResponse>;
+
+        async fn heartbeat(
+            &self,
+            request: Request<Streaming<HeartbeatRequest>>,
+        ) -> std::result::Result<Response<Self::HeartbeatStream>, Status> {
+            let (tx, rx) = mpsc::channel(4);
+
+            common_runtime::spawn_global({
+                let mut requests = request.into_inner();
+                let suspend = self.suspend.clone();
+                async move {
+                    while let Some(request) = requests.next().await {
+                        if let Err(e) = request {
+                            let _ = tx.send(Err(e)).await;
+                            return;
+                        }
+
+                        let mailbox_message =
+                            suspend.load(Ordering::Relaxed).then(|| MailboxMessage {
+                                payload: Some(Payload::Json(
+                                    serde_json::to_string(&Instruction::Suspend).unwrap(),
+                                )),
+                                ..Default::default()
+                            });
+                        let response = HeartbeatResponse {
+                            header: Some(ResponseHeader::success()),
+                            mailbox_message,
+                            ..Default::default()
+                        };
+
+                        let _ = tx.send(Ok(response)).await;
+                    }
+                }
+            });
+
+            Ok(Response::new(Box::pin(ReceiverStream::new(rx))))
+        }
+
+        async fn ask_leader(
+            &self,
+            _: Request<AskLeaderRequest>,
+        ) -> std::result::Result<Response<AskLeaderResponse>, Status> {
+            Ok(Response::new(AskLeaderResponse {
+                header: Some(ResponseHeader::success()),
+                leader: Some(Peer {
+                    addr: "localhost:0".to_string(),
+                    ..Default::default()
+                }),
+            }))
+        }
+    }
+
+    async fn create_meta_client(
+        options: &MetaClientOptions,
+        heartbeat_server: Arc<SuspendableHeartbeatServer>,
+    ) -> MetaClientRef {
+        let (client, server) = tokio::io::duplex(1024);
+
+        // create the heartbeat server:
+        common_runtime::spawn_global(async move {
+            let mut router = tonic::transport::Server::builder();
+            let router = router.add_service(
+                HeartbeatServer::from_arc(heartbeat_server)
+                    .accept_compressed(CompressionEncoding::Zstd)
+                    .send_compressed(CompressionEncoding::Zstd),
+            );
+            router
+                .serve_with_incoming(futures::stream::iter([Ok::<_, std::io::Error>(server)]))
+                .await
+        });
+
+        // Move client to an option so we can _move_ the inner value
+        // on the first attempt to connect. All other attempts will fail.
+        let mut client = Some(client);
+        let connector = tower::service_fn(move |_| {
+            let client = client.take();
+            async move {
+                if let Some(client) = client {
+                    Ok(hyper_util::rt::TokioIo::new(client))
+                } else {
+                    Err(std::io::Error::other("client already taken"))
+                }
+            }
+        });
+        let manager = ChannelManager::new();
+        manager
+            .reset_with_connector("localhost:0", connector)
+            .unwrap();
+
+        // create the heartbeat client:
+        let mut client = MetaClientBuilder::new(0, Role::Frontend)
+            .enable_heartbeat()
+            .heartbeat_channel_manager(manager)
+            .build();
+        client.start(&options.metasrv_addrs).await.unwrap();
+        Arc::new(client)
+    }
+
+    async fn create_frontend(
+        options: &FrontendOptions,
+        meta_client: MetaClientRef,
+    ) -> Result<Frontend> {
+        let instance = Arc::new(
+            FrontendBuilder::new_test(options, meta_client.clone())
+                .try_build()
+                .await?,
+        );
+
+        let servers =
+            Services::new(options.clone(), instance.clone(), Default::default()).build()?;
+
+        let executor = Arc::new(HandlerGroupExecutor::new(vec![
+            Arc::new(ParseMailboxMessageHandler),
+            Arc::new(SuspendHandler::new(instance.suspend_state())),
+        ]));
+        let heartbeat_task = Some(HeartbeatTask::new(
+            options,
+            meta_client,
+            executor,
+            Arc::new(ResourceStatImpl::default()),
+        ));
+
+        let mut frontend = Frontend {
+            instance,
+            servers,
+            heartbeat_task,
+        };
+        frontend.start().await?;
+        Ok(frontend)
+    }
+
+    async fn verify_suspend_state_by_http(
+        frontend: &Frontend,
+        expected: std::result::Result<&str, (StatusCode, &str)>,
+    ) {
+        let addr = frontend.server_handlers().addr(HTTP_SERVER).unwrap();
+        let response = reqwest::get(format!("http://{}/v1/sql?sql=SELECT 1", addr))
+            .await
+            .unwrap();
+
+        let headers = response.headers();
+        let response = if let Some((code, error)) = from_header_to_err_code_msg(headers) {
+            Err((code, error))
+        } else {
+            Ok(response.text().await.unwrap())
+        };
+
+        match (response, expected) {
+            (Ok(response), Ok(expected)) => {
+                let response: GreptimedbV1Response = serde_json::from_str(&response).unwrap();
+                let response = serde_json::to_string(response.output()).unwrap();
+                assert_eq!(&response, expected);
+            }
+            (Err(actual), Err(expected)) => assert_eq!(actual, expected),
+            _ => unreachable!(),
+        }
+    }
+
+    async fn verify_suspend_state_by_grpc(
+        frontend: &Frontend,
+        expected: std::result::Result<&str, (StatusCode, &str)>,
+    ) {
+        let addr = frontend.server_handlers().addr(GRPC_SERVER).unwrap();
+        let client = Client::with_urls([addr.to_string()]);
+        let client = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
+        let response = client.sql("SELECT 1").await;
+
+        match (response, expected) {
+            (Ok(response), Ok(expected)) => {
+                let response = response.data.pretty_print().await;
+                assert_eq!(&response, expected.trim());
+            }
+            (Err(actual), Err(expected)) => {
+                assert_eq!(actual.status_code(), expected.0);
+                assert_eq!(actual.output_msg(), expected.1);
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn test_suspend_frontend() -> Result<()> {
+        common_telemetry::init_default_ut_logging();
+
+        let meta_client_options = MetaClientOptions {
+            metasrv_addrs: vec!["localhost:0".to_string()],
+            ..Default::default()
+        };
+        let options = FrontendOptions {
+            http: HttpOptions {
+                addr: "127.0.0.1:0".to_string(),
+                ..Default::default()
+            },
+            grpc: GrpcOptions {
+                bind_addr: "127.0.0.1:0".to_string(),
+                flight_compression: FlightCompression::None,
+                ..Default::default()
+            },
+            mysql: MysqlOptions {
+                enable: false,
+                ..Default::default()
+            },
+            postgres: PostgresOptions {
+                enable: false,
+                ..Default::default()
+            },
+            meta_client: Some(meta_client_options.clone()),
+            ..Default::default()
+        };
+
+        let server = Arc::new(SuspendableHeartbeatServer {
+            suspend: Arc::new(AtomicBool::new(false)),
+        });
+        let meta_client = create_meta_client(&meta_client_options, server.clone()).await;
+        let frontend = create_frontend(&options, meta_client).await?;
+
+        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        // initial state: not suspend:
+        assert!(!frontend.instance.is_suspended());
+        verify_suspend_state_by_http(&frontend, Ok(r#"[{"records":{"schema":{"column_schemas":[{"name":"Int64(1)","data_type":"Int64"}]},"rows":[[1]],"total_rows":1}}]"#)).await;
+        verify_suspend_state_by_grpc(
+            &frontend,
+            Ok(r#"
+----------+
+| Int64(1) |
+----------+
+| 1        |
+----------+"#),
+        )
+        .await;
+
+        // make heartbeat server returned "suspend" instruction,
+        server.suspend.store(true, Ordering::Relaxed);
+        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        // ... then the frontend is suspended:
+        assert!(frontend.instance.is_suspended());
+        verify_suspend_state_by_http(
+            &frontend,
+            Err((
+                StatusCode::Suspended,
+                "error: Service suspended, execution_time_ms: 0",
+            )),
+        )
+        .await;
+        verify_suspend_state_by_grpc(&frontend, Err((StatusCode::Suspended, "Service suspended")))
+            .await;
+
+        // make heartbeat server NOT returned "suspend" instruction,
+        server.suspend.store(false, Ordering::Relaxed);
+        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        // ... then frontend's suspend state is cleared:
+        assert!(!frontend.instance.is_suspended());
+        verify_suspend_state_by_http(&frontend, Ok(r#"[{"records":{"schema":{"column_schemas":[{"name":"Int64(1)","data_type":"Int64"}]},"rows":[[1]],"total_rows":1}}]"#)).await;
+        verify_suspend_state_by_grpc(
+            &frontend,
+            Ok(r#"
+----------+
+| Int64(1) |
+----------+
+| 1        |
+----------+"#),
+        )
+        .await;
+        Ok(())
+    }
 }
--- a/src/frontend/src/heartbeat.rs
+++ b/src/frontend/src/heartbeat.rs
@@ -27,7 +27,6 @@ use common_stat::ResourceStatRef;
 use common_telemetry::{debug, error, info, warn};
 use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient};
 use servers::addrs;
-use servers::heartbeat_options::HeartbeatOptions;
 use snafu::ResultExt;
 use tokio::sync::mpsc;
 use tokio::sync::mpsc::Receiver;
@@ -54,7 +53,6 @@ impl HeartbeatTask {
    pub fn new(
        opts: &FrontendOptions,
        meta_client: Arc<MetaClient>,
-        heartbeat_opts: HeartbeatOptions,
        resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
        resource_stat: ResourceStatRef,
    ) -> Self {
@@ -68,8 +66,8 @@ impl HeartbeatTask {
                addrs::resolve_addr(&opts.grpc.bind_addr, Some(&opts.grpc.server_addr))
            },
            meta_client,
-            report_interval: heartbeat_opts.interval,
-            retry_interval: heartbeat_opts.retry_interval,
+            report_interval: opts.heartbeat.interval,
+            retry_interval: opts.heartbeat.retry_interval,
            resp_handler_executor,
            start_time_ms: common_time::util::current_time_millis() as u64,
            resource_stat,
@@ -196,7 +194,8 @@ impl HeartbeatTask {
        let report_interval = self.report_interval;
        let start_time_ms = self.start_time_ms;
        let self_peer = Some(Peer {
-            // The peer id doesn't make sense for frontend, so we just set it 0.
+            // The node id will be actually calculated from its address (by hashing the address
+            // string) in the metasrv. So it can be set to 0 here, as a placeholder.
            id: 0,
            addr: self.peer_addr.clone(),
        });
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -26,7 +26,8 @@ mod region_query;
 pub mod standalone;

 use std::pin::Pin;
-use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
+use std::sync::{Arc, atomic};
 use std::time::{Duration, SystemTime};

 use async_stream::stream;
@@ -83,6 +84,7 @@ use snafu::prelude::*;
 use sql::ast::ObjectNamePartExt;
 use sql::dialect::Dialect;
 use sql::parser::{ParseOptions, ParserContext};
+use sql::statements::comment::CommentObject;
 use sql::statements::copy::{CopyDatabase, CopyTable};
 use sql::statements::statement::Statement;
 use sql::statements::tql::Tql;
@@ -119,6 +121,7 @@ pub struct Instance {
    limiter: Option<LimiterRef>,
    process_manager: ProcessManagerRef,
    slow_query_options: SlowQueryOptions,
+    suspend: Arc<AtomicBool>,

    // cache for otlp metrics
    // first layer key: db-string
@@ -171,6 +174,14 @@ impl Instance {
    pub fn procedure_executor(&self) -> &ProcedureExecutorRef {
        self.statement_executor.procedure_executor()
    }
+
+    pub fn suspend_state(&self) -> Arc<AtomicBool> {
+        self.suspend.clone()
+    }
+
+    pub(crate) fn is_suspended(&self) -> bool {
+        self.suspend.load(atomic::Ordering::Relaxed)
+    }
 }

 fn parse_stmt(sql: &str, dialect: &(dyn Dialect + Send + Sync)) -> Result<Vec<Statement>> {
@@ -513,6 +524,10 @@ impl SqlQueryHandler for Instance {

    #[tracing::instrument(skip_all)]
    async fn do_query(&self, query: &str, query_ctx: QueryContextRef) -> Vec<Result<Output>> {
+        if self.is_suspended() {
+            return vec![error::SuspendedSnafu {}.fail()];
+        }
+
        let query_interceptor_opt = self.plugins.get::<SqlQueryInterceptorRef<Error>>();
        let query_interceptor = query_interceptor_opt.as_ref();
        let query = match query_interceptor.pre_parsing(query, query_ctx.clone()) {
@@ -580,6 +595,8 @@ impl SqlQueryHandler for Instance {
        plan: LogicalPlan,
        query_ctx: QueryContextRef,
    ) -> Result<Output> {
+        ensure!(!self.is_suspended(), error::SuspendedSnafu);
+
        if should_capture_statement(stmt.as_ref()) {
            // It's safe to unwrap here because we've already checked the type.
            let stmt = stmt.unwrap();
@@ -641,6 +658,10 @@ impl SqlQueryHandler for Instance {
        query: &PromQuery,
        query_ctx: QueryContextRef,
    ) -> Vec<Result<Output>> {
+        if self.is_suspended() {
+            return vec![error::SuspendedSnafu {}.fail()];
+        }
+
        // check will be done in prometheus handler's do_query
        let result = PrometheusHandler::do_query(self, query, query_ctx)
            .await
@@ -655,6 +676,8 @@ impl SqlQueryHandler for Instance {
        stmt: Statement,
        query_ctx: QueryContextRef,
    ) -> Result<Option<DescribeResult>> {
+        ensure!(!self.is_suspended(), error::SuspendedSnafu);
+
        if matches!(
            stmt,
            Statement::Insert(_) | Statement::Query(_) | Statement::Delete(_)
@@ -875,7 +898,7 @@ pub fn check_permission(
            validate_param(&stmt.table_name, query_ctx)?;
        }
        Statement::ShowCreateFlow(stmt) => {
-            validate_param(&stmt.flow_name, query_ctx)?;
+            validate_flow(&stmt.flow_name, query_ctx)?;
        }
        #[cfg(feature = "enterprise")]
        Statement::ShowCreateTrigger(stmt) => {
@@ -908,6 +931,12 @@ pub fn check_permission(
        // show charset and show collation won't be checked
        Statement::ShowCharset(_) | Statement::ShowCollation(_) => {}

+        Statement::Comment(comment) => match &comment.object {
+            CommentObject::Table(table) => validate_param(table, query_ctx)?,
+            CommentObject::Column { table, .. } => validate_param(table, query_ctx)?,
+            CommentObject::Flow(flow) => validate_flow(flow, query_ctx)?,
+        },
+
        Statement::Insert(insert) => {
            let name = insert.table_name().context(ParseSqlSnafu)?;
            validate_param(name, query_ctx)?;
@@ -993,6 +1022,27 @@ fn validate_param(name: &ObjectName, query_ctx: &QueryContextRef) -> Result<()>
        .context(SqlExecInterceptedSnafu)
 }

+fn validate_flow(name: &ObjectName, query_ctx: &QueryContextRef) -> Result<()> {
+    let catalog = match &name.0[..] {
+        [_flow] => query_ctx.current_catalog().to_string(),
+        [catalog, _flow] => catalog.to_string_unquoted(),
+        _ => {
+            return InvalidSqlSnafu {
+                err_msg: format!(
+                    "expect flow name to be <catalog>.<flow_name> or <flow_name>, actual: {name}",
+                ),
+            }
+            .fail();
+        }
+    };
+
+    let schema = query_ctx.current_schema();
+
+    validate_catalog_and_schema(&catalog, &schema, query_ctx)
+        .map_err(BoxedError::new)
+        .context(SqlExecInterceptedSnafu)
+}
+
 fn validate_database(name: &ObjectName, query_ctx: &QueryContextRef) -> Result<()> {
    let (catalog, schema) = match &name.0[..] {
        [schema] => (
@@ -1251,6 +1301,28 @@ mod tests {

        // test describe table
        let sql = "DESC TABLE {catalog}{schema}demo;";
-        replace_test(sql, plugins, &query_ctx);
+        replace_test(sql, plugins.clone(), &query_ctx);
+
+        let comment_flow_cases = [
+            ("COMMENT ON FLOW my_flow IS 'comment';", true),
+            ("COMMENT ON FLOW greptime.my_flow IS 'comment';", true),
+            ("COMMENT ON FLOW wrongcatalog.my_flow IS 'comment';", false),
+        ];
+        for (sql, is_ok) in comment_flow_cases {
+            let stmt = &parse_stmt(sql, &GreptimeDbDialect {}).unwrap()[0];
+            let result = check_permission(plugins.clone(), stmt, &query_ctx);
+            assert_eq!(result.is_ok(), is_ok);
+        }
+
+        let show_flow_cases = [
+            ("SHOW CREATE FLOW my_flow;", true),
+            ("SHOW CREATE FLOW greptime.my_flow;", true),
+            ("SHOW CREATE FLOW wrongcatalog.my_flow;", false),
+        ];
+        for (sql, is_ok) in show_flow_cases {
+            let stmt = &parse_stmt(sql, &GreptimeDbDialect {}).unwrap()[0];
+            let result = check_permission(plugins.clone(), stmt, &query_ctx);
+            assert_eq!(result.is_ok(), is_ok);
+        }
    }
 }
--- a/src/frontend/src/instance/builder.rs
+++ b/src/frontend/src/instance/builder.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 use std::sync::Arc;
+use std::sync::atomic::AtomicBool;

 use cache::{TABLE_FLOWNODE_SET_CACHE_NAME, TABLE_ROUTE_CACHE_NAME};
 use catalog::CatalogManagerRef;
@@ -87,6 +88,33 @@ impl FrontendBuilder {
        }
    }

+    #[cfg(test)]
+    pub(crate) fn new_test(
+        options: &FrontendOptions,
+        meta_client: meta_client::MetaClientRef,
+    ) -> Self {
+        let kv_backend = Arc::new(common_meta::kv_backend::memory::MemoryKvBackend::new());
+
+        let layered_cache_registry = Arc::new(
+            common_meta::cache::LayeredCacheRegistryBuilder::default()
+                .add_cache_registry(cache::build_fundamental_cache_registry(kv_backend.clone()))
+                .build(),
+        );
+
+        Self::new(
+            options.clone(),
+            kv_backend,
+            layered_cache_registry,
+            catalog::memory::MemoryCatalogManager::with_default_setup(),
+            Arc::new(client::client_manager::NodeClients::default()),
+            meta_client,
+            Arc::new(catalog::process_manager::ProcessManager::new(
+                "".to_string(),
+                None,
+            )),
+        )
+    }
+
    pub fn with_local_cache_invalidator(self, cache_invalidator: CacheInvalidatorRef) -> Self {
        Self {
            local_cache_invalidator: Some(cache_invalidator),
@@ -242,6 +270,7 @@ impl FrontendBuilder {
            process_manager,
            otlp_metrics_table_legacy_cache: DashMap::new(),
            slow_query_options: self.options.slow_query.clone(),
+            suspend: Arc::new(AtomicBool::new(false)),
        })
    }
 }
--- a/src/frontend/src/instance/grpc.rs
+++ b/src/frontend/src/instance/grpc.rs
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::pin::Pin;
 use std::sync::Arc;
+use std::time::Instant;

 use api::helper::from_pb_time_ranges;
 use api::v1::ddl_request::{Expr as DdlExpr, Expr};
@@ -22,16 +24,18 @@ use api::v1::{
    DeleteRequests, DropFlowExpr, InsertIntoPlan, InsertRequests, RowDeleteRequests,
    RowInsertRequests,
 };
+use async_stream::try_stream;
 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use common_base::AffectedRows;
 use common_error::ext::BoxedError;
-use common_grpc::FlightData;
-use common_grpc::flight::FlightDecoder;
+use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
 use common_query::logical_plan::add_insert_to_logical_plan;
 use common_telemetry::tracing::{self};
 use datafusion::datasource::DefaultTableSource;
+use futures::Stream;
+use futures::stream::StreamExt;
 use query::parser::PromQuery;
 use servers::interceptor::{GrpcQueryInterceptor, GrpcQueryInterceptorRef};
 use servers::query_handler::grpc::GrpcQueryHandler;
@@ -230,6 +234,11 @@ impl GrpcQueryHandler for Instance {
                    DdlExpr::DropView(_) => {
                        todo!("implemented in the following PR")
                    }
+                    DdlExpr::CommentOn(expr) => {
+                        self.statement_executor
+                            .comment_by_expr(expr, ctx.clone())
+                            .await?
+                    }
                }
            }
        };
@@ -240,10 +249,8 @@ impl GrpcQueryHandler for Instance {

    async fn put_record_batch(
        &self,
-        table_name: &TableName,
+        request: servers::grpc::flight::PutRecordBatchRequest,
        table_ref: &mut Option<TableRef>,
-        decoder: &mut FlightDecoder,
-        data: FlightData,
        ctx: QueryContextRef,
    ) -> Result<AffectedRows> {
        let table = if let Some(table) = table_ref {
@@ -252,15 +259,15 @@ impl GrpcQueryHandler for Instance {
            let table = self
                .catalog_manager()
                .table(
-                    &table_name.catalog_name,
-                    &table_name.schema_name,
-                    &table_name.table_name,
+                    &request.table_name.catalog_name,
+                    &request.table_name.schema_name,
+                    &request.table_name.table_name,
                    None,
                )
                .await
                .context(CatalogSnafu)?
                .with_context(|| TableNotFoundSnafu {
-                    table_name: table_name.to_string(),
+                    table_name: request.table_name.to_string(),
                })?;
            *table_ref = Some(table.clone());
            table
@@ -279,10 +286,87 @@ impl GrpcQueryHandler for Instance {
        // do we check limit for bulk insert?

        self.inserter
-            .handle_bulk_insert(table, decoder, data)
+            .handle_bulk_insert(
+                table,
+                request.flight_data,
+                request.record_batch,
+                request.schema_bytes,
+            )
            .await
            .context(TableOperationSnafu)
    }
+
+    fn handle_put_record_batch_stream(
+        &self,
+        mut stream: servers::grpc::flight::PutRecordBatchRequestStream,
+        ctx: QueryContextRef,
+    ) -> Pin<Box<dyn Stream<Item = Result<DoPutResponse>> + Send>> {
+        // Clone all necessary data to make it 'static
+        let catalog_manager = self.catalog_manager().clone();
+        let plugins = self.plugins.clone();
+        let inserter = self.inserter.clone();
+        let ctx = ctx.clone();
+        let mut table_ref: Option<TableRef> = None;
+        let mut table_checked = false;
+
+        Box::pin(try_stream! {
+            // Process each request in the stream
+            while let Some(request_result) = stream.next().await {
+                let request = request_result.map_err(|e| {
+                    let error_msg = format!("Stream error: {:?}", e);
+                    IncompleteGrpcRequestSnafu { err_msg: error_msg }.build()
+                })?;
+
+                // Resolve table and check permissions on first RecordBatch (after schema is received)
+                if !table_checked {
+                    let table_name = &request.table_name;
+
+                    plugins
+                        .get::<PermissionCheckerRef>()
+                        .as_ref()
+                        .check_permission(ctx.current_user(), PermissionReq::BulkInsert)
+                        .context(PermissionSnafu)?;
+
+                    // Resolve table reference
+                    table_ref = Some(
+                        catalog_manager
+                            .table(
+                                &table_name.catalog_name,
+                                &table_name.schema_name,
+                                &table_name.table_name,
+                                None,
+                            )
+                            .await
+                            .context(CatalogSnafu)?
+                            .with_context(|| TableNotFoundSnafu {
+                                table_name: table_name.to_string(),
+                            })?,
+                    );
+
+                    // Check permissions for the table
+                    let interceptor_ref = plugins.get::<GrpcQueryInterceptorRef<Error>>();
+                    let interceptor = interceptor_ref.as_ref();
+                    interceptor.pre_bulk_insert(table_ref.clone().unwrap(), ctx.clone())?;
+
+                    table_checked = true;
+                }
+
+                let request_id = request.request_id;
+                let start = Instant::now();
+                let rows = inserter
+                    .handle_bulk_insert(
+                        table_ref.clone().unwrap(),
+                        request.flight_data,
+                        request.record_batch,
+                        request.schema_bytes,
+                    )
+                    .await
+                    .context(TableOperationSnafu)?;
+                let elapsed_secs = start.elapsed().as_secs_f64();
+                yield DoPutResponse::new(request_id, rows, elapsed_secs);
+            }
+        })
+    }
 }

 fn fill_catalog_and_schema_from_context(ddl_expr: &mut DdlExpr, ctx: &QueryContextRef) {
@@ -330,6 +414,9 @@ fn fill_catalog_and_schema_from_context(ddl_expr: &mut DdlExpr, ctx: &QueryConte
        Expr::DropView(expr) => {
            check_and_fill!(expr);
        }
+        Expr::CommentOn(expr) => {
+            check_and_fill!(expr);
+        }
    }
 }

--- a/src/frontend/src/instance/jaeger.rs
+++ b/src/frontend/src/instance/jaeger.rs
@@ -65,8 +65,7 @@ impl JaegerQueryHandler for Instance {
        // It's equivalent to `SELECT DISTINCT(service_name) FROM {db}.{trace_table}`.
        Ok(query_trace_table(
            ctx,
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
            vec![SelectExpr::from(col(SERVICE_NAME_COLUMN))],
            vec![],
            vec![],
@@ -107,8 +106,7 @@ impl JaegerQueryHandler for Instance {
        // ```.
        Ok(query_trace_table(
            ctx,
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
            vec![
                SelectExpr::from(col(SPAN_NAME_COLUMN)),
                SelectExpr::from(col(SPAN_KIND_COLUMN)),
@@ -160,8 +158,7 @@ impl JaegerQueryHandler for Instance {

        Ok(query_trace_table(
            ctx,
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
            selects,
            filters,
            vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order.
@@ -220,8 +217,7 @@ impl JaegerQueryHandler for Instance {
        // ```.
        let output = query_trace_table(
            ctx.clone(),
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
            vec![wildcard()],
            filters,
            vec![],
@@ -285,8 +281,7 @@ impl JaegerQueryHandler for Instance {
                // query all spans
                Ok(query_trace_table(
                    ctx,
-                    self.catalog_manager(),
-                    self.query_engine(),
+                    self,
                    vec![wildcard()],
                    filters,
                    vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order.
@@ -303,8 +298,7 @@ impl JaegerQueryHandler for Instance {
 #[allow(clippy::too_many_arguments)]
 async fn query_trace_table(
    ctx: QueryContextRef,
-    catalog_manager: &CatalogManagerRef,
-    query_engine: &QueryEngineRef,
+    instance: &Instance,
    selects: Vec<SelectExpr>,
    filters: Vec<Expr>,
    sorts: Vec<SortExpr>,
@@ -334,7 +328,8 @@ async fn query_trace_table(
        }
    };

-    let table = catalog_manager
+    let table = instance
+        .catalog_manager()
        .table(
            ctx.current_catalog(),
            &ctx.current_schema(),
@@ -367,7 +362,7 @@ async fn query_trace_table(
        .map(|s| format!("\"{}\"", s))
        .collect::<HashSet<String>>();

-    let df_context = create_df_context(query_engine)?;
+    let df_context = create_df_context(instance.query_engine())?;

    let dataframe = df_context
        .read_table(Arc::new(DfTableProviderAdapter::new(table)))
--- a/src/frontend/src/instance/promql.rs
+++ b/src/frontend/src/instance/promql.rs
@@ -136,7 +136,7 @@ impl Instance {
                table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric),
            })?;

-        let scan_plan = dataframe.into_logical_plan();
+        let scan_plan = dataframe.into_unoptimized_plan();
        let filter_conditions =
            PromPlanner::matchers_to_expr(Matchers::new(matchers), scan_plan.schema())
                .context(PrometheusLabelValuesQueryPlanSnafu)?;
--- a/src/frontend/src/server.rs
+++ b/src/frontend/src/server.rs
@@ -16,16 +16,21 @@ use std::net::SocketAddr;
 use std::sync::Arc;

 use auth::UserProviderRef;
+use axum::extract::{Request, State};
+use axum::middleware::Next;
+use axum::response::IntoResponse;
 use common_base::Plugins;
 use common_config::Configurable;
 use common_telemetry::info;
 use meta_client::MetaClientOptions;
 use servers::error::Error as ServerError;
 use servers::grpc::builder::GrpcServerBuilder;
+use servers::grpc::flight::FlightCraftRef;
 use servers::grpc::frontend_grpc_handler::FrontendGrpcHandler;
 use servers::grpc::greptime_handler::GreptimeRequestHandler;
 use servers::grpc::{GrpcOptions, GrpcServer};
 use servers::http::event::LogValidatorRef;
+use servers::http::result::error_result::ErrorResponse;
 use servers::http::utils::router::RouterConfigurator;
 use servers::http::{HttpServer, HttpServerBuilder};
 use servers::interceptor::LogIngestInterceptorRef;
@@ -38,6 +43,7 @@ use servers::query_handler::sql::ServerSqlQueryHandlerAdapter;
 use servers::server::{Server, ServerHandlers};
 use servers::tls::{ReloadableTlsServerConfig, maybe_watch_server_tls_config};
 use snafu::ResultExt;
+use tonic::Status;

 use crate::error::{self, Result, StartServerSnafu, TomlFormatSnafu};
 use crate::frontend::FrontendOptions;
@@ -52,6 +58,7 @@ where
    grpc_server_builder: Option<GrpcServerBuilder>,
    http_server_builder: Option<HttpServerBuilder>,
    plugins: Plugins,
+    flight_handler: Option<FlightCraftRef>,
 }

 impl<T> Services<T>
@@ -65,6 +72,7 @@ where
            grpc_server_builder: None,
            http_server_builder: None,
            plugins,
+            flight_handler: None,
        }
    }

@@ -122,7 +130,16 @@ where
            builder = builder.with_extra_router(configurator.router());
        }

-        builder
+        builder.add_layer(axum::middleware::from_fn_with_state(
+            self.instance.clone(),
+            async move |State(state): State<Arc<Instance>>, request: Request, next: Next| {
+                if state.is_suspended() {
+                    return ErrorResponse::from_error(servers::error::SuspendedSnafu.build())
+                        .into_response();
+                }
+                next.run(request).await
+            },
+        ))
    }

    pub fn with_grpc_server_builder(self, builder: GrpcServerBuilder) -> Self {
@@ -139,6 +156,13 @@ where
        }
    }

+    pub fn with_flight_handler(self, flight_handler: FlightCraftRef) -> Self {
+        Self {
+            flight_handler: Some(flight_handler),
+            ..self
+        }
+    }
+
    fn build_grpc_server(
        &mut self,
        grpc: &GrpcOptions,
@@ -173,6 +197,12 @@ where
            grpc.flight_compression,
        );

+        // Use custom flight handler if provided, otherwise use the default GreptimeRequestHandler
+        let flight_handler = self
+            .flight_handler
+            .clone()
+            .unwrap_or_else(|| Arc::new(greptime_request_handler.clone()) as FlightCraftRef);
+
        let grpc_server = builder
            .name(name)
            .database_handler(greptime_request_handler.clone())
@@ -181,7 +211,17 @@ where
                self.instance.clone(),
                user_provider.clone(),
            ))
-            .flight_handler(Arc::new(greptime_request_handler));
+            .flight_handler(flight_handler)
+            .add_layer(axum::middleware::from_fn_with_state(
+                self.instance.clone(),
+                async move |State(state): State<Arc<Instance>>, request: Request, next: Next| {
+                    if state.is_suspended() {
+                        let status = Status::from(servers::error::SuspendedSnafu.build());
+                        return status.into_http();
+                    }
+                    next.run(request).await
+                },
+            ));

        let grpc_server = if !external {
            let frontend_grpc_handler =
--- a/src/index/Cargo.toml
+++ b/src/index/Cargo.toml
@@ -7,6 +7,9 @@ license.workspace = true
 [lints]
 workspace = true

+[features]
+vector_index = ["dep:usearch"]
+
 [dependencies]
 async-trait.workspace = true
 asynchronous-codec = "0.7.0"
@@ -17,6 +20,7 @@ common-error.workspace = true
 common-macro.workspace = true
 common-runtime.workspace = true
 common-telemetry.workspace = true
+datatypes.workspace = true
 fastbloom = "0.8"
 fst.workspace = true
 futures.workspace = true
@@ -25,6 +29,7 @@ itertools.workspace = true
 jieba-rs = "0.8"
 lazy_static.workspace = true
 mockall.workspace = true
+nalgebra.workspace = true
 pin-project.workspace = true
 prost.workspace = true
 puffin.workspace = true
@@ -39,6 +44,7 @@ tantivy = { version = "0.24", features = ["zstd-compression"] }
 tantivy-jieba = "0.16"
 tokio.workspace = true
 tokio-util.workspace = true
+usearch = { version = "2.21", default-features = false, features = ["fp16lib"], optional = true }
 uuid.workspace = true

 [dev-dependencies]
--- a/src/index/src/bloom_filter/applier.rs
+++ b/src/index/src/bloom_filter/applier.rs
@@ -21,7 +21,7 @@ use itertools::Itertools;

 use crate::Bytes;
 use crate::bloom_filter::error::Result;
-use crate::bloom_filter::reader::BloomFilterReader;
+use crate::bloom_filter::reader::{BloomFilterReadMetrics, BloomFilterReader};

 /// `InListPredicate` contains a list of acceptable values. A value needs to match at least
 /// one of the elements (logical OR semantic) for the predicate to be satisfied.
@@ -38,7 +38,7 @@ pub struct BloomFilterApplier {

 impl BloomFilterApplier {
    pub async fn new(reader: Box<dyn BloomFilterReader + Send>) -> Result<Self> {
-        let meta = reader.metadata().await?;
+        let meta = reader.metadata(None).await?;

        Ok(Self { reader, meta })
    }
@@ -50,6 +50,7 @@ impl BloomFilterApplier {
        &mut self,
        predicates: &[InListPredicate],
        search_ranges: &[Range<usize>],
+        metrics: Option<&mut BloomFilterReadMetrics>,
    ) -> Result<Vec<Range<usize>>> {
        if predicates.is_empty() {
            // If no predicates, return empty result
@@ -57,7 +58,7 @@ impl BloomFilterApplier {
        }

        let segments = self.row_ranges_to_segments(search_ranges);
-        let (seg_locations, bloom_filters) = self.load_bloom_filters(&segments).await?;
+        let (seg_locations, bloom_filters) = self.load_bloom_filters(&segments, metrics).await?;
        let matching_row_ranges = self.find_matching_rows(seg_locations, bloom_filters, predicates);
        Ok(intersect_ranges(search_ranges, &matching_row_ranges))
    }
@@ -95,6 +96,7 @@ impl BloomFilterApplier {
    async fn load_bloom_filters(
        &mut self,
        segments: &[usize],
+        metrics: Option<&mut BloomFilterReadMetrics>,
    ) -> Result<(Vec<(u64, usize)>, Vec<BloomFilter>)> {
        let segment_locations = segments
            .iter()
@@ -108,7 +110,10 @@ impl BloomFilterApplier {
            .map(|i| self.meta.bloom_filter_locs[i as usize])
            .collect::<Vec<_>>();

-        let bloom_filters = self.reader.bloom_filter_vec(&bloom_filter_locs).await?;
+        let bloom_filters = self
+            .reader
+            .bloom_filter_vec(&bloom_filter_locs, metrics)
+            .await?;

        Ok((segment_locations, bloom_filters))
    }
@@ -422,7 +427,10 @@ mod tests {
        ];

        for (predicates, search_range, expected) in cases {
-            let result = applier.search(&predicates, &[search_range]).await.unwrap();
+            let result = applier
+                .search(&predicates, &[search_range], None)
+                .await
+                .unwrap();
            assert_eq!(
                result, expected,
                "Expected {:?}, got {:?}",
--- a/src/index/src/bloom_filter/reader.rs
+++ b/src/index/src/bloom_filter/reader.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 use std::ops::{Range, Rem};
+use std::time::{Duration, Instant};

 use async_trait::async_trait;
 use bytemuck::try_cast_slice;
@@ -34,6 +35,72 @@ const BLOOM_META_LEN_SIZE: u64 = 4;
 /// Default prefetch size of bloom filter meta.
 pub const DEFAULT_PREFETCH_SIZE: u64 = 8192; // 8KiB

+/// Metrics for bloom filter read operations.
+#[derive(Default, Clone)]
+pub struct BloomFilterReadMetrics {
+    /// Total byte size to read.
+    pub total_bytes: u64,
+    /// Total number of ranges to read.
+    pub total_ranges: usize,
+    /// Elapsed time to fetch data.
+    pub fetch_elapsed: Duration,
+    /// Number of cache hits.
+    pub cache_hit: usize,
+    /// Number of cache misses.
+    pub cache_miss: usize,
+}
+
+impl std::fmt::Debug for BloomFilterReadMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let Self {
+            total_bytes,
+            total_ranges,
+            fetch_elapsed,
+            cache_hit,
+            cache_miss,
+        } = self;
+
+        // If both total_bytes and cache_hit are 0, we didn't read anything.
+        if *total_bytes == 0 && *cache_hit == 0 {
+            return write!(f, "{{}}");
+        }
+        write!(f, "{{")?;
+
+        if *total_bytes > 0 {
+            write!(f, "\"total_bytes\":{}", total_bytes)?;
+        }
+        if *cache_hit > 0 {
+            if *total_bytes > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "\"cache_hit\":{}", cache_hit)?;
+        }
+
+        if *total_ranges > 0 {
+            write!(f, ", \"total_ranges\":{}", total_ranges)?;
+        }
+        if !fetch_elapsed.is_zero() {
+            write!(f, ", \"fetch_elapsed\":\"{:?}\"", fetch_elapsed)?;
+        }
+        if *cache_miss > 0 {
+            write!(f, ", \"cache_miss\":{}", cache_miss)?;
+        }
+
+        write!(f, "}}")
+    }
+}
+
+impl BloomFilterReadMetrics {
+    /// Merges another metrics into this one.
+    pub fn merge_from(&mut self, other: &Self) {
+        self.total_bytes += other.total_bytes;
+        self.total_ranges += other.total_ranges;
+        self.fetch_elapsed += other.fetch_elapsed;
+        self.cache_hit += other.cache_hit;
+        self.cache_miss += other.cache_miss;
+    }
+}
+
 /// Safely converts bytes to Vec<u64> using bytemuck for optimal performance.
 /// Faster than chunking and converting each piece individually.
 ///
@@ -79,25 +146,33 @@ pub fn bytes_to_u64_vec(bytes: &Bytes) -> Vec<u64> {
 #[async_trait]
 pub trait BloomFilterReader: Sync {
    /// Reads range of bytes from the file.
-    async fn range_read(&self, offset: u64, size: u32) -> Result<Bytes>;
+    async fn range_read(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Bytes>;

    /// Reads bunch of ranges from the file.
-    async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
-        let mut results = Vec::with_capacity(ranges.len());
-        for range in ranges {
-            let size = (range.end - range.start) as u32;
-            let data = self.range_read(range.start, size).await?;
-            results.push(data);
-        }
-        Ok(results)
-    }
+    async fn read_vec(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Vec<Bytes>>;

    /// Reads the meta information of the bloom filter.
-    async fn metadata(&self) -> Result<BloomFilterMeta>;
+    async fn metadata(
+        &self,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<BloomFilterMeta>;

    /// Reads a bloom filter with the given location.
-    async fn bloom_filter(&self, loc: &BloomFilterLoc) -> Result<BloomFilter> {
-        let bytes = self.range_read(loc.offset, loc.size as _).await?;
+    async fn bloom_filter(
+        &self,
+        loc: &BloomFilterLoc,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<BloomFilter> {
+        let bytes = self.range_read(loc.offset, loc.size as _, metrics).await?;
        let vec = bytes_to_u64_vec(&bytes);
        let bm = BloomFilter::from_vec(vec)
            .seed(&SEED)
@@ -105,12 +180,16 @@ pub trait BloomFilterReader: Sync {
        Ok(bm)
    }

-    async fn bloom_filter_vec(&self, locs: &[BloomFilterLoc]) -> Result<Vec<BloomFilter>> {
+    async fn bloom_filter_vec(
+        &self,
+        locs: &[BloomFilterLoc],
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Vec<BloomFilter>> {
        let ranges = locs
            .iter()
            .map(|l| l.offset..l.offset + l.size)
            .collect::<Vec<_>>();
-        let bss = self.read_vec(&ranges).await?;
+        let bss = self.read_vec(&ranges, metrics).await?;

        let mut result = Vec::with_capacity(bss.len());
        for (bs, loc) in bss.into_iter().zip(locs.iter()) {
@@ -140,24 +219,59 @@ impl<R: RangeReader> BloomFilterReaderImpl<R> {

 #[async_trait]
 impl<R: RangeReader> BloomFilterReader for BloomFilterReaderImpl<R> {
-    async fn range_read(&self, offset: u64, size: u32) -> Result<Bytes> {
-        self.reader
+    async fn range_read(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Bytes> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+        let result = self
+            .reader
            .read(offset..offset + size as u64)
            .await
-            .context(IoSnafu)
+            .context(IoSnafu)?;
+
+        if let Some(m) = metrics {
+            m.total_ranges += 1;
+            m.total_bytes += size as u64;
+            if let Some(start) = start {
+                m.fetch_elapsed += start.elapsed();
+            }
+        }
+
+        Ok(result)
    }

-    async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
-        self.reader.read_vec(ranges).await.context(IoSnafu)
+    async fn read_vec(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Vec<Bytes>> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+        let result = self.reader.read_vec(ranges).await.context(IoSnafu)?;
+
+        if let Some(m) = metrics {
+            m.total_ranges += ranges.len();
+            m.total_bytes += ranges.iter().map(|r| r.end - r.start).sum::<u64>();
+            if let Some(start) = start {
+                m.fetch_elapsed += start.elapsed();
+            }
+        }
+
+        Ok(result)
    }

-    async fn metadata(&self) -> Result<BloomFilterMeta> {
+    async fn metadata(
+        &self,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<BloomFilterMeta> {
        let metadata = self.reader.metadata().await.context(IoSnafu)?;
        let file_size = metadata.content_length;

        let mut meta_reader =
            BloomFilterMetaReader::new(&self.reader, file_size, Some(DEFAULT_PREFETCH_SIZE));
-        meta_reader.metadata().await
+        meta_reader.metadata(metrics).await
    }
 }

@@ -183,7 +297,10 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
    ///
    /// It will first prefetch some bytes from the end of the file,
    /// then parse the metadata from the prefetch bytes.
-    pub async fn metadata(&mut self) -> Result<BloomFilterMeta> {
+    pub async fn metadata(
+        &mut self,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<BloomFilterMeta> {
        ensure!(
            self.file_size >= BLOOM_META_LEN_SIZE,
            FileSizeTooSmallSnafu {
@@ -191,6 +308,7 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
            }
        );

+        let start = metrics.as_ref().map(|_| Instant::now());
        let meta_start = self.file_size.saturating_sub(self.prefetch_size);
        let suffix = self
            .reader
@@ -208,8 +326,28 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
                .read(metadata_start..self.file_size - BLOOM_META_LEN_SIZE)
                .await
                .context(IoSnafu)?;
+
+            if let Some(m) = metrics {
+                // suffix read + meta read
+                m.total_ranges += 2;
+                // Ignores the meta length size to simplify the calculation.
+                m.total_bytes += self.file_size.min(self.prefetch_size) + length;
+                if let Some(start) = start {
+                    m.fetch_elapsed += start.elapsed();
+                }
+            }
+
            BloomFilterMeta::decode(meta).context(DecodeProtoSnafu)
        } else {
+            if let Some(m) = metrics {
+                // suffix read only
+                m.total_ranges += 1;
+                m.total_bytes += self.file_size.min(self.prefetch_size);
+                if let Some(start) = start {
+                    m.fetch_elapsed += start.elapsed();
+                }
+            }
+
            let metadata_start = self.file_size - length - BLOOM_META_LEN_SIZE - meta_start;
            let meta = &suffix[metadata_start as usize..suffix_len - BLOOM_META_LEN_SIZE as usize];
            BloomFilterMeta::decode(meta).context(DecodeProtoSnafu)
@@ -290,7 +428,7 @@ mod tests {
        for prefetch in [0u64, file_size / 2, file_size, file_size + 10] {
            let mut reader =
                BloomFilterMetaReader::new(bytes.clone(), file_size as _, Some(prefetch));
-            let meta = reader.metadata().await.unwrap();
+            let meta = reader.metadata(None).await.unwrap();

            assert_eq!(meta.rows_per_segment, 2);
            assert_eq!(meta.segment_count, 2);
@@ -312,11 +450,11 @@ mod tests {
        let bytes = mock_bloom_filter_bytes().await;

        let reader = BloomFilterReaderImpl::new(bytes);
-        let meta = reader.metadata().await.unwrap();
+        let meta = reader.metadata(None).await.unwrap();

        assert_eq!(meta.bloom_filter_locs.len(), 2);
        let bf = reader
-            .bloom_filter(&meta.bloom_filter_locs[0])
+            .bloom_filter(&meta.bloom_filter_locs[0], None)
            .await
            .unwrap();
        assert!(bf.contains(&b"a"));
@@ -325,7 +463,7 @@ mod tests {
        assert!(bf.contains(&b"d"));

        let bf = reader
-            .bloom_filter(&meta.bloom_filter_locs[1])
+            .bloom_filter(&meta.bloom_filter_locs[1], None)
            .await
            .unwrap();
        assert!(bf.contains(&b"e"));
--- a/src/index/src/fulltext_index/tests.rs
+++ b/src/index/src/fulltext_index/tests.rs
@@ -74,7 +74,7 @@ async fn test_search(
    writer.finish().await.unwrap();

    let reader = puffin_manager.reader(&file_name).await.unwrap();
-    let index_dir = reader.dir(&blob_key).await.unwrap();
+    let (index_dir, _metrics) = reader.dir(&blob_key).await.unwrap();
    let searcher = TantivyFulltextIndexSearcher::new(index_dir.path(), config).unwrap();
    for (query, expected) in query_expected {
        let results = searcher.search(query).await.unwrap();
--- a/src/index/src/inverted_index/format/reader.rs
+++ b/src/index/src/inverted_index/format/reader.rs
@@ -15,6 +15,7 @@
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
+use std::time::Duration;

 use async_trait::async_trait;
 use bytes::Bytes;
@@ -29,37 +30,115 @@ pub use crate::inverted_index::format::reader::blob::InvertedIndexBlobReader;
 mod blob;
 mod footer;

+/// Metrics for inverted index read operations.
+#[derive(Default, Clone)]
+pub struct InvertedIndexReadMetrics {
+    /// Total byte size to read.
+    pub total_bytes: u64,
+    /// Total number of ranges to read.
+    pub total_ranges: usize,
+    /// Elapsed time to fetch data.
+    pub fetch_elapsed: Duration,
+    /// Number of cache hits.
+    pub cache_hit: usize,
+    /// Number of cache misses.
+    pub cache_miss: usize,
+}
+
+impl std::fmt::Debug for InvertedIndexReadMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let Self {
+            total_bytes,
+            total_ranges,
+            fetch_elapsed,
+            cache_hit,
+            cache_miss,
+        } = self;
+
+        // If both total_bytes and cache_hit are 0, we didn't read anything.
+        if *total_bytes == 0 && *cache_hit == 0 {
+            return write!(f, "{{}}");
+        }
+        write!(f, "{{")?;
+
+        if *total_bytes > 0 {
+            write!(f, "\"total_bytes\":{}", total_bytes)?;
+        }
+        if *cache_hit > 0 {
+            if *total_bytes > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "\"cache_hit\":{}", cache_hit)?;
+        }
+
+        if *total_ranges > 0 {
+            write!(f, ", \"total_ranges\":{}", total_ranges)?;
+        }
+        if !fetch_elapsed.is_zero() {
+            write!(f, ", \"fetch_elapsed\":\"{:?}\"", fetch_elapsed)?;
+        }
+        if *cache_miss > 0 {
+            write!(f, ", \"cache_miss\":{}", cache_miss)?;
+        }
+
+        write!(f, "}}")
+    }
+}
+
+impl InvertedIndexReadMetrics {
+    /// Merges another metrics into this one.
+    pub fn merge_from(&mut self, other: &Self) {
+        self.total_bytes += other.total_bytes;
+        self.total_ranges += other.total_ranges;
+        self.fetch_elapsed += other.fetch_elapsed;
+        self.cache_hit += other.cache_hit;
+        self.cache_miss += other.cache_miss;
+    }
+}
+
 /// InvertedIndexReader defines an asynchronous reader of inverted index data
 #[mockall::automock]
 #[async_trait]
 pub trait InvertedIndexReader: Send + Sync {
    /// Seeks to given offset and reads data with exact size as provided.
-    async fn range_read(&self, offset: u64, size: u32) -> Result<Vec<u8>>;
+    async fn range_read<'a>(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<u8>>;

    /// Reads the bytes in the given ranges.
-    async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
-        let mut result = Vec::with_capacity(ranges.len());
-        for range in ranges {
-            let data = self
-                .range_read(range.start, (range.end - range.start) as u32)
-                .await?;
-            result.push(Bytes::from(data));
-        }
-        Ok(result)
-    }
+    async fn read_vec<'a>(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<Bytes>>;

    /// Retrieves metadata of all inverted indices stored within the blob.
-    async fn metadata(&self) -> Result<Arc<InvertedIndexMetas>>;
+    async fn metadata<'a>(
+        &self,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Arc<InvertedIndexMetas>>;

    /// Retrieves the finite state transducer (FST) map from the given offset and size.
-    async fn fst(&self, offset: u64, size: u32) -> Result<FstMap> {
-        let fst_data = self.range_read(offset, size).await?;
+    async fn fst<'a>(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<FstMap> {
+        let fst_data = self.range_read(offset, size, metrics).await?;
        FstMap::new(fst_data).context(DecodeFstSnafu)
    }

    /// Retrieves the multiple finite state transducer (FST) maps from the given ranges.
-    async fn fst_vec(&mut self, ranges: &[Range<u64>]) -> Result<Vec<FstMap>> {
-        self.read_vec(ranges)
+    async fn fst_vec<'a>(
+        &mut self,
+        ranges: &[Range<u64>],
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<FstMap>> {
+        self.read_vec(ranges, metrics)
            .await?
            .into_iter()
            .map(|bytes| FstMap::new(bytes.to_vec()).context(DecodeFstSnafu))
@@ -67,19 +146,28 @@ pub trait InvertedIndexReader: Send + Sync {
    }

    /// Retrieves the bitmap from the given offset and size.
-    async fn bitmap(&self, offset: u64, size: u32, bitmap_type: BitmapType) -> Result<Bitmap> {
-        self.range_read(offset, size).await.and_then(|bytes| {
-            Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
-        })
+    async fn bitmap<'a>(
+        &self,
+        offset: u64,
+        size: u32,
+        bitmap_type: BitmapType,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Bitmap> {
+        self.range_read(offset, size, metrics)
+            .await
+            .and_then(|bytes| {
+                Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
+            })
    }

    /// Retrieves the multiple bitmaps from the given ranges.
-    async fn bitmap_deque(
+    async fn bitmap_deque<'a>(
        &mut self,
        ranges: &[(Range<u64>, BitmapType)],
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
    ) -> Result<VecDeque<Bitmap>> {
        let (ranges, types): (Vec<_>, Vec<_>) = ranges.iter().cloned().unzip();
-        let bytes = self.read_vec(&ranges).await?;
+        let bytes = self.read_vec(&ranges, metrics).await?;
        bytes
            .into_iter()
            .zip(types)
--- a/src/index/src/inverted_index/format/reader/blob.rs
+++ b/src/index/src/inverted_index/format/reader/blob.rs
@@ -14,6 +14,7 @@

 use std::ops::Range;
 use std::sync::Arc;
+use std::time::Instant;

 use async_trait::async_trait;
 use bytes::Bytes;
@@ -23,10 +24,10 @@ use snafu::{ResultExt, ensure};

 use crate::inverted_index::error::{CommonIoSnafu, Result, UnexpectedBlobSizeSnafu};
 use crate::inverted_index::format::MIN_BLOB_SIZE;
-use crate::inverted_index::format::reader::InvertedIndexReader;
 use crate::inverted_index::format::reader::footer::{
    DEFAULT_PREFETCH_SIZE, InvertedIndexFooterReader,
 };
+use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};

 /// Inverted index blob reader, implements [`InvertedIndexReader`]
 pub struct InvertedIndexBlobReader<R> {
@@ -53,27 +54,58 @@ impl<R> InvertedIndexBlobReader<R> {

 #[async_trait]
 impl<R: RangeReader + Sync> InvertedIndexReader for InvertedIndexBlobReader<R> {
-    async fn range_read(&self, offset: u64, size: u32) -> Result<Vec<u8>> {
+    async fn range_read<'a>(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<u8>> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+
        let buf = self
            .source
            .read(offset..offset + size as u64)
            .await
            .context(CommonIoSnafu)?;
+
+        if let Some(m) = metrics {
+            m.total_bytes += size as u64;
+            m.total_ranges += 1;
+            m.fetch_elapsed += start.unwrap().elapsed();
+        }
+
        Ok(buf.into())
    }

-    async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
-        self.source.read_vec(ranges).await.context(CommonIoSnafu)
+    async fn read_vec<'a>(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<Bytes>> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+
+        let result = self.source.read_vec(ranges).await.context(CommonIoSnafu)?;
+
+        if let Some(m) = metrics {
+            m.total_bytes += ranges.iter().map(|r| r.end - r.start).sum::<u64>();
+            m.total_ranges += ranges.len();
+            m.fetch_elapsed += start.unwrap().elapsed();
+        }
+
+        Ok(result)
    }

-    async fn metadata(&self) -> Result<Arc<InvertedIndexMetas>> {
+    async fn metadata<'a>(
+        &self,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Arc<InvertedIndexMetas>> {
        let metadata = self.source.metadata().await.context(CommonIoSnafu)?;
        let blob_size = metadata.content_length;
        Self::validate_blob_size(blob_size)?;

        let mut footer_reader = InvertedIndexFooterReader::new(&self.source, blob_size)
            .with_prefetch_size(DEFAULT_PREFETCH_SIZE);
-        footer_reader.metadata().await.map(Arc::new)
+        footer_reader.metadata(metrics).await.map(Arc::new)
    }
 }

@@ -173,7 +205,7 @@ mod tests {
        let blob = create_inverted_index_blob();
        let blob_reader = InvertedIndexBlobReader::new(blob);

-        let metas = blob_reader.metadata().await.unwrap();
+        let metas = blob_reader.metadata(None).await.unwrap();
        assert_eq!(metas.metas.len(), 2);

        let meta0 = metas.metas.get("tag0").unwrap();
@@ -200,13 +232,14 @@ mod tests {
        let blob = create_inverted_index_blob();
        let blob_reader = InvertedIndexBlobReader::new(blob);

-        let metas = blob_reader.metadata().await.unwrap();
+        let metas = blob_reader.metadata(None).await.unwrap();
        let meta = metas.metas.get("tag0").unwrap();

        let fst_map = blob_reader
            .fst(
                meta.base_offset + meta.relative_fst_offset as u64,
                meta.fst_size,
+                None,
            )
            .await
            .unwrap();
@@ -219,6 +252,7 @@ mod tests {
            .fst(
                meta.base_offset + meta.relative_fst_offset as u64,
                meta.fst_size,
+                None,
            )
            .await
            .unwrap();
@@ -232,30 +266,30 @@ mod tests {
        let blob = create_inverted_index_blob();
        let blob_reader = InvertedIndexBlobReader::new(blob);

-        let metas = blob_reader.metadata().await.unwrap();
+        let metas = blob_reader.metadata(None).await.unwrap();
        let meta = metas.metas.get("tag0").unwrap();

        let bitmap = blob_reader
-            .bitmap(meta.base_offset, 26, BitmapType::Roaring)
+            .bitmap(meta.base_offset, 26, BitmapType::Roaring, None)
            .await
            .unwrap();
        assert_eq!(bitmap, mock_bitmap());
        let bitmap = blob_reader
-            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
+            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring, None)
            .await
            .unwrap();
        assert_eq!(bitmap, mock_bitmap());

-        let metas = blob_reader.metadata().await.unwrap();
+        let metas = blob_reader.metadata(None).await.unwrap();
        let meta = metas.metas.get("tag1").unwrap();

        let bitmap = blob_reader
-            .bitmap(meta.base_offset, 26, BitmapType::Roaring)
+            .bitmap(meta.base_offset, 26, BitmapType::Roaring, None)
            .await
            .unwrap();
        assert_eq!(bitmap, mock_bitmap());
        let bitmap = blob_reader
-            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
+            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring, None)
            .await
            .unwrap();
        assert_eq!(bitmap, mock_bitmap());
--- a/src/index/src/inverted_index/format/reader/footer.rs
+++ b/src/index/src/inverted_index/format/reader/footer.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::time::Instant;
+
 use common_base::range_read::RangeReader;
 use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexMetas};
 use prost::Message;
@@ -23,6 +25,7 @@ use crate::inverted_index::error::{
    UnexpectedZeroSegmentRowCountSnafu,
 };
 use crate::inverted_index::format::FOOTER_PAYLOAD_SIZE_SIZE;
+use crate::inverted_index::format::reader::InvertedIndexReadMetrics;

 pub const DEFAULT_PREFETCH_SIZE: u64 = 8192; // 8KiB

@@ -54,12 +57,17 @@ impl<R> InvertedIndexFooterReader<R> {
 }

 impl<R: RangeReader> InvertedIndexFooterReader<R> {
-    pub async fn metadata(&mut self) -> Result<InvertedIndexMetas> {
+    pub async fn metadata(
+        &mut self,
+        mut metrics: Option<&mut InvertedIndexReadMetrics>,
+    ) -> Result<InvertedIndexMetas> {
        ensure!(
            self.blob_size >= FOOTER_PAYLOAD_SIZE_SIZE,
            BlobSizeTooSmallSnafu
        );

+        let start = metrics.as_ref().map(|_| Instant::now());
+
        let footer_start = self.blob_size.saturating_sub(self.prefetch_size());
        let suffix = self
            .source
@@ -73,19 +81,36 @@ impl<R: RangeReader> InvertedIndexFooterReader<R> {
        let footer_size = FOOTER_PAYLOAD_SIZE_SIZE;

        // Did not fetch the entire file metadata in the initial read, need to make a second request.
-        if length > suffix_len as u64 - footer_size {
+        let result = if length > suffix_len as u64 - footer_size {
            let metadata_start = self.blob_size - length - footer_size;
            let meta = self
                .source
                .read(metadata_start..self.blob_size - footer_size)
                .await
                .context(CommonIoSnafu)?;
+
+            if let Some(m) = metrics.as_deref_mut() {
+                m.total_bytes += self.blob_size.min(self.prefetch_size()) + length;
+                m.total_ranges += 2;
+            }
+
            self.parse_payload(&meta, length)
        } else {
+            if let Some(m) = metrics.as_deref_mut() {
+                m.total_bytes += self.blob_size.min(self.prefetch_size());
+                m.total_ranges += 1;
+            }
+
            let metadata_start = self.blob_size - length - footer_size - footer_start;
            let meta = &suffix[metadata_start as usize..suffix_len - footer_size as usize];
            self.parse_payload(meta, length)
+        };
+
+        if let Some(m) = metrics {
+            m.fetch_elapsed += start.unwrap().elapsed();
        }
+
+        result
    }

    fn read_tailing_four_bytes(suffix: &[u8]) -> Result<[u8; 4]> {
@@ -186,7 +211,7 @@ mod tests {
                reader = reader.with_prefetch_size(prefetch);
            }

-            let metas = reader.metadata().await.unwrap();
+            let metas = reader.metadata(None).await.unwrap();
            assert_eq!(metas.metas.len(), 1);
            let index_meta = &metas.metas.get("test").unwrap();
            assert_eq!(index_meta.name, "test");
@@ -210,7 +235,7 @@ mod tests {
                reader = reader.with_prefetch_size(prefetch);
            }

-            let result = reader.metadata().await;
+            let result = reader.metadata(None).await;
            assert_matches!(result, Err(Error::UnexpectedFooterPayloadSize { .. }));
        }
    }
@@ -233,7 +258,7 @@ mod tests {
                reader = reader.with_prefetch_size(prefetch);
            }

-            let result = reader.metadata().await;
+            let result = reader.metadata(None).await;
            assert_matches!(result, Err(Error::UnexpectedOffsetSize { .. }));
        }
    }
--- a/src/index/src/inverted_index/format/writer/blob.rs
+++ b/src/index/src/inverted_index/format/writer/blob.rs
@@ -122,7 +122,7 @@ mod tests {
            .unwrap();

        let reader = InvertedIndexBlobReader::new(blob);
-        let metadata = reader.metadata().await.unwrap();
+        let metadata = reader.metadata(None).await.unwrap();
        assert_eq!(metadata.total_row_count, 8);
        assert_eq!(metadata.segment_row_count, 1);
        assert_eq!(metadata.metas.len(), 0);
@@ -182,7 +182,7 @@ mod tests {
            .unwrap();

        let reader = InvertedIndexBlobReader::new(blob);
-        let metadata = reader.metadata().await.unwrap();
+        let metadata = reader.metadata(None).await.unwrap();
        assert_eq!(metadata.total_row_count, 8);
        assert_eq!(metadata.segment_row_count, 1);
        assert_eq!(metadata.metas.len(), 2);
@@ -198,13 +198,19 @@ mod tests {
            .fst(
                tag0.base_offset + tag0.relative_fst_offset as u64,
                tag0.fst_size,
+                None,
            )
            .await
            .unwrap();
        assert_eq!(fst0.len(), 3);
        let [offset, size] = unpack(fst0.get(b"a").unwrap());
        let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag0.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
@@ -213,7 +219,12 @@ mod tests {
        );
        let [offset, size] = unpack(fst0.get(b"b").unwrap());
        let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag0.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
@@ -222,7 +233,12 @@ mod tests {
        );
        let [offset, size] = unpack(fst0.get(b"c").unwrap());
        let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag0.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
@@ -241,13 +257,19 @@ mod tests {
            .fst(
                tag1.base_offset + tag1.relative_fst_offset as u64,
                tag1.fst_size,
+                None,
            )
            .await
            .unwrap();
        assert_eq!(fst1.len(), 3);
        let [offset, size] = unpack(fst1.get(b"x").unwrap());
        let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag1.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
@@ -256,7 +278,12 @@ mod tests {
        );
        let [offset, size] = unpack(fst1.get(b"y").unwrap());
        let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag1.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
@@ -265,7 +292,12 @@ mod tests {
        );
        let [offset, size] = unpack(fst1.get(b"z").unwrap());
        let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag1.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
--- a/src/index/src/inverted_index/search/fst_values_mapper.rs
+++ b/src/index/src/inverted_index/search/fst_values_mapper.rs
@@ -16,7 +16,7 @@ use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta};

 use crate::bitmap::Bitmap;
 use crate::inverted_index::error::Result;
-use crate::inverted_index::format::reader::InvertedIndexReader;
+use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};

 /// `ParallelFstValuesMapper` enables parallel mapping of multiple FST value groups to their
 /// corresponding bitmaps within an inverted index.
@@ -35,7 +35,8 @@ impl<'a> ParallelFstValuesMapper<'a> {

    pub async fn map_values_vec(
        &mut self,
-        value_and_meta_vec: &[(Vec<u64>, &'a InvertedIndexMeta)],
+        value_and_meta_vec: &[(Vec<u64>, &InvertedIndexMeta)],
+        metrics: Option<&mut InvertedIndexReadMetrics>,
    ) -> Result<Vec<Bitmap>> {
        let groups = value_and_meta_vec
            .iter()
@@ -64,7 +65,7 @@ impl<'a> ParallelFstValuesMapper<'a> {
        }

        common_telemetry::debug!("fetch ranges: {:?}", fetch_ranges);
-        let mut bitmaps = self.reader.bitmap_deque(&fetch_ranges).await?;
+        let mut bitmaps = self.reader.bitmap_deque(&fetch_ranges, metrics).await?;
        let mut output = Vec::with_capacity(groups.len());

        for counter in groups {
@@ -95,23 +96,25 @@ mod tests {
    #[tokio::test]
    async fn test_map_values_vec() {
        let mut mock_reader = MockInvertedIndexReader::new();
-        mock_reader.expect_bitmap_deque().returning(|ranges| {
-            let mut output = VecDeque::new();
-            for (range, bitmap_type) in ranges {
-                let offset = range.start;
-                let size = range.end - range.start;
-                match (offset, size, bitmap_type) {
-                    (1, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
+        mock_reader
+            .expect_bitmap_deque()
+            .returning(|ranges, _metrics| {
+                let mut output = VecDeque::new();
+                for (range, bitmap_type) in ranges {
+                    let offset = range.start;
+                    let size = range.end - range.start;
+                    match (offset, size, bitmap_type) {
+                        (1, 1, BitmapType::Roaring) => {
+                            output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
+                        }
+                        (2, 1, BitmapType::Roaring) => {
+                            output.push_back(Bitmap::from_lsb0_bytes(&[0b01010101], *bitmap_type))
+                        }
+                        _ => unreachable!(),
                    }
-                    (2, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b01010101], *bitmap_type))
-                    }
-                    _ => unreachable!(),
                }
-            }
-            Ok(output)
-        });
+                Ok(output)
+            });

        let meta = InvertedIndexMeta {
            bitmap_type: BitmapType::Roaring.into(),
@@ -120,13 +123,13 @@ mod tests {
        let mut values_mapper = ParallelFstValuesMapper::new(&mut mock_reader);

        let result = values_mapper
-            .map_values_vec(&[(vec![], &meta)])
+            .map_values_vec(&[(vec![], &meta)], None)
            .await
            .unwrap();
        assert_eq!(result[0].count_ones(), 0);

        let result = values_mapper
-            .map_values_vec(&[(vec![value(1, 1)], &meta)])
+            .map_values_vec(&[(vec![value(1, 1)], &meta)], None)
            .await
            .unwrap();
        assert_eq!(
@@ -135,7 +138,7 @@ mod tests {
        );

        let result = values_mapper
-            .map_values_vec(&[(vec![value(2, 1)], &meta)])
+            .map_values_vec(&[(vec![value(2, 1)], &meta)], None)
            .await
            .unwrap();
        assert_eq!(
@@ -144,7 +147,7 @@ mod tests {
        );

        let result = values_mapper
-            .map_values_vec(&[(vec![value(1, 1), value(2, 1)], &meta)])
+            .map_values_vec(&[(vec![value(1, 1), value(2, 1)], &meta)], None)
            .await
            .unwrap();
        assert_eq!(
@@ -153,7 +156,7 @@ mod tests {
        );

        let result = values_mapper
-            .map_values_vec(&[(vec![value(2, 1), value(1, 1)], &meta)])
+            .map_values_vec(&[(vec![value(2, 1), value(1, 1)], &meta)], None)
            .await
            .unwrap();
        assert_eq!(
@@ -162,7 +165,10 @@ mod tests {
        );

        let result = values_mapper
-            .map_values_vec(&[(vec![value(2, 1)], &meta), (vec![value(1, 1)], &meta)])
+            .map_values_vec(
+                &[(vec![value(2, 1)], &meta), (vec![value(1, 1)], &meta)],
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
@@ -174,10 +180,13 @@ mod tests {
            Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
        );
        let result = values_mapper
-            .map_values_vec(&[
-                (vec![value(2, 1), value(1, 1)], &meta),
-                (vec![value(1, 1)], &meta),
-            ])
+            .map_values_vec(
+                &[
+                    (vec![value(2, 1), value(1, 1)], &meta),
+                    (vec![value(1, 1)], &meta),
+                ],
+                None,
+            )
            .await
            .unwrap();
        assert_eq!(
--- a/src/index/src/inverted_index/search/index_apply.rs
+++ b/src/index/src/inverted_index/search/index_apply.rs
@@ -19,7 +19,7 @@ pub use predicates_apply::PredicatesIndexApplier;

 use crate::bitmap::Bitmap;
 use crate::inverted_index::error::Result;
-use crate::inverted_index::format::reader::InvertedIndexReader;
+use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};

 /// The output of an apply operation.
 #[derive(Clone, Debug, PartialEq)]
@@ -44,10 +44,11 @@ pub trait IndexApplier: Send + Sync {
    /// Applies the predefined predicates to the data read by the given index reader, returning
    /// a list of relevant indices (e.g., post IDs, group IDs, row IDs).
    #[allow(unused_parens)]
-    async fn apply<'a>(
+    async fn apply<'a, 'b>(
        &self,
        context: SearchContext,
        reader: &mut (dyn InvertedIndexReader + 'a),
+        metrics: Option<&'b mut InvertedIndexReadMetrics>,
    ) -> Result<ApplyOutput>;

    /// Returns the memory usage of the applier.
--- a/src/index/src/inverted_index/search/index_apply/predicates_apply.rs
+++ b/src/index/src/inverted_index/search/index_apply/predicates_apply.rs
@@ -19,7 +19,7 @@ use greptime_proto::v1::index::InvertedIndexMetas;

 use crate::bitmap::Bitmap;
 use crate::inverted_index::error::{IndexNotFoundSnafu, Result};
-use crate::inverted_index::format::reader::InvertedIndexReader;
+use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
 use crate::inverted_index::search::fst_apply::{
    FstApplier, IntersectionFstApplier, KeysFstApplier,
 };
@@ -43,12 +43,14 @@ pub struct PredicatesIndexApplier {
 impl IndexApplier for PredicatesIndexApplier {
    /// Applies all `FstApplier`s to the data in the inverted index reader, intersecting the individual
    /// bitmaps obtained for each index to result in a final set of indices.
-    async fn apply<'a>(
+    async fn apply<'a, 'b>(
        &self,
        context: SearchContext,
        reader: &mut (dyn InvertedIndexReader + 'a),
+        metrics: Option<&'b mut InvertedIndexReadMetrics>,
    ) -> Result<ApplyOutput> {
-        let metadata = reader.metadata().await?;
+        let mut metrics = metrics;
+        let metadata = reader.metadata(metrics.as_deref_mut()).await?;
        let mut output = ApplyOutput {
            matched_segment_ids: Bitmap::new_bitvec(),
            total_row_count: metadata.total_row_count as _,
@@ -84,7 +86,7 @@ impl IndexApplier for PredicatesIndexApplier {
            return Ok(output);
        }

-        let fsts = reader.fst_vec(&fst_ranges).await?;
+        let fsts = reader.fst_vec(&fst_ranges, metrics.as_deref_mut()).await?;
        let value_and_meta_vec = fsts
            .into_iter()
            .zip(appliers)
@@ -92,7 +94,7 @@ impl IndexApplier for PredicatesIndexApplier {
            .collect::<Vec<_>>();

        let mut mapper = ParallelFstValuesMapper::new(reader);
-        let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec).await?;
+        let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec, metrics).await?;

        let mut bitmap = bm_vec.pop().unwrap(); // SAFETY: `fst_ranges` is not empty
        for bm in bm_vec {
@@ -221,26 +223,28 @@ mod tests {
        let mut mock_reader = MockInvertedIndexReader::new();
        mock_reader
            .expect_metadata()
-            .returning(|| Ok(mock_metas([("tag-0", 0)])));
-        mock_reader.expect_fst_vec().returning(|_ranges| {
+            .returning(|_| Ok(mock_metas([("tag-0", 0)])));
+        mock_reader.expect_fst_vec().returning(|_ranges, _metrics| {
            Ok(vec![
                FstMap::from_iter([(b"tag-0_value-0", fst_value(2, 1))]).unwrap(),
            ])
        });

-        mock_reader.expect_bitmap_deque().returning(|arg| {
-            assert_eq!(arg.len(), 1);
-            let range = &arg[0].0;
-            let bitmap_type = arg[0].1;
-            assert_eq!(*range, 2..3);
-            assert_eq!(bitmap_type, BitmapType::Roaring);
-            Ok(VecDeque::from([Bitmap::from_lsb0_bytes(
-                &[0b10101010],
-                bitmap_type,
-            )]))
-        });
+        mock_reader
+            .expect_bitmap_deque()
+            .returning(|arg, _metrics| {
+                assert_eq!(arg.len(), 1);
+                let range = &arg[0].0;
+                let bitmap_type = arg[0].1;
+                assert_eq!(*range, 2..3);
+                assert_eq!(bitmap_type, BitmapType::Roaring);
+                Ok(VecDeque::from([Bitmap::from_lsb0_bytes(
+                    &[0b10101010],
+                    bitmap_type,
+                )]))
+            });
        let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
            .await
            .unwrap();
        assert_eq!(
@@ -252,14 +256,14 @@ mod tests {
        let mut mock_reader = MockInvertedIndexReader::new();
        mock_reader
            .expect_metadata()
-            .returning(|| Ok(mock_metas([("tag-0", 0)])));
-        mock_reader.expect_fst_vec().returning(|_range| {
+            .returning(|_| Ok(mock_metas([("tag-0", 0)])));
+        mock_reader.expect_fst_vec().returning(|_range, _metrics| {
            Ok(vec![
                FstMap::from_iter([(b"tag-0_value-1", fst_value(2, 1))]).unwrap(),
            ])
        });
        let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
            .await
            .unwrap();
        assert_eq!(output.matched_segment_ids.count_ones(), 0);
@@ -279,8 +283,8 @@ mod tests {
        let mut mock_reader = MockInvertedIndexReader::new();
        mock_reader
            .expect_metadata()
-            .returning(|| Ok(mock_metas([("tag-0", 0), ("tag-1", 1)])));
-        mock_reader.expect_fst_vec().returning(|ranges| {
+            .returning(|_| Ok(mock_metas([("tag-0", 0), ("tag-1", 1)])));
+        mock_reader.expect_fst_vec().returning(|ranges, _metrics| {
            let mut output = vec![];
            for range in ranges {
                match range.start {
@@ -293,27 +297,29 @@ mod tests {
            }
            Ok(output)
        });
-        mock_reader.expect_bitmap_deque().returning(|ranges| {
-            let mut output = VecDeque::new();
-            for (range, bitmap_type) in ranges {
-                let offset = range.start;
-                let size = range.end - range.start;
-                match (offset, size, bitmap_type) {
-                    (1, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
+        mock_reader
+            .expect_bitmap_deque()
+            .returning(|ranges, _metrics| {
+                let mut output = VecDeque::new();
+                for (range, bitmap_type) in ranges {
+                    let offset = range.start;
+                    let size = range.end - range.start;
+                    match (offset, size, bitmap_type) {
+                        (1, 1, BitmapType::Roaring) => {
+                            output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
+                        }
+                        (2, 1, BitmapType::Roaring) => {
+                            output.push_back(Bitmap::from_lsb0_bytes(&[0b11011011], *bitmap_type))
+                        }
+                        _ => unreachable!(),
                    }
-                    (2, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b11011011], *bitmap_type))
-                    }
-                    _ => unreachable!(),
                }
-            }

-            Ok(output)
-        });
+                Ok(output)
+            });

        let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
            .await
            .unwrap();
        assert_eq!(
@@ -331,10 +337,10 @@ mod tests {
        let mut mock_reader: MockInvertedIndexReader = MockInvertedIndexReader::new();
        mock_reader
            .expect_metadata()
-            .returning(|| Ok(mock_metas([("tag-0", 0)])));
+            .returning(|_| Ok(mock_metas([("tag-0", 0)])));

        let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
            .await
            .unwrap();
        assert_eq!(output.matched_segment_ids, Bitmap::full_bitvec(8)); // full range to scan
@@ -343,7 +349,7 @@ mod tests {
    #[tokio::test]
    async fn test_index_applier_with_empty_index() {
        let mut mock_reader = MockInvertedIndexReader::new();
-        mock_reader.expect_metadata().returning(move || {
+        mock_reader.expect_metadata().returning(move |_| {
            Ok(Arc::new(InvertedIndexMetas {
                total_row_count: 0, // No rows
                segment_row_count: 1,
@@ -359,7 +365,7 @@ mod tests {
        };

        let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
            .await
            .unwrap();
        assert!(output.matched_segment_ids.is_empty());
@@ -370,7 +376,7 @@ mod tests {
        let mut mock_reader = MockInvertedIndexReader::new();
        mock_reader
            .expect_metadata()
-            .returning(|| Ok(mock_metas(vec![])));
+            .returning(|_| Ok(mock_metas(vec![])));

        let mut mock_fst_applier = MockFstApplier::new();
        mock_fst_applier.expect_apply().never();
@@ -385,6 +391,7 @@ mod tests {
                    index_not_found_strategy: IndexNotFoundStrategy::ThrowError,
                },
                &mut mock_reader,
+                None,
            )
            .await;
        assert!(matches!(result, Err(Error::IndexNotFound { .. })));
@@ -395,6 +402,7 @@ mod tests {
                    index_not_found_strategy: IndexNotFoundStrategy::ReturnEmpty,
                },
                &mut mock_reader,
+                None,
            )
            .await
            .unwrap();
@@ -406,6 +414,7 @@ mod tests {
                    index_not_found_strategy: IndexNotFoundStrategy::Ignore,
                },
                &mut mock_reader,
+                None,
            )
            .await
            .unwrap();
--- a/src/index/src/lib.rs
+++ b/src/index/src/lib.rs
@@ -22,6 +22,8 @@ pub mod external_provider;
 pub mod fulltext_index;
 pub mod inverted_index;
 pub mod target;
+#[cfg(feature = "vector_index")]
+pub mod vector;

 pub type Bytes = Vec<u8>;
 pub type BytesRef<'a> = &'a [u8];
--- a/src/index/src/vector.rs
+++ b/src/index/src/vector.rs
@@ -0,0 +1,163 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Vector index types and options.
+//!
+//! This module re-exports types from `datatypes` and provides conversions
+//! to USearch types, as well as distance computation functions.
+
+pub use datatypes::schema::{VectorDistanceMetric, VectorIndexOptions};
+use nalgebra::DVectorView;
+pub use usearch::MetricKind;
+
+/// Converts a VectorDistanceMetric to a USearch MetricKind.
+pub fn distance_metric_to_usearch(metric: VectorDistanceMetric) -> MetricKind {
+    match metric {
+        VectorDistanceMetric::L2sq => MetricKind::L2sq,
+        VectorDistanceMetric::Cosine => MetricKind::Cos,
+        VectorDistanceMetric::InnerProduct => MetricKind::IP,
+    }
+}
+
+/// Computes distance between two vectors using the specified metric.
+///
+/// Uses SIMD-optimized implementations via nalgebra.
+///
+/// **Note:** The caller must ensure that the two vectors have the same length
+/// and are non-empty. Empty vectors return 0.0 for all metrics.
+pub fn compute_distance(v1: &[f32], v2: &[f32], metric: VectorDistanceMetric) -> f32 {
+    // Empty vectors are degenerate; return 0.0 uniformly across all metrics.
+    if v1.is_empty() || v2.is_empty() {
+        return 0.0;
+    }
+
+    match metric {
+        VectorDistanceMetric::L2sq => l2sq(v1, v2),
+        VectorDistanceMetric::Cosine => cosine(v1, v2),
+        VectorDistanceMetric::InnerProduct => -dot(v1, v2),
+    }
+}
+
+/// Calculates the squared L2 distance between two vectors.
+fn l2sq(lhs: &[f32], rhs: &[f32]) -> f32 {
+    let lhs = DVectorView::from_slice(lhs, lhs.len());
+    let rhs = DVectorView::from_slice(rhs, rhs.len());
+    (lhs - rhs).norm_squared()
+}
+
+/// Calculates the cosine distance between two vectors.
+///
+/// Returns a value in `[0.0, 2.0]` where 0.0 means identical direction and 2.0 means
+/// opposite direction. For degenerate cases (zero or near-zero magnitude vectors),
+/// returns 1.0 (maximum uncertainty) to avoid NaN and ensure safe index operations.
+fn cosine(lhs: &[f32], rhs: &[f32]) -> f32 {
+    let lhs_vec = DVectorView::from_slice(lhs, lhs.len());
+    let rhs_vec = DVectorView::from_slice(rhs, rhs.len());
+
+    let dot_product = lhs_vec.dot(&rhs_vec);
+    let lhs_norm = lhs_vec.norm();
+    let rhs_norm = rhs_vec.norm();
+
+    // Zero-magnitude vectors have undefined direction; return max distance as safe fallback.
+    if dot_product.abs() < f32::EPSILON
+        || lhs_norm.abs() < f32::EPSILON
+        || rhs_norm.abs() < f32::EPSILON
+    {
+        return 1.0;
+    }
+
+    let cos_similar = dot_product / (lhs_norm * rhs_norm);
+    let res = 1.0 - cos_similar;
+    // Clamp near-zero results to exactly 0.0 to avoid floating-point artifacts.
+    if res.abs() < f32::EPSILON { 0.0 } else { res }
+}
+
+/// Calculates the dot product between two vectors.
+fn dot(lhs: &[f32], rhs: &[f32]) -> f32 {
+    let lhs = DVectorView::from_slice(lhs, lhs.len());
+    let rhs = DVectorView::from_slice(rhs, rhs.len());
+    lhs.dot(&rhs)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_distance_metric_to_usearch() {
+        assert_eq!(
+            distance_metric_to_usearch(VectorDistanceMetric::L2sq),
+            MetricKind::L2sq
+        );
+        assert_eq!(
+            distance_metric_to_usearch(VectorDistanceMetric::Cosine),
+            MetricKind::Cos
+        );
+        assert_eq!(
+            distance_metric_to_usearch(VectorDistanceMetric::InnerProduct),
+            MetricKind::IP
+        );
+    }
+
+    #[test]
+    fn test_vector_index_options_default() {
+        let options = VectorIndexOptions::default();
+        assert_eq!(options.metric, VectorDistanceMetric::L2sq);
+        assert_eq!(options.connectivity, 16);
+        assert_eq!(options.expansion_add, 128);
+        assert_eq!(options.expansion_search, 64);
+    }
+
+    #[test]
+    fn test_compute_distance_l2sq() {
+        let v1 = vec![1.0, 2.0, 3.0];
+        let v2 = vec![4.0, 5.0, 6.0];
+        // L2sq = (4-1)^2 + (5-2)^2 + (6-3)^2 = 9 + 9 + 9 = 27
+        let dist = compute_distance(&v1, &v2, VectorDistanceMetric::L2sq);
+        assert!((dist - 27.0).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_compute_distance_cosine() {
+        let v1 = vec![1.0, 0.0, 0.0];
+        let v2 = vec![0.0, 1.0, 0.0];
+        // Orthogonal vectors have cosine similarity of 0, distance of 1
+        let dist = compute_distance(&v1, &v2, VectorDistanceMetric::Cosine);
+        assert!((dist - 1.0).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_compute_distance_inner_product() {
+        let v1 = vec![1.0, 2.0, 3.0];
+        let v2 = vec![4.0, 5.0, 6.0];
+        // Inner product = 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
+        // Distance is negated: -32
+        let dist = compute_distance(&v1, &v2, VectorDistanceMetric::InnerProduct);
+        assert!((dist - (-32.0)).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_compute_distance_empty_vectors() {
+        // Empty vectors should return 0.0 uniformly for all metrics
+        assert_eq!(compute_distance(&[], &[], VectorDistanceMetric::L2sq), 0.0);
+        assert_eq!(
+            compute_distance(&[], &[], VectorDistanceMetric::Cosine),
+            0.0
+        );
+        assert_eq!(
+            compute_distance(&[], &[], VectorDistanceMetric::InnerProduct),
+            0.0
+        );
+    }
+}
--- a/src/log-store/src/kafka/client_manager.rs
+++ b/src/log-store/src/kafka/client_manager.rs
@@ -16,7 +16,7 @@ use std::collections::HashMap;
 use std::sync::Arc;

 use common_wal::config::kafka::DatanodeKafkaConfig;
-use common_wal::config::kafka::common::DEFAULT_BACKOFF_CONFIG;
+use common_wal::config::kafka::common::{DEFAULT_BACKOFF_CONFIG, DEFAULT_CONNECT_TIMEOUT};
 use dashmap::DashMap;
 use rskafka::client::ClientBuilder;
 use rskafka::client::partition::{Compression, PartitionClient, UnknownTopicHandling};
@@ -78,7 +78,8 @@ impl ClientManager {
    ) -> Result<Self> {
        // Sets backoff config for the top-level kafka client and all clients constructed by it.
        let mut builder = ClientBuilder::new(config.connection.broker_endpoints.clone())
-            .backoff_config(DEFAULT_BACKOFF_CONFIG);
+            .backoff_config(DEFAULT_BACKOFF_CONFIG)
+            .connect_timeout(Some(DEFAULT_CONNECT_TIMEOUT));
        if let Some(sasl) = &config.connection.sasl {
            builder = builder.sasl_config(sasl.config.clone().into_sasl_config());
        };
--- a/src/meta-client/src/client.rs
+++ b/src/meta-client/src/client.rs
@@ -189,6 +189,9 @@ impl MetaClientBuilder {
        let mgr = client.channel_manager.clone();

        if self.enable_heartbeat {
+            if self.heartbeat_channel_manager.is_some() {
+                info!("Enable heartbeat channel using the heartbeat channel manager.");
+            }
            let mgr = self.heartbeat_channel_manager.unwrap_or(mgr.clone());
            client.heartbeat = Some(HeartbeatClient::new(
                self.id,
--- a/src/meta-client/src/client/ask_leader.rs
+++ b/src/meta-client/src/client/ask_leader.rs
@@ -24,7 +24,7 @@ use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS;
 use common_telemetry::tracing_context::TracingContext;
 use common_telemetry::warn;
 use rand::seq::SliceRandom;
-use snafu::{OptionExt, ResultExt};
+use snafu::ResultExt;
 use tokio::time::timeout;
 use tonic::transport::Channel;

@@ -101,12 +101,14 @@ impl AskLeader {
        };

        let (tx, mut rx) = tokio::sync::mpsc::channel(peers.len());
+        let channel_manager = self.channel_manager.clone();

        for addr in &peers {
            let mut client = self.create_asker(addr)?;
            let tx_clone = tx.clone();
            let req = req.clone();
            let addr = addr.clone();
+            let channel_manager = channel_manager.clone();
            tokio::spawn(async move {
                match client.ask_leader(req).await {
                    Ok(res) => {
@@ -117,13 +119,19 @@ impl AskLeader {
                        };
                    }
                    Err(status) => {
+                        // Reset cached channel even on generic errors: the VIP may keep us on a dead
+                        // backend, so forcing a reconnect gives us a chance to hit a healthy peer.
+                        Self::reset_channels_with_manager(
+                            &channel_manager,
+                            std::slice::from_ref(&addr),
+                        );
                        warn!("Failed to ask leader from: {addr}, {status}");
                    }
                }
            });
        }

-        let leader = timeout(
+        let leader = match timeout(
            self.channel_manager
                .config()
                .timeout
@@ -131,8 +139,16 @@ impl AskLeader {
            rx.recv(),
        )
        .await
-        .context(error::AskLeaderTimeoutSnafu)?
-        .context(error::NoLeaderSnafu)?;
+        {
+            Ok(Some(leader)) => leader,
+            Ok(None) => return error::NoLeaderSnafu.fail(),
+            Err(e) => {
+                // All peers timed out. Reset channels to force reconnection,
+                // which may help escape dead backends in VIP/LB scenarios.
+                Self::reset_channels_with_manager(&self.channel_manager, &peers);
+                return Err(e).context(error::AskLeaderTimeoutSnafu);
+            }
+        };

        let mut leadership_group = self.leadership_group.write().unwrap();
        leadership_group.leader = Some(leader.clone());
@@ -169,6 +185,15 @@ impl AskLeader {
                .context(error::CreateChannelSnafu)?,
        ))
    }
+
+    /// Drop cached channels for the given peers so a fresh connection is used next time.
+    fn reset_channels_with_manager(channel_manager: &ChannelManager, peers: &[String]) {
+        if peers.is_empty() {
+            return;
+        }
+
+        channel_manager.retain_channel(|addr, _| !peers.iter().any(|peer| peer == addr));
+    }
 }

 #[async_trait]
--- a/src/meta-client/src/lib.rs
+++ b/src/meta-client/src/lib.rs
@@ -18,6 +18,10 @@ use std::time::Duration;
 use client::RegionFollowerClientRef;
 use common_base::Plugins;
 use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
+use common_meta::distributed_time_constants::{
+    HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS, HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS,
+    HEARTBEAT_TIMEOUT,
+};
 use common_telemetry::{debug, info};
 use serde::{Deserialize, Serialize};

@@ -34,8 +38,6 @@ pub struct MetaClientOptions {
    #[serde(with = "humantime_serde")]
    pub timeout: Duration,
    #[serde(with = "humantime_serde")]
-    pub heartbeat_timeout: Duration,
-    #[serde(with = "humantime_serde")]
    pub ddl_timeout: Duration,
    #[serde(with = "humantime_serde")]
    pub connect_timeout: Duration,
@@ -52,7 +54,6 @@ impl Default for MetaClientOptions {
        Self {
            metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
            timeout: Duration::from_millis(3_000u64),
-            heartbeat_timeout: Duration::from_millis(500u64),
            ddl_timeout: Duration::from_millis(10_000u64),
            connect_timeout: Duration::from_millis(1_000u64),
            tcp_nodelay: true,
@@ -97,7 +98,11 @@ pub async fn create_meta_client(
        .timeout(meta_client_options.timeout)
        .connect_timeout(meta_client_options.connect_timeout)
        .tcp_nodelay(meta_client_options.tcp_nodelay);
-    let heartbeat_config = base_config.clone();
+    let heartbeat_config = base_config
+        .clone()
+        .timeout(HEARTBEAT_TIMEOUT)
+        .http2_keep_alive_interval(HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS)
+        .http2_keep_alive_timeout(HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS);

    if let MetaClientType::Frontend = client_type {
        let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout);
--- a/src/meta-srv/src/bootstrap.rs
+++ b/src/meta-srv/src/bootstrap.rs
@@ -14,6 +14,7 @@

 use std::net::SocketAddr;
 use std::sync::Arc;
+use std::time::Duration;

 use api::v1::meta::cluster_server::ClusterServer;
 use api::v1::meta::heartbeat_server::HeartbeatServer;
@@ -49,16 +50,21 @@ use crate::metasrv::builder::MetasrvBuilder;
 use crate::metasrv::{
    BackendImpl, ElectionRef, Metasrv, MetasrvOptions, SelectTarget, SelectorRef,
 };
-use crate::selector::SelectorType;
 use crate::selector::lease_based::LeaseBasedSelector;
 use crate::selector::load_based::LoadBasedSelector;
 use crate::selector::round_robin::RoundRobinSelector;
 use crate::selector::weight_compute::RegionNumsBasedWeightCompute;
+use crate::selector::{Selector, SelectorType};
 use crate::service::admin;
 use crate::service::admin::admin_axum_router;
 use crate::utils::etcd::create_etcd_client_with_tls;
 use crate::{Result, error};

+/// The default keep-alive interval for gRPC.
+const DEFAULT_GRPC_KEEP_ALIVE_INTERVAL: Duration = Duration::from_secs(10);
+/// The default keep-alive timeout for gRPC.
+const DEFAULT_GRPC_KEEP_ALIVE_TIMEOUT: Duration = Duration::from_secs(10);
+
 pub struct MetasrvInstance {
    metasrv: Arc<Metasrv>,

@@ -245,7 +251,12 @@ macro_rules! add_compressed_service {
 }

 pub fn router(metasrv: Arc<Metasrv>) -> Router {
-    let mut router = tonic::transport::Server::builder().accept_http1(true); // for admin services
+    let mut router = tonic::transport::Server::builder()
+        // for admin services
+        .accept_http1(true)
+        // For quick network failures detection.
+        .http2_keepalive_interval(Some(DEFAULT_GRPC_KEEP_ALIVE_INTERVAL))
+        .http2_keepalive_timeout(Some(DEFAULT_GRPC_KEEP_ALIVE_TIMEOUT));
    let router = add_compressed_service!(router, HeartbeatServer::from_arc(metasrv.clone()));
    let router = add_compressed_service!(router, StoreServer::from_arc(metasrv.clone()));
    let router = add_compressed_service!(router, ClusterServer::from_arc(metasrv.clone()));
@@ -280,7 +291,7 @@ pub async fn metasrv_builder(

            use common_meta::distributed_time_constants::POSTGRES_KEEP_ALIVE_SECS;
            use common_meta::kv_backend::rds::PgStore;
-            use deadpool_postgres::Config;
+            use deadpool_postgres::{Config, ManagerConfig, RecyclingMethod};

            use crate::election::rds::postgres::{ElectionPgClient, PgElection};
            use crate::utils::postgres::create_postgres_pool;
@@ -294,9 +305,16 @@ pub async fn metasrv_builder(
            let mut cfg = Config::new();
            cfg.keepalives = Some(true);
            cfg.keepalives_idle = Some(Duration::from_secs(POSTGRES_KEEP_ALIVE_SECS));
-            // We use a separate pool for election since we need a different session keep-alive idle time.
-            let pool = create_postgres_pool(&opts.store_addrs, Some(cfg), opts.backend_tls.clone())
-                .await?;
+            cfg.manager = Some(ManagerConfig {
+                recycling_method: RecyclingMethod::Verified,
+            });
+            // Use a dedicated pool for the election client to allow customized session settings.
+            let pool = create_postgres_pool(
+                &opts.store_addrs,
+                Some(cfg.clone()),
+                opts.backend_tls.clone(),
+            )
+            .await?;

            let election_client = ElectionPgClient::new(
                pool,
@@ -316,8 +334,8 @@ pub async fn metasrv_builder(
            )
            .await?;

-            let pool =
-                create_postgres_pool(&opts.store_addrs, None, opts.backend_tls.clone()).await?;
+            let pool = create_postgres_pool(&opts.store_addrs, Some(cfg), opts.backend_tls.clone())
+                .await?;
            let kv_backend = PgStore::with_pg_pool(
                pool,
                opts.meta_schema_name.as_deref(),
@@ -393,7 +411,12 @@ pub async fn metasrv_builder(
        info!("Using selector from plugins");
        selector
    } else {
-        let selector = match opts.selector {
+        let selector: Arc<
+            dyn Selector<
+                    Context = crate::metasrv::SelectorContext,
+                    Output = Vec<common_meta::peer::Peer>,
+                >,
+        > = match opts.selector {
            SelectorType::LoadBased => Arc::new(LoadBasedSelector::new(
                RegionNumsBasedWeightCompute,
                meta_peer_client.clone(),
--- a/src/meta-srv/src/election/etcd.rs
+++ b/src/meta-srv/src/election/etcd.rs
@@ -63,22 +63,6 @@ pub struct EtcdElection {
 }

 impl EtcdElection {
-    pub async fn with_endpoints<E, S>(
-        leader_value: E,
-        endpoints: S,
-        store_key_prefix: String,
-    ) -> Result<ElectionRef>
-    where
-        E: AsRef<str>,
-        S: AsRef<[E]>,
-    {
-        let client = Client::connect(endpoints, None)
-            .await
-            .context(error::ConnectEtcdSnafu)?;
-
-        Self::with_etcd_client(leader_value, client, store_key_prefix).await
-    }
-
    pub async fn with_etcd_client<E>(
        leader_value: E,
        client: Client,
--- a/Show More
+++ b/Show More