ingest jsonbench data

parse partial struct json datatype in create sql
chore: expose symbols (#7451 )
2025-12-23 06:30:05 +00:00 · 2025-12-22 19:32:03 +08:00 · 2025-12-22 05:39:03 +00:00 · 2025-12-22 05:13:39 +00:00 · 2025-12-19 13:16:15 +00:00 · 2025-12-19 07:36:44 +00:00
196 changed files with 10774 additions and 3306 deletions
--- a/.github/actions/setup-greptimedb-cluster/action.yml
+++ b/.github/actions/setup-greptimedb-cluster/action.yml
@@ -51,7 +51,7 @@ runs:
    run: |
      helm upgrade \
        --install my-greptimedb \
-        --set meta.backendStorage.etcd.endpoints=${{ inputs.etcd-endpoints }} \
+        --set 'meta.backendStorage.etcd.endpoints[0]=${{ inputs.etcd-endpoints }}' \
        --set meta.enableRegionFailover=${{ inputs.enable-region-failover }} \
        --set image.registry=${{ inputs.image-registry }} \
        --set image.repository=${{ inputs.image-repository }}  \
--- a/.github/scripts/create-version.sh
+++ b/.github/scripts/create-version.sh
@@ -49,6 +49,17 @@ function create_version() {
      echo "GITHUB_REF_NAME is empty in push event" >&2
      exit 1
    fi
+    
+    # For tag releases, ensure GITHUB_REF_NAME matches the version in Cargo.toml
+    CARGO_VERSION=$(grep '^version = ' Cargo.toml | cut -d '"' -f 2 | head -n 1)
+    EXPECTED_REF_NAME="v${CARGO_VERSION}"
+    
+    if [ "$GITHUB_REF_NAME" != "$EXPECTED_REF_NAME" ]; then
+      echo "Error: GITHUB_REF_NAME '$GITHUB_REF_NAME' does not match Cargo.toml version 'v${CARGO_VERSION}'" >&2
+      echo "Expected tag name: '$EXPECTED_REF_NAME'" >&2
+      exit 1
+    fi
+    
    echo "$GITHUB_REF_NAME"
  elif [ "$GITHUB_EVENT_NAME" = workflow_dispatch ]; then
    echo "$NEXT_RELEASE_VERSION-$(git rev-parse --short HEAD)-$(date "+%Y%m%d-%s")"
--- a/.github/scripts/deploy-greptimedb.sh
+++ b/.github/scripts/deploy-greptimedb.sh
@@ -81,7 +81,7 @@ function deploy_greptimedb_cluster() {
    --create-namespace \
    --set image.tag="$GREPTIMEDB_IMAGE_TAG" \
    --set initializer.tag="$GREPTIMEDB_INITIALIZER_IMAGE_TAG" \
-    --set meta.backendStorage.etcd.endpoints="etcd.$install_namespace:2379" \
+    --set "meta.backendStorage.etcd.endpoints[0]=etcd.$install_namespace.svc.cluster.local:2379" \
    --set meta.backendStorage.etcd.storeKeyPrefix="$cluster_name" \
    -n "$install_namespace"

@@ -119,7 +119,7 @@ function deploy_greptimedb_cluster_with_s3_storage() {
    --create-namespace \
    --set image.tag="$GREPTIMEDB_IMAGE_TAG" \
    --set initializer.tag="$GREPTIMEDB_INITIALIZER_IMAGE_TAG" \
-    --set meta.backendStorage.etcd.endpoints="etcd.$install_namespace:2379" \
+    --set "meta.backendStorage.etcd.endpoints[0]=etcd.$install_namespace.svc.cluster.local:2379" \
    --set meta.backendStorage.etcd.storeKeyPrefix="$cluster_name" \
    --set objectStorage.s3.bucket="$AWS_CI_TEST_BUCKET" \
    --set objectStorage.s3.region="$AWS_REGION" \
--- a/.github/workflows/check-git-deps.yml
+++ b/.github/workflows/check-git-deps.yml
@@ -0,0 +1,154 @@
+name: Check Git Dependencies on Main Branch
+
+on:
+    pull_request:
+        branches: [main]
+        paths:
+            - 'Cargo.toml'
+    push:
+        branches: [main]
+        paths:
+            - 'Cargo.toml'
+
+jobs:
+    check-git-deps:
+        runs-on: ubuntu-latest
+
+        steps:
+            - name: Checkout repository
+              uses: actions/checkout@v6
+
+            - name: Check git dependencies
+              env:
+                  WHITELIST_DEPS: "greptime-proto,meter-core,meter-macros"
+              run: |
+                  #!/bin/bash
+                  set -e
+
+                  echo "Checking whitelisted git dependencies..."
+
+                  # Function to check if a commit is on main branch
+                  check_commit_on_main() {
+                      local repo_url="$1"
+                      local commit="$2"
+                      local repo_name=$(basename "$repo_url" .git)
+
+                      echo "Checking $repo_name"
+                      echo "Repo: $repo_url"
+                      echo "Commit: $commit"
+
+                      # Create a temporary directory for cloning
+                      local temp_dir=$(mktemp -d)
+
+                      # Clone the repository
+                      if git clone "$repo_url" "$temp_dir" 2>/dev/null; then
+                          cd "$temp_dir"
+
+                          # Try to determine the main branch name
+                          local main_branch="main"
+                          if ! git rev-parse --verify origin/main >/dev/null 2>&1; then
+                              if git rev-parse --verify origin/master >/dev/null 2>&1; then
+                                  main_branch="master"
+                              else
+                                  # Try to get the default branch
+                                  main_branch=$(git symbolic-ref refs/remotes/origin/HEAD | sed 's@^refs/remotes/origin/@@')
+                              fi
+                          fi
+
+                          echo "Main branch: $main_branch"
+
+                          # Check if commit exists
+                          if git cat-file -e "$commit" 2>/dev/null; then
+                              # Check if commit is on main branch
+                              if git merge-base --is-ancestor "$commit" "origin/$main_branch" 2>/dev/null; then
+                                  echo "PASS: Commit $commit is on $main_branch branch"
+                                  cd - >/dev/null
+                                  rm -rf "$temp_dir"
+                                  return 0
+                              else
+                                  echo "FAIL: Commit $commit is NOT on $main_branch branch"
+
+                                  # Try to find which branch contains this commit
+                                  local branch_name=$(git branch -r --contains "$commit" 2>/dev/null | head -1 | sed 's/^[[:space:]]*origin\///' | sed 's/[[:space:]]*$//')
+                                  if [[ -n "$branch_name" ]]; then
+                                      echo "Found on branch: $branch_name"
+                                  fi
+                                  cd - >/dev/null
+                                  rm -rf "$temp_dir"
+                                  return 1
+                              fi
+                          else
+                              echo "FAIL: Commit $commit not found in repository"
+                              cd - >/dev/null
+                              rm -rf "$temp_dir"
+                              return 1
+                          fi
+                      else
+                          echo "FAIL: Failed to clone $repo_url"
+                          rm -rf "$temp_dir"
+                          return 1
+                      fi
+                  }
+
+                  # Extract whitelisted git dependencies from Cargo.toml
+                  echo "Extracting git dependencies from Cargo.toml..."
+
+                  # Create temporary array to store dependencies
+                  declare -a deps=()
+
+                  # Build awk pattern from whitelist
+                  IFS=',' read -ra WHITELIST <<< "$WHITELIST_DEPS"
+                  awk_pattern=""
+                  for dep in "${WHITELIST[@]}"; do
+                      if [[ -n "$awk_pattern" ]]; then
+                          awk_pattern="$awk_pattern|"
+                      fi
+                      awk_pattern="$awk_pattern$dep"
+                  done
+
+                  # Extract whitelisted dependencies
+                  while IFS= read -r line; do
+                      if [[ -n "$line" ]]; then
+                          deps+=("$line")
+                      fi
+                  done < <(awk -v pattern="$awk_pattern" '
+                  $0 ~ pattern ".*git = \"https:/" {
+                      match($0, /git = "([^"]+)"/, arr)
+                      git_url = arr[1]
+                      if (match($0, /rev = "([^"]+)"/, rev_arr)) {
+                          rev = rev_arr[1]
+                          print git_url " " rev
+                      } else {
+                          # Check next line for rev
+                          getline
+                          if (match($0, /rev = "([^"]+)"/, rev_arr)) {
+                              rev = rev_arr[1]
+                              print git_url " " rev
+                          }
+                      }
+                  }
+                  ' Cargo.toml)
+
+                  echo "Found ${#deps[@]} dependencies to check:"
+                  for dep in "${deps[@]}"; do
+                      echo "  $dep"
+                  done
+
+                  failed=0
+
+                  for dep in "${deps[@]}"; do
+                      read -r repo_url commit <<< "$dep"
+                      if ! check_commit_on_main "$repo_url" "$commit"; then
+                          failed=1
+                      fi
+                  done
+
+                  echo "Check completed."
+
+                  if [[ $failed -eq 1 ]]; then
+                      echo "ERROR: Some git dependencies are not on their main branches!"
+                      echo "Please update the commits to point to main branch commits."
+                      exit 1
+                  else
+                      echo "SUCCESS: All git dependencies are on their main branches!"
+                  fi
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -212,7 +212,7 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c"

 [[package]]
 name = "api"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "arrow-schema",
 "common-base",
@@ -733,7 +733,7 @@ dependencies = [

 [[package]]
 name = "auth"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "async-trait",
@@ -1383,7 +1383,7 @@ dependencies = [

 [[package]]
 name = "cache"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "catalog",
 "common-error",
@@ -1418,7 +1418,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

 [[package]]
 name = "catalog"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "arrow",
@@ -1763,7 +1763,7 @@ checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"

 [[package]]
 name = "cli"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "async-stream",
 "async-trait",
@@ -1786,6 +1786,7 @@ dependencies = [
 "common-recordbatch",
 "common-runtime",
 "common-telemetry",
+ "common-test-util",
 "common-time",
 "common-version",
 "common-wal",
@@ -1816,7 +1817,7 @@ dependencies = [

 [[package]]
 name = "client"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "arc-swap",
@@ -1849,7 +1850,7 @@ dependencies = [
 "snafu 0.8.6",
 "store-api",
 "substrait 0.37.3",
- "substrait 1.0.0-beta.2",
+ "substrait 1.0.0-beta.3",
 "tokio",
 "tokio-stream",
 "tonic 0.13.1",
@@ -1889,7 +1890,7 @@ dependencies = [

 [[package]]
 name = "cmd"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "async-trait",
 "auth",
@@ -1977,6 +1978,17 @@ dependencies = [
 "unicode-width 0.2.1",
 ]

+[[package]]
+name = "codespan-reporting"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af491d569909a7e4dee0ad7db7f5341fef5c614d5b8ec8cf765732aba3cff681"
+dependencies = [
+ "serde",
+ "termcolor",
+ "unicode-width 0.2.1",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.4"
@@ -2012,7 +2024,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"

 [[package]]
 name = "common-base"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "anymap2",
 "async-trait",
@@ -2036,14 +2048,14 @@ dependencies = [

 [[package]]
 name = "common-catalog"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "const_format",
 ]

 [[package]]
 name = "common-config"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "common-base",
 "common-error",
@@ -2068,7 +2080,7 @@ dependencies = [

 [[package]]
 name = "common-datasource"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "arrow",
 "arrow-schema",
@@ -2103,7 +2115,7 @@ dependencies = [

 [[package]]
 name = "common-decimal"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "bigdecimal 0.4.8",
 "common-error",
@@ -2116,7 +2128,7 @@ dependencies = [

 [[package]]
 name = "common-error"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "common-macro",
 "http 1.3.1",
@@ -2127,7 +2139,7 @@ dependencies = [

 [[package]]
 name = "common-event-recorder"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "async-trait",
@@ -2149,7 +2161,7 @@ dependencies = [

 [[package]]
 name = "common-frontend"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "async-trait",
@@ -2171,7 +2183,7 @@ dependencies = [

 [[package]]
 name = "common-function"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "ahash 0.8.12",
 "api",
@@ -2231,7 +2243,7 @@ dependencies = [

 [[package]]
 name = "common-greptimedb-telemetry"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "async-trait",
 "common-runtime",
@@ -2248,7 +2260,7 @@ dependencies = [

 [[package]]
 name = "common-grpc"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "arrow-flight",
@@ -2283,7 +2295,7 @@ dependencies = [

 [[package]]
 name = "common-grpc-expr"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "common-base",
@@ -2303,7 +2315,7 @@ dependencies = [

 [[package]]
 name = "common-macro"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "greptime-proto",
 "once_cell",
@@ -2314,7 +2326,7 @@ dependencies = [

 [[package]]
 name = "common-mem-prof"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "anyhow",
 "common-error",
@@ -2328,9 +2340,22 @@ dependencies = [
 "tokio",
 ]

+[[package]]
+name = "common-memory-manager"
+version = "1.0.0-beta.3"
+dependencies = [
+ "common-error",
+ "common-macro",
+ "common-telemetry",
+ "humantime",
+ "serde",
+ "snafu 0.8.6",
+ "tokio",
+]
+
 [[package]]
 name = "common-meta"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "anymap2",
 "api",
@@ -2402,7 +2427,7 @@ dependencies = [

 [[package]]
 name = "common-options"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "common-grpc",
 "humantime-serde",
@@ -2411,11 +2436,11 @@ dependencies = [

 [[package]]
 name = "common-plugins"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"

 [[package]]
 name = "common-pprof"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "common-error",
 "common-macro",
@@ -2427,7 +2452,7 @@ dependencies = [

 [[package]]
 name = "common-procedure"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "async-stream",
@@ -2456,7 +2481,7 @@ dependencies = [

 [[package]]
 name = "common-procedure-test"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "async-trait",
 "common-procedure",
@@ -2466,7 +2491,7 @@ dependencies = [

 [[package]]
 name = "common-query"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "async-trait",
@@ -2492,7 +2517,7 @@ dependencies = [

 [[package]]
 name = "common-recordbatch"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "arc-swap",
 "common-base",
@@ -2516,7 +2541,7 @@ dependencies = [

 [[package]]
 name = "common-runtime"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "async-trait",
 "clap 4.5.40",
@@ -2545,7 +2570,7 @@ dependencies = [

 [[package]]
 name = "common-session"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "serde",
 "strum 0.27.1",
@@ -2553,12 +2578,14 @@ dependencies = [

 [[package]]
 name = "common-sql"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
+ "arrow-schema",
 "common-base",
 "common-decimal",
 "common-error",
 "common-macro",
+ "common-telemetry",
 "common-time",
 "datafusion-sql",
 "datatypes",
@@ -2571,7 +2598,7 @@ dependencies = [

 [[package]]
 name = "common-stat"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "common-base",
 "common-runtime",
@@ -2586,7 +2613,7 @@ dependencies = [

 [[package]]
 name = "common-telemetry"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "backtrace",
 "common-base",
@@ -2615,7 +2642,7 @@ dependencies = [

 [[package]]
 name = "common-test-util"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "client",
 "common-grpc",
@@ -2628,7 +2655,7 @@ dependencies = [

 [[package]]
 name = "common-time"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "arrow",
 "chrono",
@@ -2646,7 +2673,7 @@ dependencies = [

 [[package]]
 name = "common-version"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "build-data",
 "cargo-manifest",
@@ -2657,7 +2684,7 @@ dependencies = [

 [[package]]
 name = "common-wal"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "common-base",
 "common-error",
@@ -2680,7 +2707,7 @@ dependencies = [

 [[package]]
 name = "common-workload"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "common-telemetry",
 "serde",
@@ -3156,6 +3183,68 @@ dependencies = [
 "cipher",
 ]

+[[package]]
+name = "cxx"
+version = "1.0.190"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7620f6cfc4dcca21f2b085b7a890e16c60fd66f560cd69ee60594908dc72ab1"
+dependencies = [
+ "cc",
+ "cxx-build",
+ "cxxbridge-cmd",
+ "cxxbridge-flags",
+ "cxxbridge-macro",
+ "foldhash 0.2.0",
+ "link-cplusplus",
+]
+
+[[package]]
+name = "cxx-build"
+version = "1.0.190"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a9bc1a22964ff6a355fbec24cf68266a0ed28f8b84c0864c386474ea3d0e479"
+dependencies = [
+ "cc",
+ "codespan-reporting 0.13.1",
+ "indexmap 2.11.4",
+ "proc-macro2",
+ "quote",
+ "scratch",
+ "syn 2.0.106",
+]
+
+[[package]]
+name = "cxxbridge-cmd"
+version = "1.0.190"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1f29a879d35f7906e3c9b77d7a1005a6a0787d330c09dfe4ffb5f617728cb44"
+dependencies = [
+ "clap 4.5.40",
+ "codespan-reporting 0.13.1",
+ "indexmap 2.11.4",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.106",
+]
+
+[[package]]
+name = "cxxbridge-flags"
+version = "1.0.190"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d67109015f93f683e364085aa6489a5b2118b4a40058482101d699936a7836d6"
+
+[[package]]
+name = "cxxbridge-macro"
+version = "1.0.190"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d187e019e7b05a1f3e69a8396b70800ee867aa9fc2ab972761173ccee03742df"
+dependencies = [
+ "indexmap 2.11.4",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.106",
+]
+
 [[package]]
 name = "darling"
 version = "0.14.4"
@@ -3926,7 +4015,7 @@ dependencies = [

 [[package]]
 name = "datanode"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "arrow-flight",
@@ -3990,7 +4079,7 @@ dependencies = [

 [[package]]
 name = "datatypes"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "arrow",
 "arrow-array",
@@ -4547,8 +4636,9 @@ dependencies = [

 [[package]]
 name = "etcd-client"
-version = "0.15.0"
-source = "git+https://github.com/GreptimeTeam/etcd-client?rev=f62df834f0cffda355eba96691fe1a9a332b75a7#f62df834f0cffda355eba96691fe1a9a332b75a7"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88365f1a5671eb2f7fc240adb216786bc6494b38ce15f1d26ad6eaa303d5e822"
 dependencies = [
 "http 1.3.1",
 "prost 0.13.5",
@@ -4664,7 +4754,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"

 [[package]]
 name = "file-engine"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "async-trait",
@@ -4796,7 +4886,7 @@ checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"

 [[package]]
 name = "flow"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "arrow",
@@ -4865,7 +4955,7 @@ dependencies = [
 "sql",
 "store-api",
 "strum 0.27.1",
- "substrait 1.0.0-beta.2",
+ "substrait 1.0.0-beta.3",
 "table",
 "tokio",
 "tonic 0.13.1",
@@ -4903,6 +4993,12 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"

+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.2.2"
@@ -4920,7 +5016,7 @@ checksum = "28dd6caf6059519a65843af8fe2a3ae298b14b80179855aeb4adc2c1934ee619"

 [[package]]
 name = "frontend"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "arc-swap",
@@ -5367,7 +5463,7 @@ dependencies = [
 [[package]]
 name = "greptime-proto"
 version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=0423fa30203187c75e2937a668df1da699c8b96c#0423fa30203187c75e2937a668df1da699c8b96c"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=173efe5ec62722089db7c531c0b0d470a072b915#173efe5ec62722089db7c531c0b0d470a072b915"
 dependencies = [
 "prost 0.13.5",
 "prost-types 0.13.5",
@@ -5503,7 +5599,7 @@ checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5"
 dependencies = [
 "allocator-api2",
 "equivalent",
- "foldhash",
+ "foldhash 0.1.5",
 ]

 [[package]]
@@ -6135,7 +6231,7 @@ dependencies = [

 [[package]]
 name = "index"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "async-trait",
 "asynchronous-codec",
@@ -6148,6 +6244,7 @@ dependencies = [
 "common-telemetry",
 "common-test-util",
 "criterion 0.4.0",
+ "datatypes",
 "fastbloom",
 "fst",
 "futures",
@@ -6156,6 +6253,7 @@ dependencies = [
 "jieba-rs",
 "lazy_static",
 "mockall",
+ "nalgebra",
 "pin-project",
 "prost 0.13.5",
 "puffin",
@@ -6173,6 +6271,7 @@ dependencies = [
 "tempfile",
 "tokio",
 "tokio-util",
+ "usearch",
 "uuid",
 ]

@@ -7004,6 +7103,15 @@ dependencies = [
 "vcpkg",
 ]

+[[package]]
+name = "link-cplusplus"
+version = "1.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f78c730aaa7d0b9336a299029ea49f9ee53b0ed06e9202e8cb7db9bae7b8c82"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "linked-hash-map"
 version = "0.5.6"
@@ -7064,7 +7172,7 @@ checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"

 [[package]]
 name = "log-query"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "chrono",
 "common-error",
@@ -7076,7 +7184,7 @@ dependencies = [

 [[package]]
 name = "log-store"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "async-stream",
 "async-trait",
@@ -7317,12 +7425,6 @@ dependencies = [
 "digest",
 ]

-[[package]]
-name = "md5"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
-
 [[package]]
 name = "md5"
 version = "0.8.0"
@@ -7383,7 +7485,7 @@ dependencies = [

 [[package]]
 name = "meta-client"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "async-trait",
@@ -7411,7 +7513,7 @@ dependencies = [

 [[package]]
 name = "meta-srv"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "async-trait",
@@ -7511,7 +7613,7 @@ dependencies = [

 [[package]]
 name = "metric-engine"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "aquamarine",
@@ -7608,7 +7710,7 @@ dependencies = [

 [[package]]
 name = "mito-codec"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "bytes",
@@ -7633,7 +7735,7 @@ dependencies = [

 [[package]]
 name = "mito2"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "aquamarine",
@@ -7651,6 +7753,7 @@ dependencies = [
 "common-function",
 "common-grpc",
 "common-macro",
+ "common-memory-manager",
 "common-meta",
 "common-query",
 "common-recordbatch",
@@ -7672,6 +7775,7 @@ dependencies = [
 "either",
 "futures",
 "greptime-proto",
+ "humantime",
 "humantime-serde",
 "index",
 "itertools 0.14.0",
@@ -8371,7 +8475,7 @@ dependencies = [

 [[package]]
 name = "object-store"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "anyhow",
 "bytes",
@@ -8384,7 +8488,6 @@ dependencies = [
 "futures",
 "humantime-serde",
 "lazy_static",
- "md5 0.7.0",
 "moka",
 "opendal",
 "prometheus",
@@ -8657,7 +8760,7 @@ dependencies = [

 [[package]]
 name = "operator"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "ahash 0.8.12",
 "api",
@@ -8717,7 +8820,7 @@ dependencies = [
 "sql",
 "sqlparser",
 "store-api",
- "substrait 1.0.0-beta.2",
+ "substrait 1.0.0-beta.3",
 "table",
 "tokio",
 "tokio-util",
@@ -9003,7 +9106,7 @@ dependencies = [

 [[package]]
 name = "partition"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "async-trait",
@@ -9219,9 +9322,9 @@ dependencies = [

 [[package]]
 name = "pgwire"
-version = "0.36.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70a2bcdcc4b20a88e0648778ecf00415bbd5b447742275439c22176835056f99"
+checksum = "02d86d57e732d40382ceb9bfea80901d839bae8571aa11c06af9177aed9dfb6c"
 dependencies = [
 "async-trait",
 "base64 0.22.1",
@@ -9231,7 +9334,7 @@ dependencies = [
 "futures",
 "hex",
 "lazy-regex",
- "md5 0.8.0",
+ "md5",
 "postgres-types",
 "rand 0.9.1",
 "ring",
@@ -9240,6 +9343,7 @@ dependencies = [
 "ryu",
 "serde",
 "serde_json",
+ "smol_str",
 "stringprep",
 "thiserror 2.0.17",
 "tokio",
@@ -9360,7 +9464,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"

 [[package]]
 name = "pipeline"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "ahash 0.8.12",
 "api",
@@ -9516,7 +9620,7 @@ dependencies = [

 [[package]]
 name = "plugins"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "auth",
 "catalog",
@@ -9818,7 +9922,7 @@ dependencies = [

 [[package]]
 name = "promql"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "ahash 0.8.12",
 "async-trait",
@@ -10101,7 +10205,7 @@ dependencies = [

 [[package]]
 name = "puffin"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "async-compression 0.4.19",
 "async-trait",
@@ -10143,7 +10247,7 @@ dependencies = [

 [[package]]
 name = "query"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "ahash 0.8.12",
 "api",
@@ -10210,7 +10314,7 @@ dependencies = [
 "sql",
 "sqlparser",
 "store-api",
- "substrait 1.0.0-beta.2",
+ "substrait 1.0.0-beta.3",
 "table",
 "tokio",
 "tokio-stream",
@@ -11282,6 +11386,12 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"

+[[package]]
+name = "scratch"
+version = "1.0.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d68f2ec51b097e4c1a75b681a8bec621909b5e91f15bb7b840c4f2f7b01148b2"
+
 [[package]]
 name = "scrypt"
 version = "0.11.0"
@@ -11398,10 +11508,11 @@ checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc"

 [[package]]
 name = "serde"
-version = "1.0.219"
+version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
 dependencies = [
+ "serde_core",
 "serde_derive",
 ]

@@ -11416,10 +11527,19 @@ dependencies = [
 ]

 [[package]]
-name = "serde_derive"
-version = "1.0.219"
+name = "serde_core"
+version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -11546,7 +11666,7 @@ dependencies = [

 [[package]]
 name = "servers"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "ahash 0.8.12",
 "api",
@@ -11674,7 +11794,7 @@ dependencies = [

 [[package]]
 name = "session"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "ahash 0.8.12",
 "api",
@@ -11894,6 +12014,16 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "smol_str"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3498b0a27f93ef1402f20eefacfaa1691272ac4eca1cdc8c596cb0a245d6cbf5"
+dependencies = [
+ "borsh",
+ "serde_core",
+]
+
 [[package]]
 name = "snafu"
 version = "0.7.5"
@@ -12008,7 +12138,7 @@ dependencies = [

 [[package]]
 name = "sql"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "arrow-buffer",
@@ -12068,7 +12198,7 @@ dependencies = [

 [[package]]
 name = "sqlness-runner"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "async-trait",
 "clap 4.5.40",
@@ -12099,7 +12229,7 @@ dependencies = [
 [[package]]
 name = "sqlparser"
 version = "0.58.0"
-source = "git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=4b519a5caa95472cc3988f5556813a583dd35af1#4b519a5caa95472cc3988f5556813a583dd35af1"
+source = "git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=a0ce2bc6eb3e804532932f39833c32432f5c9a39#a0ce2bc6eb3e804532932f39833c32432f5c9a39"
 dependencies = [
 "lazy_static",
 "log",
@@ -12123,7 +12253,7 @@ dependencies = [
 [[package]]
 name = "sqlparser_derive"
 version = "0.3.0"
-source = "git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=4b519a5caa95472cc3988f5556813a583dd35af1#4b519a5caa95472cc3988f5556813a583dd35af1"
+source = "git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=a0ce2bc6eb3e804532932f39833c32432f5c9a39#a0ce2bc6eb3e804532932f39833c32432f5c9a39"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -12345,7 +12475,7 @@ dependencies = [

 [[package]]
 name = "standalone"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "async-trait",
 "catalog",
@@ -12386,7 +12516,7 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"

 [[package]]
 name = "store-api"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "aquamarine",
@@ -12599,7 +12729,7 @@ dependencies = [

 [[package]]
 name = "substrait"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "async-trait",
 "bytes",
@@ -12722,7 +12852,7 @@ dependencies = [

 [[package]]
 name = "table"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "async-trait",
@@ -12991,7 +13121,7 @@ checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683"

 [[package]]
 name = "tests-fuzz"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "arbitrary",
 "async-trait",
@@ -13035,7 +13165,7 @@ dependencies = [

 [[package]]
 name = "tests-integration"
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 dependencies = [
 "api",
 "arrow-flight",
@@ -13110,7 +13240,7 @@ dependencies = [
 "sqlx",
 "standalone",
 "store-api",
- "substrait 1.0.0-beta.2",
+ "substrait 1.0.0-beta.3",
 "table",
 "tempfile",
 "time",
@@ -14135,6 +14265,16 @@ version = "2.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"

+[[package]]
+name = "usearch"
+version = "2.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2cc9fc5f872a3a4f9081d5f42624d788231b763e1846c829b9968a3755ac884d"
+dependencies = [
+ "cxx",
+ "cxx-build",
+]
+
 [[package]]
 name = "utf8-ranges"
 version = "1.0.5"
@@ -14274,7 +14414,7 @@ dependencies = [
 "ciborium",
 "cidr",
 "clap 4.5.40",
- "codespan-reporting",
+ "codespan-reporting 0.12.0",
 "community-id",
 "convert_case 0.7.1",
 "crc",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,6 +21,7 @@ members = [
    "src/common/grpc-expr",
    "src/common/macro",
    "src/common/mem-prof",
+    "src/common/memory-manager",
    "src/common/meta",
    "src/common/options",
    "src/common/plugins",
@@ -74,7 +75,7 @@ members = [
 resolver = "2"

 [workspace.package]
-version = "1.0.0-beta.2"
+version = "1.0.0-beta.3"
 edition = "2024"
 license = "Apache-2.0"

@@ -142,14 +143,14 @@ derive_builder = "0.20"
 derive_more = { version = "2.1", features = ["full"] }
 dotenv = "0.15"
 either = "1.15"
-etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62df834f0cffda355eba96691fe1a9a332b75a7", features = [
+etcd-client = { version = "0.16.1", features = [
    "tls",
    "tls-roots",
 ] }
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "0423fa30203187c75e2937a668df1da699c8b96c" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "173efe5ec62722089db7c531c0b0d470a072b915" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
@@ -266,6 +267,7 @@ common-grpc = { path = "src/common/grpc" }
 common-grpc-expr = { path = "src/common/grpc-expr" }
 common-macro = { path = "src/common/macro" }
 common-mem-prof = { path = "src/common/mem-prof" }
+common-memory-manager = { path = "src/common/memory-manager" }
 common-meta = { path = "src/common/meta" }
 common-options = { path = "src/common/options" }
 common-plugins = { path = "src/common/plugins" }
@@ -330,7 +332,7 @@ datafusion-physical-plan = { git = "https://github.com/GreptimeTeam/datafusion.g
 datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
 datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
 datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
-sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "4b519a5caa95472cc3988f5556813a583dd35af1" }                           # branch = "v0.58.x"
+sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "a0ce2bc6eb3e804532932f39833c32432f5c9a39" }                           # branch = "v0.58.x"

 [profile.release]
 debug = 1
--- a/config/config.md
+++ b/config/config.md
@@ -83,6 +83,8 @@
 | `wal.sync_period` | String | `10s` | Duration for fsyncing log files.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.recovery_parallelism` | Integer | `2` | Parallelism during WAL recovery. |
 | `wal.broker_endpoints` | Array | -- | The Kafka broker endpoints.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.connect_timeout` | String | `3s` | The connect timeout for kafka client.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.timeout` | String | `3s` | The timeout for kafka client.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.auto_create_topics` | Bool | `true` | Automatically create topics for WAL.<br/>Set to `true` to automatically create topics for WAL.<br/>Otherwise, use topics named `topic_name_prefix_[0..num_topics)` |
 | `wal.num_topics` | Integer | `64` | Number of topics.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.selector_type` | String | `round_robin` | Topic selector type.<br/>Available selector types:<br/>- `round_robin` (default)<br/>**It's only used when the provider is `kafka`**. |
@@ -108,9 +110,6 @@
 | `storage` | -- | -- | The data storage options. |
 | `storage.data_home` | String | `./greptimedb_data` | The working home directory. |
 | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
-| `storage.enable_read_cache` | Bool | `true` | Whether to enable read cache. If not set, the read cache will be enabled by default when using object storage. |
-| `storage.cache_path` | String | Unset | Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.<br/>A local file directory, defaults to `{data_home}`. An empty string means disabling. |
-| `storage.cache_capacity` | String | Unset | The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger. |
 | `storage.bucket` | String | Unset | The S3 bucket name.<br/>**It's only used when the storage type is `S3`, `Oss` and `Gcs`**. |
 | `storage.root` | String | Unset | The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.<br/>**It's only used when the storage type is `S3`, `Oss` and `Azblob`**. |
 | `storage.access_key_id` | String | Unset | The access key id of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3` and `Oss`**. |
@@ -141,6 +140,8 @@
 | `region_engine.mito.max_background_flushes` | Integer | Auto | Max number of running background flush jobs (default: 1/2 of cpu cores). |
 | `region_engine.mito.max_background_compactions` | Integer | Auto | Max number of running background compaction jobs (default: 1/4 of cpu cores). |
 | `region_engine.mito.max_background_purges` | Integer | Auto | Max number of running background purge jobs (default: number of cpu cores). |
+| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. |
+| `region_engine.mito.experimental_compaction_on_exhausted` | String | wait | Behavior when compaction cannot acquire memory from the budget.<br/>Options: "wait" (default, 10s), "wait(<duration>)", "fail" |
 | `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. |
 | `region_engine.mito.global_write_buffer_size` | String | Auto | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. |
 | `region_engine.mito.global_write_buffer_reject_size` | String | Auto | Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size`. |
@@ -154,6 +155,8 @@
 | `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.preload_index_cache` | Bool | `true` | Preload index (puffin) files into cache on region open (default: true).<br/>When enabled, index files are loaded into the write cache during region initialization,<br/>which can improve query performance at the cost of longer startup times. |
 | `region_engine.mito.index_cache_percent` | Integer | `20` | Percentage of write cache capacity allocated for index (puffin) files (default: 20).<br/>The remaining capacity is used for data (parquet) files.<br/>Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,<br/>1GiB is reserved for index files and 4GiB for data files. |
+| `region_engine.mito.enable_refill_cache_on_read` | Bool | `true` | Enable refilling cache on read operations (default: true).<br/>When disabled, cache refilling on read won't happen. |
+| `region_engine.mito.manifest_cache_size` | String | `256MB` | Capacity for manifest cache (default: 256MB). |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
 | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. |
@@ -351,6 +354,7 @@
 | `region_failure_detector_initialization_delay` | String | `10m` | The delay before starting region failure detection.<br/>This delay helps prevent Metasrv from triggering unnecessary region failovers before all Datanodes are fully started.<br/>Especially useful when the cluster is not deployed with GreptimeDB Operator and maintenance mode is not enabled. |
 | `allow_region_failover_on_local_wal` | Bool | `false` | Whether to allow region failover on local WAL.<br/>**This option is not recommended to be set to true, because it may lead to data loss during failover.** |
 | `node_max_idle_time` | String | `24hours` | Max allowed idle time before removing node info from metasrv memory. |
+| `heartbeat_interval` | String | `3s` | Base heartbeat interval for calculating distributed time constants.<br/>The frontend heartbeat interval is 6 times of the base heartbeat interval.<br/>The flownode/datanode heartbeat interval is 1 times of the base heartbeat interval.<br/>e.g., If the base heartbeat interval is 3s, the frontend heartbeat interval is 18s, the flownode/datanode heartbeat interval is 3s.<br/>If you change this value, you need to change the heartbeat interval of the flownode/frontend/datanode accordingly. |
 | `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. Enabled by default. |
 | `runtime` | -- | -- | The runtime options. |
 | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
@@ -360,12 +364,18 @@
 | `backend_tls.cert_path` | String | `""` | Path to client certificate file (for client authentication)<br/>Like "/path/to/client.crt" |
 | `backend_tls.key_path` | String | `""` | Path to client private key file (for client authentication)<br/>Like "/path/to/client.key" |
 | `backend_tls.ca_cert_path` | String | `""` | Path to CA certificate file (for server certificate verification)<br/>Required when using custom CAs or self-signed certificates<br/>Leave empty to use system root certificates only<br/>Like "/path/to/ca.crt" |
+| `backend_client` | -- | -- | The backend client options.<br/>Currently, only applicable when using etcd as the metadata store. |
+| `backend_client.keep_alive_timeout` | String | `3s` | The keep alive timeout for backend client. |
+| `backend_client.keep_alive_interval` | String | `10s` | The keep alive interval for backend client. |
+| `backend_client.connect_timeout` | String | `3s` | The connect timeout for backend client. |
 | `grpc` | -- | -- | The gRPC server options. |
 | `grpc.bind_addr` | String | `127.0.0.1:3002` | The address to bind the gRPC server. |
 | `grpc.server_addr` | String | `127.0.0.1:3002` | The communication server address for the frontend and datanode to connect to metasrv.<br/>If left empty or unset, the server will automatically use the IP address of the first network interface<br/>on the host, with the same port number as the one specified in `bind_addr`. |
 | `grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
 | `grpc.max_recv_message_size` | String | `512MB` | The maximum receive message size for gRPC server. |
 | `grpc.max_send_message_size` | String | `512MB` | The maximum send message size for gRPC server. |
+| `grpc.http2_keep_alive_interval` | String | `10s` | The server side HTTP/2 keep-alive interval |
+| `grpc.http2_keep_alive_timeout` | String | `3s` | The server side HTTP/2 keep-alive timeout. |
 | `http` | -- | -- | The HTTP server options. |
 | `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
 | `http.timeout` | String | `0s` | HTTP request timeout. Set to 0 to disable timeout. |
@@ -475,6 +485,8 @@
 | `wal.sync_period` | String | `10s` | Duration for fsyncing log files.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.recovery_parallelism` | Integer | `2` | Parallelism during WAL recovery. |
 | `wal.broker_endpoints` | Array | -- | The Kafka broker endpoints.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.connect_timeout` | String | `3s` | The connect timeout for kafka client.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.timeout` | String | `3s` | The timeout for kafka client.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.max_batch_bytes` | String | `1MB` | The max size of a single producer batch.<br/>Warning: Kafka has a default limit of 1MB per message in a topic.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.consumer_wait_timeout` | String | `100ms` | The consumer wait timeout.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.create_index` | Bool | `true` | Whether to enable WAL index creation.<br/>**It's only used when the provider is `kafka`**. |
@@ -486,9 +498,6 @@
 | `storage` | -- | -- | The data storage options. |
 | `storage.data_home` | String | `./greptimedb_data` | The working home directory. |
 | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
-| `storage.cache_path` | String | Unset | Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.<br/>A local file directory, defaults to `{data_home}`. An empty string means disabling. |
-| `storage.enable_read_cache` | Bool | `true` | Whether to enable read cache. If not set, the read cache will be enabled by default when using object storage. |
-| `storage.cache_capacity` | String | Unset | The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger. |
 | `storage.bucket` | String | Unset | The S3 bucket name.<br/>**It's only used when the storage type is `S3`, `Oss` and `Gcs`**. |
 | `storage.root` | String | Unset | The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.<br/>**It's only used when the storage type is `S3`, `Oss` and `Azblob`**. |
 | `storage.access_key_id` | String | Unset | The access key id of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3` and `Oss`**. |
@@ -521,6 +530,8 @@
 | `region_engine.mito.max_background_flushes` | Integer | Auto | Max number of running background flush jobs (default: 1/2 of cpu cores). |
 | `region_engine.mito.max_background_compactions` | Integer | Auto | Max number of running background compaction jobs (default: 1/4 of cpu cores). |
 | `region_engine.mito.max_background_purges` | Integer | Auto | Max number of running background purge jobs (default: number of cpu cores). |
+| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. |
+| `region_engine.mito.experimental_compaction_on_exhausted` | String | wait | Behavior when compaction cannot acquire memory from the budget.<br/>Options: "wait" (default, 10s), "wait(<duration>)", "fail" |
 | `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. |
 | `region_engine.mito.global_write_buffer_size` | String | Auto | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. |
 | `region_engine.mito.global_write_buffer_reject_size` | String | Auto | Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size` |
@@ -534,6 +545,8 @@
 | `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.preload_index_cache` | Bool | `true` | Preload index (puffin) files into cache on region open (default: true).<br/>When enabled, index files are loaded into the write cache during region initialization,<br/>which can improve query performance at the cost of longer startup times. |
 | `region_engine.mito.index_cache_percent` | Integer | `20` | Percentage of write cache capacity allocated for index (puffin) files (default: 20).<br/>The remaining capacity is used for data (parquet) files.<br/>Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,<br/>1GiB is reserved for index files and 4GiB for data files. |
+| `region_engine.mito.enable_refill_cache_on_read` | Bool | `true` | Enable refilling cache on read operations (default: true).<br/>When disabled, cache refilling on read won't happen. |
+| `region_engine.mito.manifest_cache_size` | String | `256MB` | Capacity for manifest cache (default: 256MB). |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
 | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -169,6 +169,14 @@ recovery_parallelism = 2
 ## **It's only used when the provider is `kafka`**.
 broker_endpoints = ["127.0.0.1:9092"]

+## The connect timeout for kafka client.
+## **It's only used when the provider is `kafka`**.
+#+ connect_timeout = "3s"
+
+## The timeout for kafka client.
+## **It's only used when the provider is `kafka`**.
+#+ timeout = "3s"
+
 ## The max size of a single producer batch.
 ## Warning: Kafka has a default limit of 1MB per message in a topic.
 ## **It's only used when the provider is `kafka`**.
@@ -225,6 +233,7 @@ overwrite_entry_start_id = false
 # endpoint = "https://s3.amazonaws.com"
 # region = "us-west-2"
 # enable_virtual_host_style = false
+# disable_ec2_metadata = false

 # Example of using Oss as the storage.
 # [storage]
@@ -281,18 +290,6 @@ data_home = "./greptimedb_data"
 ## - `Oss`: the data is stored in the Aliyun OSS.
 type = "File"

-## Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.
-## A local file directory, defaults to `{data_home}`. An empty string means disabling.
-## @toml2docs:none-default
-#+ cache_path = ""
-
-## Whether to enable read cache. If not set, the read cache will be enabled by default when using object storage.
-#+ enable_read_cache = true
-
-## The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger.
-## @toml2docs:none-default
-cache_capacity = "5GiB"
-
 ## The S3 bucket name.
 ## **It's only used when the storage type is `S3`, `Oss` and `Gcs`**.
 ## @toml2docs:none-default
@@ -452,6 +449,15 @@ compress_manifest = false
 ## @toml2docs:none-default="Auto"
 #+ max_background_purges = 8

+## Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit.
+## @toml2docs:none-default="0"
+#+ experimental_compaction_memory_limit = "0"
+
+## Behavior when compaction cannot acquire memory from the budget.
+## Options: "wait" (default, 10s), "wait(<duration>)", "fail"
+## @toml2docs:none-default="wait"
+#+ experimental_compaction_on_exhausted = "wait"
+
 ## Interval to auto flush a region if it has not flushed yet.
 auto_flush_interval = "1h"

@@ -507,6 +513,13 @@ preload_index_cache = true
 ## 1GiB is reserved for index files and 4GiB for data files.
 index_cache_percent = 20

+## Enable refilling cache on read operations (default: true).
+## When disabled, cache refilling on read won't happen.
+enable_refill_cache_on_read = true
+
+## Capacity for manifest cache (default: 256MB).
+manifest_cache_size = "256MB"
+
 ## Buffer size for SST writing.
 sst_write_buffer_size = "8MB"

--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -131,7 +131,6 @@ key_path = ""
 ## For now, gRPC tls config does not support auto reload.
 watch = false

-
 ## MySQL server options.
 [mysql]
 ## Whether to enable.
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -71,6 +71,13 @@ allow_region_failover_on_local_wal = false
 ## Max allowed idle time before removing node info from metasrv memory.
 node_max_idle_time = "24hours"

+## Base heartbeat interval for calculating distributed time constants.
+## The frontend heartbeat interval is 6 times of the base heartbeat interval.
+## The flownode/datanode heartbeat interval is 1 times of the base heartbeat interval.
+## e.g., If the base heartbeat interval is 3s, the frontend heartbeat interval is 18s, the flownode/datanode heartbeat interval is 3s.
+## If you change this value, you need to change the heartbeat interval of the flownode/frontend/datanode accordingly.
+#+ heartbeat_interval = "3s"
+
 ## Whether to enable greptimedb telemetry. Enabled by default.
 #+ enable_telemetry = true

@@ -109,6 +116,16 @@ key_path = ""
 ## Like "/path/to/ca.crt"
 ca_cert_path = ""

+## The backend client options.
+## Currently, only applicable when using etcd as the metadata store.
+#+ [backend_client]
+## The keep alive timeout for backend client.
+#+ keep_alive_timeout = "3s"
+## The keep alive interval for backend client.
+#+ keep_alive_interval = "10s"
+## The connect timeout for backend client.
+#+ connect_timeout = "3s"
+
 ## The gRPC server options.
 [grpc]
 ## The address to bind the gRPC server.
@@ -123,6 +140,10 @@ runtime_size = 8
 max_recv_message_size = "512MB"
 ## The maximum send message size for gRPC server.
 max_send_message_size = "512MB"
+## The server side HTTP/2 keep-alive interval
+#+ http2_keep_alive_interval = "10s"
+## The server side HTTP/2 keep-alive timeout.
+#+ http2_keep_alive_timeout = "3s"

 ## The HTTP server options.
 [http]
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -230,6 +230,14 @@ recovery_parallelism = 2
 ## **It's only used when the provider is `kafka`**.
 broker_endpoints = ["127.0.0.1:9092"]

+## The connect timeout for kafka client.
+## **It's only used when the provider is `kafka`**.
+#+ connect_timeout = "3s"
+
+## The timeout for kafka client.
+## **It's only used when the provider is `kafka`**.
+#+ timeout = "3s"
+
 ## Automatically create topics for WAL.
 ## Set to `true` to automatically create topics for WAL.
 ## Otherwise, use topics named `topic_name_prefix_[0..num_topics)`
@@ -332,6 +340,7 @@ max_running_procedures = 128
 # endpoint = "https://s3.amazonaws.com"
 # region = "us-west-2"
 # enable_virtual_host_style = false
+# disable_ec2_metadata = false

 # Example of using Oss as the storage.
 # [storage]
@@ -388,18 +397,6 @@ data_home = "./greptimedb_data"
 ## - `Oss`: the data is stored in the Aliyun OSS.
 type = "File"

-## Whether to enable read cache. If not set, the read cache will be enabled by default when using object storage.
-#+ enable_read_cache = true
-
-## Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.
-## A local file directory, defaults to `{data_home}`. An empty string means disabling.
-## @toml2docs:none-default
-#+ cache_path = ""
-
-## The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger.
-## @toml2docs:none-default
-cache_capacity = "5GiB"
-
 ## The S3 bucket name.
 ## **It's only used when the storage type is `S3`, `Oss` and `Gcs`**.
 ## @toml2docs:none-default
@@ -546,6 +543,15 @@ compress_manifest = false
 ## @toml2docs:none-default="Auto"
 #+ max_background_purges = 8

+## Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit.
+## @toml2docs:none-default="0"
+#+ experimental_compaction_memory_limit = "0"
+
+## Behavior when compaction cannot acquire memory from the budget.
+## Options: "wait" (default, 10s), "wait(<duration>)", "fail"
+## @toml2docs:none-default="wait"
+#+ experimental_compaction_on_exhausted = "wait"
+
 ## Interval to auto flush a region if it has not flushed yet.
 auto_flush_interval = "1h"

@@ -601,6 +607,13 @@ preload_index_cache = true
 ## 1GiB is reserved for index files and 4GiB for data files.
 index_cache_percent = 20

+## Enable refilling cache on read operations (default: true).
+## When disabled, cache refilling on read won't happen.
+enable_refill_cache_on_read = true
+
+## Capacity for manifest cache (default: 256MB).
+manifest_cache_size = "256MB"
+
 ## Buffer size for SST writing.
 sst_write_buffer_size = "8MB"

--- a/src/catalog/src/system_schema/information_schema.rs
+++ b/src/catalog/src/system_schema/information_schema.rs
@@ -428,7 +428,7 @@ pub trait InformationExtension {
 }

 /// The request to inspect the datanode.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq)]
 pub struct DatanodeInspectRequest {
    /// Kind to fetch from datanode.
    pub kind: DatanodeInspectKind,
--- a/src/cli/Cargo.toml
+++ b/src/cli/Cargo.toml
@@ -67,6 +67,7 @@ tracing-appender.workspace = true

 [dev-dependencies]
 common-meta = { workspace = true, features = ["testing"] }
+common-test-util.workspace = true
 common-version.workspace = true
 serde.workspace = true
 tempfile.workspace = true
--- a/src/cli/src/common.rs
+++ b/src/cli/src/common.rs
@@ -15,5 +15,8 @@
 mod object_store;
 mod store;

-pub use object_store::{ObjectStoreConfig, new_fs_object_store};
+pub use object_store::{
+    ObjectStoreConfig, PrefixedAzblobConnection, PrefixedGcsConnection, PrefixedOssConnection,
+    PrefixedS3Connection, new_fs_object_store,
+};
 pub use store::StoreConfig;
--- a/src/cli/src/common/object_store.rs
+++ b/src/cli/src/common/object_store.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use common_base::secrets::SecretString;
+use common_base::secrets::{ExposeSecret, SecretString};
 use common_error::ext::BoxedError;
 use object_store::services::{Azblob, Fs, Gcs, Oss, S3};
 use object_store::util::{with_instrument_layers, with_retry_layers};
@@ -22,9 +22,69 @@ use snafu::ResultExt;

 use crate::error::{self};

+/// Trait to convert CLI field types to target struct field types.
+/// This enables `Option<SecretString>` (CLI) -> `SecretString` (target) conversions,
+/// allowing us to distinguish "not provided" from "provided but empty".
+trait IntoField<T> {
+    fn into_field(self) -> T;
+}
+
+/// Identity conversion for types that are the same.
+impl<T> IntoField<T> for T {
+    fn into_field(self) -> T {
+        self
+    }
+}
+
+/// Convert `Option<SecretString>` to `SecretString`, using default for None.
+impl IntoField<SecretString> for Option<SecretString> {
+    fn into_field(self) -> SecretString {
+        self.unwrap_or_default()
+    }
+}
+
+/// Trait for checking if a field is effectively empty.
+///
+/// **`is_empty()`**: Checks if the field has no meaningful value
+/// - Used when backend is enabled to validate required fields
+/// - `None`, `Some("")`, `false`, or `""` are considered empty
+trait FieldValidator {
+    /// Check if the field is empty (has no meaningful value).
+    fn is_empty(&self) -> bool;
+}
+
+/// String fields: empty if the string is empty
+impl FieldValidator for String {
+    fn is_empty(&self) -> bool {
+        self.is_empty()
+    }
+}
+
+/// Bool fields: false is considered "empty", true is "provided"
+impl FieldValidator for bool {
+    fn is_empty(&self) -> bool {
+        !self
+    }
+}
+
+/// Option<String> fields: None or empty content is empty
+impl FieldValidator for Option<String> {
+    fn is_empty(&self) -> bool {
+        self.as_ref().is_none_or(|s| s.is_empty())
+    }
+}
+
+/// Option<SecretString> fields: None or empty secret is empty
+/// For secrets, Some("") is treated as "not provided" for both checks
+impl FieldValidator for Option<SecretString> {
+    fn is_empty(&self) -> bool {
+        self.as_ref().is_none_or(|s| s.expose_secret().is_empty())
+    }
+}
+
 macro_rules! wrap_with_clap_prefix {
    (
-        $new_name:ident, $prefix:literal, $base:ty, {
+        $new_name:ident, $prefix:literal, $enable_flag:literal, $base:ty, {
            $( $( #[doc = $doc:expr] )? $( #[alias = $alias:literal] )? $field:ident : $type:ty $( = $default:expr )? ),* $(,)?
        }
    ) => {
@@ -34,15 +94,16 @@ macro_rules! wrap_with_clap_prefix {
                $(
                    $( #[doc = $doc] )?
                    $( #[clap(alias = $alias)] )?
-                    #[clap(long $(, default_value_t = $default )? )]
-                    [<$prefix $field>]: $type,
+                    #[clap(long, requires = $enable_flag $(, default_value_t = $default )? )]
+                    pub [<$prefix $field>]: $type,
                )*
            }

            impl From<$new_name> for $base {
                fn from(w: $new_name) -> Self {
                    Self {
-                        $( $field: w.[<$prefix $field>] ),*
+                        // Use into_field() to handle Option<SecretString> -> SecretString conversion
+                        $( $field: w.[<$prefix $field>].into_field() ),*
                    }
                }
            }
@@ -50,9 +111,90 @@ macro_rules! wrap_with_clap_prefix {
    };
 }

+/// Macro for declarative backend validation.
+///
+/// # Validation Rules
+///
+/// For each storage backend (S3, OSS, GCS, Azblob), this function validates:
+/// **When backend is enabled** (e.g., `--s3`): All required fields must be non-empty
+///
+/// Note: When backend is disabled, clap's `requires` attribute ensures no configuration
+/// fields can be provided at parse time.
+///
+/// # Syntax
+///
+/// ```ignore
+/// validate_backend!(
+///     enable: self.enable_s3,
+///     name: "S3",
+///     required: [(field1, "name1"), (field2, "name2"), ...],
+///     custom_validator: |missing| { ... }  // optional
+/// )
+/// ```
+///
+/// # Arguments
+///
+/// - `enable`: Boolean expression indicating if backend is enabled
+/// - `name`: Human-readable backend name for error messages
+/// - `required`: Array of (field_ref, field_name) tuples for required fields
+/// - `custom_validator`: Optional closure for complex validation logic
+///
+/// # Example
+///
+/// ```ignore
+/// validate_backend!(
+///     enable: self.enable_s3,
+///     name: "S3",
+///     required: [
+///         (&self.s3.s3_bucket, "bucket"),
+///         (&self.s3.s3_access_key_id, "access key ID"),
+///     ]
+/// )
+/// ```
+macro_rules! validate_backend {
+    (
+        enable: $enable:expr,
+        name: $backend_name:expr,
+        required: [ $( ($field:expr, $field_name:expr) ),* $(,)? ]
+        $(, custom_validator: $custom_validator:expr)?
+    ) => {{
+        if $enable {
+            // Check required fields when backend is enabled
+            let mut missing = Vec::new();
+            $(
+                if FieldValidator::is_empty($field) {
+                    missing.push($field_name);
+                }
+            )*
+
+            // Run custom validation if provided
+            $(
+                $custom_validator(&mut missing);
+            )?
+
+            if !missing.is_empty() {
+                return Err(BoxedError::new(
+                    error::MissingConfigSnafu {
+                        msg: format!(
+                            "{} {} must be set when --{} is enabled.",
+                            $backend_name,
+                            missing.join(", "),
+                            $backend_name.to_lowercase()
+                        ),
+                    }
+                    .build(),
+                ));
+            }
+        }
+
+        Ok(())
+    }};
+}
+
 wrap_with_clap_prefix! {
    PrefixedAzblobConnection,
    "azblob-",
+    "enable_azblob",
    AzblobConnection,
    {
        #[doc = "The container of the object store."]
@@ -60,9 +202,9 @@ wrap_with_clap_prefix! {
        #[doc = "The root of the object store."]
        root: String = Default::default(),
        #[doc = "The account name of the object store."]
-        account_name: SecretString = Default::default(),
+        account_name: Option<SecretString>,
        #[doc = "The account key of the object store."]
-        account_key: SecretString = Default::default(),
+        account_key: Option<SecretString>,
        #[doc = "The endpoint of the object store."]
        endpoint: String = Default::default(),
        #[doc = "The SAS token of the object store."]
@@ -70,9 +212,33 @@ wrap_with_clap_prefix! {
    }
 }

+impl PrefixedAzblobConnection {
+    pub fn validate(&self) -> Result<(), BoxedError> {
+        validate_backend!(
+            enable: true,
+            name: "AzBlob",
+            required: [
+                (&self.azblob_container, "container"),
+                (&self.azblob_root, "root"),
+                (&self.azblob_account_name, "account name"),
+                (&self.azblob_endpoint, "endpoint"),
+            ],
+            custom_validator: |missing: &mut Vec<&str>| {
+                // account_key is only required if sas_token is not provided
+                if self.azblob_sas_token.is_none()
+                    && self.azblob_account_key.is_empty()
+                {
+                    missing.push("account key (when sas_token is not provided)");
+                }
+            }
+        )
+    }
+}
+
 wrap_with_clap_prefix! {
    PrefixedS3Connection,
    "s3-",
+    "enable_s3",
    S3Connection,
    {
        #[doc = "The bucket of the object store."]
@@ -80,21 +246,39 @@ wrap_with_clap_prefix! {
        #[doc = "The root of the object store."]
        root: String = Default::default(),
        #[doc = "The access key ID of the object store."]
-        access_key_id: SecretString = Default::default(),
+        access_key_id: Option<SecretString>,
        #[doc = "The secret access key of the object store."]
-        secret_access_key: SecretString = Default::default(),
+        secret_access_key: Option<SecretString>,
        #[doc = "The endpoint of the object store."]
        endpoint: Option<String>,
        #[doc = "The region of the object store."]
        region: Option<String>,
        #[doc = "Enable virtual host style for the object store."]
        enable_virtual_host_style: bool = Default::default(),
+        #[doc = "Disable EC2 metadata service for the object store."]
+        disable_ec2_metadata: bool = Default::default(),
+    }
+}
+
+impl PrefixedS3Connection {
+    pub fn validate(&self) -> Result<(), BoxedError> {
+        validate_backend!(
+            enable: true,
+            name: "S3",
+            required: [
+                (&self.s3_bucket, "bucket"),
+                (&self.s3_access_key_id, "access key ID"),
+                (&self.s3_secret_access_key, "secret access key"),
+                (&self.s3_region, "region"),
+            ]
+        )
    }
 }

 wrap_with_clap_prefix! {
    PrefixedOssConnection,
    "oss-",
+    "enable_oss",
    OssConnection,
    {
        #[doc = "The bucket of the object store."]
@@ -102,17 +286,33 @@ wrap_with_clap_prefix! {
        #[doc = "The root of the object store."]
        root: String = Default::default(),
        #[doc = "The access key ID of the object store."]
-        access_key_id: SecretString = Default::default(),
+        access_key_id: Option<SecretString>,
        #[doc = "The access key secret of the object store."]
-        access_key_secret: SecretString = Default::default(),
+        access_key_secret: Option<SecretString>,
        #[doc = "The endpoint of the object store."]
        endpoint: String = Default::default(),
    }
 }

+impl PrefixedOssConnection {
+    pub fn validate(&self) -> Result<(), BoxedError> {
+        validate_backend!(
+            enable: true,
+            name: "OSS",
+            required: [
+                (&self.oss_bucket, "bucket"),
+                (&self.oss_access_key_id, "access key ID"),
+                (&self.oss_access_key_secret, "access key secret"),
+                (&self.oss_endpoint, "endpoint"),
+            ]
+        )
+    }
+}
+
 wrap_with_clap_prefix! {
    PrefixedGcsConnection,
    "gcs-",
+    "enable_gcs",
    GcsConnection,
    {
        #[doc = "The root of the object store."]
@@ -122,40 +322,72 @@ wrap_with_clap_prefix! {
        #[doc = "The scope of the object store."]
        scope: String = Default::default(),
        #[doc = "The credential path of the object store."]
-        credential_path: SecretString = Default::default(),
+        credential_path: Option<SecretString>,
        #[doc = "The credential of the object store."]
-        credential: SecretString = Default::default(),
+        credential: Option<SecretString>,
        #[doc = "The endpoint of the object store."]
        endpoint: String = Default::default(),
    }
 }

-/// common config for object store.
+impl PrefixedGcsConnection {
+    pub fn validate(&self) -> Result<(), BoxedError> {
+        validate_backend!(
+            enable: true,
+            name: "GCS",
+            required: [
+                (&self.gcs_bucket, "bucket"),
+                (&self.gcs_root, "root"),
+                (&self.gcs_scope, "scope"),
+            ]
+            // No custom_validator needed: GCS supports Application Default Credentials (ADC)
+            // where neither credential_path nor credential is required.
+            // Endpoint is also optional (defaults to https://storage.googleapis.com).
+        )
+    }
+}
+
+/// Common config for object store.
+///
+/// # Dependency Enforcement
+///
+/// Each backend's configuration fields (e.g., `--s3-bucket`) requires its corresponding
+/// enable flag (e.g., `--s3`) to be present. This is enforced by `clap` at parse time
+/// using the `requires` attribute.
+///
+/// For example, attempting to use `--s3-bucket my-bucket` without `--s3` will result in:
+/// ```text
+/// error: The argument '--s3-bucket <BUCKET>' requires '--s3'
+/// ```
+///
+/// This ensures that users cannot accidentally provide backend-specific configuration
+/// without explicitly enabling that backend.
 #[derive(clap::Parser, Debug, Clone, PartialEq, Default)]
+#[clap(group(clap::ArgGroup::new("storage_backend").required(false).multiple(false)))]
 pub struct ObjectStoreConfig {
    /// Whether to use S3 object store.
-    #[clap(long, alias = "s3")]
+    #[clap(long = "s3", group = "storage_backend")]
    pub enable_s3: bool,

    #[clap(flatten)]
    pub s3: PrefixedS3Connection,

    /// Whether to use OSS.
-    #[clap(long, alias = "oss")]
+    #[clap(long = "oss", group = "storage_backend")]
    pub enable_oss: bool,

    #[clap(flatten)]
    pub oss: PrefixedOssConnection,

    /// Whether to use GCS.
-    #[clap(long, alias = "gcs")]
+    #[clap(long = "gcs", group = "storage_backend")]
    pub enable_gcs: bool,

    #[clap(flatten)]
    pub gcs: PrefixedGcsConnection,

    /// Whether to use Azure Blob.
-    #[clap(long, alias = "azblob")]
+    #[clap(long = "azblob", group = "storage_backend")]
    pub enable_azblob: bool,

    #[clap(flatten)]
@@ -173,52 +405,66 @@ pub fn new_fs_object_store(root: &str) -> std::result::Result<ObjectStore, Boxed
    Ok(with_instrument_layers(object_store, false))
 }

+macro_rules! gen_object_store_builder {
+    ($method:ident, $field:ident, $conn_type:ty, $service_type:ty) => {
+        pub fn $method(&self) -> Result<ObjectStore, BoxedError> {
+            let config = <$conn_type>::from(self.$field.clone());
+            common_telemetry::info!(
+                "Building object store with {}: {:?}",
+                stringify!($field),
+                config
+            );
+            let object_store = ObjectStore::new(<$service_type>::from(&config))
+                .context(error::InitBackendSnafu)
+                .map_err(BoxedError::new)?
+                .finish();
+            Ok(with_instrument_layers(
+                with_retry_layers(object_store),
+                false,
+            ))
+        }
+    };
+}
+
 impl ObjectStoreConfig {
+    gen_object_store_builder!(build_s3, s3, S3Connection, S3);
+
+    gen_object_store_builder!(build_oss, oss, OssConnection, Oss);
+
+    gen_object_store_builder!(build_gcs, gcs, GcsConnection, Gcs);
+
+    gen_object_store_builder!(build_azblob, azblob, AzblobConnection, Azblob);
+
+    pub fn validate(&self) -> Result<(), BoxedError> {
+        if self.enable_s3 {
+            self.s3.validate()?;
+        }
+        if self.enable_oss {
+            self.oss.validate()?;
+        }
+        if self.enable_gcs {
+            self.gcs.validate()?;
+        }
+        if self.enable_azblob {
+            self.azblob.validate()?;
+        }
+        Ok(())
+    }
+
    /// Builds the object store from the config.
    pub fn build(&self) -> Result<Option<ObjectStore>, BoxedError> {
-        let object_store = if self.enable_s3 {
-            let s3 = S3Connection::from(self.s3.clone());
-            common_telemetry::info!("Building object store with s3: {:?}", s3);
-            Some(
-                ObjectStore::new(S3::from(&s3))
-                    .context(error::InitBackendSnafu)
-                    .map_err(BoxedError::new)?
-                    .finish(),
-            )
+        self.validate()?;
+
+        if self.enable_s3 {
+            self.build_s3().map(Some)
        } else if self.enable_oss {
-            let oss = OssConnection::from(self.oss.clone());
-            common_telemetry::info!("Building object store with oss: {:?}", oss);
-            Some(
-                ObjectStore::new(Oss::from(&oss))
-                    .context(error::InitBackendSnafu)
-                    .map_err(BoxedError::new)?
-                    .finish(),
-            )
+            self.build_oss().map(Some)
        } else if self.enable_gcs {
-            let gcs = GcsConnection::from(self.gcs.clone());
-            common_telemetry::info!("Building object store with gcs: {:?}", gcs);
-            Some(
-                ObjectStore::new(Gcs::from(&gcs))
-                    .context(error::InitBackendSnafu)
-                    .map_err(BoxedError::new)?
-                    .finish(),
-            )
+            self.build_gcs().map(Some)
        } else if self.enable_azblob {
-            let azblob = AzblobConnection::from(self.azblob.clone());
-            common_telemetry::info!("Building object store with azblob: {:?}", azblob);
-            Some(
-                ObjectStore::new(Azblob::from(&azblob))
-                    .context(error::InitBackendSnafu)
-                    .map_err(BoxedError::new)?
-                    .finish(),
-            )
+            self.build_azblob().map(Some)
        } else {
-            None
-        };
-
-        let object_store = object_store
-            .map(|object_store| with_instrument_layers(with_retry_layers(object_store), false));
-
-        Ok(object_store)
+            Ok(None)
+        }
    }
 }
--- a/src/cli/src/common/store.rs
+++ b/src/cli/src/common/store.rs
@@ -19,7 +19,7 @@ use common_error::ext::BoxedError;
 use common_meta::kv_backend::KvBackendRef;
 use common_meta::kv_backend::chroot::ChrootKvBackend;
 use common_meta::kv_backend::etcd::EtcdStore;
-use meta_srv::metasrv::BackendImpl;
+use meta_srv::metasrv::{BackendClientOptions, BackendImpl};
 use meta_srv::utils::etcd::create_etcd_client_with_tls;
 use servers::tls::{TlsMode, TlsOption};

@@ -112,9 +112,13 @@ impl StoreConfig {
            let kvbackend = match self.backend {
                BackendImpl::EtcdStore => {
                    let tls_config = self.tls_config();
-                    let etcd_client = create_etcd_client_with_tls(store_addrs, tls_config.as_ref())
-                        .await
-                        .map_err(BoxedError::new)?;
+                    let etcd_client = create_etcd_client_with_tls(
+                        store_addrs,
+                        &BackendClientOptions::default(),
+                        tls_config.as_ref(),
+                    )
+                    .await
+                    .map_err(BoxedError::new)?;
                    Ok(EtcdStore::with_etcd_client(etcd_client, max_txn_ops))
                }
                #[cfg(feature = "pg_kvbackend")]
--- a/src/cli/src/data.rs
+++ b/src/cli/src/data.rs
@@ -14,6 +14,7 @@

 mod export;
 mod import;
+mod storage_export;

 use clap::Subcommand;
 use client::DEFAULT_CATALOG_NAME;
--- a/src/cli/src/data/export.rs
+++ b/src/cli/src/data/export.rs
--- a/src/cli/src/data/storage_export.rs
+++ b/src/cli/src/data/storage_export.rs
@@ -0,0 +1,373 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::path::PathBuf;
+
+use common_base::secrets::{ExposeSecret, SecretString};
+use common_error::ext::BoxedError;
+
+use crate::common::{
+    PrefixedAzblobConnection, PrefixedGcsConnection, PrefixedOssConnection, PrefixedS3Connection,
+};
+
+/// Helper function to extract secret string from Option<SecretString>.
+/// Returns empty string if None.
+fn expose_optional_secret(secret: &Option<SecretString>) -> &str {
+    secret
+        .as_ref()
+        .map(|s| s.expose_secret().as_str())
+        .unwrap_or("")
+}
+
+/// Helper function to format root path with leading slash if non-empty.
+fn format_root_path(root: &str) -> String {
+    if root.is_empty() {
+        String::new()
+    } else {
+        format!("/{}", root)
+    }
+}
+
+/// Helper function to mask multiple secrets in a string.
+fn mask_secrets(mut sql: String, secrets: &[&str]) -> String {
+    for secret in secrets {
+        if !secret.is_empty() {
+            sql = sql.replace(secret, "[REDACTED]");
+        }
+    }
+    sql
+}
+
+/// Helper function to format storage URI.
+fn format_uri(scheme: &str, bucket: &str, root: &str, path: &str) -> String {
+    let root = format_root_path(root);
+    format!("{}://{}{}/{}", scheme, bucket, root, path)
+}
+
+/// Trait for storage backends that can be used for data export.
+pub trait StorageExport: Send + Sync {
+    /// Generate the storage path for COPY DATABASE command.
+    /// Returns (path, connection_string) where connection_string includes CONNECTION clause.
+    fn get_storage_path(&self, catalog: &str, schema: &str) -> (String, String);
+
+    /// Format the output path for logging purposes.
+    fn format_output_path(&self, file_path: &str) -> String;
+
+    /// Mask sensitive information in SQL commands for safe logging.
+    fn mask_sensitive_info(&self, sql: &str) -> String;
+}
+
+macro_rules! define_backend {
+    ($name:ident, $config:ty) => {
+        #[derive(Clone)]
+        pub struct $name {
+            config: $config,
+        }
+
+        impl $name {
+            pub fn new(config: $config) -> Result<Self, BoxedError> {
+                config.validate()?;
+                Ok(Self { config })
+            }
+        }
+    };
+}
+
+/// Local file system storage backend.
+#[derive(Clone)]
+pub struct FsBackend {
+    output_dir: String,
+}
+
+impl FsBackend {
+    pub fn new(output_dir: String) -> Self {
+        Self { output_dir }
+    }
+}
+
+impl StorageExport for FsBackend {
+    fn get_storage_path(&self, catalog: &str, schema: &str) -> (String, String) {
+        if self.output_dir.is_empty() {
+            unreachable!("output_dir must be set when not using remote storage")
+        }
+        let path = PathBuf::from(&self.output_dir)
+            .join(catalog)
+            .join(format!("{schema}/"))
+            .to_string_lossy()
+            .to_string();
+        (path, String::new())
+    }
+
+    fn format_output_path(&self, file_path: &str) -> String {
+        format!("{}/{}", self.output_dir, file_path)
+    }
+
+    fn mask_sensitive_info(&self, sql: &str) -> String {
+        sql.to_string()
+    }
+}
+
+define_backend!(S3Backend, PrefixedS3Connection);
+
+impl StorageExport for S3Backend {
+    fn get_storage_path(&self, catalog: &str, schema: &str) -> (String, String) {
+        let s3_path = format_uri(
+            "s3",
+            &self.config.s3_bucket,
+            &self.config.s3_root,
+            &format!("{}/{}/", catalog, schema),
+        );
+
+        let mut connection_options = vec![
+            format!(
+                "ACCESS_KEY_ID='{}'",
+                expose_optional_secret(&self.config.s3_access_key_id)
+            ),
+            format!(
+                "SECRET_ACCESS_KEY='{}'",
+                expose_optional_secret(&self.config.s3_secret_access_key)
+            ),
+        ];
+
+        if let Some(region) = &self.config.s3_region {
+            connection_options.push(format!("REGION='{}'", region));
+        }
+
+        if let Some(endpoint) = &self.config.s3_endpoint {
+            connection_options.push(format!("ENDPOINT='{}'", endpoint));
+        }
+
+        let connection_str = format!(" CONNECTION ({})", connection_options.join(", "));
+        (s3_path, connection_str)
+    }
+
+    fn format_output_path(&self, file_path: &str) -> String {
+        format_uri(
+            "s3",
+            &self.config.s3_bucket,
+            &self.config.s3_root,
+            file_path,
+        )
+    }
+
+    fn mask_sensitive_info(&self, sql: &str) -> String {
+        mask_secrets(
+            sql.to_string(),
+            &[
+                expose_optional_secret(&self.config.s3_access_key_id),
+                expose_optional_secret(&self.config.s3_secret_access_key),
+            ],
+        )
+    }
+}
+
+define_backend!(OssBackend, PrefixedOssConnection);
+
+impl StorageExport for OssBackend {
+    fn get_storage_path(&self, catalog: &str, schema: &str) -> (String, String) {
+        let oss_path = format_uri(
+            "oss",
+            &self.config.oss_bucket,
+            &self.config.oss_root,
+            &format!("{}/{}/", catalog, schema),
+        );
+
+        let connection_options = [
+            format!(
+                "ACCESS_KEY_ID='{}'",
+                expose_optional_secret(&self.config.oss_access_key_id)
+            ),
+            format!(
+                "ACCESS_KEY_SECRET='{}'",
+                expose_optional_secret(&self.config.oss_access_key_secret)
+            ),
+        ];
+
+        let connection_str = format!(" CONNECTION ({})", connection_options.join(", "));
+        (oss_path, connection_str)
+    }
+
+    fn format_output_path(&self, file_path: &str) -> String {
+        format_uri(
+            "oss",
+            &self.config.oss_bucket,
+            &self.config.oss_root,
+            file_path,
+        )
+    }
+
+    fn mask_sensitive_info(&self, sql: &str) -> String {
+        mask_secrets(
+            sql.to_string(),
+            &[
+                expose_optional_secret(&self.config.oss_access_key_id),
+                expose_optional_secret(&self.config.oss_access_key_secret),
+            ],
+        )
+    }
+}
+
+define_backend!(GcsBackend, PrefixedGcsConnection);
+
+impl StorageExport for GcsBackend {
+    fn get_storage_path(&self, catalog: &str, schema: &str) -> (String, String) {
+        let gcs_path = format_uri(
+            "gcs",
+            &self.config.gcs_bucket,
+            &self.config.gcs_root,
+            &format!("{}/{}/", catalog, schema),
+        );
+
+        let mut connection_options = Vec::new();
+
+        let credential_path = expose_optional_secret(&self.config.gcs_credential_path);
+        if !credential_path.is_empty() {
+            connection_options.push(format!("CREDENTIAL_PATH='{}'", credential_path));
+        }
+
+        let credential = expose_optional_secret(&self.config.gcs_credential);
+        if !credential.is_empty() {
+            connection_options.push(format!("CREDENTIAL='{}'", credential));
+        }
+
+        if !self.config.gcs_endpoint.is_empty() {
+            connection_options.push(format!("ENDPOINT='{}'", self.config.gcs_endpoint));
+        }
+
+        let connection_str = if connection_options.is_empty() {
+            String::new()
+        } else {
+            format!(" CONNECTION ({})", connection_options.join(", "))
+        };
+
+        (gcs_path, connection_str)
+    }
+
+    fn format_output_path(&self, file_path: &str) -> String {
+        format_uri(
+            "gcs",
+            &self.config.gcs_bucket,
+            &self.config.gcs_root,
+            file_path,
+        )
+    }
+
+    fn mask_sensitive_info(&self, sql: &str) -> String {
+        mask_secrets(
+            sql.to_string(),
+            &[
+                expose_optional_secret(&self.config.gcs_credential_path),
+                expose_optional_secret(&self.config.gcs_credential),
+            ],
+        )
+    }
+}
+
+define_backend!(AzblobBackend, PrefixedAzblobConnection);
+
+impl StorageExport for AzblobBackend {
+    fn get_storage_path(&self, catalog: &str, schema: &str) -> (String, String) {
+        let azblob_path = format_uri(
+            "azblob",
+            &self.config.azblob_container,
+            &self.config.azblob_root,
+            &format!("{}/{}/", catalog, schema),
+        );
+
+        let mut connection_options = vec![
+            format!(
+                "ACCOUNT_NAME='{}'",
+                expose_optional_secret(&self.config.azblob_account_name)
+            ),
+            format!(
+                "ACCOUNT_KEY='{}'",
+                expose_optional_secret(&self.config.azblob_account_key)
+            ),
+        ];
+
+        if let Some(sas_token) = &self.config.azblob_sas_token {
+            connection_options.push(format!("SAS_TOKEN='{}'", sas_token));
+        }
+
+        let connection_str = format!(" CONNECTION ({})", connection_options.join(", "));
+        (azblob_path, connection_str)
+    }
+
+    fn format_output_path(&self, file_path: &str) -> String {
+        format_uri(
+            "azblob",
+            &self.config.azblob_container,
+            &self.config.azblob_root,
+            file_path,
+        )
+    }
+
+    fn mask_sensitive_info(&self, sql: &str) -> String {
+        mask_secrets(
+            sql.to_string(),
+            &[
+                expose_optional_secret(&self.config.azblob_account_name),
+                expose_optional_secret(&self.config.azblob_account_key),
+            ],
+        )
+    }
+}
+
+#[derive(Clone)]
+pub enum StorageType {
+    Fs(FsBackend),
+    S3(S3Backend),
+    Oss(OssBackend),
+    Gcs(GcsBackend),
+    Azblob(AzblobBackend),
+}
+
+impl StorageExport for StorageType {
+    fn get_storage_path(&self, catalog: &str, schema: &str) -> (String, String) {
+        match self {
+            StorageType::Fs(backend) => backend.get_storage_path(catalog, schema),
+            StorageType::S3(backend) => backend.get_storage_path(catalog, schema),
+            StorageType::Oss(backend) => backend.get_storage_path(catalog, schema),
+            StorageType::Gcs(backend) => backend.get_storage_path(catalog, schema),
+            StorageType::Azblob(backend) => backend.get_storage_path(catalog, schema),
+        }
+    }
+
+    fn format_output_path(&self, file_path: &str) -> String {
+        match self {
+            StorageType::Fs(backend) => backend.format_output_path(file_path),
+            StorageType::S3(backend) => backend.format_output_path(file_path),
+            StorageType::Oss(backend) => backend.format_output_path(file_path),
+            StorageType::Gcs(backend) => backend.format_output_path(file_path),
+            StorageType::Azblob(backend) => backend.format_output_path(file_path),
+        }
+    }
+
+    fn mask_sensitive_info(&self, sql: &str) -> String {
+        match self {
+            StorageType::Fs(backend) => backend.mask_sensitive_info(sql),
+            StorageType::S3(backend) => backend.mask_sensitive_info(sql),
+            StorageType::Oss(backend) => backend.mask_sensitive_info(sql),
+            StorageType::Gcs(backend) => backend.mask_sensitive_info(sql),
+            StorageType::Azblob(backend) => backend.mask_sensitive_info(sql),
+        }
+    }
+}
+
+impl StorageType {
+    /// Returns true if the storage backend is remote (not local filesystem).
+    pub fn is_remote_storage(&self) -> bool {
+        !matches!(self, StorageType::Fs(_))
+    }
+}
--- a/src/cli/src/error.rs
+++ b/src/cli/src/error.rs
@@ -253,12 +253,6 @@ pub enum Error {
        error: ObjectStoreError,
    },

-    #[snafu(display("S3 config need be set"))]
-    S3ConfigNotSet {
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Output directory not set"))]
    OutputDirNotSet {
        #[snafu(implicit)]
@@ -364,9 +358,9 @@ impl ErrorExt for Error {

            Error::Other { source, .. } => source.status_code(),
            Error::OpenDal { .. } | Error::InitBackend { .. } => StatusCode::Internal,
-            Error::S3ConfigNotSet { .. }
-            | Error::OutputDirNotSet { .. }
-            | Error::EmptyStoreAddrs { .. } => StatusCode::InvalidArguments,
+            Error::OutputDirNotSet { .. } | Error::EmptyStoreAddrs { .. } => {
+                StatusCode::InvalidArguments
+            }

            Error::BuildRuntime { source, .. } => source.status_code(),

--- a/src/cmd/src/datanode/objbench.rs
+++ b/src/cmd/src/datanode/objbench.rs
@@ -145,6 +145,17 @@ impl ObjbenchCommand {
        let region_meta = extract_region_metadata(&self.source, &parquet_meta)?;
        let num_rows = parquet_meta.file_metadata().num_rows() as u64;
        let num_row_groups = parquet_meta.num_row_groups() as u64;
+        let max_row_group_uncompressed_size: u64 = parquet_meta
+            .row_groups()
+            .iter()
+            .map(|rg| {
+                rg.columns()
+                    .iter()
+                    .map(|c| c.uncompressed_size() as u64)
+                    .sum::<u64>()
+            })
+            .max()
+            .unwrap_or(0);

        println!(
            "{} Metadata loaded - rows: {}, size: {} bytes",
@@ -160,6 +171,7 @@ impl ObjbenchCommand {
            time_range: Default::default(),
            level: 0,
            file_size,
+            max_row_group_uncompressed_size,
            available_indexes: Default::default(),
            indexes: Default::default(),
            index_file_size: 0,
--- a/src/cmd/src/frontend.rs
+++ b/src/cmd/src/frontend.rs
@@ -52,7 +52,7 @@ use plugins::frontend::context::{
 };
 use servers::addrs;
 use servers::grpc::GrpcOptions;
-use servers::tls::{TlsMode, TlsOption};
+use servers::tls::{TlsMode, TlsOption, merge_tls_option};
 use snafu::{OptionExt, ResultExt};
 use tracing_appender::non_blocking::WorkerGuard;

@@ -256,7 +256,7 @@ impl StartCommand {

        if let Some(addr) = &self.rpc_bind_addr {
            opts.grpc.bind_addr.clone_from(addr);
-            opts.grpc.tls = tls_opts.clone();
+            opts.grpc.tls = merge_tls_option(&opts.grpc.tls, tls_opts.clone());
        }

        if let Some(addr) = &self.rpc_server_addr {
@@ -291,13 +291,13 @@ impl StartCommand {
        if let Some(addr) = &self.mysql_addr {
            opts.mysql.enable = true;
            opts.mysql.addr.clone_from(addr);
-            opts.mysql.tls = tls_opts.clone();
+            opts.mysql.tls = merge_tls_option(&opts.mysql.tls, tls_opts.clone());
        }

        if let Some(addr) = &self.postgres_addr {
            opts.postgres.enable = true;
            opts.postgres.addr.clone_from(addr);
-            opts.postgres.tls = tls_opts;
+            opts.postgres.tls = merge_tls_option(&opts.postgres.tls, tls_opts.clone());
        }

        if let Some(enable) = self.influxdb_enable {
--- a/src/cmd/src/metasrv.rs
+++ b/src/cmd/src/metasrv.rs
@@ -20,6 +20,7 @@ use async_trait::async_trait;
 use clap::Parser;
 use common_base::Plugins;
 use common_config::Configurable;
+use common_meta::distributed_time_constants::init_distributed_time_constants;
 use common_telemetry::info;
 use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions};
 use common_version::{short_version, verbose_version};
@@ -327,6 +328,7 @@ impl StartCommand {
        log_versions(verbose_version(), short_version(), APP_NAME);
        maybe_activate_heap_profile(&opts.component.memory);
        create_resource_limit_metrics(APP_NAME);
+        init_distributed_time_constants(opts.component.heartbeat_interval);

        info!("Metasrv start command: {:#?}", self);

--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -62,7 +62,7 @@ use plugins::frontend::context::{
    CatalogManagerConfigureContext, StandaloneCatalogManagerConfigureContext,
 };
 use plugins::standalone::context::DdlManagerConfigureContext;
-use servers::tls::{TlsMode, TlsOption};
+use servers::tls::{TlsMode, TlsOption, merge_tls_option};
 use snafu::ResultExt;
 use standalone::StandaloneInformationExtension;
 use standalone::options::StandaloneOptions;
@@ -293,19 +293,20 @@ impl StartCommand {
                    ),
                }.fail();
            }
-            opts.grpc.bind_addr.clone_from(addr)
+            opts.grpc.bind_addr.clone_from(addr);
+            opts.grpc.tls = merge_tls_option(&opts.grpc.tls, tls_opts.clone());
        }

        if let Some(addr) = &self.mysql_addr {
            opts.mysql.enable = true;
            opts.mysql.addr.clone_from(addr);
-            opts.mysql.tls = tls_opts.clone();
+            opts.mysql.tls = merge_tls_option(&opts.mysql.tls, tls_opts.clone());
        }

        if let Some(addr) = &self.postgres_addr {
            opts.postgres.enable = true;
            opts.postgres.addr.clone_from(addr);
-            opts.postgres.tls = tls_opts;
+            opts.postgres.tls = merge_tls_option(&opts.postgres.tls, tls_opts.clone());
        }

        if self.influxdb_enable {
@@ -551,9 +552,8 @@ impl StartCommand {
        let grpc_handler = fe_instance.clone() as Arc<dyn GrpcQueryHandlerWithBoxedError>;
        let weak_grpc_handler = Arc::downgrade(&grpc_handler);
        frontend_instance_handler
-            .lock()
-            .unwrap()
-            .replace(weak_grpc_handler);
+            .set_handler(weak_grpc_handler)
+            .await;

        // set the frontend invoker for flownode
        let flow_streaming_engine = flownode.flow_engine().streaming_engine();
@@ -765,7 +765,6 @@ mod tests {
            user_provider: Some("static_user_provider:cmd:test=test".to_string()),
            mysql_addr: Some("127.0.0.1:4002".to_string()),
            postgres_addr: Some("127.0.0.1:4003".to_string()),
-            tls_watch: true,
            ..Default::default()
        };

@@ -782,8 +781,6 @@ mod tests {

        assert_eq!("./greptimedb_data/test/logs", opts.logging.dir);
        assert_eq!("debug", opts.logging.level.unwrap());
-        assert!(opts.mysql.tls.watch);
-        assert!(opts.postgres.tls.watch);
    }

    #[test]
--- a/src/common/config/src/error.rs
+++ b/src/common/config/src/error.rs
@@ -59,15 +59,6 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Failed to canonicalize path: {}", path))]
-    CanonicalizePath {
-        path: String,
-        #[snafu(source)]
-        error: std::io::Error,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Invalid path '{}': expected a file, not a directory", path))]
    InvalidPath {
        path: String,
@@ -82,8 +73,7 @@ impl ErrorExt for Error {
            Error::TomlFormat { .. }
            | Error::LoadLayeredConfig { .. }
            | Error::FileWatch { .. }
-            | Error::InvalidPath { .. }
-            | Error::CanonicalizePath { .. } => StatusCode::InvalidArguments,
+            | Error::InvalidPath { .. } => StatusCode::InvalidArguments,
            Error::SerdeJson { .. } => StatusCode::Unexpected,
        }
    }
--- a/src/common/config/src/file_watcher.rs
+++ b/src/common/config/src/file_watcher.rs
@@ -30,7 +30,7 @@ use common_telemetry::{error, info, warn};
 use notify::{EventKind, RecursiveMode, Watcher};
 use snafu::ResultExt;

-use crate::error::{CanonicalizePathSnafu, FileWatchSnafu, InvalidPathSnafu, Result};
+use crate::error::{FileWatchSnafu, InvalidPathSnafu, Result};

 /// Configuration for the file watcher behavior.
 #[derive(Debug, Clone, Default)]
@@ -41,15 +41,10 @@ pub struct FileWatcherConfig {

 impl FileWatcherConfig {
    pub fn new() -> Self {
-        Self::default()
+        Default::default()
    }

-    pub fn with_modify_and_create(mut self) -> Self {
-        self.include_remove_events = false;
-        self
-    }
-
-    pub fn with_remove_events(mut self) -> Self {
+    pub fn include_remove_events(mut self) -> Self {
        self.include_remove_events = true;
        self
    }
@@ -93,11 +88,8 @@ impl FileWatcherBuilder {
                path: path.display().to_string(),
            }
        );
-        // Canonicalize the path for reliable comparison with event paths
-        let canonical = path.canonicalize().context(CanonicalizePathSnafu {
-            path: path.display().to_string(),
-        })?;
-        self.file_paths.push(canonical);
+
+        self.file_paths.push(path.to_path_buf());
        Ok(self)
    }

@@ -144,7 +136,6 @@ impl FileWatcherBuilder {
        }

        let config = self.config;
-        let watched_files: HashSet<PathBuf> = self.file_paths.iter().cloned().collect();

        info!(
            "Spawning file watcher for paths: {:?} (watching parent directories)",
@@ -165,25 +156,7 @@ impl FileWatcherBuilder {
                            continue;
                        }

-                        // Check if any of the event paths match our watched files
-                        let is_watched_file = event.paths.iter().any(|event_path| {
-                            // Try to canonicalize the event path for comparison
-                            // If the file was deleted, canonicalize will fail, so we also
-                            // compare the raw path
-                            if let Ok(canonical) = event_path.canonicalize()
-                                && watched_files.contains(&canonical)
-                            {
-                                return true;
-                            }
-                            // For deleted files, compare using the raw path
-                            watched_files.contains(event_path)
-                        });
-
-                        if !is_watched_file {
-                            continue;
-                        }
-
-                        info!(?event.kind, ?event.paths, "Detected file change");
+                        info!(?event.kind, ?event.paths, "Detected folder change");
                        callback();
                    }
                    Err(err) => {
@@ -301,55 +274,4 @@ mod tests {
            "Watcher should have detected file recreation"
        );
    }
-
-    #[test]
-    fn test_file_watcher_ignores_other_files() {
-        common_telemetry::init_default_ut_logging();
-
-        let dir = create_temp_dir("test_file_watcher_other");
-        let watched_file = dir.path().join("watched.txt");
-        let other_file = dir.path().join("other.txt");
-
-        // Create both files
-        std::fs::write(&watched_file, "watched content").unwrap();
-        std::fs::write(&other_file, "other content").unwrap();
-
-        let counter = Arc::new(AtomicUsize::new(0));
-        let counter_clone = counter.clone();
-
-        FileWatcherBuilder::new()
-            .watch_path(&watched_file)
-            .unwrap()
-            .config(FileWatcherConfig::new())
-            .spawn(move || {
-                counter_clone.fetch_add(1, Ordering::SeqCst);
-            })
-            .unwrap();
-
-        // Give watcher time to start
-        std::thread::sleep(Duration::from_millis(100));
-
-        // Modify the other file - should NOT trigger callback
-        std::fs::write(&other_file, "modified other content").unwrap();
-
-        // Wait for potential event
-        std::thread::sleep(Duration::from_millis(500));
-
-        assert_eq!(
-            counter.load(Ordering::SeqCst),
-            0,
-            "Watcher should not have detected changes to other files"
-        );
-
-        // Now modify the watched file - SHOULD trigger callback
-        std::fs::write(&watched_file, "modified watched content").unwrap();
-
-        // Wait for the event to be processed
-        std::thread::sleep(Duration::from_millis(500));
-
-        assert!(
-            counter.load(Ordering::SeqCst) >= 1,
-            "Watcher should have detected change to watched file"
-        );
-    }
 }
--- a/src/common/datasource/src/object_store/s3.rs
+++ b/src/common/datasource/src/object_store/s3.rs
@@ -27,6 +27,7 @@ const SECRET_ACCESS_KEY: &str = "secret_access_key";
 const SESSION_TOKEN: &str = "session_token";
 const REGION: &str = "region";
 const ENABLE_VIRTUAL_HOST_STYLE: &str = "enable_virtual_host_style";
+const DISABLE_EC2_METADATA: &str = "disable_ec2_metadata";

 pub fn is_supported_in_s3(key: &str) -> bool {
    [
@@ -36,6 +37,7 @@ pub fn is_supported_in_s3(key: &str) -> bool {
        SESSION_TOKEN,
        REGION,
        ENABLE_VIRTUAL_HOST_STYLE,
+        DISABLE_EC2_METADATA,
    ]
    .contains(&key)
 }
@@ -82,6 +84,21 @@ pub fn build_s3_backend(
        }
    }

+    if let Some(disable_str) = connection.get(DISABLE_EC2_METADATA) {
+        let disable = disable_str.as_str().parse::<bool>().map_err(|e| {
+            error::InvalidConnectionSnafu {
+                msg: format!(
+                    "failed to parse the option {}={}, {}",
+                    DISABLE_EC2_METADATA, disable_str, e
+                ),
+            }
+            .build()
+        })?;
+        if disable {
+            builder = builder.disable_ec2_metadata();
+        }
+    }
+
    // TODO(weny): Consider finding a better way to eliminate duplicate code.
    Ok(ObjectStore::new(builder)
        .context(error::BuildBackendSnafu)?
@@ -109,6 +126,7 @@ mod tests {
        assert!(is_supported_in_s3(SESSION_TOKEN));
        assert!(is_supported_in_s3(REGION));
        assert!(is_supported_in_s3(ENABLE_VIRTUAL_HOST_STYLE));
+        assert!(is_supported_in_s3(DISABLE_EC2_METADATA));
        assert!(!is_supported_in_s3("foo"))
    }
 }
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -19,7 +19,7 @@ arc-swap = "1.0"
 arrow.workspace = true
 arrow-schema.workspace = true
 async-trait.workspace = true
-bincode = "1.3"
+bincode = "=1.3.3"
 catalog.workspace = true
 chrono.workspace = true
 common-base.workspace = true
--- a/src/common/memory-manager/Cargo.toml
+++ b/src/common/memory-manager/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "common-memory-manager"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+common-error = { workspace = true }
+common-macro = { workspace = true }
+common-telemetry = { workspace = true }
+humantime = { workspace = true }
+serde = { workspace = true }
+snafu = { workspace = true }
+tokio = { workspace = true, features = ["sync"] }
+
+[dev-dependencies]
+tokio = { workspace = true, features = ["rt", "macros"] }
--- a/src/common/memory-manager/src/error.rs
+++ b/src/common/memory-manager/src/error.rs
@@ -0,0 +1,63 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::time::Duration;
+
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_macro::stack_trace_debug;
+use snafu::Snafu;
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+#[derive(Snafu)]
+#[snafu(visibility(pub))]
+#[stack_trace_debug]
+pub enum Error {
+    #[snafu(display(
+        "Memory limit exceeded: requested {requested_bytes} bytes, limit {limit_bytes} bytes"
+    ))]
+    MemoryLimitExceeded {
+        requested_bytes: u64,
+        limit_bytes: u64,
+    },
+
+    #[snafu(display("Memory semaphore unexpectedly closed"))]
+    MemorySemaphoreClosed,
+
+    #[snafu(display(
+        "Timeout waiting for memory quota: requested {requested_bytes} bytes, waited {waited:?}"
+    ))]
+    MemoryAcquireTimeout {
+        requested_bytes: u64,
+        waited: Duration,
+    },
+}
+
+impl ErrorExt for Error {
+    fn status_code(&self) -> StatusCode {
+        use Error::*;
+
+        match self {
+            MemoryLimitExceeded { .. } => StatusCode::RuntimeResourcesExhausted,
+            MemorySemaphoreClosed => StatusCode::Unexpected,
+            MemoryAcquireTimeout { .. } => StatusCode::RuntimeResourcesExhausted,
+        }
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
--- a/src/common/memory-manager/src/granularity.rs
+++ b/src/common/memory-manager/src/granularity.rs
@@ -0,0 +1,168 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt;
+
+/// Memory permit granularity for different use cases.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum PermitGranularity {
+    /// 1 KB per permit
+    ///
+    /// Use for:
+    /// - HTTP/gRPC request limiting (small, high-concurrency operations)
+    /// - Small batch operations
+    /// - Scenarios requiring fine-grained fairness
+    Kilobyte,
+
+    /// 1 MB per permit (default)
+    ///
+    /// Use for:
+    /// - Query execution memory management
+    /// - Compaction memory control
+    /// - Large, long-running operations
+    #[default]
+    Megabyte,
+}
+
+impl PermitGranularity {
+    /// Returns the number of bytes per permit.
+    #[inline]
+    pub const fn bytes(self) -> u64 {
+        match self {
+            Self::Kilobyte => 1024,
+            Self::Megabyte => 1024 * 1024,
+        }
+    }
+
+    /// Returns a human-readable string representation.
+    pub const fn as_str(self) -> &'static str {
+        match self {
+            Self::Kilobyte => "1KB",
+            Self::Megabyte => "1MB",
+        }
+    }
+
+    /// Converts bytes to permits based on this granularity.
+    ///
+    /// Rounds up to ensure the requested bytes are fully covered.
+    /// Clamped to Semaphore::MAX_PERMITS.
+    #[inline]
+    pub fn bytes_to_permits(self, bytes: u64) -> u32 {
+        use tokio::sync::Semaphore;
+
+        let granularity_bytes = self.bytes();
+        bytes
+            .saturating_add(granularity_bytes - 1)
+            .saturating_div(granularity_bytes)
+            .min(Semaphore::MAX_PERMITS as u64)
+            .min(u32::MAX as u64) as u32
+    }
+
+    /// Converts permits to bytes based on this granularity.
+    #[inline]
+    pub fn permits_to_bytes(self, permits: u32) -> u64 {
+        (permits as u64).saturating_mul(self.bytes())
+    }
+}
+
+impl fmt::Display for PermitGranularity {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.as_str())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bytes_to_permits_kilobyte() {
+        let granularity = PermitGranularity::Kilobyte;
+
+        // Exact multiples
+        assert_eq!(granularity.bytes_to_permits(1024), 1);
+        assert_eq!(granularity.bytes_to_permits(2048), 2);
+        assert_eq!(granularity.bytes_to_permits(10 * 1024), 10);
+
+        // Rounds up
+        assert_eq!(granularity.bytes_to_permits(1), 1);
+        assert_eq!(granularity.bytes_to_permits(1025), 2);
+        assert_eq!(granularity.bytes_to_permits(2047), 2);
+    }
+
+    #[test]
+    fn test_bytes_to_permits_megabyte() {
+        let granularity = PermitGranularity::Megabyte;
+
+        // Exact multiples
+        assert_eq!(granularity.bytes_to_permits(1024 * 1024), 1);
+        assert_eq!(granularity.bytes_to_permits(2 * 1024 * 1024), 2);
+
+        // Rounds up
+        assert_eq!(granularity.bytes_to_permits(1), 1);
+        assert_eq!(granularity.bytes_to_permits(1024), 1);
+        assert_eq!(granularity.bytes_to_permits(1024 * 1024 + 1), 2);
+    }
+
+    #[test]
+    fn test_bytes_to_permits_zero_bytes() {
+        assert_eq!(PermitGranularity::Kilobyte.bytes_to_permits(0), 0);
+        assert_eq!(PermitGranularity::Megabyte.bytes_to_permits(0), 0);
+    }
+
+    #[test]
+    fn test_bytes_to_permits_clamps_to_maximum() {
+        use tokio::sync::Semaphore;
+
+        let max_permits = (Semaphore::MAX_PERMITS as u64).min(u32::MAX as u64) as u32;
+
+        assert_eq!(
+            PermitGranularity::Kilobyte.bytes_to_permits(u64::MAX),
+            max_permits
+        );
+        assert_eq!(
+            PermitGranularity::Megabyte.bytes_to_permits(u64::MAX),
+            max_permits
+        );
+    }
+
+    #[test]
+    fn test_permits_to_bytes() {
+        assert_eq!(PermitGranularity::Kilobyte.permits_to_bytes(1), 1024);
+        assert_eq!(PermitGranularity::Kilobyte.permits_to_bytes(10), 10 * 1024);
+
+        assert_eq!(PermitGranularity::Megabyte.permits_to_bytes(1), 1024 * 1024);
+        assert_eq!(
+            PermitGranularity::Megabyte.permits_to_bytes(10),
+            10 * 1024 * 1024
+        );
+    }
+
+    #[test]
+    fn test_round_trip_conversion() {
+        // Kilobyte: bytes -> permits -> bytes (should round up)
+        let kb = PermitGranularity::Kilobyte;
+        let permits = kb.bytes_to_permits(1500);
+        let bytes = kb.permits_to_bytes(permits);
+        assert!(bytes >= 1500); // Must cover original request
+        assert_eq!(bytes, 2048); // 2KB
+
+        // Megabyte: bytes -> permits -> bytes (should round up)
+        let mb = PermitGranularity::Megabyte;
+        let permits = mb.bytes_to_permits(1500);
+        let bytes = mb.permits_to_bytes(permits);
+        assert!(bytes >= 1500);
+        assert_eq!(bytes, 1024 * 1024); // 1MB
+    }
+}
--- a/src/common/memory-manager/src/guard.rs
+++ b/src/common/memory-manager/src/guard.rs
@@ -0,0 +1,141 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::{fmt, mem};
+
+use common_telemetry::debug;
+use tokio::sync::{OwnedSemaphorePermit, TryAcquireError};
+
+use crate::manager::{MemoryMetrics, MemoryQuota};
+
+/// Guard representing a slice of reserved memory.
+pub struct MemoryGuard<M: MemoryMetrics> {
+    pub(crate) state: GuardState<M>,
+}
+
+pub(crate) enum GuardState<M: MemoryMetrics> {
+    Unlimited,
+    Limited {
+        permit: OwnedSemaphorePermit,
+        quota: MemoryQuota<M>,
+    },
+}
+
+impl<M: MemoryMetrics> MemoryGuard<M> {
+    pub(crate) fn unlimited() -> Self {
+        Self {
+            state: GuardState::Unlimited,
+        }
+    }
+
+    pub(crate) fn limited(permit: OwnedSemaphorePermit, quota: MemoryQuota<M>) -> Self {
+        Self {
+            state: GuardState::Limited { permit, quota },
+        }
+    }
+
+    /// Returns granted quota in bytes.
+    pub fn granted_bytes(&self) -> u64 {
+        match &self.state {
+            GuardState::Unlimited => 0,
+            GuardState::Limited { permit, quota } => {
+                quota.permits_to_bytes(permit.num_permits() as u32)
+            }
+        }
+    }
+
+    /// Tries to allocate additional memory during task execution.
+    ///
+    /// On success, merges the new memory into this guard and returns true.
+    /// On failure, returns false and leaves this guard unchanged.
+    pub fn request_additional(&mut self, bytes: u64) -> bool {
+        match &mut self.state {
+            GuardState::Unlimited => true,
+            GuardState::Limited { permit, quota } => {
+                if bytes == 0 {
+                    return true;
+                }
+
+                let additional_permits = quota.bytes_to_permits(bytes);
+
+                match quota
+                    .semaphore
+                    .clone()
+                    .try_acquire_many_owned(additional_permits)
+                {
+                    Ok(additional_permit) => {
+                        permit.merge(additional_permit);
+                        quota.update_in_use_metric();
+                        debug!("Allocated additional {} bytes", bytes);
+                        true
+                    }
+                    Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => {
+                        quota.metrics.inc_rejected("request_additional");
+                        false
+                    }
+                }
+            }
+        }
+    }
+
+    /// Releases a portion of granted memory back to the pool early,
+    /// before the guard is dropped.
+    ///
+    /// Returns true if the release succeeds or is a no-op; false if the request exceeds granted.
+    pub fn early_release_partial(&mut self, bytes: u64) -> bool {
+        match &mut self.state {
+            GuardState::Unlimited => true,
+            GuardState::Limited { permit, quota } => {
+                if bytes == 0 {
+                    return true;
+                }
+
+                let release_permits = quota.bytes_to_permits(bytes);
+
+                match permit.split(release_permits as usize) {
+                    Some(released_permit) => {
+                        let released_bytes =
+                            quota.permits_to_bytes(released_permit.num_permits() as u32);
+                        drop(released_permit);
+                        quota.update_in_use_metric();
+                        debug!("Early released {} bytes from memory guard", released_bytes);
+                        true
+                    }
+                    None => false,
+                }
+            }
+        }
+    }
+}
+
+impl<M: MemoryMetrics> Drop for MemoryGuard<M> {
+    fn drop(&mut self) {
+        if let GuardState::Limited { permit, quota } =
+            mem::replace(&mut self.state, GuardState::Unlimited)
+        {
+            let bytes = quota.permits_to_bytes(permit.num_permits() as u32);
+            drop(permit);
+            quota.update_in_use_metric();
+            debug!("Released memory: {} bytes", bytes);
+        }
+    }
+}
+
+impl<M: MemoryMetrics> fmt::Debug for MemoryGuard<M> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("MemoryGuard")
+            .field("granted_bytes", &self.granted_bytes())
+            .finish()
+    }
+}
--- a/src/common/memory-manager/src/lib.rs
+++ b/src/common/memory-manager/src/lib.rs
@@ -0,0 +1,49 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Generic memory management for resource-constrained operations.
+//!
+//! This crate provides a reusable memory quota system based on semaphores,
+//! allowing different subsystems (compaction, flush, index build, etc.) to
+//! share the same allocation logic while using their own metrics.
+
+mod error;
+mod granularity;
+mod guard;
+mod manager;
+mod policy;
+
+#[cfg(test)]
+mod tests;
+
+pub use error::{Error, Result};
+pub use granularity::PermitGranularity;
+pub use guard::MemoryGuard;
+pub use manager::{MemoryManager, MemoryMetrics};
+pub use policy::{DEFAULT_MEMORY_WAIT_TIMEOUT, OnExhaustedPolicy};
+
+/// No-op metrics implementation for testing.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct NoOpMetrics;
+
+impl MemoryMetrics for NoOpMetrics {
+    #[inline(always)]
+    fn set_limit(&self, _: i64) {}
+
+    #[inline(always)]
+    fn set_in_use(&self, _: i64) {}
+
+    #[inline(always)]
+    fn inc_rejected(&self, _: &str) {}
+}
--- a/src/common/memory-manager/src/manager.rs
+++ b/src/common/memory-manager/src/manager.rs
@@ -0,0 +1,216 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use snafu::ensure;
+use tokio::sync::{Semaphore, TryAcquireError};
+
+use crate::error::{
+    MemoryAcquireTimeoutSnafu, MemoryLimitExceededSnafu, MemorySemaphoreClosedSnafu, Result,
+};
+use crate::granularity::PermitGranularity;
+use crate::guard::MemoryGuard;
+use crate::policy::OnExhaustedPolicy;
+
+/// Trait for recording memory usage metrics.
+pub trait MemoryMetrics: Clone + Send + Sync + 'static {
+    fn set_limit(&self, bytes: i64);
+    fn set_in_use(&self, bytes: i64);
+    fn inc_rejected(&self, reason: &str);
+}
+
+/// Generic memory manager for quota-controlled operations.
+#[derive(Clone)]
+pub struct MemoryManager<M: MemoryMetrics> {
+    quota: Option<MemoryQuota<M>>,
+}
+
+#[derive(Clone)]
+pub(crate) struct MemoryQuota<M: MemoryMetrics> {
+    pub(crate) semaphore: Arc<Semaphore>,
+    pub(crate) limit_permits: u32,
+    pub(crate) granularity: PermitGranularity,
+    pub(crate) metrics: M,
+}
+
+impl<M: MemoryMetrics> MemoryManager<M> {
+    /// Creates a new memory manager with the given limit in bytes.
+    /// `limit_bytes = 0` disables the limit.
+    pub fn new(limit_bytes: u64, metrics: M) -> Self {
+        Self::with_granularity(limit_bytes, PermitGranularity::default(), metrics)
+    }
+
+    /// Creates a new memory manager with specified granularity.
+    pub fn with_granularity(limit_bytes: u64, granularity: PermitGranularity, metrics: M) -> Self {
+        if limit_bytes == 0 {
+            metrics.set_limit(0);
+            return Self { quota: None };
+        }
+
+        let limit_permits = granularity.bytes_to_permits(limit_bytes);
+        let limit_aligned_bytes = granularity.permits_to_bytes(limit_permits);
+        metrics.set_limit(limit_aligned_bytes as i64);
+
+        Self {
+            quota: Some(MemoryQuota {
+                semaphore: Arc::new(Semaphore::new(limit_permits as usize)),
+                limit_permits,
+                granularity,
+                metrics,
+            }),
+        }
+    }
+
+    /// Returns the configured limit in bytes (0 if unlimited).
+    pub fn limit_bytes(&self) -> u64 {
+        self.quota
+            .as_ref()
+            .map(|quota| quota.permits_to_bytes(quota.limit_permits))
+            .unwrap_or(0)
+    }
+
+    /// Returns currently used bytes.
+    pub fn used_bytes(&self) -> u64 {
+        self.quota
+            .as_ref()
+            .map(|quota| quota.permits_to_bytes(quota.used_permits()))
+            .unwrap_or(0)
+    }
+
+    /// Returns available bytes.
+    pub fn available_bytes(&self) -> u64 {
+        self.quota
+            .as_ref()
+            .map(|quota| quota.permits_to_bytes(quota.available_permits_clamped()))
+            .unwrap_or(0)
+    }
+
+    /// Acquires memory, waiting if necessary until enough is available.
+    ///
+    /// # Errors
+    /// - Returns error if requested bytes exceed the total limit
+    /// - Returns error if the semaphore is unexpectedly closed
+    pub async fn acquire(&self, bytes: u64) -> Result<MemoryGuard<M>> {
+        match &self.quota {
+            None => Ok(MemoryGuard::unlimited()),
+            Some(quota) => {
+                let permits = quota.bytes_to_permits(bytes);
+
+                ensure!(
+                    permits <= quota.limit_permits,
+                    MemoryLimitExceededSnafu {
+                        requested_bytes: bytes,
+                        limit_bytes: self.limit_bytes()
+                    }
+                );
+
+                let permit = quota
+                    .semaphore
+                    .clone()
+                    .acquire_many_owned(permits)
+                    .await
+                    .map_err(|_| MemorySemaphoreClosedSnafu.build())?;
+                quota.update_in_use_metric();
+                Ok(MemoryGuard::limited(permit, quota.clone()))
+            }
+        }
+    }
+
+    /// Tries to acquire memory. Returns Some(guard) on success, None if insufficient.
+    pub fn try_acquire(&self, bytes: u64) -> Option<MemoryGuard<M>> {
+        match &self.quota {
+            None => Some(MemoryGuard::unlimited()),
+            Some(quota) => {
+                let permits = quota.bytes_to_permits(bytes);
+
+                match quota.semaphore.clone().try_acquire_many_owned(permits) {
+                    Ok(permit) => {
+                        quota.update_in_use_metric();
+                        Some(MemoryGuard::limited(permit, quota.clone()))
+                    }
+                    Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => {
+                        quota.metrics.inc_rejected("try_acquire");
+                        None
+                    }
+                }
+            }
+        }
+    }
+
+    /// Acquires memory based on the given policy.
+    ///
+    /// - For `OnExhaustedPolicy::Wait`: Waits up to the timeout duration for memory to become available
+    /// - For `OnExhaustedPolicy::Fail`: Returns immediately if memory is not available
+    ///
+    /// # Errors
+    /// - `MemoryLimitExceeded`: Requested bytes exceed the total limit (both policies), or memory is currently exhausted (Fail policy only)
+    /// - `MemoryAcquireTimeout`: Timeout elapsed while waiting for memory (Wait policy only)
+    /// - `MemorySemaphoreClosed`: The internal semaphore is unexpectedly closed (rare, indicates system issue)
+    pub async fn acquire_with_policy(
+        &self,
+        bytes: u64,
+        policy: OnExhaustedPolicy,
+    ) -> Result<MemoryGuard<M>> {
+        match policy {
+            OnExhaustedPolicy::Wait { timeout } => {
+                match tokio::time::timeout(timeout, self.acquire(bytes)).await {
+                    Ok(Ok(guard)) => Ok(guard),
+                    Ok(Err(e)) => Err(e),
+                    Err(_elapsed) => {
+                        // Timeout elapsed while waiting
+                        MemoryAcquireTimeoutSnafu {
+                            requested_bytes: bytes,
+                            waited: timeout,
+                        }
+                        .fail()
+                    }
+                }
+            }
+            OnExhaustedPolicy::Fail => self.try_acquire(bytes).ok_or_else(|| {
+                MemoryLimitExceededSnafu {
+                    requested_bytes: bytes,
+                    limit_bytes: self.limit_bytes(),
+                }
+                .build()
+            }),
+        }
+    }
+}
+
+impl<M: MemoryMetrics> MemoryQuota<M> {
+    pub(crate) fn bytes_to_permits(&self, bytes: u64) -> u32 {
+        self.granularity.bytes_to_permits(bytes)
+    }
+
+    pub(crate) fn permits_to_bytes(&self, permits: u32) -> u64 {
+        self.granularity.permits_to_bytes(permits)
+    }
+
+    pub(crate) fn used_permits(&self) -> u32 {
+        self.limit_permits
+            .saturating_sub(self.available_permits_clamped())
+    }
+
+    pub(crate) fn available_permits_clamped(&self) -> u32 {
+        self.semaphore
+            .available_permits()
+            .min(self.limit_permits as usize) as u32
+    }
+
+    pub(crate) fn update_in_use_metric(&self) {
+        let bytes = self.permits_to_bytes(self.used_permits());
+        self.metrics.set_in_use(bytes as i64);
+    }
+}
--- a/src/common/memory-manager/src/policy.rs
+++ b/src/common/memory-manager/src/policy.rs
@@ -0,0 +1,83 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::time::Duration;
+
+use humantime::{format_duration, parse_duration};
+use serde::{Deserialize, Serialize};
+
+/// Default wait timeout for memory acquisition.
+pub const DEFAULT_MEMORY_WAIT_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// Defines how to react when memory cannot be acquired immediately.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum OnExhaustedPolicy {
+    /// Wait until enough memory is released, bounded by timeout.
+    Wait { timeout: Duration },
+
+    /// Fail immediately if memory is not available.
+    Fail,
+}
+
+impl Default for OnExhaustedPolicy {
+    fn default() -> Self {
+        OnExhaustedPolicy::Wait {
+            timeout: DEFAULT_MEMORY_WAIT_TIMEOUT,
+        }
+    }
+}
+
+impl Serialize for OnExhaustedPolicy {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let text = match self {
+            OnExhaustedPolicy::Fail => "fail".to_string(),
+            OnExhaustedPolicy::Wait { timeout } if *timeout == DEFAULT_MEMORY_WAIT_TIMEOUT => {
+                "wait".to_string()
+            }
+            OnExhaustedPolicy::Wait { timeout } => format!("wait({})", format_duration(*timeout)),
+        };
+        serializer.serialize_str(&text)
+    }
+}
+
+impl<'de> Deserialize<'de> for OnExhaustedPolicy {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let raw = String::deserialize(deserializer)?;
+        let lower = raw.to_ascii_lowercase();
+
+        // Accept both "skip" (legacy) and "fail".
+        if lower == "skip" || lower == "fail" {
+            return Ok(OnExhaustedPolicy::Fail);
+        }
+        if lower == "wait" {
+            return Ok(OnExhaustedPolicy::default());
+        }
+        if lower.starts_with("wait(") && lower.ends_with(')') {
+            let inner = &raw[5..raw.len() - 1];
+            let timeout = parse_duration(inner).map_err(serde::de::Error::custom)?;
+            return Ok(OnExhaustedPolicy::Wait { timeout });
+        }
+
+        Err(serde::de::Error::custom(format!(
+            "invalid memory policy: {}, expected wait, wait(<duration>), fail",
+            raw
+        )))
+    }
+}
--- a/src/common/memory-manager/src/tests.rs
+++ b/src/common/memory-manager/src/tests.rs
@@ -0,0 +1,250 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use tokio::time::{Duration, sleep};
+
+use crate::{MemoryManager, NoOpMetrics, PermitGranularity};
+
+// Helper constant for tests - use default Megabyte granularity
+const PERMIT_GRANULARITY_BYTES: u64 = PermitGranularity::Megabyte.bytes();
+
+#[test]
+fn test_try_acquire_unlimited() {
+    let manager = MemoryManager::new(0, NoOpMetrics);
+    let guard = manager.try_acquire(10 * PERMIT_GRANULARITY_BYTES).unwrap();
+    assert_eq!(manager.limit_bytes(), 0);
+    assert_eq!(guard.granted_bytes(), 0);
+}
+
+#[test]
+fn test_try_acquire_limited_success_and_release() {
+    let bytes = 2 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(bytes, NoOpMetrics);
+    {
+        let guard = manager.try_acquire(PERMIT_GRANULARITY_BYTES).unwrap();
+        assert_eq!(guard.granted_bytes(), PERMIT_GRANULARITY_BYTES);
+        assert_eq!(manager.used_bytes(), PERMIT_GRANULARITY_BYTES);
+        drop(guard);
+    }
+    assert_eq!(manager.used_bytes(), 0);
+}
+
+#[test]
+fn test_try_acquire_exceeds_limit() {
+    let limit = PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+    let result = manager.try_acquire(limit + PERMIT_GRANULARITY_BYTES);
+    assert!(result.is_none());
+}
+
+#[tokio::test(flavor = "current_thread")]
+async fn test_acquire_blocks_and_unblocks() {
+    let bytes = 2 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(bytes, NoOpMetrics);
+    let guard = manager.try_acquire(bytes).unwrap();
+
+    // Spawn a task that will block on acquire()
+    let waiter = {
+        let manager = manager.clone();
+        tokio::spawn(async move {
+            // This will block until memory is available
+            let _guard = manager.acquire(bytes).await.unwrap();
+        })
+    };
+
+    sleep(Duration::from_millis(10)).await;
+    // Release memory - this should unblock the waiter
+    drop(guard);
+
+    // Waiter should complete now
+    waiter.await.unwrap();
+}
+
+#[test]
+fn test_request_additional_success() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES; // 10MB limit
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    // Acquire base quota (5MB)
+    let base = 5 * PERMIT_GRANULARITY_BYTES;
+    let mut guard = manager.try_acquire(base).unwrap();
+    assert_eq!(guard.granted_bytes(), base);
+    assert_eq!(manager.used_bytes(), base);
+
+    // Request additional memory (3MB) - should succeed and merge
+    assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_request_additional_exceeds_limit() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES; // 10MB limit
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    // Acquire base quota (5MB)
+    let base = 5 * PERMIT_GRANULARITY_BYTES;
+    let mut guard = manager.try_acquire(base).unwrap();
+
+    // Request additional memory (3MB) - should succeed
+    assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+
+    // Request more (3MB) - should fail (would exceed 10MB limit)
+    let result = guard.request_additional(3 * PERMIT_GRANULARITY_BYTES);
+    assert!(!result);
+
+    // Still at 8MB
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(guard.granted_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_request_additional_auto_release_on_guard_drop() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    {
+        let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+        // Request additional - memory is merged into guard
+        assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
+        assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+
+        // When guard drops, all memory (base + additional) is released together
+    }
+
+    // After scope, all memory should be released
+    assert_eq!(manager.used_bytes(), 0);
+}
+
+#[test]
+fn test_request_additional_unlimited() {
+    let manager = MemoryManager::new(0, NoOpMetrics); // Unlimited
+    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Should always succeed with unlimited manager
+    assert!(guard.request_additional(100 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 0);
+    assert_eq!(manager.used_bytes(), 0);
+}
+
+#[test]
+fn test_request_additional_zero_bytes() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Request 0 bytes should succeed without affecting anything
+    assert!(guard.request_additional(0));
+    assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_early_release_partial_success() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(8 * PERMIT_GRANULARITY_BYTES).unwrap();
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+
+    // Release half
+    assert!(guard.early_release_partial(4 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 4 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 4 * PERMIT_GRANULARITY_BYTES);
+
+    // Released memory should be available to others
+    let _guard2 = manager.try_acquire(4 * PERMIT_GRANULARITY_BYTES).unwrap();
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_early_release_partial_exceeds_granted() {
+    let manager = MemoryManager::new(10 * PERMIT_GRANULARITY_BYTES, NoOpMetrics);
+    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Try to release more than granted - should fail
+    assert!(!guard.early_release_partial(10 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_early_release_partial_unlimited() {
+    let manager = MemoryManager::new(0, NoOpMetrics);
+    let mut guard = manager.try_acquire(100 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Unlimited guard - release should succeed (no-op)
+    assert!(guard.early_release_partial(50 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 0);
+}
+
+#[test]
+fn test_request_and_early_release_symmetry() {
+    let limit = 20 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Request additional
+    assert!(guard.request_additional(5 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
+
+    // Early release some
+    assert!(guard.early_release_partial(3 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
+
+    // Request again
+    assert!(guard.request_additional(2 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
+
+    // Early release again
+    assert!(guard.early_release_partial(4 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+
+    drop(guard);
+    assert_eq!(manager.used_bytes(), 0);
+}
+
+#[test]
+fn test_small_allocation_rounds_up() {
+    // Test that allocations smaller than PERMIT_GRANULARITY_BYTES
+    // round up to 1 permit and can use request_additional()
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(512 * 1024).unwrap(); // 512KB
+    assert_eq!(guard.granted_bytes(), PERMIT_GRANULARITY_BYTES); // Rounds up to 1MB
+    assert!(guard.request_additional(2 * PERMIT_GRANULARITY_BYTES)); // Can request more
+    assert_eq!(guard.granted_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_acquire_zero_bytes_lazy_allocation() {
+    // Test that acquire(0) returns 0 permits but can request_additional() later
+    let manager = MemoryManager::new(10 * PERMIT_GRANULARITY_BYTES, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(0).unwrap();
+    assert_eq!(guard.granted_bytes(), 0); // No permits consumed
+    assert_eq!(manager.used_bytes(), 0);
+
+    assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES)); // Lazy allocation
+    assert_eq!(guard.granted_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
+}
--- a/src/common/meta/src/distributed_time_constants.rs
+++ b/src/common/meta/src/distributed_time_constants.rs
@@ -12,27 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::sync::OnceLock;
 use std::time::Duration;

-use etcd_client::ConnectOptions;
-
-/// Heartbeat interval time (is the basic unit of various time).
-pub const HEARTBEAT_INTERVAL_MILLIS: u64 = 3000;
-
-/// The frontend will also send heartbeats to Metasrv, sending an empty
-/// heartbeat every HEARTBEAT_INTERVAL_MILLIS * 6 seconds.
-pub const FRONTEND_HEARTBEAT_INTERVAL_MILLIS: u64 = HEARTBEAT_INTERVAL_MILLIS * 6;
-
-/// The lease seconds of a region. It's set by 3 heartbeat intervals
-/// (HEARTBEAT_INTERVAL_MILLIS × 3), plus some extra buffer (1 second).
-pub const REGION_LEASE_SECS: u64 =
-    Duration::from_millis(HEARTBEAT_INTERVAL_MILLIS * 3).as_secs() + 1;
-
-/// When creating table or region failover, a target node needs to be selected.
-/// If the node's lease has expired, the `Selector` will not select it.
-pub const DATANODE_LEASE_SECS: u64 = REGION_LEASE_SECS;
-
-pub const FLOWNODE_LEASE_SECS: u64 = DATANODE_LEASE_SECS;
+pub const BASE_HEARTBEAT_INTERVAL: Duration = Duration::from_secs(3);

 /// The lease seconds of metasrv leader.
 pub const META_LEASE_SECS: u64 = 5;
@@ -52,14 +35,6 @@ pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS: Duration = Duration::from_
 /// The keep-alive timeout of the heartbeat channel.
 pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS: Duration = Duration::from_secs(5);

-/// The default options for the etcd client.
-pub fn default_etcd_client_options() -> ConnectOptions {
-    ConnectOptions::new()
-        .with_keep_alive_while_idle(true)
-        .with_keep_alive(Duration::from_secs(15), Duration::from_secs(5))
-        .with_connect_timeout(Duration::from_secs(10))
-}
-
 /// The default mailbox round-trip timeout.
 pub const MAILBOX_RTT_SECS: u64 = 1;

@@ -68,3 +43,60 @@ pub const TOPIC_STATS_REPORT_INTERVAL_SECS: u64 = 15;

 /// The retention seconds of topic stats.
 pub const TOPIC_STATS_RETENTION_SECS: u64 = TOPIC_STATS_REPORT_INTERVAL_SECS * 100;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+/// The distributed time constants.
+pub struct DistributedTimeConstants {
+    pub heartbeat_interval: Duration,
+    pub frontend_heartbeat_interval: Duration,
+    pub region_lease: Duration,
+    pub datanode_lease: Duration,
+    pub flownode_lease: Duration,
+}
+
+/// The frontend heartbeat interval is 6 times of the base heartbeat interval.
+pub fn frontend_heartbeat_interval(base_heartbeat_interval: Duration) -> Duration {
+    base_heartbeat_interval * 6
+}
+
+impl DistributedTimeConstants {
+    /// Create a new DistributedTimeConstants from the heartbeat interval.
+    pub fn from_heartbeat_interval(heartbeat_interval: Duration) -> Self {
+        let region_lease = heartbeat_interval * 3 + Duration::from_secs(1);
+        let datanode_lease = region_lease;
+        let flownode_lease = datanode_lease;
+        Self {
+            heartbeat_interval,
+            frontend_heartbeat_interval: frontend_heartbeat_interval(heartbeat_interval),
+            region_lease,
+            datanode_lease,
+            flownode_lease,
+        }
+    }
+}
+
+impl Default for DistributedTimeConstants {
+    fn default() -> Self {
+        Self::from_heartbeat_interval(BASE_HEARTBEAT_INTERVAL)
+    }
+}
+
+static DEFAULT_DISTRIBUTED_TIME_CONSTANTS: OnceLock<DistributedTimeConstants> = OnceLock::new();
+
+/// Get the default distributed time constants.
+pub fn default_distributed_time_constants() -> &'static DistributedTimeConstants {
+    DEFAULT_DISTRIBUTED_TIME_CONSTANTS.get_or_init(Default::default)
+}
+
+/// Initialize the default distributed time constants.
+pub fn init_distributed_time_constants(base_heartbeat_interval: Duration) {
+    let distributed_time_constants =
+        DistributedTimeConstants::from_heartbeat_interval(base_heartbeat_interval);
+    DEFAULT_DISTRIBUTED_TIME_CONSTANTS
+        .set(distributed_time_constants)
+        .expect("Failed to set default distributed time constants");
+    common_telemetry::info!(
+        "Initialized default distributed time constants: {:#?}",
+        distributed_time_constants
+    );
+}
--- a/src/common/meta/src/wal_options_allocator/topic_creator.rs
+++ b/src/common/meta/src/wal_options_allocator/topic_creator.rs
@@ -14,7 +14,7 @@

 use common_telemetry::{debug, error, info};
 use common_wal::config::kafka::common::{
-    DEFAULT_BACKOFF_CONFIG, DEFAULT_CONNECT_TIMEOUT, KafkaConnectionConfig, KafkaTopicConfig,
+    DEFAULT_BACKOFF_CONFIG, KafkaConnectionConfig, KafkaTopicConfig,
 };
 use rskafka::client::error::Error as RsKafkaError;
 use rskafka::client::error::ProtocolError::TopicAlreadyExists;
@@ -211,7 +211,8 @@ pub async fn build_kafka_client(connection: &KafkaConnectionConfig) -> Result<Cl
    // Builds an kafka controller client for creating topics.
    let mut builder = ClientBuilder::new(connection.broker_endpoints.clone())
        .backoff_config(DEFAULT_BACKOFF_CONFIG)
-        .connect_timeout(Some(DEFAULT_CONNECT_TIMEOUT));
+        .connect_timeout(Some(connection.connect_timeout))
+        .timeout(Some(connection.timeout));
    if let Some(sasl) = &connection.sasl {
        builder = builder.sasl_config(sasl.config.clone().into_sasl_config());
    };
--- a/src/common/sql/Cargo.toml
+++ b/src/common/sql/Cargo.toml
@@ -5,10 +5,12 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+arrow-schema.workspace = true
 common-base.workspace = true
 common-decimal.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
+common-telemetry.workspace = true
 common-time.workspace = true
 datafusion-sql.workspace = true
 datatypes.workspace = true
--- a/src/common/sql/src/convert.rs
+++ b/src/common/sql/src/convert.rs
@@ -14,11 +14,12 @@

 use std::str::FromStr;

+use arrow_schema::extension::ExtensionType;
 use common_time::Timestamp;
 use common_time::timezone::Timezone;
-use datatypes::json::JsonStructureSettings;
+use datatypes::extension::json::JsonExtensionType;
 use datatypes::prelude::ConcreteDataType;
-use datatypes::schema::ColumnDefaultConstraint;
+use datatypes::schema::{ColumnDefaultConstraint, ColumnSchema};
 use datatypes::types::{JsonFormat, parse_string_to_jsonb, parse_string_to_vector_type_value};
 use datatypes::value::{OrderedF32, OrderedF64, Value};
 use snafu::{OptionExt, ResultExt, ensure};
@@ -124,13 +125,14 @@ pub(crate) fn sql_number_to_value(data_type: &ConcreteDataType, n: &str) -> Resu
 /// If `auto_string_to_numeric` is true, tries to cast the string value to numeric values,
 /// and returns error if the cast fails.
 pub fn sql_value_to_value(
-    column_name: &str,
-    data_type: &ConcreteDataType,
+    column_schema: &ColumnSchema,
    sql_val: &SqlValue,
    timezone: Option<&Timezone>,
    unary_op: Option<UnaryOperator>,
    auto_string_to_numeric: bool,
 ) -> Result<Value> {
+    let column_name = &column_schema.name;
+    let data_type = &column_schema.data_type;
    let mut value = match sql_val {
        SqlValue::Number(n, _) => sql_number_to_value(data_type, n)?,
        SqlValue::Null => Value::Null,
@@ -146,13 +148,9 @@ pub fn sql_value_to_value(

            (*b).into()
        }
-        SqlValue::DoubleQuotedString(s) | SqlValue::SingleQuotedString(s) => parse_string_to_value(
-            column_name,
-            s.clone(),
-            data_type,
-            timezone,
-            auto_string_to_numeric,
-        )?,
+        SqlValue::DoubleQuotedString(s) | SqlValue::SingleQuotedString(s) => {
+            parse_string_to_value(column_schema, s.clone(), timezone, auto_string_to_numeric)?
+        }
        SqlValue::HexStringLiteral(s) => {
            // Should not directly write binary into json column
            ensure!(
@@ -244,12 +242,12 @@ pub fn sql_value_to_value(
 }

 pub(crate) fn parse_string_to_value(
-    column_name: &str,
+    column_schema: &ColumnSchema,
    s: String,
-    data_type: &ConcreteDataType,
    timezone: Option<&Timezone>,
    auto_string_to_numeric: bool,
 ) -> Result<Value> {
+    let data_type = &column_schema.data_type;
    if auto_string_to_numeric && let Some(value) = auto_cast_to_numeric(&s, data_type)? {
        return Ok(value);
    }
@@ -257,7 +255,7 @@ pub(crate) fn parse_string_to_value(
    ensure!(
        data_type.is_stringifiable(),
        ColumnTypeMismatchSnafu {
-            column_name,
+            column_name: column_schema.name.clone(),
            expect: data_type.clone(),
            actual: ConcreteDataType::string_datatype(),
        }
@@ -303,23 +301,21 @@ pub(crate) fn parse_string_to_value(
            }
        }
        ConcreteDataType::Binary(_) => Ok(Value::Binary(s.as_bytes().into())),
-        ConcreteDataType::Json(j) => {
-            match &j.format {
-                JsonFormat::Jsonb => {
-                    let v = parse_string_to_jsonb(&s).context(DatatypeSnafu)?;
-                    Ok(Value::Binary(v.into()))
-                }
-                JsonFormat::Native(_inner) => {
-                    // Always use the structured version at this level.
-                    let serde_json_value =
-                        serde_json::from_str(&s).context(DeserializeSnafu { json: s })?;
-                    let json_structure_settings = JsonStructureSettings::Structured(None);
-                    json_structure_settings
-                        .encode(serde_json_value)
-                        .context(DatatypeSnafu)
-                }
+        ConcreteDataType::Json(j) => match &j.format {
+            JsonFormat::Jsonb => {
+                let v = parse_string_to_jsonb(&s).context(DatatypeSnafu)?;
+                Ok(Value::Binary(v.into()))
            }
-        }
+            JsonFormat::Native(_) => {
+                let extension_type: Option<JsonExtensionType> =
+                    column_schema.extension_type().context(DatatypeSnafu)?;
+                let json_structure_settings = extension_type
+                    .and_then(|x| x.metadata().json_structure_settings.clone())
+                    .unwrap_or_default();
+                let v = serde_json::from_str(&s).context(DeserializeSnafu { json: s })?;
+                json_structure_settings.encode(v).context(DatatypeSnafu)
+            }
+        },
        ConcreteDataType::Vector(d) => {
            let v = parse_string_to_vector_type_value(&s, Some(d.dim)).context(DatatypeSnafu)?;
            Ok(Value::Binary(v.into()))
@@ -417,305 +413,265 @@ mod test {

    use super::*;

+    macro_rules! call_parse_string_to_value {
+        ($column_name: expr, $input: expr, $data_type: expr) => {
+            call_parse_string_to_value!($column_name, $input, $data_type, None)
+        };
+        ($column_name: expr, $input: expr, $data_type: expr, timezone = $timezone: expr) => {
+            call_parse_string_to_value!($column_name, $input, $data_type, Some($timezone))
+        };
+        ($column_name: expr, $input: expr, $data_type: expr, $timezone: expr) => {{
+            let column_schema = ColumnSchema::new($column_name, $data_type, true);
+            parse_string_to_value(&column_schema, $input, $timezone, true)
+        }};
+    }
+
    #[test]
-    fn test_string_to_value_auto_numeric() {
+    fn test_string_to_value_auto_numeric() -> Result<()> {
        // Test string to boolean with auto cast
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "true".to_string(),
-            &ConcreteDataType::boolean_datatype(),
-            None,
-            true,
-        )
-        .unwrap();
+            ConcreteDataType::boolean_datatype()
+        )?;
        assert_eq!(Value::Boolean(true), result);

        // Test invalid string to boolean with auto cast
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "not_a_boolean".to_string(),
-            &ConcreteDataType::boolean_datatype(),
-            None,
-            true,
+            ConcreteDataType::boolean_datatype()
        );
        assert!(result.is_err());

        // Test string to int8
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "42".to_string(),
-            &ConcreteDataType::int8_datatype(),
-            None,
-            true,
-        )
-        .unwrap();
+            ConcreteDataType::int8_datatype()
+        )?;
        assert_eq!(Value::Int8(42), result);

        // Test invalid string to int8 with auto cast
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "not_an_int8".to_string(),
-            &ConcreteDataType::int8_datatype(),
-            None,
-            true,
+            ConcreteDataType::int8_datatype()
        );
        assert!(result.is_err());

        // Test string to int16
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "1000".to_string(),
-            &ConcreteDataType::int16_datatype(),
-            None,
-            true,
-        )
-        .unwrap();
+            ConcreteDataType::int16_datatype()
+        )?;
        assert_eq!(Value::Int16(1000), result);

        // Test invalid string to int16 with auto cast
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "not_an_int16".to_string(),
-            &ConcreteDataType::int16_datatype(),
-            None,
-            true,
+            ConcreteDataType::int16_datatype()
        );
        assert!(result.is_err());

        // Test string to int32
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "100000".to_string(),
-            &ConcreteDataType::int32_datatype(),
-            None,
-            true,
-        )
-        .unwrap();
+            ConcreteDataType::int32_datatype()
+        )?;
        assert_eq!(Value::Int32(100000), result);

        // Test invalid string to int32 with auto cast
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "not_an_int32".to_string(),
-            &ConcreteDataType::int32_datatype(),
-            None,
-            true,
+            ConcreteDataType::int32_datatype()
        );
        assert!(result.is_err());

        // Test string to int64
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "1000000".to_string(),
-            &ConcreteDataType::int64_datatype(),
-            None,
-            true,
-        )
-        .unwrap();
+            ConcreteDataType::int64_datatype()
+        )?;
        assert_eq!(Value::Int64(1000000), result);

        // Test invalid string to int64 with auto cast
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "not_an_int64".to_string(),
-            &ConcreteDataType::int64_datatype(),
-            None,
-            true,
+            ConcreteDataType::int64_datatype()
        );
        assert!(result.is_err());

        // Test string to uint8
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "200".to_string(),
-            &ConcreteDataType::uint8_datatype(),
-            None,
-            true,
-        )
-        .unwrap();
+            ConcreteDataType::uint8_datatype()
+        )?;
        assert_eq!(Value::UInt8(200), result);

        // Test invalid string to uint8 with auto cast
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "not_a_uint8".to_string(),
-            &ConcreteDataType::uint8_datatype(),
-            None,
-            true,
+            ConcreteDataType::uint8_datatype()
        );
        assert!(result.is_err());

        // Test string to uint16
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "60000".to_string(),
-            &ConcreteDataType::uint16_datatype(),
-            None,
-            true,
-        )
-        .unwrap();
+            ConcreteDataType::uint16_datatype()
+        )?;
        assert_eq!(Value::UInt16(60000), result);

        // Test invalid string to uint16 with auto cast
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "not_a_uint16".to_string(),
-            &ConcreteDataType::uint16_datatype(),
-            None,
-            true,
+            ConcreteDataType::uint16_datatype()
        );
        assert!(result.is_err());

        // Test string to uint32
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "4000000000".to_string(),
-            &ConcreteDataType::uint32_datatype(),
-            None,
-            true,
-        )
-        .unwrap();
+            ConcreteDataType::uint32_datatype()
+        )?;
        assert_eq!(Value::UInt32(4000000000), result);

        // Test invalid string to uint32 with auto cast
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "not_a_uint32".to_string(),
-            &ConcreteDataType::uint32_datatype(),
-            None,
-            true,
+            ConcreteDataType::uint32_datatype()
        );
        assert!(result.is_err());

        // Test string to uint64
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "18446744073709551615".to_string(),
-            &ConcreteDataType::uint64_datatype(),
-            None,
-            true,
-        )
-        .unwrap();
+            ConcreteDataType::uint64_datatype()
+        )?;
        assert_eq!(Value::UInt64(18446744073709551615), result);

        // Test invalid string to uint64 with auto cast
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "not_a_uint64".to_string(),
-            &ConcreteDataType::uint64_datatype(),
-            None,
-            true,
+            ConcreteDataType::uint64_datatype()
        );
        assert!(result.is_err());

        // Test string to float32
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "3.5".to_string(),
-            &ConcreteDataType::float32_datatype(),
-            None,
-            true,
-        )
-        .unwrap();
+            ConcreteDataType::float32_datatype()
+        )?;
        assert_eq!(Value::Float32(OrderedF32::from(3.5)), result);

        // Test invalid string to float32 with auto cast
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "not_a_float32".to_string(),
-            &ConcreteDataType::float32_datatype(),
-            None,
-            true,
+            ConcreteDataType::float32_datatype()
        );
        assert!(result.is_err());

        // Test string to float64
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "3.5".to_string(),
-            &ConcreteDataType::float64_datatype(),
-            None,
-            true,
-        )
-        .unwrap();
+            ConcreteDataType::float64_datatype()
+        )?;
        assert_eq!(Value::Float64(OrderedF64::from(3.5)), result);

        // Test invalid string to float64 with auto cast
-        let result = parse_string_to_value(
+        let result = call_parse_string_to_value!(
            "col",
            "not_a_float64".to_string(),
-            &ConcreteDataType::float64_datatype(),
-            None,
-            true,
+            ConcreteDataType::float64_datatype()
        );
        assert!(result.is_err());
+        Ok(())
    }

-    #[test]
-    fn test_sql_value_to_value() {
-        let sql_val = SqlValue::Null;
-        assert_eq!(
-            Value::Null,
-            sql_value_to_value(
-                "a",
-                &ConcreteDataType::float64_datatype(),
-                &sql_val,
-                None,
+    macro_rules! call_sql_value_to_value {
+        ($column_name: expr, $data_type: expr, $sql_value: expr) => {
+            call_sql_value_to_value!($column_name, $data_type, $sql_value, None, None, false)
+        };
+        ($column_name: expr, $data_type: expr, $sql_value: expr, timezone = $timezone: expr) => {
+            call_sql_value_to_value!(
+                $column_name,
+                $data_type,
+                $sql_value,
+                Some($timezone),
                None,
                false
            )
-            .unwrap()
+        };
+        ($column_name: expr, $data_type: expr, $sql_value: expr, unary_op = $unary_op: expr) => {
+            call_sql_value_to_value!(
+                $column_name,
+                $data_type,
+                $sql_value,
+                None,
+                Some($unary_op),
+                false
+            )
+        };
+        ($column_name: expr, $data_type: expr, $sql_value: expr, auto_string_to_numeric) => {
+            call_sql_value_to_value!($column_name, $data_type, $sql_value, None, None, true)
+        };
+        ($column_name: expr, $data_type: expr, $sql_value: expr, $timezone: expr, $unary_op: expr, $auto_string_to_numeric: expr) => {{
+            let column_schema = ColumnSchema::new($column_name, $data_type, true);
+            sql_value_to_value(
+                &column_schema,
+                $sql_value,
+                $timezone,
+                $unary_op,
+                $auto_string_to_numeric,
+            )
+        }};
+    }
+
+    #[test]
+    fn test_sql_value_to_value() -> Result<()> {
+        let sql_val = SqlValue::Null;
+        assert_eq!(
+            Value::Null,
+            call_sql_value_to_value!("a", ConcreteDataType::float64_datatype(), &sql_val)?
        );

        let sql_val = SqlValue::Boolean(true);
        assert_eq!(
            Value::Boolean(true),
-            sql_value_to_value(
-                "a",
-                &ConcreteDataType::boolean_datatype(),
-                &sql_val,
-                None,
-                None,
-                false
-            )
-            .unwrap()
+            call_sql_value_to_value!("a", ConcreteDataType::boolean_datatype(), &sql_val)?
        );

        let sql_val = SqlValue::Number("3.0".to_string(), false);
        assert_eq!(
            Value::Float64(OrderedFloat(3.0)),
-            sql_value_to_value(
-                "a",
-                &ConcreteDataType::float64_datatype(),
-                &sql_val,
-                None,
-                None,
-                false
-            )
-            .unwrap()
+            call_sql_value_to_value!("a", ConcreteDataType::float64_datatype(), &sql_val)?
        );

        let sql_val = SqlValue::Number("3.0".to_string(), false);
-        let v = sql_value_to_value(
-            "a",
-            &ConcreteDataType::boolean_datatype(),
-            &sql_val,
-            None,
-            None,
-            false,
-        );
+        let v = call_sql_value_to_value!("a", ConcreteDataType::boolean_datatype(), &sql_val);
        assert!(v.is_err());
        assert!(format!("{v:?}").contains("Failed to parse number '3.0' to boolean column type"));

        let sql_val = SqlValue::Boolean(true);
-        let v = sql_value_to_value(
-            "a",
-            &ConcreteDataType::float64_datatype(),
-            &sql_val,
-            None,
-            None,
-            false,
-        );
+        let v = call_sql_value_to_value!("a", ConcreteDataType::float64_datatype(), &sql_val);
        assert!(v.is_err());
        assert!(
            format!("{v:?}").contains(
@@ -725,41 +681,18 @@ mod test {
        );

        let sql_val = SqlValue::HexStringLiteral("48656c6c6f20776f726c6421".to_string());
-        let v = sql_value_to_value(
-            "a",
-            &ConcreteDataType::binary_datatype(),
-            &sql_val,
-            None,
-            None,
-            false,
-        )
-        .unwrap();
+        let v = call_sql_value_to_value!("a", ConcreteDataType::binary_datatype(), &sql_val)?;
        assert_eq!(Value::Binary(Bytes::from(b"Hello world!".as_slice())), v);

        let sql_val = SqlValue::DoubleQuotedString("MorningMyFriends".to_string());
-        let v = sql_value_to_value(
-            "a",
-            &ConcreteDataType::binary_datatype(),
-            &sql_val,
-            None,
-            None,
-            false,
-        )
-        .unwrap();
+        let v = call_sql_value_to_value!("a", ConcreteDataType::binary_datatype(), &sql_val)?;
        assert_eq!(
            Value::Binary(Bytes::from(b"MorningMyFriends".as_slice())),
            v
        );

        let sql_val = SqlValue::HexStringLiteral("9AF".to_string());
-        let v = sql_value_to_value(
-            "a",
-            &ConcreteDataType::binary_datatype(),
-            &sql_val,
-            None,
-            None,
-            false,
-        );
+        let v = call_sql_value_to_value!("a", ConcreteDataType::binary_datatype(), &sql_val);
        assert!(v.is_err());
        assert!(
            format!("{v:?}").contains("odd number of digits"),
@@ -767,38 +700,16 @@ mod test {
        );

        let sql_val = SqlValue::HexStringLiteral("AG".to_string());
-        let v = sql_value_to_value(
-            "a",
-            &ConcreteDataType::binary_datatype(),
-            &sql_val,
-            None,
-            None,
-            false,
-        );
+        let v = call_sql_value_to_value!("a", ConcreteDataType::binary_datatype(), &sql_val);
        assert!(v.is_err());
        assert!(format!("{v:?}").contains("invalid character"), "v is {v:?}",);

        let sql_val = SqlValue::DoubleQuotedString("MorningMyFriends".to_string());
-        let v = sql_value_to_value(
-            "a",
-            &ConcreteDataType::json_datatype(),
-            &sql_val,
-            None,
-            None,
-            false,
-        );
+        let v = call_sql_value_to_value!("a", ConcreteDataType::json_datatype(), &sql_val);
        assert!(v.is_err());

        let sql_val = SqlValue::DoubleQuotedString(r#"{"a":"b"}"#.to_string());
-        let v = sql_value_to_value(
-            "a",
-            &ConcreteDataType::json_datatype(),
-            &sql_val,
-            None,
-            None,
-            false,
-        )
-        .unwrap();
+        let v = call_sql_value_to_value!("a", ConcreteDataType::json_datatype(), &sql_val)?;
        assert_eq!(
            Value::Binary(Bytes::from(
                jsonb::parse_value(r#"{"a":"b"}"#.as_bytes())
@@ -808,16 +719,15 @@ mod test {
            )),
            v
        );
+        Ok(())
    }

    #[test]
    fn test_parse_json_to_jsonb() {
-        match parse_string_to_value(
+        match call_parse_string_to_value!(
            "json_col",
            r#"{"a": "b"}"#.to_string(),
-            &ConcreteDataType::json_datatype(),
-            None,
-            false,
+            ConcreteDataType::json_datatype()
        ) {
            Ok(Value::Binary(b)) => {
                assert_eq!(
@@ -833,12 +743,10 @@ mod test {
        }

        assert!(
-            parse_string_to_value(
+            call_parse_string_to_value!(
                "json_col",
                r#"Nicola Kovac is the best rifler in the world"#.to_string(),
-                &ConcreteDataType::json_datatype(),
-                None,
-                false,
+                ConcreteDataType::json_datatype()
            )
            .is_err()
        )
@@ -878,13 +786,10 @@ mod test {

    #[test]
    fn test_parse_date_literal() {
-        let value = sql_value_to_value(
+        let value = call_sql_value_to_value!(
            "date",
-            &ConcreteDataType::date_datatype(),
-            &SqlValue::DoubleQuotedString("2022-02-22".to_string()),
-            None,
-            None,
-            false,
+            ConcreteDataType::date_datatype(),
+            &SqlValue::DoubleQuotedString("2022-02-22".to_string())
        )
        .unwrap();
        assert_eq!(ConcreteDataType::date_datatype(), value.data_type());
@@ -895,13 +800,11 @@ mod test {
        }

        // with timezone
-        let value = sql_value_to_value(
+        let value = call_sql_value_to_value!(
            "date",
-            &ConcreteDataType::date_datatype(),
+            ConcreteDataType::date_datatype(),
            &SqlValue::DoubleQuotedString("2022-02-22".to_string()),
-            Some(&Timezone::from_tz_string("+07:00").unwrap()),
-            None,
-            false,
+            timezone = &Timezone::from_tz_string("+07:00").unwrap()
        )
        .unwrap();
        assert_eq!(ConcreteDataType::date_datatype(), value.data_type());
@@ -913,16 +816,12 @@ mod test {
    }

    #[test]
-    fn test_parse_timestamp_literal() {
-        match parse_string_to_value(
+    fn test_parse_timestamp_literal() -> Result<()> {
+        match call_parse_string_to_value!(
            "timestamp_col",
            "2022-02-22T00:01:01+08:00".to_string(),
-            &ConcreteDataType::timestamp_millisecond_datatype(),
-            None,
-            false,
-        )
-        .unwrap()
-        {
+            ConcreteDataType::timestamp_millisecond_datatype()
+        )? {
            Value::Timestamp(ts) => {
                assert_eq!(1645459261000, ts.value());
                assert_eq!(TimeUnit::Millisecond, ts.unit());
@@ -932,15 +831,11 @@ mod test {
            }
        }

-        match parse_string_to_value(
+        match call_parse_string_to_value!(
            "timestamp_col",
            "2022-02-22T00:01:01+08:00".to_string(),
-            &ConcreteDataType::timestamp_datatype(TimeUnit::Second),
-            None,
-            false,
-        )
-        .unwrap()
-        {
+            ConcreteDataType::timestamp_datatype(TimeUnit::Second)
+        )? {
            Value::Timestamp(ts) => {
                assert_eq!(1645459261, ts.value());
                assert_eq!(TimeUnit::Second, ts.unit());
@@ -950,15 +845,11 @@ mod test {
            }
        }

-        match parse_string_to_value(
+        match call_parse_string_to_value!(
            "timestamp_col",
            "2022-02-22T00:01:01+08:00".to_string(),
-            &ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond),
-            None,
-            false,
-        )
-        .unwrap()
-        {
+            ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond)
+        )? {
            Value::Timestamp(ts) => {
                assert_eq!(1645459261000000, ts.value());
                assert_eq!(TimeUnit::Microsecond, ts.unit());
@@ -968,15 +859,11 @@ mod test {
            }
        }

-        match parse_string_to_value(
+        match call_parse_string_to_value!(
            "timestamp_col",
            "2022-02-22T00:01:01+08:00".to_string(),
-            &ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond),
-            None,
-            false,
-        )
-        .unwrap()
-        {
+            ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond)
+        )? {
            Value::Timestamp(ts) => {
                assert_eq!(1645459261000000000, ts.value());
                assert_eq!(TimeUnit::Nanosecond, ts.unit());
@@ -987,26 +874,21 @@ mod test {
        }

        assert!(
-            parse_string_to_value(
+            call_parse_string_to_value!(
                "timestamp_col",
                "2022-02-22T00:01:01+08".to_string(),
-                &ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond),
-                None,
-                false,
+                ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond)
            )
            .is_err()
        );

        // with timezone
-        match parse_string_to_value(
+        match call_parse_string_to_value!(
            "timestamp_col",
            "2022-02-22T00:01:01".to_string(),
-            &ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond),
-            Some(&Timezone::from_tz_string("Asia/Shanghai").unwrap()),
-            false,
-        )
-        .unwrap()
-        {
+            ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond),
+            timezone = &Timezone::from_tz_string("Asia/Shanghai").unwrap()
+        )? {
            Value::Timestamp(ts) => {
                assert_eq!(1645459261000000000, ts.value());
                assert_eq!("2022-02-21 16:01:01+0000", ts.to_iso8601_string());
@@ -1016,51 +898,42 @@ mod test {
                unreachable!()
            }
        }
+        Ok(())
    }

    #[test]
    fn test_parse_placeholder_value() {
        assert!(
-            sql_value_to_value(
+            call_sql_value_to_value!(
                "test",
-                &ConcreteDataType::string_datatype(),
+                ConcreteDataType::string_datatype(),
+                &SqlValue::Placeholder("default".into())
+            )
+            .is_err()
+        );
+        assert!(
+            call_sql_value_to_value!(
+                "test",
+                ConcreteDataType::string_datatype(),
                &SqlValue::Placeholder("default".into()),
-                None,
-                None,
-                false
+                unary_op = UnaryOperator::Minus
            )
            .is_err()
        );
        assert!(
-            sql_value_to_value(
+            call_sql_value_to_value!(
                "test",
-                &ConcreteDataType::string_datatype(),
-                &SqlValue::Placeholder("default".into()),
-                None,
-                Some(UnaryOperator::Minus),
-                false
-            )
-            .is_err()
-        );
-        assert!(
-            sql_value_to_value(
-                "test",
-                &ConcreteDataType::uint16_datatype(),
+                ConcreteDataType::uint16_datatype(),
                &SqlValue::Number("3".into(), false),
-                None,
-                Some(UnaryOperator::Minus),
-                false
+                unary_op = UnaryOperator::Minus
            )
            .is_err()
        );
        assert!(
-            sql_value_to_value(
+            call_sql_value_to_value!(
                "test",
-                &ConcreteDataType::uint16_datatype(),
-                &SqlValue::Number("3".into(), false),
-                None,
-                None,
-                false
+                ConcreteDataType::uint16_datatype(),
+                &SqlValue::Number("3".into(), false)
            )
            .is_ok()
        );
@@ -1070,77 +943,60 @@ mod test {
    fn test_auto_string_to_numeric() {
        // Test with auto_string_to_numeric=true
        let sql_val = SqlValue::SingleQuotedString("123".to_string());
-        let v = sql_value_to_value(
+        let v = call_sql_value_to_value!(
            "a",
-            &ConcreteDataType::int32_datatype(),
+            ConcreteDataType::int32_datatype(),
            &sql_val,
-            None,
-            None,
-            true,
+            auto_string_to_numeric
        )
        .unwrap();
        assert_eq!(Value::Int32(123), v);

        // Test with a float string
        let sql_val = SqlValue::SingleQuotedString("3.5".to_string());
-        let v = sql_value_to_value(
+        let v = call_sql_value_to_value!(
            "a",
-            &ConcreteDataType::float64_datatype(),
+            ConcreteDataType::float64_datatype(),
            &sql_val,
-            None,
-            None,
-            true,
+            auto_string_to_numeric
        )
        .unwrap();
        assert_eq!(Value::Float64(OrderedFloat(3.5)), v);

        // Test with auto_string_to_numeric=false
        let sql_val = SqlValue::SingleQuotedString("123".to_string());
-        let v = sql_value_to_value(
-            "a",
-            &ConcreteDataType::int32_datatype(),
-            &sql_val,
-            None,
-            None,
-            false,
-        );
+        let v = call_sql_value_to_value!("a", ConcreteDataType::int32_datatype(), &sql_val);
        assert!(v.is_err());

        // Test with an invalid numeric string but auto_string_to_numeric=true
        // Should return an error now with the new auto_cast_to_numeric behavior
        let sql_val = SqlValue::SingleQuotedString("not_a_number".to_string());
-        let v = sql_value_to_value(
+        let v = call_sql_value_to_value!(
            "a",
-            &ConcreteDataType::int32_datatype(),
+            ConcreteDataType::int32_datatype(),
            &sql_val,
-            None,
-            None,
-            true,
+            auto_string_to_numeric
        );
        assert!(v.is_err());

        // Test with boolean type
        let sql_val = SqlValue::SingleQuotedString("true".to_string());
-        let v = sql_value_to_value(
+        let v = call_sql_value_to_value!(
            "a",
-            &ConcreteDataType::boolean_datatype(),
+            ConcreteDataType::boolean_datatype(),
            &sql_val,
-            None,
-            None,
-            true,
+            auto_string_to_numeric
        )
        .unwrap();
        assert_eq!(Value::Boolean(true), v);

        // Non-numeric types should still be handled normally
        let sql_val = SqlValue::SingleQuotedString("hello".to_string());
-        let v = sql_value_to_value(
+        let v = call_sql_value_to_value!(
            "a",
-            &ConcreteDataType::string_datatype(),
+            ConcreteDataType::string_datatype(),
            &sql_val,
-            None,
-            None,
-            true,
+            auto_string_to_numeric
        );
        assert!(v.is_ok());
    }
--- a/src/common/sql/src/default_constraint.rs
+++ b/src/common/sql/src/default_constraint.rs
@@ -14,8 +14,8 @@

 use common_time::timezone::Timezone;
 use datatypes::prelude::ConcreteDataType;
-use datatypes::schema::ColumnDefaultConstraint;
 use datatypes::schema::constraint::{CURRENT_TIMESTAMP, CURRENT_TIMESTAMP_FN};
+use datatypes::schema::{ColumnDefaultConstraint, ColumnSchema};
 use snafu::ensure;
 use sqlparser::ast::ValueWithSpan;
 pub use sqlparser::ast::{
@@ -47,9 +47,12 @@ pub fn parse_column_default_constraint(
        );

        let default_constraint = match &opt.option {
-            ColumnOption::Default(Expr::Value(v)) => ColumnDefaultConstraint::Value(
-                sql_value_to_value(column_name, data_type, &v.value, timezone, None, false)?,
-            ),
+            ColumnOption::Default(Expr::Value(v)) => {
+                let schema = ColumnSchema::new(column_name, data_type.clone(), true);
+                ColumnDefaultConstraint::Value(sql_value_to_value(
+                    &schema, &v.value, timezone, None, false,
+                )?)
+            }
            ColumnOption::Default(Expr::Function(func)) => {
                let mut func = format!("{func}").to_lowercase();
                // normalize CURRENT_TIMESTAMP to CURRENT_TIMESTAMP()
@@ -80,8 +83,7 @@ pub fn parse_column_default_constraint(

                if let Expr::Value(v) = &**expr {
                    let value = sql_value_to_value(
-                        column_name,
-                        data_type,
+                        &ColumnSchema::new(column_name, data_type.clone(), true),
                        &v.value,
                        timezone,
                        Some(*op),
--- a/src/common/stat/src/resource.rs
+++ b/src/common/stat/src/resource.rs
@@ -58,10 +58,14 @@ pub fn get_total_memory_bytes() -> i64 {
    }
 }

-/// Get the total CPU cores. The result will be rounded to the nearest integer.
-/// For example, if the total CPU is 1.5 cores(1500 millicores), the result will be 2.
+/// Get the total CPU cores. The result will be rounded up to the next integer (ceiling).
+/// For example, if the total CPU is 1.1 cores (1100 millicores) or 1.5 cores (1500 millicores), the result will be 2.
 pub fn get_total_cpu_cores() -> usize {
-    ((get_total_cpu_millicores() as f64) / 1000.0).round() as usize
+    cpu_cores(get_total_cpu_millicores())
+}
+
+fn cpu_cores(cpu_millicores: i64) -> usize {
+    ((cpu_millicores as f64) / 1_000.0).ceil() as usize
 }

 /// Get the total memory in readable size.
@@ -178,6 +182,13 @@ mod tests {
    #[test]
    fn test_get_total_cpu_cores() {
        assert!(get_total_cpu_cores() > 0);
+        assert_eq!(cpu_cores(1), 1);
+        assert_eq!(cpu_cores(100), 1);
+        assert_eq!(cpu_cores(500), 1);
+        assert_eq!(cpu_cores(1000), 1);
+        assert_eq!(cpu_cores(1100), 2);
+        assert_eq!(cpu_cores(1900), 2);
+        assert_eq!(cpu_cores(10_000), 10);
    }

    #[test]
--- a/src/common/telemetry/src/metric.rs
+++ b/src/common/telemetry/src/metric.rs
@@ -71,6 +71,7 @@ pub fn convert_metric_to_write_request(
                        timestamp,
                    }],
                    exemplars: vec![],
+                    histograms: vec![],
                }),
                MetricType::GAUGE => timeseries.push(TimeSeries {
                    labels: convert_label(m.get_label(), mf_name, None),
@@ -79,6 +80,7 @@ pub fn convert_metric_to_write_request(
                        timestamp,
                    }],
                    exemplars: vec![],
+                    histograms: vec![],
                }),
                MetricType::HISTOGRAM => {
                    let h = m.get_histogram();
@@ -97,6 +99,7 @@ pub fn convert_metric_to_write_request(
                                timestamp,
                            }],
                            exemplars: vec![],
+                            histograms: vec![],
                        });
                        if upper_bound.is_sign_positive() && upper_bound.is_infinite() {
                            inf_seen = true;
@@ -114,6 +117,7 @@ pub fn convert_metric_to_write_request(
                                timestamp,
                            }],
                            exemplars: vec![],
+                            histograms: vec![],
                        });
                    }
                    timeseries.push(TimeSeries {
@@ -127,6 +131,7 @@ pub fn convert_metric_to_write_request(
                            timestamp,
                        }],
                        exemplars: vec![],
+                        histograms: vec![],
                    });
                    timeseries.push(TimeSeries {
                        labels: convert_label(
@@ -139,6 +144,7 @@ pub fn convert_metric_to_write_request(
                            timestamp,
                        }],
                        exemplars: vec![],
+                        histograms: vec![],
                    });
                }
                MetricType::SUMMARY => {
@@ -155,6 +161,7 @@ pub fn convert_metric_to_write_request(
                                timestamp,
                            }],
                            exemplars: vec![],
+                            histograms: vec![],
                        });
                    }
                    timeseries.push(TimeSeries {
@@ -168,6 +175,7 @@ pub fn convert_metric_to_write_request(
                            timestamp,
                        }],
                        exemplars: vec![],
+                        histograms: vec![],
                    });
                    timeseries.push(TimeSeries {
                        labels: convert_label(
@@ -180,6 +188,7 @@ pub fn convert_metric_to_write_request(
                            timestamp,
                        }],
                        exemplars: vec![],
+                        histograms: vec![],
                    });
                }
                MetricType::UNTYPED => {
@@ -274,7 +283,7 @@ mod test {

        assert_eq!(
            format!("{:?}", write_quest.timeseries),
-            r#"[TimeSeries { labels: [Label { name: "__name__", value: "test_counter" }, Label { name: "a", value: "1" }, Label { name: "b", value: "2" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [] }]"#
+            r#"[TimeSeries { labels: [Label { name: "__name__", value: "test_counter" }, Label { name: "a", value: "1" }, Label { name: "b", value: "2" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [], histograms: [] }]"#
        );

        let gauge_opts = Opts::new("test_gauge", "test help")
@@ -288,7 +297,7 @@ mod test {
        let write_quest = convert_metric_to_write_request(mf, None, 0);
        assert_eq!(
            format!("{:?}", write_quest.timeseries),
-            r#"[TimeSeries { labels: [Label { name: "__name__", value: "test_gauge" }, Label { name: "a", value: "1" }, Label { name: "b", value: "2" }], samples: [Sample { value: 42.0, timestamp: 0 }], exemplars: [] }]"#
+            r#"[TimeSeries { labels: [Label { name: "__name__", value: "test_gauge" }, Label { name: "a", value: "1" }, Label { name: "b", value: "2" }], samples: [Sample { value: 42.0, timestamp: 0 }], exemplars: [], histograms: [] }]"#
        );
    }

@@ -305,20 +314,20 @@ mod test {
            .iter()
            .map(|x| format!("{:?}", x))
            .collect();
-        let ans = r#"TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "0.005" }], samples: [Sample { value: 0.0, timestamp: 0 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "0.01" }], samples: [Sample { value: 0.0, timestamp: 0 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "0.025" }], samples: [Sample { value: 0.0, timestamp: 0 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "0.05" }], samples: [Sample { value: 0.0, timestamp: 0 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "0.1" }], samples: [Sample { value: 0.0, timestamp: 0 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "0.25" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "0.5" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "1" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "2.5" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "5" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "10" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "+Inf" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_sum" }, Label { name: "a", value: "1" }], samples: [Sample { value: 0.25, timestamp: 0 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_count" }, Label { name: "a", value: "1" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [] }"#;
+        let ans = r#"TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "0.005" }], samples: [Sample { value: 0.0, timestamp: 0 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "0.01" }], samples: [Sample { value: 0.0, timestamp: 0 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "0.025" }], samples: [Sample { value: 0.0, timestamp: 0 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "0.05" }], samples: [Sample { value: 0.0, timestamp: 0 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "0.1" }], samples: [Sample { value: 0.0, timestamp: 0 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "0.25" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "0.5" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "1" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "2.5" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "5" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "10" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_bucket" }, Label { name: "a", value: "1" }, Label { name: "le", value: "+Inf" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_sum" }, Label { name: "a", value: "1" }], samples: [Sample { value: 0.25, timestamp: 0 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_count" }, Label { name: "a", value: "1" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [], histograms: [] }"#;
        assert_eq!(write_quest_str.join("\n"), ans);
    }

@@ -355,10 +364,10 @@ TimeSeries { labels: [Label { name: "__name__", value: "test_histogram_count" },
            .iter()
            .map(|x| format!("{:?}", x))
            .collect();
-        let ans = r#"TimeSeries { labels: [Label { name: "__name__", value: "test_summary" }, Label { name: "quantile", value: "50" }], samples: [Sample { value: 3.0, timestamp: 20 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_summary" }, Label { name: "quantile", value: "100" }], samples: [Sample { value: 5.0, timestamp: 20 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_summary_sum" }], samples: [Sample { value: 15.0, timestamp: 20 }], exemplars: [] }
-TimeSeries { labels: [Label { name: "__name__", value: "test_summary_count" }], samples: [Sample { value: 5.0, timestamp: 20 }], exemplars: [] }"#;
+        let ans = r#"TimeSeries { labels: [Label { name: "__name__", value: "test_summary" }, Label { name: "quantile", value: "50" }], samples: [Sample { value: 3.0, timestamp: 20 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_summary" }, Label { name: "quantile", value: "100" }], samples: [Sample { value: 5.0, timestamp: 20 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_summary_sum" }], samples: [Sample { value: 15.0, timestamp: 20 }], exemplars: [], histograms: [] }
+TimeSeries { labels: [Label { name: "__name__", value: "test_summary_count" }], samples: [Sample { value: 5.0, timestamp: 20 }], exemplars: [], histograms: [] }"#;
        assert_eq!(write_quest_str.join("\n"), ans);
    }

@@ -385,11 +394,11 @@ TimeSeries { labels: [Label { name: "__name__", value: "test_summary_count" }],
        let write_quest2 = convert_metric_to_write_request(mf, Some(&filter), 0);
        assert_eq!(
            format!("{:?}", write_quest1.timeseries),
-            r#"[TimeSeries { labels: [Label { name: "__name__", value: "filter_counter" }, Label { name: "a", value: "1" }, Label { name: "b", value: "2" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [] }, TimeSeries { labels: [Label { name: "__name__", value: "test_counter" }, Label { name: "a", value: "1" }, Label { name: "b", value: "2" }], samples: [Sample { value: 2.0, timestamp: 0 }], exemplars: [] }]"#
+            r#"[TimeSeries { labels: [Label { name: "__name__", value: "filter_counter" }, Label { name: "a", value: "1" }, Label { name: "b", value: "2" }], samples: [Sample { value: 1.0, timestamp: 0 }], exemplars: [], histograms: [] }, TimeSeries { labels: [Label { name: "__name__", value: "test_counter" }, Label { name: "a", value: "1" }, Label { name: "b", value: "2" }], samples: [Sample { value: 2.0, timestamp: 0 }], exemplars: [], histograms: [] }]"#
        );
        assert_eq!(
            format!("{:?}", write_quest2.timeseries),
-            r#"[TimeSeries { labels: [Label { name: "__name__", value: "test_counter" }, Label { name: "a", value: "1" }, Label { name: "b", value: "2" }], samples: [Sample { value: 2.0, timestamp: 0 }], exemplars: [] }]"#
+            r#"[TimeSeries { labels: [Label { name: "__name__", value: "test_counter" }, Label { name: "a", value: "1" }, Label { name: "b", value: "2" }], samples: [Sample { value: 2.0, timestamp: 0 }], exemplars: [], histograms: [] }]"#
        );
    }
 }
--- a/src/common/wal/src/config.rs
+++ b/src/common/wal/src/config.rs
@@ -206,6 +206,8 @@ mod tests {
                    client_cert_path: None,
                    client_key_path: None,
                }),
+                connect_timeout: Duration::from_secs(3),
+                timeout: Duration::from_secs(3),
            },
            kafka_topic: KafkaTopicConfig {
                num_topics: 32,
@@ -239,6 +241,8 @@ mod tests {
                    client_cert_path: None,
                    client_key_path: None,
                }),
+                connect_timeout: Duration::from_secs(3),
+                timeout: Duration::from_secs(3),
            },
            max_batch_bytes: ReadableSize::mb(1),
            consumer_wait_timeout: Duration::from_millis(100),
--- a/src/common/wal/src/config/kafka/common.rs
+++ b/src/common/wal/src/config/kafka/common.rs
@@ -36,9 +36,6 @@ pub const DEFAULT_BACKOFF_CONFIG: BackoffConfig = BackoffConfig {
    deadline: Some(Duration::from_secs(3)),
 };

-/// The default connect timeout for kafka client.
-pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
-
 /// Default interval for auto WAL pruning.
 pub const DEFAULT_AUTO_PRUNE_INTERVAL: Duration = Duration::from_mins(30);
 /// Default limit for concurrent auto pruning tasks.
@@ -167,6 +164,12 @@ pub struct KafkaConnectionConfig {
    pub sasl: Option<KafkaClientSasl>,
    /// Client TLS config
    pub tls: Option<KafkaClientTls>,
+    /// The connect timeout for kafka client.
+    #[serde(with = "humantime_serde")]
+    pub connect_timeout: Duration,
+    /// The timeout for kafka client.
+    #[serde(with = "humantime_serde")]
+    pub timeout: Duration,
 }

 impl Default for KafkaConnectionConfig {
@@ -175,6 +178,8 @@ impl Default for KafkaConnectionConfig {
            broker_endpoints: vec![BROKER_ENDPOINT.to_string()],
            sasl: None,
            tls: None,
+            connect_timeout: Duration::from_secs(3),
+            timeout: Duration::from_secs(3),
        }
    }
 }
--- a/src/datanode/src/error.rs
+++ b/src/datanode/src/error.rs
@@ -410,14 +410,6 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Failed to build cache store"))]
-    BuildCacheStore {
-        #[snafu(source)]
-        error: object_store::Error,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Not yet implemented: {what}"))]
    NotYetImplemented { what: String },
 }
@@ -493,7 +485,6 @@ impl ErrorExt for Error {
            SerializeJson { .. } => StatusCode::Internal,

            ObjectStore { source, .. } => source.status_code(),
-            BuildCacheStore { .. } => StatusCode::StorageUnavailable,
        }
    }

--- a/src/datanode/src/store.rs
+++ b/src/datanode/src/store.rs
@@ -14,15 +14,10 @@

 //! object storage utilities

-use std::sync::Arc;
-
-use common_telemetry::info;
-use object_store::config::ObjectStorageCacheConfig;
+use common_telemetry::{info, warn};
 use object_store::factory::new_raw_object_store;
-use object_store::layers::LruCacheLayer;
-use object_store::services::Fs;
 use object_store::util::{clean_temp_dir, join_dir, with_instrument_layers, with_retry_layers};
-use object_store::{ATOMIC_WRITE_DIR, Access, ObjectStore, ObjectStoreBuilder};
+use object_store::{ATOMIC_WRITE_DIR, ObjectStore};
 use snafu::prelude::*;

 use crate::config::ObjectStoreConfig;
@@ -47,23 +42,58 @@ pub(crate) async fn new_object_store_without_cache(
    Ok(object_store)
 }

+/// Cleans up old LRU read cache directories that were removed.
+fn clean_old_read_cache(store: &ObjectStoreConfig, data_home: &str) {
+    if !store.is_object_storage() {
+        return;
+    }
+
+    let Some(cache_config) = store.cache_config() else {
+        return;
+    };
+
+    // Only cleans if read cache was enabled
+    if !cache_config.enable_read_cache {
+        return;
+    }
+
+    let cache_base_dir = if cache_config.cache_path.is_empty() {
+        data_home
+    } else {
+        &cache_config.cache_path
+    };
+
+    // Cleans up the old read cache directory
+    let old_read_cache_dir = join_dir(cache_base_dir, "cache/object/read");
+    info!(
+        "Cleaning up old read cache directory: {}",
+        old_read_cache_dir
+    );
+    if let Err(e) = clean_temp_dir(&old_read_cache_dir) {
+        warn!(e; "Failed to clean old read cache directory {}", old_read_cache_dir);
+    }
+
+    // Cleans up the atomic temp dir used by the cache layer
+    let cache_atomic_temp_dir = join_dir(cache_base_dir, ATOMIC_WRITE_DIR);
+    info!(
+        "Cleaning up old cache atomic temp directory: {}",
+        cache_atomic_temp_dir
+    );
+    if let Err(e) = clean_temp_dir(&cache_atomic_temp_dir) {
+        warn!(e; "Failed to clean old cache atomic temp directory {}", cache_atomic_temp_dir);
+    }
+}
+
 pub async fn new_object_store(store: ObjectStoreConfig, data_home: &str) -> Result<ObjectStore> {
+    // Cleans up old LRU read cache directories.
+    // TODO: Remove this line after the 1.0 release.
+    clean_old_read_cache(&store, data_home);
+
    let object_store = new_raw_object_store(&store, data_home)
        .await
        .context(error::ObjectStoreSnafu)?;
-    // Enable retry layer and cache layer for non-fs object storages
+    // Enables retry layer for non-fs object storages
    let object_store = if store.is_object_storage() {
-        let object_store = {
-            // It's safe to unwrap here because we already checked above.
-            let cache_config = store.cache_config().unwrap();
-            if let Some(cache_layer) = build_cache_layer(cache_config, data_home).await? {
-                // Adds cache layer
-                object_store.layer(cache_layer)
-            } else {
-                object_store
-            }
-        };
-
        // Adds retry layer
        with_retry_layers(object_store)
    } else {
@@ -73,40 +103,3 @@ pub async fn new_object_store(store: ObjectStoreConfig, data_home: &str) -> Resu
    let object_store = with_instrument_layers(object_store, true);
    Ok(object_store)
 }
-
-async fn build_cache_layer(
-    cache_config: &ObjectStorageCacheConfig,
-    data_home: &str,
-) -> Result<Option<LruCacheLayer<impl Access>>> {
-    // No need to build cache layer if read cache is disabled.
-    if !cache_config.enable_read_cache {
-        return Ok(None);
-    }
-    let cache_base_dir = if cache_config.cache_path.is_empty() {
-        data_home
-    } else {
-        &cache_config.cache_path
-    };
-    let atomic_temp_dir = join_dir(cache_base_dir, ATOMIC_WRITE_DIR);
-    clean_temp_dir(&atomic_temp_dir).context(error::ObjectStoreSnafu)?;
-
-    let cache_store = Fs::default()
-        .root(cache_base_dir)
-        .atomic_write_dir(&atomic_temp_dir)
-        .build()
-        .context(error::BuildCacheStoreSnafu)?;
-
-    let cache_layer = LruCacheLayer::new(
-        Arc::new(cache_store),
-        cache_config.cache_capacity.0 as usize,
-    )
-    .context(error::BuildCacheStoreSnafu)?;
-    cache_layer.recover_cache(false).await;
-
-    info!(
-        "Enabled local object storage cache, path: {}, capacity: {}.",
-        cache_config.cache_path, cache_config.cache_capacity
-    );
-
-    Ok(Some(cache_layer))
-}
--- a/src/datanode/src/tests.rs
+++ b/src/datanode/src/tests.rs
@@ -33,9 +33,9 @@ use servers::grpc::FlightCompression;
 use session::context::QueryContextRef;
 use store_api::metadata::RegionMetadataRef;
 use store_api::region_engine::{
-    RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
-    RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse,
-    SettableRegionRoleState, SyncManifestResponse,
+    CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine, RegionManifestInfo, RegionRole,
+    RegionScannerRef, RegionStatistic, RemapManifestsRequest, RemapManifestsResponse,
+    SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse,
 };
 use store_api::region_request::{AffectedRows, RegionRequest};
 use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
@@ -299,6 +299,14 @@ impl RegionEngine for MockRegionEngine {
        unimplemented!()
    }

+    async fn copy_region_from(
+        &self,
+        _region_id: RegionId,
+        _request: CopyRegionFromRequest,
+    ) -> Result<CopyRegionFromResponse, BoxedError> {
+        unimplemented!()
+    }
+
    fn as_any(&self) -> &dyn Any {
        self
    }
--- a/src/datatypes/src/schema.rs
+++ b/src/datatypes/src/schema.rs
@@ -33,7 +33,8 @@ pub use crate::schema::column_schema::{
    COLUMN_SKIPPING_INDEX_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY,
    COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, ColumnExtType, ColumnSchema, FULLTEXT_KEY,
    FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata,
-    SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY,
+    SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY, VECTOR_INDEX_KEY,
+    VectorDistanceMetric, VectorIndexEngineType, VectorIndexOptions,
 };
 pub use crate::schema::constraint::ColumnDefaultConstraint;
 pub use crate::schema::raw::RawSchema;
--- a/src/datatypes/src/schema/column_schema.rs
+++ b/src/datatypes/src/schema/column_schema.rs
@@ -46,6 +46,8 @@ pub const FULLTEXT_KEY: &str = "greptime:fulltext";
 pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index";
 /// Key used to store skip options in arrow field's metadata.
 pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index";
+/// Key used to store vector index options in arrow field's metadata.
+pub const VECTOR_INDEX_KEY: &str = "greptime:vector_index";

 /// Keys used in fulltext options
 pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable";
@@ -216,6 +218,53 @@ impl ColumnSchema {
        self.metadata.contains_key(INVERTED_INDEX_KEY)
    }

+    /// Checks if this column has a vector index.
+    pub fn is_vector_indexed(&self) -> bool {
+        match self.vector_index_options() {
+            Ok(opts) => opts.is_some(),
+            Err(e) => {
+                common_telemetry::warn!(
+                    "Failed to deserialize vector_index_options for column '{}': {}",
+                    self.name,
+                    e
+                );
+                false
+            }
+        }
+    }
+
+    /// Gets the vector index options.
+    pub fn vector_index_options(&self) -> Result<Option<VectorIndexOptions>> {
+        match self.metadata.get(VECTOR_INDEX_KEY) {
+            None => Ok(None),
+            Some(json) => {
+                let options =
+                    serde_json::from_str(json).context(error::DeserializeSnafu { json })?;
+                Ok(Some(options))
+            }
+        }
+    }
+
+    /// Sets the vector index options.
+    pub fn set_vector_index_options(&mut self, options: &VectorIndexOptions) -> Result<()> {
+        self.metadata.insert(
+            VECTOR_INDEX_KEY.to_string(),
+            serde_json::to_string(options).context(error::SerializeSnafu)?,
+        );
+        Ok(())
+    }
+
+    /// Removes the vector index options.
+    pub fn unset_vector_index_options(&mut self) {
+        self.metadata.remove(VECTOR_INDEX_KEY);
+    }
+
+    /// Sets vector index options and returns self for chaining.
+    pub fn with_vector_index_options(mut self, options: &VectorIndexOptions) -> Result<Self> {
+        self.set_vector_index_options(options)?;
+        Ok(self)
+    }
+
    /// Set default constraint.
    ///
    /// If a default constraint exists for the column, this method will
@@ -964,6 +1013,181 @@ impl TryFrom<HashMap<String, String>> for SkippingIndexOptions {
    }
 }

+/// Distance metric for vector similarity search.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default, Visit, VisitMut)]
+#[serde(rename_all = "lowercase")]
+pub enum VectorDistanceMetric {
+    /// Squared Euclidean distance (L2^2).
+    #[default]
+    L2sq,
+    /// Cosine distance (1 - cosine similarity).
+    Cosine,
+    /// Inner product (negative, for maximum inner product search).
+    #[serde(alias = "ip")]
+    InnerProduct,
+}
+
+impl fmt::Display for VectorDistanceMetric {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            VectorDistanceMetric::L2sq => write!(f, "l2sq"),
+            VectorDistanceMetric::Cosine => write!(f, "cosine"),
+            VectorDistanceMetric::InnerProduct => write!(f, "ip"),
+        }
+    }
+}
+
+impl std::str::FromStr for VectorDistanceMetric {
+    type Err = String;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "l2sq" | "l2" | "euclidean" => Ok(VectorDistanceMetric::L2sq),
+            "cosine" | "cos" => Ok(VectorDistanceMetric::Cosine),
+            "inner_product" | "ip" | "dot" => Ok(VectorDistanceMetric::InnerProduct),
+            _ => Err(format!(
+                "Unknown distance metric: {}. Expected: l2sq, cosine, or ip",
+                s
+            )),
+        }
+    }
+}
+
+impl VectorDistanceMetric {
+    /// Returns the metric as u8 for blob serialization.
+    pub fn as_u8(&self) -> u8 {
+        match self {
+            Self::L2sq => 0,
+            Self::Cosine => 1,
+            Self::InnerProduct => 2,
+        }
+    }
+
+    /// Parses metric from u8 (used when reading blob).
+    pub fn try_from_u8(v: u8) -> Option<Self> {
+        match v {
+            0 => Some(Self::L2sq),
+            1 => Some(Self::Cosine),
+            2 => Some(Self::InnerProduct),
+            _ => None,
+        }
+    }
+}
+
+/// Default HNSW connectivity parameter.
+const DEFAULT_VECTOR_INDEX_CONNECTIVITY: u32 = 16;
+/// Default expansion factor during index construction.
+const DEFAULT_VECTOR_INDEX_EXPANSION_ADD: u32 = 128;
+/// Default expansion factor during search.
+const DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH: u32 = 64;
+
+fn default_vector_index_connectivity() -> u32 {
+    DEFAULT_VECTOR_INDEX_CONNECTIVITY
+}
+
+fn default_vector_index_expansion_add() -> u32 {
+    DEFAULT_VECTOR_INDEX_EXPANSION_ADD
+}
+
+fn default_vector_index_expansion_search() -> u32 {
+    DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH
+}
+
+/// Supported vector index engine types.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, Visit, VisitMut)]
+#[serde(rename_all = "lowercase")]
+pub enum VectorIndexEngineType {
+    /// USearch HNSW implementation.
+    #[default]
+    Usearch,
+    // Future: Vsag,
+}
+
+impl VectorIndexEngineType {
+    /// Returns the engine type as u8 for blob serialization.
+    pub fn as_u8(&self) -> u8 {
+        match self {
+            Self::Usearch => 0,
+        }
+    }
+
+    /// Parses engine type from u8 (used when reading blob).
+    pub fn try_from_u8(v: u8) -> Option<Self> {
+        match v {
+            0 => Some(Self::Usearch),
+            _ => None,
+        }
+    }
+}
+
+impl fmt::Display for VectorIndexEngineType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Usearch => write!(f, "usearch"),
+        }
+    }
+}
+
+impl std::str::FromStr for VectorIndexEngineType {
+    type Err = String;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "usearch" => Ok(Self::Usearch),
+            _ => Err(format!(
+                "Unknown vector index engine: {}. Expected: usearch",
+                s
+            )),
+        }
+    }
+}
+
+/// Options for vector index (HNSW).
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Visit, VisitMut)]
+#[serde(rename_all = "kebab-case")]
+pub struct VectorIndexOptions {
+    /// Vector index engine type (default: usearch).
+    #[serde(default)]
+    pub engine: VectorIndexEngineType,
+    /// Distance metric for similarity search.
+    #[serde(default)]
+    pub metric: VectorDistanceMetric,
+    /// HNSW connectivity parameter (M in the paper).
+    /// Higher values improve recall but increase memory usage.
+    #[serde(default = "default_vector_index_connectivity")]
+    pub connectivity: u32,
+    /// Expansion factor during index construction (ef_construction).
+    /// Higher values improve index quality but slow down construction.
+    #[serde(default = "default_vector_index_expansion_add")]
+    pub expansion_add: u32,
+    /// Expansion factor during search (ef_search).
+    /// Higher values improve recall but slow down search.
+    #[serde(default = "default_vector_index_expansion_search")]
+    pub expansion_search: u32,
+}
+
+impl Default for VectorIndexOptions {
+    fn default() -> Self {
+        Self {
+            engine: VectorIndexEngineType::default(),
+            metric: VectorDistanceMetric::default(),
+            connectivity: DEFAULT_VECTOR_INDEX_CONNECTIVITY,
+            expansion_add: DEFAULT_VECTOR_INDEX_EXPANSION_ADD,
+            expansion_search: DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH,
+        }
+    }
+}
+
+impl fmt::Display for VectorIndexOptions {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "engine={}, metric={}, connectivity={}, expansion_add={}, expansion_search={}",
+            self.engine, self.metric, self.connectivity, self.expansion_add, self.expansion_search
+        )
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::sync::Arc;
--- a/src/file-engine/src/engine.rs
+++ b/src/file-engine/src/engine.rs
@@ -26,10 +26,10 @@ use object_store::ObjectStore;
 use snafu::{OptionExt, ensure};
 use store_api::metadata::RegionMetadataRef;
 use store_api::region_engine::{
-    RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
-    RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse,
-    SetRegionRoleStateSuccess, SettableRegionRoleState, SinglePartitionScanner,
-    SyncManifestResponse,
+    CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine, RegionManifestInfo, RegionRole,
+    RegionScannerRef, RegionStatistic, RemapManifestsRequest, RemapManifestsResponse,
+    SetRegionRoleStateResponse, SetRegionRoleStateSuccess, SettableRegionRoleState,
+    SinglePartitionScanner, SyncManifestResponse,
 };
 use store_api::region_request::{
    AffectedRows, RegionCloseRequest, RegionCreateRequest, RegionDropRequest, RegionOpenRequest,
@@ -163,6 +163,19 @@ impl RegionEngine for FileRegionEngine {
        ))
    }

+    async fn copy_region_from(
+        &self,
+        _region_id: RegionId,
+        _request: CopyRegionFromRequest,
+    ) -> Result<CopyRegionFromResponse, BoxedError> {
+        Err(BoxedError::new(
+            UnsupportedSnafu {
+                operation: "copy_region_from",
+            }
+            .build(),
+        ))
+    }
+
    fn role(&self, region_id: RegionId) -> Option<RegionRole> {
        self.inner.state(region_id)
    }
--- a/src/flow/src/batching_mode/frontend_client.rs
+++ b/src/flow/src/batching_mode/frontend_client.rs
@@ -15,7 +15,7 @@
 //! Frontend client to run flow as batching task which is time-window-aware normal query triggered every tick set by user

 use std::collections::HashMap;
-use std::sync::{Arc, Weak};
+use std::sync::{Arc, Mutex, Weak};
 use std::time::SystemTime;

 use api::v1::greptime_request::Request;
@@ -38,6 +38,7 @@ use servers::query_handler::grpc::GrpcQueryHandler;
 use session::context::{QueryContextBuilder, QueryContextRef};
 use session::hints::READ_PREFERENCE_HINT;
 use snafu::{OptionExt, ResultExt};
+use tokio::sync::SetOnce;

 use crate::batching_mode::BatchingModeOptions;
 use crate::error::{
@@ -75,7 +76,19 @@ impl<E: ErrorExt + Send + Sync + 'static, T: GrpcQueryHandler<Error = E> + Send
    }
 }

-type HandlerMutable = Arc<std::sync::Mutex<Option<Weak<dyn GrpcQueryHandlerWithBoxedError>>>>;
+#[derive(Debug, Clone)]
+pub struct HandlerMutable {
+    handler: Arc<Mutex<Option<Weak<dyn GrpcQueryHandlerWithBoxedError>>>>,
+    is_initialized: Arc<SetOnce<()>>,
+}
+
+impl HandlerMutable {
+    pub async fn set_handler(&self, handler: Weak<dyn GrpcQueryHandlerWithBoxedError>) {
+        *self.handler.lock().unwrap() = Some(handler);
+        // Ignore the error, as we allow the handler to be set multiple times.
+        let _ = self.is_initialized.set(());
+    }
+}

 /// A simple frontend client able to execute sql using grpc protocol
 ///
@@ -100,7 +113,11 @@ pub enum FrontendClient {
 impl FrontendClient {
    /// Create a new empty frontend client, with a `HandlerMutable` to set the grpc handler later
    pub fn from_empty_grpc_handler(query: QueryOptions) -> (Self, HandlerMutable) {
-        let handler = Arc::new(std::sync::Mutex::new(None));
+        let is_initialized = Arc::new(SetOnce::new());
+        let handler = HandlerMutable {
+            handler: Arc::new(Mutex::new(None)),
+            is_initialized,
+        };
        (
            Self::Standalone {
                database_client: handler.clone(),
@@ -110,6 +127,16 @@ impl FrontendClient {
        )
    }

+    /// Waits until the frontend client is initialized.
+    pub async fn wait_initialized(&self) {
+        if let FrontendClient::Standalone {
+            database_client, ..
+        } = self
+        {
+            database_client.is_initialized.wait().await;
+        }
+    }
+
    pub fn from_meta_client(
        meta_client: Arc<MetaClient>,
        auth: Option<FlowAuthHeader>,
@@ -138,8 +165,14 @@ impl FrontendClient {
        grpc_handler: Weak<dyn GrpcQueryHandlerWithBoxedError>,
        query: QueryOptions,
    ) -> Self {
+        let is_initialized = Arc::new(SetOnce::new_with(Some(())));
+        let handler = HandlerMutable {
+            handler: Arc::new(Mutex::new(Some(grpc_handler))),
+            is_initialized: is_initialized.clone(),
+        };
+
        Self::Standalone {
-            database_client: Arc::new(std::sync::Mutex::new(Some(grpc_handler))),
+            database_client: handler,
            query,
        }
    }
@@ -321,6 +354,7 @@ impl FrontendClient {
                {
                    let database_client = {
                        database_client
+                            .handler
                            .lock()
                            .map_err(|e| {
                                UnexpectedSnafu {
@@ -398,6 +432,7 @@ impl FrontendClient {
                {
                    let database_client = {
                        database_client
+                            .handler
                            .lock()
                            .map_err(|e| {
                                UnexpectedSnafu {
@@ -460,3 +495,73 @@ impl std::fmt::Display for PeerDesc {
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use common_query::Output;
+    use tokio::time::timeout;
+
+    use super::*;
+
+    #[derive(Debug)]
+    struct NoopHandler;
+
+    #[async_trait::async_trait]
+    impl GrpcQueryHandlerWithBoxedError for NoopHandler {
+        async fn do_query(
+            &self,
+            _query: Request,
+            _ctx: QueryContextRef,
+        ) -> std::result::Result<Output, BoxedError> {
+            Ok(Output::new_with_affected_rows(0))
+        }
+    }
+
+    #[tokio::test]
+    async fn wait_initialized() {
+        let (client, handler_mut) =
+            FrontendClient::from_empty_grpc_handler(QueryOptions::default());
+
+        assert!(
+            timeout(Duration::from_millis(50), client.wait_initialized())
+                .await
+                .is_err()
+        );
+
+        let handler: Arc<dyn GrpcQueryHandlerWithBoxedError> = Arc::new(NoopHandler);
+        handler_mut.set_handler(Arc::downgrade(&handler)).await;
+
+        timeout(Duration::from_secs(1), client.wait_initialized())
+            .await
+            .expect("wait_initialized should complete after handler is set");
+
+        timeout(Duration::from_millis(10), client.wait_initialized())
+            .await
+            .expect("wait_initialized should be a no-op once initialized");
+
+        let handler: Arc<dyn GrpcQueryHandlerWithBoxedError> = Arc::new(NoopHandler);
+        let client =
+            FrontendClient::from_grpc_handler(Arc::downgrade(&handler), QueryOptions::default());
+        assert!(
+            timeout(Duration::from_millis(10), client.wait_initialized())
+                .await
+                .is_ok()
+        );
+
+        let meta_client = Arc::new(MetaClient::default());
+        let client = FrontendClient::from_meta_client(
+            meta_client,
+            None,
+            QueryOptions::default(),
+            BatchingModeOptions::default(),
+        )
+        .unwrap();
+        assert!(
+            timeout(Duration::from_millis(10), client.wait_initialized())
+                .await
+                .is_ok()
+        );
+    }
+}
--- a/src/frontend/src/frontend.rs
+++ b/src/frontend/src/frontend.rs
@@ -157,7 +157,6 @@ mod tests {
    use common_error::from_header_to_err_code_msg;
    use common_error::status_code::StatusCode;
    use common_grpc::channel_manager::ChannelManager;
-    use common_meta::distributed_time_constants::FRONTEND_HEARTBEAT_INTERVAL_MILLIS;
    use common_meta::heartbeat::handler::HandlerGroupExecutor;
    use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
    use common_meta::heartbeat::handler::suspend::SuspendHandler;
@@ -400,6 +399,10 @@ mod tests {
                ..Default::default()
            },
            meta_client: Some(meta_client_options.clone()),
+            heartbeat: HeartbeatOptions {
+                interval: Duration::from_secs(1),
+                ..Default::default()
+            },
            ..Default::default()
        };

@@ -409,7 +412,8 @@ mod tests {
        let meta_client = create_meta_client(&meta_client_options, server.clone()).await;
        let frontend = create_frontend(&options, meta_client).await?;

-        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        let frontend_heartbeat_interval = options.heartbeat.interval;
+        tokio::time::sleep(frontend_heartbeat_interval).await;
        // initial state: not suspend:
        assert!(!frontend.instance.is_suspended());
        verify_suspend_state_by_http(&frontend, Ok(r#"[{"records":{"schema":{"column_schemas":[{"name":"Int64(1)","data_type":"Int64"}]},"rows":[[1]],"total_rows":1}}]"#)).await;
@@ -426,7 +430,7 @@ mod tests {

        // make heartbeat server returned "suspend" instruction,
        server.suspend.store(true, Ordering::Relaxed);
-        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        tokio::time::sleep(frontend_heartbeat_interval).await;
        // ... then the frontend is suspended:
        assert!(frontend.instance.is_suspended());
        verify_suspend_state_by_http(
@@ -442,7 +446,7 @@ mod tests {

        // make heartbeat server NOT returned "suspend" instruction,
        server.suspend.store(false, Ordering::Relaxed);
-        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        tokio::time::sleep(frontend_heartbeat_interval).await;
        // ... then frontend's suspend state is cleared:
        assert!(!frontend.instance.is_suspended());
        verify_suspend_state_by_http(&frontend, Ok(r#"[{"records":{"schema":{"column_schemas":[{"name":"Int64(1)","data_type":"Int64"}]},"rows":[[1]],"total_rows":1}}]"#)).await;
--- a/src/frontend/src/instance/grpc.rs
+++ b/src/frontend/src/instance/grpc.rs
@@ -301,39 +301,15 @@ impl GrpcQueryHandler for Instance {
        mut stream: servers::grpc::flight::PutRecordBatchRequestStream,
        ctx: QueryContextRef,
    ) -> Pin<Box<dyn Stream<Item = Result<DoPutResponse>> + Send>> {
-        // Resolve table once for the stream
        // Clone all necessary data to make it 'static
        let catalog_manager = self.catalog_manager().clone();
        let plugins = self.plugins.clone();
        let inserter = self.inserter.clone();
-        let table_name = stream.table_name().clone();
        let ctx = ctx.clone();
+        let mut table_ref: Option<TableRef> = None;
+        let mut table_checked = false;

        Box::pin(try_stream! {
-            plugins
-                .get::<PermissionCheckerRef>()
-                .as_ref()
-                .check_permission(ctx.current_user(), PermissionReq::BulkInsert)
-                .context(PermissionSnafu)?;
-            // Cache for resolved table reference - resolve once and reuse
-            let table_ref = catalog_manager
-                .table(
-                    &table_name.catalog_name,
-                    &table_name.schema_name,
-                    &table_name.table_name,
-                    None,
-                )
-                .await
-                .context(CatalogSnafu)?
-                .with_context(|| TableNotFoundSnafu {
-                    table_name: table_name.to_string(),
-                })?;
-
-            // Check permissions once for the stream
-            let interceptor_ref = plugins.get::<GrpcQueryInterceptorRef<Error>>();
-            let interceptor = interceptor_ref.as_ref();
-            interceptor.pre_bulk_insert(table_ref.clone(), ctx.clone())?;
-
            // Process each request in the stream
            while let Some(request_result) = stream.next().await {
                let request = request_result.map_err(|e| {
@@ -341,11 +317,45 @@ impl GrpcQueryHandler for Instance {
                    IncompleteGrpcRequestSnafu { err_msg: error_msg }.build()
                })?;

+                // Resolve table and check permissions on first RecordBatch (after schema is received)
+                if !table_checked {
+                    let table_name = &request.table_name;
+
+                    plugins
+                        .get::<PermissionCheckerRef>()
+                        .as_ref()
+                        .check_permission(ctx.current_user(), PermissionReq::BulkInsert)
+                        .context(PermissionSnafu)?;
+
+                    // Resolve table reference
+                    table_ref = Some(
+                        catalog_manager
+                            .table(
+                                &table_name.catalog_name,
+                                &table_name.schema_name,
+                                &table_name.table_name,
+                                None,
+                            )
+                            .await
+                            .context(CatalogSnafu)?
+                            .with_context(|| TableNotFoundSnafu {
+                                table_name: table_name.to_string(),
+                            })?,
+                    );
+
+                    // Check permissions for the table
+                    let interceptor_ref = plugins.get::<GrpcQueryInterceptorRef<Error>>();
+                    let interceptor = interceptor_ref.as_ref();
+                    interceptor.pre_bulk_insert(table_ref.clone().unwrap(), ctx.clone())?;
+
+                    table_checked = true;
+                }
+
                let request_id = request.request_id;
                let start = Instant::now();
                let rows = inserter
                    .handle_bulk_insert(
-                        table_ref.clone(),
+                        table_ref.clone().unwrap(),
                        request.flight_data,
                        request.record_batch,
                        request.schema_bytes,
--- a/src/index/Cargo.toml
+++ b/src/index/Cargo.toml
@@ -7,6 +7,9 @@ license.workspace = true
 [lints]
 workspace = true

+[features]
+vector_index = ["dep:usearch"]
+
 [dependencies]
 async-trait.workspace = true
 asynchronous-codec = "0.7.0"
@@ -17,6 +20,7 @@ common-error.workspace = true
 common-macro.workspace = true
 common-runtime.workspace = true
 common-telemetry.workspace = true
+datatypes.workspace = true
 fastbloom = "0.8"
 fst.workspace = true
 futures.workspace = true
@@ -25,6 +29,7 @@ itertools.workspace = true
 jieba-rs = "0.8"
 lazy_static.workspace = true
 mockall.workspace = true
+nalgebra.workspace = true
 pin-project.workspace = true
 prost.workspace = true
 puffin.workspace = true
@@ -39,6 +44,7 @@ tantivy = { version = "0.24", features = ["zstd-compression"] }
 tantivy-jieba = "0.16"
 tokio.workspace = true
 tokio-util.workspace = true
+usearch = { version = "2.21", default-features = false, features = ["fp16lib"], optional = true }
 uuid.workspace = true

 [dev-dependencies]
--- a/src/index/src/lib.rs
+++ b/src/index/src/lib.rs
@@ -22,6 +22,8 @@ pub mod external_provider;
 pub mod fulltext_index;
 pub mod inverted_index;
 pub mod target;
+#[cfg(feature = "vector_index")]
+pub mod vector;

 pub type Bytes = Vec<u8>;
 pub type BytesRef<'a> = &'a [u8];
--- a/src/index/src/vector.rs
+++ b/src/index/src/vector.rs
@@ -0,0 +1,163 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Vector index types and options.
+//!
+//! This module re-exports types from `datatypes` and provides conversions
+//! to USearch types, as well as distance computation functions.
+
+pub use datatypes::schema::{VectorDistanceMetric, VectorIndexOptions};
+use nalgebra::DVectorView;
+pub use usearch::MetricKind;
+
+/// Converts a VectorDistanceMetric to a USearch MetricKind.
+pub fn distance_metric_to_usearch(metric: VectorDistanceMetric) -> MetricKind {
+    match metric {
+        VectorDistanceMetric::L2sq => MetricKind::L2sq,
+        VectorDistanceMetric::Cosine => MetricKind::Cos,
+        VectorDistanceMetric::InnerProduct => MetricKind::IP,
+    }
+}
+
+/// Computes distance between two vectors using the specified metric.
+///
+/// Uses SIMD-optimized implementations via nalgebra.
+///
+/// **Note:** The caller must ensure that the two vectors have the same length
+/// and are non-empty. Empty vectors return 0.0 for all metrics.
+pub fn compute_distance(v1: &[f32], v2: &[f32], metric: VectorDistanceMetric) -> f32 {
+    // Empty vectors are degenerate; return 0.0 uniformly across all metrics.
+    if v1.is_empty() || v2.is_empty() {
+        return 0.0;
+    }
+
+    match metric {
+        VectorDistanceMetric::L2sq => l2sq(v1, v2),
+        VectorDistanceMetric::Cosine => cosine(v1, v2),
+        VectorDistanceMetric::InnerProduct => -dot(v1, v2),
+    }
+}
+
+/// Calculates the squared L2 distance between two vectors.
+fn l2sq(lhs: &[f32], rhs: &[f32]) -> f32 {
+    let lhs = DVectorView::from_slice(lhs, lhs.len());
+    let rhs = DVectorView::from_slice(rhs, rhs.len());
+    (lhs - rhs).norm_squared()
+}
+
+/// Calculates the cosine distance between two vectors.
+///
+/// Returns a value in `[0.0, 2.0]` where 0.0 means identical direction and 2.0 means
+/// opposite direction. For degenerate cases (zero or near-zero magnitude vectors),
+/// returns 1.0 (maximum uncertainty) to avoid NaN and ensure safe index operations.
+fn cosine(lhs: &[f32], rhs: &[f32]) -> f32 {
+    let lhs_vec = DVectorView::from_slice(lhs, lhs.len());
+    let rhs_vec = DVectorView::from_slice(rhs, rhs.len());
+
+    let dot_product = lhs_vec.dot(&rhs_vec);
+    let lhs_norm = lhs_vec.norm();
+    let rhs_norm = rhs_vec.norm();
+
+    // Zero-magnitude vectors have undefined direction; return max distance as safe fallback.
+    if dot_product.abs() < f32::EPSILON
+        || lhs_norm.abs() < f32::EPSILON
+        || rhs_norm.abs() < f32::EPSILON
+    {
+        return 1.0;
+    }
+
+    let cos_similar = dot_product / (lhs_norm * rhs_norm);
+    let res = 1.0 - cos_similar;
+    // Clamp near-zero results to exactly 0.0 to avoid floating-point artifacts.
+    if res.abs() < f32::EPSILON { 0.0 } else { res }
+}
+
+/// Calculates the dot product between two vectors.
+fn dot(lhs: &[f32], rhs: &[f32]) -> f32 {
+    let lhs = DVectorView::from_slice(lhs, lhs.len());
+    let rhs = DVectorView::from_slice(rhs, rhs.len());
+    lhs.dot(&rhs)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_distance_metric_to_usearch() {
+        assert_eq!(
+            distance_metric_to_usearch(VectorDistanceMetric::L2sq),
+            MetricKind::L2sq
+        );
+        assert_eq!(
+            distance_metric_to_usearch(VectorDistanceMetric::Cosine),
+            MetricKind::Cos
+        );
+        assert_eq!(
+            distance_metric_to_usearch(VectorDistanceMetric::InnerProduct),
+            MetricKind::IP
+        );
+    }
+
+    #[test]
+    fn test_vector_index_options_default() {
+        let options = VectorIndexOptions::default();
+        assert_eq!(options.metric, VectorDistanceMetric::L2sq);
+        assert_eq!(options.connectivity, 16);
+        assert_eq!(options.expansion_add, 128);
+        assert_eq!(options.expansion_search, 64);
+    }
+
+    #[test]
+    fn test_compute_distance_l2sq() {
+        let v1 = vec![1.0, 2.0, 3.0];
+        let v2 = vec![4.0, 5.0, 6.0];
+        // L2sq = (4-1)^2 + (5-2)^2 + (6-3)^2 = 9 + 9 + 9 = 27
+        let dist = compute_distance(&v1, &v2, VectorDistanceMetric::L2sq);
+        assert!((dist - 27.0).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_compute_distance_cosine() {
+        let v1 = vec![1.0, 0.0, 0.0];
+        let v2 = vec![0.0, 1.0, 0.0];
+        // Orthogonal vectors have cosine similarity of 0, distance of 1
+        let dist = compute_distance(&v1, &v2, VectorDistanceMetric::Cosine);
+        assert!((dist - 1.0).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_compute_distance_inner_product() {
+        let v1 = vec![1.0, 2.0, 3.0];
+        let v2 = vec![4.0, 5.0, 6.0];
+        // Inner product = 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
+        // Distance is negated: -32
+        let dist = compute_distance(&v1, &v2, VectorDistanceMetric::InnerProduct);
+        assert!((dist - (-32.0)).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_compute_distance_empty_vectors() {
+        // Empty vectors should return 0.0 uniformly for all metrics
+        assert_eq!(compute_distance(&[], &[], VectorDistanceMetric::L2sq), 0.0);
+        assert_eq!(
+            compute_distance(&[], &[], VectorDistanceMetric::Cosine),
+            0.0
+        );
+        assert_eq!(
+            compute_distance(&[], &[], VectorDistanceMetric::InnerProduct),
+            0.0
+        );
+    }
+}
--- a/src/log-store/src/kafka/client_manager.rs
+++ b/src/log-store/src/kafka/client_manager.rs
@@ -16,7 +16,7 @@ use std::collections::HashMap;
 use std::sync::Arc;

 use common_wal::config::kafka::DatanodeKafkaConfig;
-use common_wal::config::kafka::common::{DEFAULT_BACKOFF_CONFIG, DEFAULT_CONNECT_TIMEOUT};
+use common_wal::config::kafka::common::DEFAULT_BACKOFF_CONFIG;
 use dashmap::DashMap;
 use rskafka::client::ClientBuilder;
 use rskafka::client::partition::{Compression, PartitionClient, UnknownTopicHandling};
@@ -79,7 +79,8 @@ impl ClientManager {
        // Sets backoff config for the top-level kafka client and all clients constructed by it.
        let mut builder = ClientBuilder::new(config.connection.broker_endpoints.clone())
            .backoff_config(DEFAULT_BACKOFF_CONFIG)
-            .connect_timeout(Some(DEFAULT_CONNECT_TIMEOUT));
+            .connect_timeout(Some(config.connection.connect_timeout))
+            .timeout(Some(config.connection.timeout));
        if let Some(sasl) = &config.connection.sasl {
            builder = builder.sasl_config(sasl.config.clone().into_sasl_config());
        };
--- a/src/meta-srv/src/bootstrap.rs
+++ b/src/meta-srv/src/bootstrap.rs
@@ -14,7 +14,6 @@

 use std::net::SocketAddr;
 use std::sync::Arc;
-use std::time::Duration;

 use api::v1::meta::cluster_server::ClusterServer;
 use api::v1::meta::heartbeat_server::HeartbeatServer;
@@ -60,11 +59,6 @@ use crate::service::admin::admin_axum_router;
 use crate::utils::etcd::create_etcd_client_with_tls;
 use crate::{Result, error};

-/// The default keep-alive interval for gRPC.
-const DEFAULT_GRPC_KEEP_ALIVE_INTERVAL: Duration = Duration::from_secs(10);
-/// The default keep-alive timeout for gRPC.
-const DEFAULT_GRPC_KEEP_ALIVE_TIMEOUT: Duration = Duration::from_secs(10);
-
 pub struct MetasrvInstance {
    metasrv: Arc<Metasrv>,

@@ -255,8 +249,8 @@ pub fn router(metasrv: Arc<Metasrv>) -> Router {
        // for admin services
        .accept_http1(true)
        // For quick network failures detection.
-        .http2_keepalive_interval(Some(DEFAULT_GRPC_KEEP_ALIVE_INTERVAL))
-        .http2_keepalive_timeout(Some(DEFAULT_GRPC_KEEP_ALIVE_TIMEOUT));
+        .http2_keepalive_interval(Some(metasrv.options().grpc.http2_keep_alive_interval))
+        .http2_keepalive_timeout(Some(metasrv.options().grpc.http2_keep_alive_timeout));
    let router = add_compressed_service!(router, HeartbeatServer::from_arc(metasrv.clone()));
    let router = add_compressed_service!(router, StoreServer::from_arc(metasrv.clone()));
    let router = add_compressed_service!(router, ClusterServer::from_arc(metasrv.clone()));
@@ -273,8 +267,12 @@ pub async fn metasrv_builder(
        (Some(kv_backend), _) => (kv_backend, None),
        (None, BackendImpl::MemoryStore) => (Arc::new(MemoryKvBackend::new()) as _, None),
        (None, BackendImpl::EtcdStore) => {
-            let etcd_client =
-                create_etcd_client_with_tls(&opts.store_addrs, opts.backend_tls.as_ref()).await?;
+            let etcd_client = create_etcd_client_with_tls(
+                &opts.store_addrs,
+                &opts.backend_client,
+                opts.backend_tls.as_ref(),
+            )
+            .await?;
            let kv_backend = EtcdStore::with_etcd_client(etcd_client.clone(), opts.max_txn_ops);
            let election = EtcdElection::with_etcd_client(
                &opts.grpc.server_addr,
@@ -291,7 +289,7 @@ pub async fn metasrv_builder(

            use common_meta::distributed_time_constants::POSTGRES_KEEP_ALIVE_SECS;
            use common_meta::kv_backend::rds::PgStore;
-            use deadpool_postgres::Config;
+            use deadpool_postgres::{Config, ManagerConfig, RecyclingMethod};

            use crate::election::rds::postgres::{ElectionPgClient, PgElection};
            use crate::utils::postgres::create_postgres_pool;
@@ -305,9 +303,16 @@ pub async fn metasrv_builder(
            let mut cfg = Config::new();
            cfg.keepalives = Some(true);
            cfg.keepalives_idle = Some(Duration::from_secs(POSTGRES_KEEP_ALIVE_SECS));
-            // We use a separate pool for election since we need a different session keep-alive idle time.
-            let pool = create_postgres_pool(&opts.store_addrs, Some(cfg), opts.backend_tls.clone())
-                .await?;
+            cfg.manager = Some(ManagerConfig {
+                recycling_method: RecyclingMethod::Verified,
+            });
+            // Use a dedicated pool for the election client to allow customized session settings.
+            let pool = create_postgres_pool(
+                &opts.store_addrs,
+                Some(cfg.clone()),
+                opts.backend_tls.clone(),
+            )
+            .await?;

            let election_client = ElectionPgClient::new(
                pool,
@@ -327,8 +332,8 @@ pub async fn metasrv_builder(
            )
            .await?;

-            let pool =
-                create_postgres_pool(&opts.store_addrs, None, opts.backend_tls.clone()).await?;
+            let pool = create_postgres_pool(&opts.store_addrs, Some(cfg), opts.backend_tls.clone())
+                .await?;
            let kv_backend = PgStore::with_pg_pool(
                pool,
                opts.meta_schema_name.as_deref(),
--- a/src/meta-srv/src/discovery.rs
+++ b/src/meta-srv/src/discovery.rs
@@ -16,13 +16,9 @@ pub mod lease;
 pub mod node_info;
 pub mod utils;

-use std::time::Duration;
-
 use api::v1::meta::heartbeat_request::NodeWorkloads;
 use common_error::ext::BoxedError;
-use common_meta::distributed_time_constants::{
-    DATANODE_LEASE_SECS, FLOWNODE_LEASE_SECS, FRONTEND_HEARTBEAT_INTERVAL_MILLIS,
-};
+use common_meta::distributed_time_constants::default_distributed_time_constants;
 use common_meta::error::Result;
 use common_meta::peer::{Peer, PeerDiscovery, PeerResolver};
 use common_meta::{DatanodeId, FlownodeId};
@@ -38,7 +34,7 @@ impl PeerDiscovery for MetaPeerClient {
        utils::alive_frontends(
            &DefaultSystemTimer,
            self,
-            Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS),
+            default_distributed_time_constants().frontend_heartbeat_interval,
        )
        .await
        .map_err(BoxedError::new)
@@ -52,7 +48,7 @@ impl PeerDiscovery for MetaPeerClient {
        utils::alive_datanodes(
            &DefaultSystemTimer,
            self,
-            Duration::from_secs(DATANODE_LEASE_SECS),
+            default_distributed_time_constants().datanode_lease,
            filter,
        )
        .await
@@ -67,7 +63,7 @@ impl PeerDiscovery for MetaPeerClient {
        utils::alive_flownodes(
            &DefaultSystemTimer,
            self,
-            Duration::from_secs(FLOWNODE_LEASE_SECS),
+            default_distributed_time_constants().flownode_lease,
            filter,
        )
        .await
--- a/src/meta-srv/src/discovery/lease.rs
+++ b/src/meta-srv/src/discovery/lease.rs
@@ -102,7 +102,7 @@ mod tests {
    use api::v1::meta::heartbeat_request::NodeWorkloads;
    use api::v1::meta::{DatanodeWorkloads, FlownodeWorkloads};
    use common_meta::cluster::{FrontendStatus, NodeInfo, NodeInfoKey, NodeStatus, Role};
-    use common_meta::distributed_time_constants::FRONTEND_HEARTBEAT_INTERVAL_MILLIS;
+    use common_meta::distributed_time_constants::default_distributed_time_constants;
    use common_meta::kv_backend::ResettableKvBackendRef;
    use common_meta::peer::{Peer, PeerDiscovery};
    use common_meta::rpc::store::PutRequest;
@@ -473,8 +473,10 @@ mod tests {
        let client = create_meta_peer_client();
        let in_memory = client.memory_backend();

+        let frontend_heartbeat_interval =
+            default_distributed_time_constants().frontend_heartbeat_interval;
        let last_activity_ts =
-            current_time_millis() - FRONTEND_HEARTBEAT_INTERVAL_MILLIS as i64 - 1000;
+            current_time_millis() - frontend_heartbeat_interval.as_millis() as i64 - 1000;
        let active_frontend_node = NodeInfo {
            peer: Peer {
                id: 0,
--- a/src/meta-srv/src/failure_detector.rs
+++ b/src/meta-srv/src/failure_detector.rs
@@ -15,7 +15,6 @@
 use std::collections::VecDeque;
 use std::time::Duration;

-use common_meta::distributed_time_constants;
 use serde::{Deserialize, Serialize};

 const FIRST_HEARTBEAT_ESTIMATE_MILLIS: i64 = 1000;
@@ -79,9 +78,7 @@ impl Default for PhiAccrualFailureDetectorOptions {
        Self {
            threshold: 8_f32,
            min_std_deviation: Duration::from_millis(100),
-            acceptable_heartbeat_pause: Duration::from_secs(
-                distributed_time_constants::DATANODE_LEASE_SECS,
-            ),
+            acceptable_heartbeat_pause: Duration::from_secs(10),
        }
    }
 }
--- a/src/meta-srv/src/gc/mock/integration.rs
+++ b/src/meta-srv/src/gc/mock/integration.rs
@@ -135,6 +135,9 @@ async fn test_full_gc_workflow() {
    );
 }

+/// Due to https://github.com/rust-lang/rust/issues/100141 can't have Instant early than process start time on non-linux OS
+/// This is fine since in real usage instant will always be after process start time
+#[cfg(target_os = "linux")]
 #[tokio::test]
 async fn test_tracker_cleanup() {
    init_default_ut_logging();
--- a/src/meta-srv/src/handler/region_lease_handler.rs
+++ b/src/meta-srv/src/handler/region_lease_handler.rs
@@ -134,7 +134,7 @@ mod test {
    use std::sync::Arc;

    use common_meta::datanode::{RegionManifestInfo, RegionStat, Stat};
-    use common_meta::distributed_time_constants;
+    use common_meta::distributed_time_constants::default_distributed_time_constants;
    use common_meta::key::TableMetadataManager;
    use common_meta::key::table_route::TableRouteValue;
    use common_meta::key::test_utils::new_test_table_info;
@@ -236,7 +236,7 @@ mod test {
        let opening_region_keeper = Arc::new(MemoryRegionKeeper::default());

        let handler = RegionLeaseHandler::new(
-            distributed_time_constants::REGION_LEASE_SECS,
+            default_distributed_time_constants().region_lease.as_secs(),
            table_metadata_manager.clone(),
            opening_region_keeper.clone(),
            None,
@@ -266,7 +266,7 @@ mod test {

        assert_eq!(
            acc.region_lease.as_ref().unwrap().lease_seconds,
-            distributed_time_constants::REGION_LEASE_SECS
+            default_distributed_time_constants().region_lease.as_secs()
        );

        assert_region_lease(
@@ -300,7 +300,7 @@ mod test {

        assert_eq!(
            acc.region_lease.as_ref().unwrap().lease_seconds,
-            distributed_time_constants::REGION_LEASE_SECS
+            default_distributed_time_constants().region_lease.as_secs()
        );

        assert_region_lease(
@@ -379,7 +379,7 @@ mod test {
        });

        let handler = RegionLeaseHandler::new(
-            distributed_time_constants::REGION_LEASE_SECS,
+            default_distributed_time_constants().region_lease.as_secs(),
            table_metadata_manager.clone(),
            Default::default(),
            None,
@@ -461,7 +461,7 @@ mod test {
            ..Default::default()
        });
        let handler = RegionLeaseHandler::new(
-            distributed_time_constants::REGION_LEASE_SECS,
+            default_distributed_time_constants().region_lease.as_secs(),
            table_metadata_manager.clone(),
            Default::default(),
            None,
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -27,7 +27,7 @@ use common_event_recorder::EventRecorderOptions;
 use common_greptimedb_telemetry::GreptimeDBTelemetryTask;
 use common_meta::cache_invalidator::CacheInvalidatorRef;
 use common_meta::ddl_manager::DdlManagerRef;
-use common_meta::distributed_time_constants;
+use common_meta::distributed_time_constants::{self, default_distributed_time_constants};
 use common_meta::key::TableMetadataManagerRef;
 use common_meta::key::runtime_switch::RuntimeSwitchManagerRef;
 use common_meta::kv_backend::{KvBackendRef, ResettableKvBackend, ResettableKvBackendRef};
@@ -121,6 +121,27 @@ impl Default for StatsPersistenceOptions {
    }
 }

+#[derive(Clone, PartialEq, Serialize, Deserialize, Debug)]
+#[serde(default)]
+pub struct BackendClientOptions {
+    #[serde(with = "humantime_serde")]
+    pub keep_alive_timeout: Duration,
+    #[serde(with = "humantime_serde")]
+    pub keep_alive_interval: Duration,
+    #[serde(with = "humantime_serde")]
+    pub connect_timeout: Duration,
+}
+
+impl Default for BackendClientOptions {
+    fn default() -> Self {
+        Self {
+            keep_alive_interval: Duration::from_secs(10),
+            keep_alive_timeout: Duration::from_secs(3),
+            connect_timeout: Duration::from_secs(3),
+        }
+    }
+}
+
 #[derive(Clone, PartialEq, Serialize, Deserialize)]
 #[serde(default)]
 pub struct MetasrvOptions {
@@ -136,12 +157,22 @@ pub struct MetasrvOptions {
    /// Only applicable when using PostgreSQL or MySQL as the metadata store
    #[serde(default)]
    pub backend_tls: Option<TlsOption>,
+    /// The backend client options.
+    /// Currently, only applicable when using etcd as the metadata store.
+    #[serde(default)]
+    pub backend_client: BackendClientOptions,
    /// The type of selector.
    pub selector: SelectorType,
    /// Whether to use the memory store.
    pub use_memory_store: bool,
    /// Whether to enable region failover.
    pub enable_region_failover: bool,
+    /// The base heartbeat interval.
+    ///
+    /// This value is used to calculate the distributed time constants for components.
+    /// e.g., the region lease time is `heartbeat_interval * 3 + Duration::from_secs(1)`.
+    #[serde(with = "humantime_serde")]
+    pub heartbeat_interval: Duration,
    /// The delay before starting region failure detection.
    /// This delay helps prevent Metasrv from triggering unnecessary region failovers before all Datanodes are fully started.
    /// Especially useful when the cluster is not deployed with GreptimeDB Operator and maintenance mode is not enabled.
@@ -240,7 +271,9 @@ impl fmt::Debug for MetasrvOptions {
            .field("tracing", &self.tracing)
            .field("backend", &self.backend)
            .field("event_recorder", &self.event_recorder)
-            .field("stats_persistence", &self.stats_persistence);
+            .field("stats_persistence", &self.stats_persistence)
+            .field("heartbeat_interval", &self.heartbeat_interval)
+            .field("backend_client", &self.backend_client);

        #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
        debug_struct.field("meta_table_name", &self.meta_table_name);
@@ -270,6 +303,7 @@ impl Default for MetasrvOptions {
            selector: SelectorType::default(),
            use_memory_store: false,
            enable_region_failover: false,
+            heartbeat_interval: distributed_time_constants::BASE_HEARTBEAT_INTERVAL,
            region_failure_detector_initialization_delay: Duration::from_secs(10 * 60),
            allow_region_failover_on_local_wal: false,
            grpc: GrpcOptions {
@@ -307,6 +341,7 @@ impl Default for MetasrvOptions {
            event_recorder: EventRecorderOptions::default(),
            stats_persistence: StatsPersistenceOptions::default(),
            gc: GcSchedulerOptions::default(),
+            backend_client: BackendClientOptions::default(),
        }
    }
 }
@@ -747,7 +782,7 @@ impl Metasrv {
            &DefaultSystemTimer,
            self.meta_peer_client.as_ref(),
            peer_id,
-            Duration::from_secs(distributed_time_constants::DATANODE_LEASE_SECS),
+            default_distributed_time_constants().datanode_lease,
        )
        .await
    }
--- a/src/meta-srv/src/metasrv/builder.rs
+++ b/src/meta-srv/src/metasrv/builder.rs
@@ -29,7 +29,7 @@ use common_meta::ddl::{
    DdlContext, NoopRegionFailureDetectorControl, RegionFailureDetectorControllerRef,
 };
 use common_meta::ddl_manager::{DdlManager, DdlManagerConfiguratorRef};
-use common_meta::distributed_time_constants::{self};
+use common_meta::distributed_time_constants::default_distributed_time_constants;
 use common_meta::key::TableMetadataManager;
 use common_meta::key::flow::FlowMetadataManager;
 use common_meta::key::flow::flow_state::FlowStateManager;
@@ -513,7 +513,7 @@ impl MetasrvBuilder {
            Some(handler_group_builder) => handler_group_builder,
            None => {
                let region_lease_handler = RegionLeaseHandler::new(
-                    distributed_time_constants::REGION_LEASE_SECS,
+                    default_distributed_time_constants().region_lease.as_secs(),
                    table_metadata_manager.clone(),
                    memory_region_keeper.clone(),
                    customized_region_lease_renewer,
--- a/src/meta-srv/src/procedure/region_migration.rs
+++ b/src/meta-srv/src/procedure/region_migration.rs
@@ -921,7 +921,7 @@ mod tests {
    use std::assert_matches::assert_matches;
    use std::sync::Arc;

-    use common_meta::distributed_time_constants::REGION_LEASE_SECS;
+    use common_meta::distributed_time_constants::default_distributed_time_constants;
    use common_meta::instruction::Instruction;
    use common_meta::key::test_utils::new_test_table_info;
    use common_meta::rpc::router::{Region, RegionRoute};
@@ -1192,8 +1192,10 @@ mod tests {
            .run_once()
            .await;

+        let region_lease = default_distributed_time_constants().region_lease.as_secs();
+
        // Ensure it didn't run into the slow path.
-        assert!(timer.elapsed().as_secs() < REGION_LEASE_SECS / 2);
+        assert!(timer.elapsed().as_secs() < region_lease / 2);

        runner.suite.verify_table_metadata().await;
    }
@@ -1539,8 +1541,9 @@ mod tests {
            .run_once()
            .await;

+        let region_lease = default_distributed_time_constants().region_lease.as_secs();
        // Ensure it didn't run into the slow path.
-        assert!(timer.elapsed().as_secs() < REGION_LEASE_SECS);
+        assert!(timer.elapsed().as_secs() < region_lease);
        runner.suite.verify_table_metadata().await;
    }
 }
--- a/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs
@@ -13,11 +13,10 @@
 // limitations under the License.

 use std::any::Any;
-use std::time::Duration;

 use api::v1::meta::MailboxMessage;
 use common_meta::RegionIdent;
-use common_meta::distributed_time_constants::REGION_LEASE_SECS;
+use common_meta::distributed_time_constants::default_distributed_time_constants;
 use common_meta::instruction::{Instruction, InstructionReply, SimpleReply};
 use common_procedure::{Context as ProcedureContext, Status};
 use common_telemetry::{info, warn};
@@ -30,9 +29,6 @@ use crate::procedure::region_migration::migration_end::RegionMigrationEnd;
 use crate::procedure::region_migration::{Context, State};
 use crate::service::mailbox::Channel;

-/// Uses lease time of a region as the timeout of closing a downgraded region.
-const CLOSE_DOWNGRADED_REGION_TIMEOUT: Duration = Duration::from_secs(REGION_LEASE_SECS);
-
 #[derive(Debug, Serialize, Deserialize)]
 pub struct CloseDowngradedRegion;

@@ -112,7 +108,7 @@ impl CloseDowngradedRegion {
        let ch = Channel::Datanode(downgrade_leader_datanode.id);
        let receiver = ctx
            .mailbox
-            .send(&ch, msg, CLOSE_DOWNGRADED_REGION_TIMEOUT)
+            .send(&ch, msg, default_distributed_time_constants().region_lease)
            .await?;

        match receiver.await {
--- a/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs
@@ -17,7 +17,7 @@ use std::time::Duration;

 use api::v1::meta::MailboxMessage;
 use common_error::ext::BoxedError;
-use common_meta::distributed_time_constants::REGION_LEASE_SECS;
+use common_meta::distributed_time_constants::default_distributed_time_constants;
 use common_meta::instruction::{
    DowngradeRegion, DowngradeRegionReply, DowngradeRegionsReply, Instruction, InstructionReply,
 };
@@ -64,7 +64,7 @@ impl State for DowngradeLeaderRegion {
        let now = Instant::now();
        // Ensures the `leader_region_lease_deadline` must exist after recovering.
        ctx.volatile_ctx
-            .set_leader_region_lease_deadline(Duration::from_secs(REGION_LEASE_SECS));
+            .set_leader_region_lease_deadline(default_distributed_time_constants().region_lease);

        match self.downgrade_region_with_retry(ctx).await {
            Ok(_) => {
@@ -277,14 +277,14 @@ impl DowngradeLeaderRegion {
        if let Some(last_connection_at) = last_connection_at {
            let now = current_time_millis();
            let elapsed = now - last_connection_at;
-            let region_lease = Duration::from_secs(REGION_LEASE_SECS);
+            let region_lease = default_distributed_time_constants().region_lease;

            // It's safe to update the region leader lease deadline here because:
            // 1. The old region leader has already been marked as downgraded in metadata,
            //    which means any attempts to renew its lease will be rejected.
            // 2. The pusher disconnect time record only gets removed when the datanode (from_peer)
            //    establishes a new heartbeat connection stream.
-            if elapsed >= (REGION_LEASE_SECS * 1000) as i64 {
+            if elapsed >= (region_lease.as_secs() * 1000) as i64 {
                ctx.volatile_ctx.reset_leader_region_lease_deadline();
                info!(
                    "Datanode {}({}) has been disconnected for longer than the region lease period ({:?}), reset leader region lease deadline to None, region: {:?}",
@@ -697,7 +697,8 @@ mod tests {
        let procedure_ctx = new_procedure_context();
        let (next, _) = state.next(&mut ctx, &procedure_ctx).await.unwrap();
        let elapsed = timer.elapsed().as_secs();
-        assert!(elapsed < REGION_LEASE_SECS / 2);
+        let region_lease = default_distributed_time_constants().region_lease.as_secs();
+        assert!(elapsed < region_lease / 2);
        assert_eq!(
            ctx.volatile_ctx
                .leader_region_last_entry_ids
--- a/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs
@@ -14,11 +14,10 @@

 use std::any::Any;
 use std::ops::Div;
-use std::time::Duration;

 use api::v1::meta::MailboxMessage;
 use common_meta::RegionIdent;
-use common_meta::distributed_time_constants::REGION_LEASE_SECS;
+use common_meta::distributed_time_constants::default_distributed_time_constants;
 use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply};
 use common_meta::key::datanode_table::RegionInfo;
 use common_procedure::{Context as ProcedureContext, Status};
@@ -33,9 +32,6 @@ use crate::procedure::region_migration::flush_leader_region::PreFlushRegion;
 use crate::procedure::region_migration::{Context, State};
 use crate::service::mailbox::Channel;

-/// Uses lease time of a region as the timeout of opening a candidate region.
-const OPEN_CANDIDATE_REGION_TIMEOUT: Duration = Duration::from_secs(REGION_LEASE_SECS);
-
 #[derive(Debug, Serialize, Deserialize)]
 pub struct OpenCandidateRegion;

@@ -157,7 +153,9 @@ impl OpenCandidateRegion {
                .context(error::ExceededDeadlineSnafu {
                    operation: "Open candidate region",
                })?;
-        let operation_timeout = operation_timeout.div(2).max(OPEN_CANDIDATE_REGION_TIMEOUT);
+        let operation_timeout = operation_timeout
+            .div(2)
+            .max(default_distributed_time_constants().region_lease);
        let ch = Channel::Datanode(candidate.id);
        let now = Instant::now();
        let receiver = ctx.mailbox.send(&ch, msg, operation_timeout).await?;
--- a/src/meta-srv/src/service/heartbeat.rs
+++ b/src/meta-srv/src/service/heartbeat.rs
@@ -99,6 +99,7 @@ impl heartbeat_server::Heartbeat for Metasrv {
                            error!("Client disconnected: broken pipe");
                            break;
                        }
+                        error!(err; "Sending heartbeat response error");

                        if tx.send(Err(err)).await.is_err() {
                            info!("ReceiverStream was dropped; shutting down");
--- a/src/meta-srv/src/utils/etcd.rs
+++ b/src/meta-srv/src/utils/etcd.rs
@@ -12,17 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use common_meta::distributed_time_constants::default_etcd_client_options;
 use common_meta::kv_backend::etcd::create_etcd_tls_options;
-use etcd_client::Client;
+use etcd_client::{Client, ConnectOptions};
 use servers::tls::{TlsMode, TlsOption};
 use snafu::ResultExt;

 use crate::error::{self, BuildTlsOptionsSnafu, Result};
+use crate::metasrv::BackendClientOptions;

 /// Creates an etcd client with TLS configuration.
 pub async fn create_etcd_client_with_tls(
    store_addrs: &[String],
+    client_options: &BackendClientOptions,
    tls_config: Option<&TlsOption>,
 ) -> Result<Client> {
    let etcd_endpoints = store_addrs
@@ -31,7 +32,12 @@ pub async fn create_etcd_client_with_tls(
        .filter(|x| !x.is_empty())
        .collect::<Vec<_>>();

-    let mut connect_options = default_etcd_client_options();
+    let mut connect_options = ConnectOptions::new()
+        .with_keep_alive_while_idle(true)
+        .with_keep_alive(
+            client_options.keep_alive_interval,
+            client_options.keep_alive_timeout,
+        );
    if let Some(tls_config) = tls_config
        && let Some(tls_options) = create_etcd_tls_options(&convert_tls_option(tls_config))
            .context(BuildTlsOptionsSnafu)?
--- a/src/metric-engine/src/engine.rs
+++ b/src/metric-engine/src/engine.rs
@@ -43,9 +43,10 @@ pub(crate) use state::MetricEngineState;
 use store_api::metadata::RegionMetadataRef;
 use store_api::metric_engine_consts::METRIC_ENGINE_NAME;
 use store_api::region_engine::{
-    BatchResponses, RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef,
-    RegionStatistic, RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse,
-    SetRegionRoleStateSuccess, SettableRegionRoleState, SyncManifestResponse,
+    BatchResponses, CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine,
+    RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic, RemapManifestsRequest,
+    RemapManifestsResponse, SetRegionRoleStateResponse, SetRegionRoleStateSuccess,
+    SettableRegionRoleState, SyncManifestResponse,
 };
 use store_api::region_request::{
    BatchRegionDdlRequest, RegionCatchupRequest, RegionOpenRequest, RegionRequest,
@@ -375,6 +376,14 @@ impl RegionEngine for MetricEngine {
        }
    }

+    async fn copy_region_from(
+        &self,
+        _region_id: RegionId,
+        _request: CopyRegionFromRequest,
+    ) -> Result<CopyRegionFromResponse, BoxedError> {
+        todo!()
+    }
+
    async fn set_region_role_state_gracefully(
        &self,
        region_id: RegionId,
--- a/src/mito-codec/src/index.rs
+++ b/src/mito-codec/src/index.rs
@@ -48,7 +48,7 @@ impl IndexValueCodec {
    ) -> Result<()> {
        ensure!(!value.is_null(), IndexEncodeNullSnafu);

-        if field.data_type().is_string() {
+        if field.encode_data_type().is_string() {
            let value = value
                .try_into_string()
                .context(FieldTypeMismatchSnafu)?
--- a/src/mito-codec/src/row_converter/dense.rs
+++ b/src/mito-codec/src/row_converter/dense.rs
@@ -57,15 +57,20 @@ impl SortField {
        &self.data_type
    }

-    pub fn estimated_size(&self) -> usize {
+    /// Returns the physical data type to encode of the field.
+    ///
+    /// For example, a dictionary field will be encoded as its value type.
+    pub fn encode_data_type(&self) -> &ConcreteDataType {
        match &self.data_type {
-            ConcreteDataType::Dictionary(dict_type) => {
-                Self::estimated_size_by_type(dict_type.value_type())
-            }
-            data_type => Self::estimated_size_by_type(data_type),
+            ConcreteDataType::Dictionary(dict_type) => dict_type.value_type(),
+            _ => &self.data_type,
        }
    }

+    pub fn estimated_size(&self) -> usize {
+        Self::estimated_size_by_type(self.encode_data_type())
+    }
+
    fn estimated_size_by_type(data_type: &ConcreteDataType) -> usize {
        match data_type {
            ConcreteDataType::Boolean(_) => 2,
@@ -98,12 +103,7 @@ impl SortField {
        serializer: &mut Serializer<&mut Vec<u8>>,
        value: &ValueRef,
    ) -> Result<()> {
-        match self.data_type() {
-            ConcreteDataType::Dictionary(dict_type) => {
-                Self::serialize_by_type(dict_type.value_type(), serializer, value)
-            }
-            data_type => Self::serialize_by_type(data_type, serializer, value),
-        }
+        Self::serialize_by_type(self.encode_data_type(), serializer, value)
    }

    fn serialize_by_type(
@@ -194,12 +194,7 @@ impl SortField {

    /// Deserialize a value from the deserializer.
    pub fn deserialize<B: Buf>(&self, deserializer: &mut Deserializer<B>) -> Result<Value> {
-        match &self.data_type {
-            ConcreteDataType::Dictionary(dict_type) => {
-                Self::deserialize_by_type(dict_type.value_type(), deserializer)
-            }
-            data_type => Self::deserialize_by_type(data_type, deserializer),
-        }
+        Self::deserialize_by_type(self.encode_data_type(), deserializer)
    }

    fn deserialize_by_type<B: Buf>(
@@ -301,12 +296,7 @@ impl SortField {
            return Ok(1);
        }

-        match &self.data_type {
-            ConcreteDataType::Dictionary(dict_type) => {
-                Self::skip_deserialize_by_type(dict_type.value_type(), bytes, deserializer)
-            }
-            data_type => Self::skip_deserialize_by_type(data_type, bytes, deserializer),
-        }
+        Self::skip_deserialize_by_type(self.encode_data_type(), bytes, deserializer)
    }

    fn skip_deserialize_by_type(
--- a/src/mito2/Cargo.toml
+++ b/src/mito2/Cargo.toml
@@ -30,6 +30,7 @@ common-error.workspace = true
 common-grpc.workspace = true
 common-macro.workspace = true
 common-meta.workspace = true
+common-memory-manager.workspace = true
 common-query.workspace = true
 common-recordbatch.workspace = true
 common-runtime.workspace = true
@@ -48,6 +49,7 @@ dotenv.workspace = true
 either.workspace = true
 futures.workspace = true
 humantime-serde.workspace = true
+humantime.workspace = true
 index.workspace = true
 itertools.workspace = true
 greptime-proto.workspace = true
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -34,6 +34,7 @@ use index::bloom_filter_index::{BloomFilterIndexCache, BloomFilterIndexCacheRef}
 use index::result_cache::IndexResultCache;
 use moka::notification::RemovalCause;
 use moka::sync::Cache;
+use object_store::ObjectStore;
 use parquet::file::metadata::ParquetMetaData;
 use puffin::puffin_manager::cache::{PuffinMetadataCache, PuffinMetadataCacheRef};
 use store_api::storage::{ConcreteDataType, FileId, RegionId, TimeSeriesRowSelector};
@@ -263,6 +264,26 @@ impl CacheStrategy {
            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
        }
    }
+
+    /// Triggers download if the strategy is [CacheStrategy::EnableAll] and write cache is available.
+    pub fn maybe_download_background(
+        &self,
+        index_key: IndexKey,
+        remote_path: String,
+        remote_store: ObjectStore,
+        file_size: u64,
+    ) {
+        if let CacheStrategy::EnableAll(cache_manager) = self
+            && let Some(write_cache) = cache_manager.write_cache()
+        {
+            write_cache.file_cache().maybe_download_background(
+                index_key,
+                remote_path,
+                remote_store,
+                file_size,
+            );
+        }
+    }
 }

 /// Manages cached data for the engine.
--- a/src/mito2/src/cache/file_cache.rs
+++ b/src/mito2/src/cache/file_cache.rs
@@ -31,7 +31,7 @@ use object_store::{ErrorKind, ObjectStore, Reader};
 use parquet::file::metadata::ParquetMetaData;
 use snafu::ResultExt;
 use store_api::storage::{FileId, RegionId};
-use tokio::sync::mpsc::UnboundedReceiver;
+use tokio::sync::mpsc::{Sender, UnboundedReceiver};

 use crate::access_layer::TempFileCleaner;
 use crate::cache::{FILE_TYPE, INDEX_TYPE};
@@ -55,6 +55,17 @@ pub(crate) const DEFAULT_INDEX_CACHE_PERCENT: u8 = 20;
 /// Minimum capacity for each cache (512MB).
 const MIN_CACHE_CAPACITY: u64 = 512 * 1024 * 1024;

+/// Channel capacity for background download tasks.
+const DOWNLOAD_TASK_CHANNEL_SIZE: usize = 64;
+
+/// A task to download a file in the background.
+struct DownloadTask {
+    index_key: IndexKey,
+    remote_path: String,
+    remote_store: ObjectStore,
+    file_size: u64,
+}
+
 /// Inner struct for FileCache that can be used in spawned tasks.
 #[derive(Debug)]
 struct FileCacheInner {
@@ -170,8 +181,8 @@ impl FileCacheInner {
        remote_path: &str,
        remote_store: &ObjectStore,
        file_size: u64,
+        concurrency: usize,
    ) -> Result<()> {
-        const DOWNLOAD_READER_CONCURRENCY: usize = 8;
        const DOWNLOAD_READER_CHUNK_SIZE: ReadableSize = ReadableSize::mb(8);

        let file_type = index_key.file_type;
@@ -184,7 +195,7 @@ impl FileCacheInner {

        let reader = remote_store
            .reader_with(remote_path)
-            .concurrent(DOWNLOAD_READER_CONCURRENCY)
+            .concurrent(concurrency)
            .chunk(DOWNLOAD_READER_CHUNK_SIZE.as_bytes() as usize)
            .await
            .context(error::OpenDalSnafu)?
@@ -238,11 +249,14 @@ impl FileCacheInner {
        remote_path: &str,
        remote_store: &ObjectStore,
        file_size: u64,
+        concurrency: usize,
    ) -> Result<()> {
        if let Err(e) = self
-            .download_without_cleaning(index_key, remote_path, remote_store, file_size)
+            .download_without_cleaning(index_key, remote_path, remote_store, file_size, concurrency)
            .await
        {
+            error!(e; "Failed to download file '{}' for region {}", remote_path, index_key.region_id);
+
            let filename = index_key.to_string();
            TempFileCleaner::clean_atomic_dir_files(&self.local_store, &[&filename]).await;

@@ -251,6 +265,11 @@ impl FileCacheInner {

        Ok(())
    }
+
+    /// Checks if the key is in the file cache.
+    fn contains_key(&self, key: &IndexKey) -> bool {
+        self.memory_index(key.file_type).contains_key(key)
+    }
 }

 /// A file cache manages files on local store and evict files based
@@ -261,6 +280,8 @@ pub(crate) struct FileCache {
    inner: Arc<FileCacheInner>,
    /// Capacity of the puffin (index) cache in bytes.
    puffin_capacity: u64,
+    /// Channel for background download tasks. None if background worker is disabled.
+    download_task_tx: Option<Sender<DownloadTask>>,
 }

 pub(crate) type FileCacheRef = Arc<FileCache>;
@@ -272,6 +293,7 @@ impl FileCache {
        capacity: ReadableSize,
        ttl: Option<Duration>,
        index_cache_percent: Option<u8>,
+        enable_background_worker: bool,
    ) -> FileCache {
        // Validate and use the provided percent or default
        let index_percent = index_cache_percent
@@ -306,12 +328,54 @@ impl FileCache {
            puffin_index,
        });

+        // Only create channel and spawn worker if background download is enabled
+        let download_task_tx = if enable_background_worker {
+            let (tx, rx) = tokio::sync::mpsc::channel(DOWNLOAD_TASK_CHANNEL_SIZE);
+            Self::spawn_download_worker(inner.clone(), rx);
+            Some(tx)
+        } else {
+            None
+        };
+
        FileCache {
            inner,
            puffin_capacity,
+            download_task_tx,
        }
    }

+    /// Spawns a background worker to process download tasks.
+    fn spawn_download_worker(
+        inner: Arc<FileCacheInner>,
+        mut download_task_rx: tokio::sync::mpsc::Receiver<DownloadTask>,
+    ) {
+        tokio::spawn(async move {
+            info!("Background download worker started");
+            while let Some(task) = download_task_rx.recv().await {
+                // Check if the file is already in the cache
+                if inner.contains_key(&task.index_key) {
+                    debug!(
+                        "Skipping background download for region {}, file {} - already in cache",
+                        task.index_key.region_id, task.index_key.file_id
+                    );
+                    continue;
+                }
+
+                // Ignores background download errors.
+                let _ = inner
+                    .download(
+                        task.index_key,
+                        &task.remote_path,
+                        &task.remote_store,
+                        task.file_size,
+                        1, // Background downloads use concurrency=1
+                    )
+                    .await;
+            }
+            info!("Background download worker stopped");
+        });
+    }
+
    /// Builds a cache for a specific file type.
    fn build_cache(
        local_store: ObjectStore,
@@ -333,11 +397,9 @@ impl FileCache {
                let file_path = cache_file_path(FILE_DIR, *key);
                async move {
                    if let RemovalCause::Replaced = cause {
-                        // The cache is replaced by another file. This is unexpected, we don't remove the same
+                        // The cache is replaced by another file (maybe download again). We don't remove the same
                        // file but updates the metrics as the file is already replaced by users.
                        CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into());
-                        // TODO(yingwen): Don't log warn later.
-                        warn!("Replace existing cache {} for region {} unexpectedly", file_path, key.region_id);
                        return;
                    }

@@ -553,7 +615,7 @@ impl FileCache {

    /// Checks if the key is in the file cache.
    pub(crate) fn contains_key(&self, key: &IndexKey) -> bool {
-        self.inner.memory_index(key.file_type).contains_key(key)
+        self.inner.contains_key(key)
    }

    /// Returns the capacity of the puffin (index) cache in bytes.
@@ -576,9 +638,42 @@ impl FileCache {
        file_size: u64,
    ) -> Result<()> {
        self.inner
-            .download(index_key, remote_path, remote_store, file_size)
+            .download(index_key, remote_path, remote_store, file_size, 8) // Foreground uses concurrency=8
            .await
    }
+
+    /// Downloads a file in `remote_path` from the remote object store to the local cache
+    /// (specified by `index_key`) in the background. Errors are logged but not returned.
+    ///
+    /// This method attempts to send a download task to the background worker.
+    /// If the channel is full, the task is silently dropped.
+    pub(crate) fn maybe_download_background(
+        &self,
+        index_key: IndexKey,
+        remote_path: String,
+        remote_store: ObjectStore,
+        file_size: u64,
+    ) {
+        // Do nothing if background worker is disabled (channel is None)
+        let Some(tx) = &self.download_task_tx else {
+            return;
+        };
+
+        let task = DownloadTask {
+            index_key,
+            remote_path,
+            remote_store,
+            file_size,
+        };
+
+        // Try to send the task; if the channel is full, just drop it
+        if let Err(e) = tx.try_send(task) {
+            debug!(
+                "Failed to queue background download task for region {}, file {}: {:?}",
+                index_key.region_id, index_key.file_id, e
+            );
+        }
+    }
 }

 /// Key of file cache index.
@@ -708,6 +803,7 @@ mod tests {
            ReadableSize::mb(10),
            Some(Duration::from_millis(10)),
            None,
+            true, // enable_background_worker
        );
        let region_id = RegionId::new(2000, 0);
        let file_id = FileId::random();
@@ -744,7 +840,13 @@ mod tests {
        let dir = create_temp_dir("");
        let local_store = new_fs_store(dir.path().to_str().unwrap());

-        let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None);
+        let cache = FileCache::new(
+            local_store.clone(),
+            ReadableSize::mb(10),
+            None,
+            None,
+            true, // enable_background_worker
+        );
        let region_id = RegionId::new(2000, 0);
        let file_id = FileId::random();
        let key = IndexKey::new(region_id, file_id, FileType::Parquet);
@@ -792,7 +894,13 @@ mod tests {
        let dir = create_temp_dir("");
        let local_store = new_fs_store(dir.path().to_str().unwrap());

-        let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None);
+        let cache = FileCache::new(
+            local_store.clone(),
+            ReadableSize::mb(10),
+            None,
+            None,
+            true, // enable_background_worker
+        );
        let region_id = RegionId::new(2000, 0);
        let file_id = FileId::random();
        let key = IndexKey::new(region_id, file_id, FileType::Parquet);
@@ -824,7 +932,13 @@ mod tests {
    async fn test_file_cache_recover() {
        let dir = create_temp_dir("");
        let local_store = new_fs_store(dir.path().to_str().unwrap());
-        let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None);
+        let cache = FileCache::new(
+            local_store.clone(),
+            ReadableSize::mb(10),
+            None,
+            None,
+            true, // enable_background_worker
+        );

        let region_id = RegionId::new(2000, 0);
        let file_type = FileType::Parquet;
@@ -850,7 +964,13 @@ mod tests {
        }

        // Recover the cache.
-        let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None);
+        let cache = FileCache::new(
+            local_store.clone(),
+            ReadableSize::mb(10),
+            None,
+            None,
+            true, // enable_background_worker
+        );
        // No entry before recovery.
        assert!(
            cache
@@ -879,7 +999,13 @@ mod tests {
    async fn test_file_cache_read_ranges() {
        let dir = create_temp_dir("");
        let local_store = new_fs_store(dir.path().to_str().unwrap());
-        let file_cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None);
+        let file_cache = FileCache::new(
+            local_store.clone(),
+            ReadableSize::mb(10),
+            None,
+            None,
+            true, // enable_background_worker
+        );
        let region_id = RegionId::new(2000, 0);
        let file_id = FileId::random();
        let key = IndexKey::new(region_id, file_id, FileType::Parquet);
--- a/src/mito2/src/cache/manifest_cache.rs
+++ b/src/mito2/src/cache/manifest_cache.rs
@@ -370,7 +370,22 @@ impl ManifestCache {
    /// If `check_mtime` is true, only removes directories that have not been modified
    /// for at least 1 hour.
    fn clean_empty_dirs_sync(dir: &PathBuf, check_mtime: bool) -> std::io::Result<()> {
-        Self::remove_empty_dirs_recursive_sync(dir, check_mtime)?;
+        let is_empty = Self::remove_empty_dirs_recursive_sync(dir, check_mtime)?;
+        if is_empty {
+            if let Err(e) = std::fs::remove_dir(dir) {
+                if e.kind() != std::io::ErrorKind::NotFound {
+                    warn!(e; "Failed to remove empty root dir {}", dir.display());
+                    return Err(e);
+                } else {
+                    warn!("Empty root dir not found before removal {}", dir.display());
+                }
+            } else {
+                info!(
+                    "Removed empty root dir {} from manifest cache",
+                    dir.display()
+                );
+            }
+        }
        Ok(())
    }

@@ -412,11 +427,16 @@ impl ManifestCache {

                let subdir_empty = Self::remove_empty_dirs_recursive_sync(&path, check_mtime)?;
                if subdir_empty {
-                    if let Err(e) = std::fs::remove_dir(&path)
-                        && e.kind() != std::io::ErrorKind::NotFound
-                    {
-                        warn!(e; "Failed to remove empty directory {}", path.display());
-                        is_empty = false;
+                    if let Err(e) = std::fs::remove_dir(&path) {
+                        if e.kind() != std::io::ErrorKind::NotFound {
+                            warn!(e; "Failed to remove empty directory {}", path.display());
+                            is_empty = false;
+                        } else {
+                            info!(
+                                "Empty directory {} not found before removal",
+                                path.display()
+                            );
+                        }
                    } else {
                        info!(
                            "Removed empty directory {} from manifest cache",
@@ -571,4 +591,116 @@ mod tests {
            cache.cache_file_path("region_1/manifest/00000000000000000007.checkpoint")
        );
    }
+
+    #[tokio::test]
+    async fn test_clean_empty_dirs_sync_no_mtime_check() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("");
+        let root = PathBuf::from(dir.path());
+
+        // Create a directory structure:
+        // root/
+        //   empty_dir1/
+        //   empty_dir2/
+        //     empty_subdir/
+        //   non_empty_dir/
+        //     file.txt
+        //   nested/
+        //     empty_subdir1/
+        //     non_empty_subdir/
+        //       file.txt
+
+        let empty_dir1 = root.join("empty_dir1");
+        let empty_dir2 = root.join("empty_dir2");
+        let empty_subdir = empty_dir2.join("empty_subdir");
+        let non_empty_dir = root.join("non_empty_dir");
+        let nested = root.join("nested");
+        let nested_empty = nested.join("empty_subdir1");
+        let nested_non_empty = nested.join("non_empty_subdir");
+
+        // Create directories
+        std::fs::create_dir_all(&empty_dir1).unwrap();
+        std::fs::create_dir_all(&empty_subdir).unwrap();
+        std::fs::create_dir_all(&non_empty_dir).unwrap();
+        std::fs::create_dir_all(&nested_empty).unwrap();
+        std::fs::create_dir_all(&nested_non_empty).unwrap();
+
+        // Create files in non-empty directories
+        std::fs::write(non_empty_dir.join("file.txt"), b"content").unwrap();
+        std::fs::write(nested_non_empty.join("file.txt"), b"content").unwrap();
+
+        // Verify initial state
+        assert!(empty_dir1.exists());
+        assert!(empty_dir2.exists());
+        assert!(empty_subdir.exists());
+        assert!(non_empty_dir.exists());
+        assert!(nested.exists());
+        assert!(nested_empty.exists());
+        assert!(nested_non_empty.exists());
+
+        // Clean empty directories with check_mtime = false
+        ManifestCache::clean_empty_dirs_sync(&root, false).unwrap();
+
+        // Verify empty directories are removed
+        assert!(!empty_dir1.exists());
+        assert!(!empty_dir2.exists());
+        assert!(!empty_subdir.exists());
+        assert!(!nested_empty.exists());
+
+        // Verify non-empty directories still exist
+        assert!(non_empty_dir.exists());
+        assert!(non_empty_dir.join("file.txt").exists());
+        assert!(nested.exists());
+        assert!(nested_non_empty.exists());
+        assert!(nested_non_empty.join("file.txt").exists());
+    }
+
+    #[tokio::test]
+    async fn test_clean_empty_dirs_sync_with_mtime_check() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("");
+        let root = PathBuf::from(dir.path());
+
+        // Create a directory structure with recently created empty directories
+        // root/
+        //   empty_dir1/
+        //   empty_dir2/
+        //     empty_subdir/
+        //   non_empty_dir/
+        //     file.txt
+
+        let empty_dir1 = root.join("empty_dir1");
+        let empty_dir2 = root.join("empty_dir2");
+        let empty_subdir = empty_dir2.join("empty_subdir");
+        let non_empty_dir = root.join("non_empty_dir");
+
+        // Create directories
+        std::fs::create_dir_all(&empty_dir1).unwrap();
+        std::fs::create_dir_all(&empty_subdir).unwrap();
+        std::fs::create_dir_all(&non_empty_dir).unwrap();
+
+        // Create file in non-empty directory
+        std::fs::write(non_empty_dir.join("file.txt"), b"content").unwrap();
+
+        // Verify initial state
+        assert!(empty_dir1.exists());
+        assert!(empty_dir2.exists());
+        assert!(empty_subdir.exists());
+        assert!(non_empty_dir.exists());
+
+        // Clean empty directories with check_mtime = true
+        // Since the directories were just created (mtime < 1 hour), they should NOT be removed
+        ManifestCache::clean_empty_dirs_sync(&root, true).unwrap();
+
+        // Verify empty directories are NOT removed (they're too recent)
+        assert!(empty_dir1.exists());
+        assert!(empty_dir2.exists());
+        assert!(empty_subdir.exists());
+
+        // Verify non-empty directory still exists
+        assert!(non_empty_dir.exists());
+        assert!(non_empty_dir.join("file.txt").exists());
+    }
 }
--- a/src/mito2/src/cache/write_cache.rs
+++ b/src/mito2/src/cache/write_cache.rs
@@ -63,11 +63,13 @@ pub type WriteCacheRef = Arc<WriteCache>;
 impl WriteCache {
    /// Create the cache with a `local_store` to cache files and a
    /// `object_store_manager` for all object stores.
+    #[allow(clippy::too_many_arguments)]
    pub async fn new(
        local_store: ObjectStore,
        cache_capacity: ReadableSize,
        ttl: Option<Duration>,
        index_cache_percent: Option<u8>,
+        enable_background_worker: bool,
        puffin_manager_factory: PuffinManagerFactory,
        intermediate_manager: IntermediateManager,
        manifest_cache: Option<ManifestCache>,
@@ -79,6 +81,7 @@ impl WriteCache {
            cache_capacity,
            ttl,
            index_cache_percent,
+            enable_background_worker,
        ));
        file_cache.recover(false, Some(task_receiver)).await;

@@ -92,11 +95,13 @@ impl WriteCache {
    }

    /// Creates a write cache based on local fs.
+    #[allow(clippy::too_many_arguments)]
    pub async fn new_fs(
        cache_dir: &str,
        cache_capacity: ReadableSize,
        ttl: Option<Duration>,
        index_cache_percent: Option<u8>,
+        enable_background_worker: bool,
        puffin_manager_factory: PuffinManagerFactory,
        intermediate_manager: IntermediateManager,
        manifest_cache_capacity: ReadableSize,
@@ -117,6 +122,7 @@ impl WriteCache {
            cache_capacity,
            ttl,
            index_cache_percent,
+            enable_background_worker,
            puffin_manager_factory,
            intermediate_manager,
            manifest_cache,
--- a/src/mito2/src/compaction.rs
+++ b/src/mito2/src/compaction.rs
@@ -14,6 +14,7 @@

 mod buckets;
 pub mod compactor;
+pub mod memory_manager;
 pub mod picker;
 pub mod run;
 mod task;
@@ -29,6 +30,7 @@ use std::time::Instant;
 use api::v1::region::compact_request;
 use api::v1::region::compact_request::Options;
 use common_base::Plugins;
+use common_memory_manager::OnExhaustedPolicy;
 use common_meta::key::SchemaMetadataManagerRef;
 use common_telemetry::{debug, error, info, warn};
 use common_time::range::TimestampRange;
@@ -46,7 +48,8 @@ use tokio::sync::mpsc::{self, Sender};
 use crate::access_layer::AccessLayerRef;
 use crate::cache::{CacheManagerRef, CacheStrategy};
 use crate::compaction::compactor::{CompactionRegion, CompactionVersion, DefaultCompactor};
-use crate::compaction::picker::{CompactionTask, new_picker};
+use crate::compaction::memory_manager::CompactionMemoryManager;
+use crate::compaction::picker::{CompactionTask, PickerOutput, new_picker};
 use crate::compaction::task::CompactionTaskImpl;
 use crate::config::MitoConfig;
 use crate::error::{
@@ -104,12 +107,15 @@ pub(crate) struct CompactionScheduler {
    request_sender: Sender<WorkerRequestWithTime>,
    cache_manager: CacheManagerRef,
    engine_config: Arc<MitoConfig>,
+    memory_manager: Arc<CompactionMemoryManager>,
+    memory_policy: OnExhaustedPolicy,
    listener: WorkerListener,
    /// Plugins for the compaction scheduler.
    plugins: Plugins,
 }

 impl CompactionScheduler {
+    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        scheduler: SchedulerRef,
        request_sender: Sender<WorkerRequestWithTime>,
@@ -117,6 +123,8 @@ impl CompactionScheduler {
        engine_config: Arc<MitoConfig>,
        listener: WorkerListener,
        plugins: Plugins,
+        memory_manager: Arc<CompactionMemoryManager>,
+        memory_policy: OnExhaustedPolicy,
    ) -> Self {
        Self {
            scheduler,
@@ -124,6 +132,8 @@ impl CompactionScheduler {
            request_sender,
            cache_manager,
            engine_config,
+            memory_manager,
+            memory_policy,
            listener,
            plugins,
        }
@@ -429,7 +439,8 @@ impl CompactionScheduler {
        };

        // Create a local compaction task.
-        let mut local_compaction_task = Box::new(CompactionTaskImpl {
+        let estimated_bytes = estimate_compaction_bytes(&picker_output);
+        let local_compaction_task = Box::new(CompactionTaskImpl {
            request_sender,
            waiters,
            start_time,
@@ -437,18 +448,27 @@ impl CompactionScheduler {
            picker_output,
            compaction_region,
            compactor: Arc::new(DefaultCompactor {}),
+            memory_manager: self.memory_manager.clone(),
+            memory_policy: self.memory_policy,
+            estimated_memory_bytes: estimated_bytes,
        });

-        // Submit the compaction task.
+        self.submit_compaction_task(local_compaction_task, region_id)
+    }
+
+    fn submit_compaction_task(
+        &mut self,
+        mut task: Box<CompactionTaskImpl>,
+        region_id: RegionId,
+    ) -> Result<()> {
        self.scheduler
            .schedule(Box::pin(async move {
                INFLIGHT_COMPACTION_COUNT.inc();
-                local_compaction_task.run().await;
+                task.run().await;
                INFLIGHT_COMPACTION_COUNT.dec();
            }))
            .map_err(|e| {
                error!(e; "Failed to submit compaction request for region {}", region_id);
-                // If failed to submit the job, we need to remove the region from the scheduler.
                self.region_status.remove(&region_id);
                e
            })
@@ -758,6 +778,20 @@ fn get_expired_ssts(
        .collect()
 }

+/// Estimates compaction memory as the sum of all input files' maximum row-group
+/// uncompressed sizes.
+fn estimate_compaction_bytes(picker_output: &PickerOutput) -> u64 {
+    picker_output
+        .outputs
+        .iter()
+        .flat_map(|output| output.inputs.iter())
+        .map(|file: &FileHandle| {
+            let meta = file.meta_ref();
+            meta.max_row_group_uncompressed_size
+        })
+        .sum()
+}
+
 /// Pending compaction request that is supposed to run after current task is finished,
 /// typically used for manual compactions.
 struct PendingCompaction {
@@ -773,9 +807,10 @@ struct PendingCompaction {
 mod tests {
    use api::v1::region::StrictWindow;
    use common_datasource::compression::CompressionType;
-    use tokio::sync::oneshot;
+    use tokio::sync::{Barrier, oneshot};

    use super::*;
+    use crate::compaction::memory_manager::{CompactionMemoryGuard, new_compaction_memory_manager};
    use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
    use crate::region::ManifestContext;
    use crate::sst::FormatType;
@@ -1145,4 +1180,39 @@ mod tests {
        assert_eq!(result.unwrap(), 0); // is there a better way to check this?
        assert_eq!(0, scheduler.region_status.len());
    }
+
+    #[tokio::test]
+    async fn test_concurrent_memory_competition() {
+        let manager = Arc::new(new_compaction_memory_manager(3 * 1024 * 1024)); // 3MB
+        let barrier = Arc::new(Barrier::new(3));
+        let mut handles = vec![];
+
+        // Spawn 3 tasks competing for memory, each trying to acquire 2MB
+        for _i in 0..3 {
+            let mgr = manager.clone();
+            let bar = barrier.clone();
+            let handle = tokio::spawn(async move {
+                bar.wait().await; // Synchronize start
+                mgr.try_acquire(2 * 1024 * 1024)
+            });
+            handles.push(handle);
+        }
+
+        let results: Vec<Option<CompactionMemoryGuard>> = futures::future::join_all(handles)
+            .await
+            .into_iter()
+            .map(|r| r.unwrap())
+            .collect();
+
+        // Only 1 should succeed (3MB limit, 2MB request, can only fit one)
+        let succeeded = results.iter().filter(|r| r.is_some()).count();
+        let failed = results.iter().filter(|r| r.is_none()).count();
+
+        assert_eq!(succeeded, 1, "Expected exactly 1 task to acquire memory");
+        assert_eq!(failed, 2, "Expected 2 tasks to fail");
+
+        // Clean up
+        drop(results);
+        assert_eq!(manager.used_bytes(), 0);
+    }
 }
--- a/src/mito2/src/compaction/compactor.rs
+++ b/src/mito2/src/compaction/compactor.rs
@@ -396,6 +396,7 @@ impl DefaultCompactor {
                time_range: sst_info.time_range,
                level: output.output_level,
                file_size: sst_info.file_size,
+                max_row_group_uncompressed_size: sst_info.max_row_group_uncompressed_size,
                available_indexes: sst_info.index_metadata.build_available_indexes(),
                indexes: sst_info.index_metadata.build_indexes(),
                index_file_size: sst_info.index_metadata.file_size,
--- a/src/mito2/src/compaction/memory_manager.rs
+++ b/src/mito2/src/compaction/memory_manager.rs
@@ -0,0 +1,50 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use common_memory_manager::{MemoryGuard, MemoryManager, MemoryMetrics};
+
+use crate::metrics::{
+    COMPACTION_MEMORY_IN_USE, COMPACTION_MEMORY_LIMIT, COMPACTION_MEMORY_REJECTED,
+};
+
+/// Compaction-specific memory metrics implementation.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct CompactionMemoryMetrics;
+
+impl MemoryMetrics for CompactionMemoryMetrics {
+    fn set_limit(&self, bytes: i64) {
+        COMPACTION_MEMORY_LIMIT.set(bytes);
+    }
+
+    fn set_in_use(&self, bytes: i64) {
+        COMPACTION_MEMORY_IN_USE.set(bytes);
+    }
+
+    fn inc_rejected(&self, reason: &str) {
+        COMPACTION_MEMORY_REJECTED
+            .with_label_values(&[reason])
+            .inc();
+    }
+}
+
+/// Compaction memory manager.
+pub type CompactionMemoryManager = MemoryManager<CompactionMemoryMetrics>;
+
+/// Compaction memory guard.
+pub type CompactionMemoryGuard = MemoryGuard<CompactionMemoryMetrics>;
+
+/// Helper to construct a compaction memory manager without passing metrics explicitly.
+pub fn new_compaction_memory_manager(limit_bytes: u64) -> CompactionMemoryManager {
+    CompactionMemoryManager::new(limit_bytes, CompactionMemoryMetrics)
+}
--- a/src/mito2/src/compaction/task.rs
+++ b/src/mito2/src/compaction/task.rs
@@ -16,16 +16,18 @@ use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
 use std::time::Instant;

+use common_memory_manager::OnExhaustedPolicy;
 use common_telemetry::{error, info, warn};
 use itertools::Itertools;
 use snafu::ResultExt;
 use tokio::sync::mpsc;

 use crate::compaction::compactor::{CompactionRegion, Compactor};
+use crate::compaction::memory_manager::{CompactionMemoryGuard, CompactionMemoryManager};
 use crate::compaction::picker::{CompactionTask, PickerOutput};
-use crate::error::CompactRegionSnafu;
+use crate::error::{CompactRegionSnafu, CompactionMemoryExhaustedSnafu};
 use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
-use crate::metrics::{COMPACTION_FAILURE_COUNT, COMPACTION_STAGE_ELAPSED};
+use crate::metrics::{COMPACTION_FAILURE_COUNT, COMPACTION_MEMORY_WAIT, COMPACTION_STAGE_ELAPSED};
 use crate::region::RegionRoleState;
 use crate::request::{
    BackgroundNotify, CompactionFailed, CompactionFinished, OutputTx, RegionEditResult,
@@ -52,6 +54,12 @@ pub(crate) struct CompactionTaskImpl {
    pub(crate) compactor: Arc<dyn Compactor>,
    /// Output of the picker.
    pub(crate) picker_output: PickerOutput,
+    /// Memory manager to acquire memory budget.
+    pub(crate) memory_manager: Arc<CompactionMemoryManager>,
+    /// Policy when memory is exhausted.
+    pub(crate) memory_policy: OnExhaustedPolicy,
+    /// Estimated memory bytes needed for this compaction.
+    pub(crate) estimated_memory_bytes: u64,
 }

 impl Debug for CompactionTaskImpl {
@@ -81,6 +89,24 @@ impl CompactionTaskImpl {
            .for_each(|o| o.inputs.iter().for_each(|f| f.set_compacting(compacting)));
    }

+    /// Acquires memory budget based on the configured policy.
+    ///
+    /// Returns an error if memory cannot be acquired according to the policy.
+    async fn acquire_memory_with_policy(&self) -> error::Result<CompactionMemoryGuard> {
+        let region_id = self.compaction_region.region_id;
+        let requested_bytes = self.estimated_memory_bytes;
+        let policy = self.memory_policy;
+
+        let _timer = COMPACTION_MEMORY_WAIT.start_timer();
+        self.memory_manager
+            .acquire_with_policy(requested_bytes, policy)
+            .await
+            .context(CompactionMemoryExhaustedSnafu {
+                region_id,
+                policy: format!("{policy:?}"),
+            })
+    }
+
    /// Remove expired ssts files, update manifest immediately
    /// and apply the edit to region version.
    ///
@@ -222,7 +248,7 @@ impl CompactionTaskImpl {
    }

    /// Handles compaction failure, notifies all waiters.
-    fn on_failure(&mut self, err: Arc<error::Error>) {
+    pub(crate) fn on_failure(&mut self, err: Arc<error::Error>) {
        COMPACTION_FAILURE_COUNT.inc();
        for waiter in self.waiters.drain(..) {
            waiter.send(Err(err.clone()).context(CompactRegionSnafu {
@@ -249,6 +275,26 @@ impl CompactionTaskImpl {
 #[async_trait::async_trait]
 impl CompactionTask for CompactionTaskImpl {
    async fn run(&mut self) {
+        // Acquire memory budget before starting compaction
+        let _memory_guard = match self.acquire_memory_with_policy().await {
+            Ok(guard) => guard,
+            Err(e) => {
+                error!(e; "Failed to acquire memory for compaction, region id: {}", self.compaction_region.region_id);
+                let err = Arc::new(e);
+                self.on_failure(err.clone());
+                let notify = BackgroundNotify::CompactionFailed(CompactionFailed {
+                    region_id: self.compaction_region.region_id,
+                    err,
+                });
+                self.send_to_worker(WorkerRequest::Background {
+                    region_id: self.compaction_region.region_id,
+                    notify,
+                })
+                .await;
+                return;
+            }
+        };
+
        let notify = match self.handle_expiration_and_compaction().await {
            Ok(edit) => BackgroundNotify::CompactionFinished(CompactionFinished {
                region_id: self.compaction_region.region_id,
--- a/src/mito2/src/compaction/test_util.rs
+++ b/src/mito2/src/compaction/test_util.rs
@@ -74,6 +74,7 @@ pub fn new_file_handle_with_size_and_sequence(
            ),
            level,
            file_size,
+            max_row_group_uncompressed_size: file_size,
            available_indexes: Default::default(),
            indexes: Default::default(),
            index_file_size: 0,
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -20,6 +20,7 @@ use std::time::Duration;

 use common_base::memory_limit::MemoryLimit;
 use common_base::readable_size::ReadableSize;
+use common_memory_manager::OnExhaustedPolicy;
 use common_stat::{get_total_cpu_cores, get_total_memory_readable};
 use common_telemetry::warn;
 use serde::{Deserialize, Serialize};
@@ -92,6 +93,10 @@ pub struct MitoConfig {
    pub max_background_compactions: usize,
    /// Max number of running background purge jobs (default: number of cpu cores).
    pub max_background_purges: usize,
+    /// Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit.
+    pub experimental_compaction_memory_limit: MemoryLimit,
+    /// Behavior when compaction cannot acquire memory from the budget.
+    pub experimental_compaction_on_exhausted: OnExhaustedPolicy,

    // Flush configs:
    /// Interval to auto flush a region if it has not flushed yet (default 30 min).
@@ -126,6 +131,11 @@ pub struct MitoConfig {
    /// The remaining capacity is used for data (parquet) files.
    /// Must be between 0 and 100 (exclusive).
    pub index_cache_percent: u8,
+    /// Enable background downloading of files to the local cache when accessed during queries (default: true).
+    /// When enabled, files will be asynchronously downloaded to improve performance for subsequent reads.
+    pub enable_refill_cache_on_read: bool,
+    /// Capacity for manifest cache (default: 256MB).
+    pub manifest_cache_size: ReadableSize,

    // Other configs:
    /// Buffer size for SST writing.
@@ -178,6 +188,8 @@ impl Default for MitoConfig {
            max_background_flushes: divide_num_cpus(2),
            max_background_compactions: divide_num_cpus(4),
            max_background_purges: get_total_cpu_cores(),
+            experimental_compaction_memory_limit: MemoryLimit::Unlimited,
+            experimental_compaction_on_exhausted: OnExhaustedPolicy::default(),
            auto_flush_interval: Duration::from_secs(30 * 60),
            global_write_buffer_size: ReadableSize::gb(1),
            global_write_buffer_reject_size: ReadableSize::gb(2),
@@ -191,6 +203,8 @@ impl Default for MitoConfig {
            write_cache_ttl: None,
            preload_index_cache: true,
            index_cache_percent: DEFAULT_INDEX_CACHE_PERCENT,
+            enable_refill_cache_on_read: true,
+            manifest_cache_size: ReadableSize::mb(256),
            sst_write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE,
            parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
            max_concurrent_scan_files: DEFAULT_MAX_CONCURRENT_SCAN_FILES,
--- a/src/mito2/src/engine.rs
+++ b/src/mito2/src/engine.rs
@@ -71,6 +71,8 @@ mod sync_test;
 #[cfg(test)]
 mod truncate_test;

+#[cfg(test)]
+mod copy_region_from_test;
 #[cfg(test)]
 mod remap_manifests_test;

@@ -103,8 +105,9 @@ use store_api::metric_engine_consts::{
    MANIFEST_INFO_EXTENSION_KEY, TABLE_COLUMN_METADATA_EXTENSION_KEY,
 };
 use store_api::region_engine::{
-    BatchResponses, RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef,
-    RegionStatistic, RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse,
+    BatchResponses, CopyRegionFromRequest, CopyRegionFromResponse, MitoCopyRegionFromResponse,
+    RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
+    RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse,
    SettableRegionRoleState, SyncManifestResponse,
 };
 use store_api::region_request::{
@@ -119,8 +122,8 @@ use crate::cache::{CacheManagerRef, CacheStrategy};
 use crate::config::MitoConfig;
 use crate::engine::puffin_index::{IndexEntryContext, collect_index_entries_from_puffin};
 use crate::error::{
-    InvalidRequestSnafu, JoinSnafu, MitoManifestInfoSnafu, RecvSnafu, RegionNotFoundSnafu, Result,
-    SerdeJsonSnafu, SerializeColumnMetadataSnafu, SerializeManifestSnafu,
+    self, InvalidRequestSnafu, JoinSnafu, MitoManifestInfoSnafu, RecvSnafu, RegionNotFoundSnafu,
+    Result, SerdeJsonSnafu, SerializeColumnMetadataSnafu, SerializeManifestSnafu,
 };
 #[cfg(feature = "enterprise")]
 use crate::extension::BoxedExtensionRangeProviderFactory;
@@ -421,6 +424,17 @@ impl MitoEngine {
        rx.await.context(RecvSnafu)?
    }

+    /// Handles copy region from request.
+    ///
+    /// This method is only supported for internal use and is not exposed in the trait implementation.
+    pub async fn copy_region_from(
+        &self,
+        region_id: RegionId,
+        request: CopyRegionFromRequest,
+    ) -> Result<MitoCopyRegionFromResponse> {
+        self.inner.copy_region_from(region_id, request).await
+    }
+
    #[cfg(test)]
    pub(crate) fn get_region(&self, id: RegionId) -> Option<crate::region::MitoRegionRef> {
        self.find_region(id)
@@ -621,7 +635,9 @@ impl MitoEngine {
    }
 }

-/// Check whether the region edit is valid. Only adding files to region is considered valid now.
+/// Check whether the region edit is valid.
+///
+/// Only adding or removing files to region is considered valid now.
 fn is_valid_region_edit(edit: &RegionEdit) -> bool {
    !edit.files_to_add.is_empty()
        && edit.files_to_remove.is_empty()
@@ -1054,6 +1070,18 @@ impl EngineInner {
        Ok(RemapManifestsResponse { new_manifests })
    }

+    async fn copy_region_from(
+        &self,
+        region_id: RegionId,
+        request: CopyRegionFromRequest,
+    ) -> Result<MitoCopyRegionFromResponse> {
+        let (request, receiver) =
+            WorkerRequest::try_from_copy_region_from_request(region_id, request)?;
+        self.workers.submit_to_worker(region_id, request).await?;
+        let response = receiver.await.context(RecvSnafu)??;
+        Ok(response)
+    }
+
    fn role(&self, region_id: RegionId) -> Option<RegionRole> {
        self.workers.get_region(region_id).map(|region| {
            if region.is_follower() {
@@ -1240,6 +1268,19 @@ impl RegionEngine for MitoEngine {
            .map_err(BoxedError::new)
    }

+    async fn copy_region_from(
+        &self,
+        _region_id: RegionId,
+        _request: CopyRegionFromRequest,
+    ) -> Result<CopyRegionFromResponse, BoxedError> {
+        Err(BoxedError::new(
+            error::UnsupportedOperationSnafu {
+                err_msg: "copy_region_from is not supported",
+            }
+            .build(),
+        ))
+    }
+
    fn role(&self, region_id: RegionId) -> Option<RegionRole> {
        self.inner.role(region_id)
    }
--- a/src/mito2/src/engine/basic_test.rs
+++ b/src/mito2/src/engine/basic_test.rs
@@ -872,9 +872,9 @@ StorageSstEntry { file_path: "test/11_0000000002/index/<file_id>.puffin", file_s
 StorageSstEntry { file_path: "test/22_0000000042/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
 StorageSstEntry { file_path: "test/22_0000000042/index/<file_id>.puffin", file_size: None, last_modified_ms: None, node_id: None }"#).await;
    test_list_ssts_with_format(true, r#"
-ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"#, 
+ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"#, 
 r#"
 StorageSstEntry { file_path: "test/11_0000000001/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
 StorageSstEntry { file_path: "test/11_0000000001/index/<file_id>.puffin", file_size: None, last_modified_ms: None, node_id: None }
--- a/src/mito2/src/engine/copy_region_from_test.rs
+++ b/src/mito2/src/engine/copy_region_from_test.rs
@@ -0,0 +1,361 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::assert_matches::assert_matches;
+use std::fs;
+use std::sync::Arc;
+
+use api::v1::Rows;
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use object_store::layers::mock::{Error as MockError, ErrorKind, MockLayerBuilder};
+use store_api::region_engine::{CopyRegionFromRequest, RegionEngine, RegionRole};
+use store_api::region_request::{RegionFlushRequest, RegionRequest};
+use store_api::storage::RegionId;
+
+use crate::config::MitoConfig;
+use crate::error::Error;
+use crate::test_util::{CreateRequestBuilder, TestEnv, build_rows, put_rows, rows_schema};
+
+#[tokio::test]
+async fn test_engine_copy_region_from() {
+    common_telemetry::init_default_ut_logging();
+
+    test_engine_copy_region_from_with_format(true, true).await;
+    test_engine_copy_region_from_with_format(true, false).await;
+    test_engine_copy_region_from_with_format(false, true).await;
+    test_engine_copy_region_from_with_format(false, false).await;
+}
+
+async fn test_engine_copy_region_from_with_format(flat_format: bool, with_index: bool) {
+    let mut env = TestEnv::with_prefix("copy-region-from").await;
+    let engine = env
+        .create_engine(MitoConfig {
+            default_experimental_flat_format: flat_format,
+            ..Default::default()
+        })
+        .await;
+    // Creates a source region and adds some data
+    let source_region_id = RegionId::new(1, 1);
+    let mut request = CreateRequestBuilder::new().build();
+    if with_index {
+        request
+            .column_metadatas
+            .iter_mut()
+            .find(|c| c.column_schema.name == "tag_0")
+            .unwrap()
+            .column_schema
+            .set_inverted_index(true);
+    }
+
+    let column_schemas = rows_schema(&request);
+    engine
+        .handle_request(source_region_id, RegionRequest::Create(request.clone()))
+        .await
+        .unwrap();
+    let rows = Rows {
+        schema: column_schemas,
+        rows: build_rows(0, 42),
+    };
+    put_rows(&engine, source_region_id, rows).await;
+    engine
+        .handle_request(
+            source_region_id,
+            RegionRequest::Flush(RegionFlushRequest {
+                row_group_size: None,
+            }),
+        )
+        .await
+        .unwrap();
+
+    // Creates a target region and enters staging mode
+    let target_region_id = RegionId::new(1, 2);
+    engine
+        .handle_request(target_region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+    common_telemetry::debug!("copy region from");
+    let resp = engine
+        .copy_region_from(
+            target_region_id,
+            CopyRegionFromRequest {
+                source_region_id,
+                parallelism: 1,
+            },
+        )
+        .await
+        .unwrap();
+
+    let manifest = engine
+        .get_region(target_region_id)
+        .unwrap()
+        .manifest_ctx
+        .manifest()
+        .await;
+    assert!(!manifest.files.is_empty());
+    for meta in manifest.files.values() {
+        assert_eq!(meta.region_id, target_region_id);
+        assert_eq!(meta.exists_index(), with_index);
+    }
+
+    let source_region_dir = format!("{}/data/test/1_0000000001", env.data_home().display());
+    let source_region_files = collect_filename_in_dir(&source_region_dir);
+    let target_region_dir = format!("{}/data/test/1_0000000002", env.data_home().display());
+    let target_region_files = collect_filename_in_dir(&target_region_dir);
+    assert_eq!(source_region_files, target_region_files);
+
+    if with_index {
+        let source_region_index_files =
+            collect_filename_in_dir(&format!("{}/index", source_region_dir));
+        let target_region_index_files =
+            collect_filename_in_dir(&format!("{}/index", target_region_dir));
+        assert_eq!(source_region_index_files, target_region_index_files);
+    }
+    common_telemetry::debug!("copy region from again");
+    let resp2 = engine
+        .copy_region_from(
+            target_region_id,
+            CopyRegionFromRequest {
+                source_region_id,
+                parallelism: 1,
+            },
+        )
+        .await
+        .unwrap();
+    assert_eq!(resp.copied_file_ids, resp2.copied_file_ids);
+}
+
+#[tokio::test]
+async fn test_engine_copy_region_failure() {
+    common_telemetry::init_default_ut_logging();
+    test_engine_copy_region_failure_with_format(false).await;
+    test_engine_copy_region_failure_with_format(true).await;
+}
+
+async fn test_engine_copy_region_failure_with_format(flat_format: bool) {
+    let mock_layer = MockLayerBuilder::default()
+        .copy_interceptor(Arc::new(|from, _, _args| {
+            if from.contains(".puffin") {
+                Some(Err(MockError::new(ErrorKind::Unexpected, "mock err")))
+            } else {
+                None
+            }
+        }))
+        .build()
+        .unwrap();
+    let mut env = TestEnv::new().await.with_mock_layer(mock_layer);
+    let engine = env
+        .create_engine(MitoConfig {
+            default_experimental_flat_format: flat_format,
+            ..Default::default()
+        })
+        .await;
+    // Creates a source region and adds some data
+    let source_region_id = RegionId::new(1, 1);
+    let mut request = CreateRequestBuilder::new().build();
+    request
+        .column_metadatas
+        .iter_mut()
+        .find(|c| c.column_schema.name == "tag_0")
+        .unwrap()
+        .column_schema
+        .set_inverted_index(true);
+
+    let column_schemas = rows_schema(&request);
+    engine
+        .handle_request(source_region_id, RegionRequest::Create(request.clone()))
+        .await
+        .unwrap();
+    let rows = Rows {
+        schema: column_schemas,
+        rows: build_rows(0, 42),
+    };
+    put_rows(&engine, source_region_id, rows).await;
+    engine
+        .handle_request(
+            source_region_id,
+            RegionRequest::Flush(RegionFlushRequest {
+                row_group_size: None,
+            }),
+        )
+        .await
+        .unwrap();
+    let source_region_dir = format!("{}/data/test/1_0000000001", env.data_home().display());
+    assert_file_num_in_dir(&source_region_dir, 1);
+    assert_file_num_in_dir(&format!("{}/index", source_region_dir), 1);
+    let source_region_files = collect_filename_in_dir(&source_region_dir);
+    let source_region_index_files =
+        collect_filename_in_dir(&format!("{}/index", source_region_dir));
+
+    // Creates a target region and enters staging mode
+    let target_region_id = RegionId::new(1, 2);
+    engine
+        .handle_request(target_region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+    let err = engine
+        .copy_region_from(
+            target_region_id,
+            CopyRegionFromRequest {
+                source_region_id,
+                parallelism: 1,
+            },
+        )
+        .await
+        .unwrap_err();
+    assert_eq!(err.status_code(), StatusCode::StorageUnavailable);
+
+    // Check target region directory is empty
+    let target_region_dir = format!("{}/data/test/1_0000000002", env.data_home().display());
+    assert_file_num_in_dir(&target_region_dir, 0);
+    assert!(!fs::exists(format!("{}/index", target_region_dir)).unwrap());
+
+    // Check source region directory is not affected
+    let source_region_dir = format!("{}/data/test/1_0000000001", env.data_home().display());
+    assert_file_num_in_dir(&source_region_dir, 1);
+    assert_file_num_in_dir(&format!("{}/index", source_region_dir), 1);
+
+    assert_eq!(
+        source_region_files,
+        collect_filename_in_dir(&source_region_dir)
+    );
+    assert_eq!(
+        source_region_index_files,
+        collect_filename_in_dir(&format!("{}/index", source_region_dir))
+    );
+}
+
+fn assert_file_num_in_dir(dir: &str, expected_num: usize) {
+    let files = fs::read_dir(dir)
+        .unwrap()
+        .collect::<Result<Vec<_>, _>>()
+        .unwrap()
+        .into_iter()
+        .filter(|f| f.metadata().unwrap().is_file())
+        .collect::<Vec<_>>();
+    assert_eq!(
+        files.len(),
+        expected_num,
+        "The number of files in the directory should be {}, got: {:?}",
+        expected_num,
+        files
+    );
+}
+
+fn collect_filename_in_dir(dir: &str) -> Vec<String> {
+    let mut files = fs::read_dir(dir)
+        .unwrap()
+        .collect::<Result<Vec<_>, _>>()
+        .unwrap()
+        .into_iter()
+        .filter(|f| f.metadata().unwrap().is_file())
+        .map(|f| {
+            f.path()
+                .to_string_lossy()
+                .rsplit("/")
+                .last()
+                .unwrap()
+                .to_string()
+        })
+        .collect::<Vec<_>>();
+    files.sort_unstable();
+
+    files
+}
+
+#[tokio::test]
+async fn test_engine_copy_region_invalid_args() {
+    common_telemetry::init_default_ut_logging();
+    test_engine_copy_region_invalid_args_with_format(false).await;
+    test_engine_copy_region_invalid_args_with_format(true).await;
+}
+
+async fn test_engine_copy_region_invalid_args_with_format(flat_format: bool) {
+    let mut env = TestEnv::new().await;
+    let engine = env
+        .create_engine(MitoConfig {
+            default_experimental_flat_format: flat_format,
+            ..Default::default()
+        })
+        .await;
+    let region_id = RegionId::new(1, 1);
+    let request = CreateRequestBuilder::new().build();
+    engine
+        .handle_request(region_id, RegionRequest::Create(request.clone()))
+        .await
+        .unwrap();
+    let err = engine
+        .copy_region_from(
+            region_id,
+            CopyRegionFromRequest {
+                source_region_id: RegionId::new(2, 1),
+                parallelism: 1,
+            },
+        )
+        .await
+        .unwrap_err();
+    assert_eq!(err.status_code(), StatusCode::InvalidArguments);
+    let err = engine
+        .copy_region_from(
+            region_id,
+            CopyRegionFromRequest {
+                source_region_id: RegionId::new(1, 1),
+                parallelism: 1,
+            },
+        )
+        .await
+        .unwrap_err();
+    assert_eq!(err.status_code(), StatusCode::InvalidArguments);
+}
+
+#[tokio::test]
+async fn test_engine_copy_region_unexpected_state() {
+    common_telemetry::init_default_ut_logging();
+    test_engine_copy_region_unexpected_state_with_format(false).await;
+    test_engine_copy_region_unexpected_state_with_format(true).await;
+}
+
+async fn test_engine_copy_region_unexpected_state_with_format(flat_format: bool) {
+    let mut env = TestEnv::new().await;
+    let engine = env
+        .create_engine(MitoConfig {
+            default_experimental_flat_format: flat_format,
+            ..Default::default()
+        })
+        .await;
+    let region_id = RegionId::new(1, 1);
+    let request = CreateRequestBuilder::new().build();
+    engine
+        .handle_request(region_id, RegionRequest::Create(request.clone()))
+        .await
+        .unwrap();
+    engine
+        .set_region_role(region_id, RegionRole::Follower)
+        .unwrap();
+
+    let err = engine
+        .copy_region_from(
+            region_id,
+            CopyRegionFromRequest {
+                source_region_id: RegionId::new(1, 2),
+                parallelism: 1,
+            },
+        )
+        .await
+        .unwrap_err();
+    assert_matches!(
+        err.as_any().downcast_ref::<Error>().unwrap(),
+        Error::RegionState { .. }
+    )
+}
--- a/src/mito2/src/engine/index_build_test.rs
+++ b/src/mito2/src/engine/index_build_test.rs
@@ -160,6 +160,8 @@ async fn test_index_build_type_flush() {

 #[tokio::test]
 async fn test_index_build_type_compact() {
+    common_telemetry::init_default_ut_logging();
+
    let mut env = TestEnv::with_prefix("test_index_build_type_compact_").await;
    let listener = Arc::new(IndexBuildListener::default());
    let engine = env
--- a/src/mito2/src/error.rs
+++ b/src/mito2/src/error.rs
@@ -19,6 +19,7 @@ use common_datasource::compression::CompressionType;
 use common_error::ext::{BoxedError, ErrorExt};
 use common_error::status_code::StatusCode;
 use common_macro::stack_trace_debug;
+use common_memory_manager;
 use common_runtime::JoinError;
 use common_time::Timestamp;
 use common_time::timestamp::TimeUnit;
@@ -1041,6 +1042,16 @@ pub enum Error {
    #[snafu(display("Manual compaction is override by following operations."))]
    ManualCompactionOverride {},

+    #[snafu(display("Compaction memory exhausted for region {region_id} (policy: {policy})",))]
+    CompactionMemoryExhausted {
+        region_id: RegionId,
+        policy: String,
+        #[snafu(source)]
+        source: common_memory_manager::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display(
        "Incompatible WAL provider change. This is typically caused by changing WAL provider in database config file without completely cleaning existing files. Global provider: {}, region provider: {}",
        global,
@@ -1162,6 +1173,18 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display(
+        "Invalid source and target region, source: {}, target: {}",
+        source_region_id,
+        target_region_id
+    ))]
+    InvalidSourceAndTargetRegion {
+        source_region_id: RegionId,
+        target_region_id: RegionId,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -1230,7 +1253,8 @@ impl ErrorExt for Error {
            | MissingManifest { .. }
            | NoOldManifests { .. }
            | MissingPartitionExpr { .. }
-            | SerializePartitionExpr { .. } => StatusCode::InvalidArguments,
+            | SerializePartitionExpr { .. }
+            | InvalidSourceAndTargetRegion { .. } => StatusCode::InvalidArguments,

            RegionMetadataNotFound { .. }
            | Join { .. }
@@ -1323,6 +1347,8 @@ impl ErrorExt for Error {

            ManualCompactionOverride {} => StatusCode::Cancelled,

+            CompactionMemoryExhausted { source, .. } => source.status_code(),
+
            IncompatibleWalProviderChange { .. } => StatusCode::InvalidArguments,

            ScanSeries { source, .. } => source.status_code(),
--- a/src/mito2/src/flush.rs
+++ b/src/mito2/src/flush.rs
@@ -640,6 +640,7 @@ impl RegionFlushTask {
            time_range: sst_info.time_range,
            level: 0,
            file_size: sst_info.file_size,
+            max_row_group_uncompressed_size: sst_info.max_row_group_uncompressed_size,
            available_indexes: sst_info.index_metadata.build_available_indexes(),
            indexes: sst_info.index_metadata.build_indexes(),
            index_file_size: sst_info.index_metadata.file_size,
@@ -773,7 +774,12 @@ fn memtable_flat_sources(
            let iter = only_range.build_record_batch_iter(None)?;
            // Dedup according to append mode and merge mode.
            // Even single range may have duplicate rows.
-            let iter = maybe_dedup_one(options, field_column_start, iter);
+            let iter = maybe_dedup_one(
+                options.append_mode,
+                options.merge_mode(),
+                field_column_start,
+                iter,
+            );
            flat_sources.sources.push(FlatSource::Iter(iter));
        };
    } else {
@@ -795,7 +801,8 @@ fn memtable_flat_sources(
            if last_iter_rows > min_flush_rows {
                let maybe_dedup = merge_and_dedup(
                    &schema,
-                    options,
+                    options.append_mode,
+                    options.merge_mode(),
                    field_column_start,
                    std::mem::replace(&mut input_iters, Vec::with_capacity(num_ranges)),
                )?;
@@ -807,7 +814,13 @@ fn memtable_flat_sources(

        // Handle remaining iters.
        if !input_iters.is_empty() {
-            let maybe_dedup = merge_and_dedup(&schema, options, field_column_start, input_iters)?;
+            let maybe_dedup = merge_and_dedup(
+                &schema,
+                options.append_mode,
+                options.merge_mode(),
+                field_column_start,
+                input_iters,
+            )?;

            flat_sources.sources.push(FlatSource::Iter(maybe_dedup));
        }
@@ -816,19 +829,64 @@ fn memtable_flat_sources(
    Ok(flat_sources)
 }

-fn merge_and_dedup(
+/// Merges multiple record batch iterators and applies deduplication based on the specified mode.
+///
+/// This function is used during the flush process to combine data from multiple memtable ranges
+/// into a single stream while handling duplicate records according to the configured merge strategy.
+///
+/// # Arguments
+///
+/// * `schema` - The Arrow schema reference that defines the structure of the record batches
+/// * `append_mode` - When true, no deduplication is performed and all records are preserved.
+///                  This is used for append-only workloads where duplicate handling is not required.
+/// * `merge_mode` - The strategy used for deduplication when not in append mode:
+///   - `MergeMode::LastRow`: Keeps the last record for each primary key
+///   - `MergeMode::LastNonNull`: Keeps the last non-null values for each field
+/// * `field_column_start` - The starting column index for fields in the record batch.
+///                          Used when `MergeMode::LastNonNull` to identify which columns
+///                          contain field values versus primary key columns.
+/// * `input_iters` - A vector of record batch iterators to be merged and deduplicated
+///
+/// # Returns
+///
+/// Returns a boxed record batch iterator that yields the merged and potentially deduplicated
+/// record batches.
+///
+/// # Behavior
+///
+/// 1. Creates a `FlatMergeIterator` to merge all input iterators in sorted order based on
+///    primary key and timestamp
+/// 2. If `append_mode` is true, returns the merge iterator directly without deduplication
+/// 3. If `append_mode` is false, wraps the merge iterator with a `FlatDedupIterator` that
+///    applies the specified merge mode:
+///    - `LastRow`: Removes duplicate rows, keeping only the last one
+///    - `LastNonNull`: Removes duplicates but preserves the last non-null value for each field
+///
+/// # Examples
+///
+/// ```ignore
+/// let merged_iter = merge_and_dedup(
+///     &schema,
+///     false,  // not append mode, apply dedup
+///     MergeMode::LastRow,
+///     2,  // fields start at column 2 after primary key columns
+///     vec![iter1, iter2, iter3],
+/// )?;
+/// ```
+pub fn merge_and_dedup(
    schema: &SchemaRef,
-    options: &RegionOptions,
+    append_mode: bool,
+    merge_mode: MergeMode,
    field_column_start: usize,
    input_iters: Vec<BoxedRecordBatchIterator>,
 ) -> Result<BoxedRecordBatchIterator> {
    let merge_iter = FlatMergeIterator::new(schema.clone(), input_iters, DEFAULT_READ_BATCH_SIZE)?;
-    let maybe_dedup = if options.append_mode {
+    let maybe_dedup = if append_mode {
        // No dedup in append mode
        Box::new(merge_iter) as _
    } else {
        // Dedup according to merge mode.
-        match options.merge_mode() {
+        match merge_mode {
            MergeMode::LastRow => {
                Box::new(FlatDedupIterator::new(merge_iter, FlatLastRow::new(false))) as _
            }
@@ -841,17 +899,18 @@ fn merge_and_dedup(
    Ok(maybe_dedup)
 }

-fn maybe_dedup_one(
-    options: &RegionOptions,
+pub fn maybe_dedup_one(
+    append_mode: bool,
+    merge_mode: MergeMode,
    field_column_start: usize,
    input_iter: BoxedRecordBatchIterator,
 ) -> BoxedRecordBatchIterator {
-    if options.append_mode {
+    if append_mode {
        // No dedup in append mode
        input_iter
    } else {
        // Dedup according to merge mode.
-        match options.merge_mode() {
+        match merge_mode {
            MergeMode::LastRow => {
                Box::new(FlatDedupIterator::new(input_iter, FlatLastRow::new(false)))
            }
--- a/src/mito2/src/gc.rs
+++ b/src/mito2/src/gc.rs
@@ -540,7 +540,7 @@ impl LocalGcWorker {
    fn filter_deletable_files(
        &self,
        entries: Vec<Entry>,
-        in_use_filenames: &HashSet<&FileId>,
+        in_use_filenames: &HashSet<FileId>,
        may_linger_filenames: &HashSet<&FileId>,
        eligible_for_removal: &HashSet<&FileId>,
        unknown_file_may_linger_until: chrono::DateTime<chrono::Utc>,
@@ -641,9 +641,6 @@ impl LocalGcWorker {
            .flatten()
            .collect::<HashSet<_>>();

-        // in use filenames, include sst and index files
-        let in_use_filenames = in_used.iter().collect::<HashSet<_>>();
-
        // When full_file_listing is false, skip expensive list operations and only delete
        // files that are tracked in recently_removed_files
        if !self.full_file_listing {
@@ -653,7 +650,7 @@ impl LocalGcWorker {
            // 3. Have passed the lingering time
            let files_to_delete: Vec<FileId> = eligible_for_removal
                .iter()
-                .filter(|file_id| !in_use_filenames.contains(*file_id))
+                .filter(|file_id| !in_used.contains(*file_id))
                .map(|&f| *f)
                .collect();

@@ -672,7 +669,7 @@ impl LocalGcWorker {
        let (all_unused_files_ready_for_delete, all_in_exist_linger_files) = self
            .filter_deletable_files(
                all_entries,
-                &in_use_filenames,
+                in_used,
                &may_linger_filenames,
                &eligible_for_removal,
                unknown_file_may_linger_until,
--- a/src/mito2/src/manifest/storage.rs
+++ b/src/mito2/src/manifest/storage.rs
@@ -157,6 +157,8 @@ impl ManifestObjectStore {
        total_manifest_size: Arc<AtomicU64>,
        manifest_cache: Option<ManifestCache>,
    ) -> Self {
+        common_telemetry::info!("Create manifest store, cache: {}", manifest_cache.is_some());
+
        let path = util::normalize_dir(path);
        let staging_path = {
            // Convert "region_dir/manifest/" to "region_dir/staging/manifest/"
--- a/src/mito2/src/manifest/tests/checkpoint.rs
+++ b/src/mito2/src/manifest/tests/checkpoint.rs
@@ -244,6 +244,7 @@ async fn checkpoint_with_different_compression_types() {
            time_range: (0.into(), 10000000.into()),
            level: 0,
            file_size: 1024000,
+            max_row_group_uncompressed_size: 1024000,
            available_indexes: Default::default(),
            indexes: Default::default(),
            index_file_size: 0,
@@ -309,6 +310,7 @@ fn generate_action_lists(num: usize) -> (Vec<FileId>, Vec<RegionMetaActionList>)
            time_range: (0.into(), 10000000.into()),
            level: 0,
            file_size: 1024000,
+            max_row_group_uncompressed_size: 1024000,
            available_indexes: Default::default(),
            indexes: Default::default(),
            index_file_size: 0,
--- a/src/mito2/src/memtable.rs
+++ b/src/mito2/src/memtable.rs
@@ -55,10 +55,8 @@ pub mod time_partition;
 pub mod time_series;
 pub(crate) mod version;

-#[cfg(any(test, feature = "test"))]
-pub use bulk::part::BulkPart;
 pub use bulk::part::{
-    BulkPartEncoder, BulkPartMeta, UnorderedPart, record_batch_estimated_size,
+    BulkPart, BulkPartEncoder, BulkPartMeta, UnorderedPart, record_batch_estimated_size,
    sort_primary_key_record_batch,
 };
 #[cfg(any(test, feature = "test"))]
--- a/src/mito2/src/memtable/bulk.rs
+++ b/src/mito2/src/memtable/bulk.rs
@@ -668,10 +668,10 @@ impl BulkMemtable {
 }

 /// Iterator builder for bulk range
-struct BulkRangeIterBuilder {
-    part: BulkPart,
-    context: Arc<BulkIterContext>,
-    sequence: Option<SequenceRange>,
+pub struct BulkRangeIterBuilder {
+    pub part: BulkPart,
+    pub context: Arc<BulkIterContext>,
+    pub sequence: Option<SequenceRange>,
 }

 impl IterBuilder for BulkRangeIterBuilder {
@@ -1188,7 +1188,6 @@ impl MemtableBuilder for BulkMemtableBuilder {

 #[cfg(test)]
 mod tests {
-
    use mito_codec::row_converter::build_primary_key_codec;

    use super::*;
--- a/src/mito2/src/memtable/bulk/part.rs
+++ b/src/mito2/src/memtable/bulk/part.rs
@@ -974,6 +974,19 @@ impl EncodedBulkPart {
    /// Returns a `SstInfo` instance with information derived from this bulk part's metadata
    pub(crate) fn to_sst_info(&self, file_id: FileId) -> SstInfo {
        let unit = self.metadata.region_metadata.time_index_type().unit();
+        let max_row_group_uncompressed_size: u64 = self
+            .metadata
+            .parquet_metadata
+            .row_groups()
+            .iter()
+            .map(|rg| {
+                rg.columns()
+                    .iter()
+                    .map(|c| c.uncompressed_size() as u64)
+                    .sum::<u64>()
+            })
+            .max()
+            .unwrap_or(0);
        SstInfo {
            file_id,
            time_range: (
@@ -981,6 +994,7 @@ impl EncodedBulkPart {
                Timestamp::new(self.metadata.max_timestamp, unit),
            ),
            file_size: self.data.len() as u64,
+            max_row_group_uncompressed_size,
            num_rows: self.metadata.num_rows,
            num_row_groups: self.metadata.parquet_metadata.num_row_groups() as u64,
            file_metadata: Some(self.metadata.parquet_metadata.clone()),
@@ -1197,343 +1211,24 @@ impl BulkPartEncoder {
    }
 }

-/// Converts mutations to record batches.
-fn mutations_to_record_batch(
-    mutations: &[Mutation],
-    metadata: &RegionMetadataRef,
-    pk_encoder: &DensePrimaryKeyCodec,
-    dedup: bool,
-) -> Result<Option<(RecordBatch, i64, i64)>> {
-    let total_rows: usize = mutations
-        .iter()
-        .map(|m| m.rows.as_ref().map(|r| r.rows.len()).unwrap_or(0))
-        .sum();
-
-    if total_rows == 0 {
-        return Ok(None);
-    }
-
-    let mut pk_builder = BinaryBuilder::with_capacity(total_rows, 0);
-
-    let mut ts_vector: Box<dyn MutableVector> = metadata
-        .time_index_column()
-        .column_schema
-        .data_type
-        .create_mutable_vector(total_rows);
-    let mut sequence_builder = UInt64Builder::with_capacity(total_rows);
-    let mut op_type_builder = UInt8Builder::with_capacity(total_rows);
-
-    let mut field_builders: Vec<Box<dyn MutableVector>> = metadata
-        .field_columns()
-        .map(|f| f.column_schema.data_type.create_mutable_vector(total_rows))
-        .collect();
-
-    let mut pk_buffer = vec![];
-    for m in mutations {
-        let Some(key_values) = KeyValuesRef::new(metadata, m) else {
-            continue;
-        };
-
-        for row in key_values.iter() {
-            pk_buffer.clear();
-            pk_encoder
-                .encode_to_vec(row.primary_keys(), &mut pk_buffer)
-                .context(EncodeSnafu)?;
-            pk_builder.append_value(pk_buffer.as_bytes());
-            ts_vector.push_value_ref(&row.timestamp());
-            sequence_builder.append_value(row.sequence());
-            op_type_builder.append_value(row.op_type() as u8);
-            for (builder, field) in field_builders.iter_mut().zip(row.fields()) {
-                builder.push_value_ref(&field);
-            }
-        }
-    }
-
-    let arrow_schema = to_sst_arrow_schema(metadata);
-    // safety: timestamp column must be valid, and values must not be None.
-    let timestamp_unit = metadata
-        .time_index_column()
-        .column_schema
-        .data_type
-        .as_timestamp()
-        .unwrap()
-        .unit();
-    let sorter = ArraysSorter {
-        encoded_primary_keys: pk_builder.finish(),
-        timestamp_unit,
-        timestamp: ts_vector.to_vector().to_arrow_array(),
-        sequence: sequence_builder.finish(),
-        op_type: op_type_builder.finish(),
-        fields: field_builders
-            .iter_mut()
-            .map(|f| f.to_vector().to_arrow_array()),
-        dedup,
-        arrow_schema,
-    };
-
-    sorter.sort().map(Some)
-}
-
-struct ArraysSorter<I> {
-    encoded_primary_keys: BinaryArray,
-    timestamp_unit: TimeUnit,
-    timestamp: ArrayRef,
-    sequence: UInt64Array,
-    op_type: UInt8Array,
-    fields: I,
-    dedup: bool,
-    arrow_schema: SchemaRef,
-}
-
-impl<I> ArraysSorter<I>
-where
-    I: Iterator<Item = ArrayRef>,
-{
-    /// Converts arrays to record batch.
-    fn sort(self) -> Result<(RecordBatch, i64, i64)> {
-        debug_assert!(!self.timestamp.is_empty());
-        debug_assert!(self.timestamp.len() == self.sequence.len());
-        debug_assert!(self.timestamp.len() == self.op_type.len());
-        debug_assert!(self.timestamp.len() == self.encoded_primary_keys.len());
-
-        let timestamp_iter = timestamp_array_to_iter(self.timestamp_unit, &self.timestamp);
-        let (mut min_timestamp, mut max_timestamp) = (i64::MAX, i64::MIN);
-        let mut to_sort = self
-            .encoded_primary_keys
-            .iter()
-            .zip(timestamp_iter)
-            .zip(self.sequence.iter())
-            .map(|((pk, timestamp), sequence)| {
-                max_timestamp = max_timestamp.max(*timestamp);
-                min_timestamp = min_timestamp.min(*timestamp);
-                (pk, timestamp, sequence)
-            })
-            .enumerate()
-            .collect::<Vec<_>>();
-
-        to_sort.sort_unstable_by(|(_, (l_pk, l_ts, l_seq)), (_, (r_pk, r_ts, r_seq))| {
-            l_pk.cmp(r_pk)
-                .then(l_ts.cmp(r_ts))
-                .then(l_seq.cmp(r_seq).reverse())
-        });
-
-        if self.dedup {
-            // Dedup by timestamps while ignore sequence.
-            to_sort.dedup_by(|(_, (l_pk, l_ts, _)), (_, (r_pk, r_ts, _))| {
-                l_pk == r_pk && l_ts == r_ts
-            });
-        }
-
-        let indices = UInt32Array::from_iter_values(to_sort.iter().map(|v| v.0 as u32));
-
-        let pk_dictionary = Arc::new(binary_array_to_dictionary(
-            // safety: pk must be BinaryArray
-            arrow::compute::take(
-                &self.encoded_primary_keys,
-                &indices,
-                Some(TakeOptions {
-                    check_bounds: false,
-                }),
-            )
-            .context(ComputeArrowSnafu)?
-            .as_any()
-            .downcast_ref::<BinaryArray>()
-            .unwrap(),
-        )?) as ArrayRef;
-
-        let mut arrays = Vec::with_capacity(self.arrow_schema.fields.len());
-        for arr in self.fields {
-            arrays.push(
-                arrow::compute::take(
-                    &arr,
-                    &indices,
-                    Some(TakeOptions {
-                        check_bounds: false,
-                    }),
-                )
-                .context(ComputeArrowSnafu)?,
-            );
-        }
-
-        let timestamp = arrow::compute::take(
-            &self.timestamp,
-            &indices,
-            Some(TakeOptions {
-                check_bounds: false,
-            }),
-        )
-        .context(ComputeArrowSnafu)?;
-
-        arrays.push(timestamp);
-        arrays.push(pk_dictionary);
-        arrays.push(
-            arrow::compute::take(
-                &self.sequence,
-                &indices,
-                Some(TakeOptions {
-                    check_bounds: false,
-                }),
-            )
-            .context(ComputeArrowSnafu)?,
-        );
-
-        arrays.push(
-            arrow::compute::take(
-                &self.op_type,
-                &indices,
-                Some(TakeOptions {
-                    check_bounds: false,
-                }),
-            )
-            .context(ComputeArrowSnafu)?,
-        );
-
-        let batch = RecordBatch::try_new(self.arrow_schema, arrays).context(NewRecordBatchSnafu)?;
-        Ok((batch, min_timestamp, max_timestamp))
-    }
-}
-
-/// Converts timestamp array to an iter of i64 values.
-fn timestamp_array_to_iter(
-    timestamp_unit: TimeUnit,
-    timestamp: &ArrayRef,
-) -> impl Iterator<Item = &i64> {
-    match timestamp_unit {
-        // safety: timestamp column must be valid.
-        TimeUnit::Second => timestamp
-            .as_any()
-            .downcast_ref::<TimestampSecondArray>()
-            .unwrap()
-            .values()
-            .iter(),
-        TimeUnit::Millisecond => timestamp
-            .as_any()
-            .downcast_ref::<TimestampMillisecondArray>()
-            .unwrap()
-            .values()
-            .iter(),
-        TimeUnit::Microsecond => timestamp
-            .as_any()
-            .downcast_ref::<TimestampMicrosecondArray>()
-            .unwrap()
-            .values()
-            .iter(),
-        TimeUnit::Nanosecond => timestamp
-            .as_any()
-            .downcast_ref::<TimestampNanosecondArray>()
-            .unwrap()
-            .values()
-            .iter(),
-    }
-}
-
-/// Converts a **sorted** [BinaryArray] to [DictionaryArray].
-fn binary_array_to_dictionary(input: &BinaryArray) -> Result<PrimaryKeyArray> {
-    if input.is_empty() {
-        return Ok(DictionaryArray::new(
-            UInt32Array::from(Vec::<u32>::new()),
-            Arc::new(BinaryArray::from_vec(vec![])) as ArrayRef,
-        ));
-    }
-    let mut keys = Vec::with_capacity(16);
-    let mut values = BinaryBuilder::new();
-    let mut prev: usize = 0;
-    keys.push(prev as u32);
-    values.append_value(input.value(prev));
-
-    for current_bytes in input.iter().skip(1) {
-        // safety: encoded pk must present.
-        let current_bytes = current_bytes.unwrap();
-        let prev_bytes = input.value(prev);
-        if current_bytes != prev_bytes {
-            values.append_value(current_bytes);
-            prev += 1;
-        }
-        keys.push(prev as u32);
-    }
-
-    Ok(DictionaryArray::new(
-        UInt32Array::from(keys),
-        Arc::new(values.finish()) as ArrayRef,
-    ))
-}
-
 #[cfg(test)]
 mod tests {
-    use std::collections::VecDeque;
-
    use api::v1::{Row, SemanticType, WriteHint};
    use datafusion_common::ScalarValue;
    use datatypes::arrow::array::Float64Array;
-    use datatypes::prelude::{ConcreteDataType, ScalarVector, Value};
+    use datatypes::prelude::{ConcreteDataType, Value};
    use datatypes::schema::ColumnSchema;
-    use datatypes::vectors::{Float64Vector, TimestampMillisecondVector};
    use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
    use store_api::storage::RegionId;
    use store_api::storage::consts::ReservedColumnId;

    use super::*;
    use crate::memtable::bulk::context::BulkIterContext;
-    use crate::sst::parquet::format::{PrimaryKeyReadFormat, ReadFormat};
    use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
    use crate::test_util::memtable_util::{
        build_key_values_with_ts_seq_values, metadata_for_test, region_metadata_to_row_schema,
    };

-    fn check_binary_array_to_dictionary(
-        input: &[&[u8]],
-        expected_keys: &[u32],
-        expected_values: &[&[u8]],
-    ) {
-        let input = BinaryArray::from_iter_values(input.iter());
-        let array = binary_array_to_dictionary(&input).unwrap();
-        assert_eq!(
-            &expected_keys,
-            &array.keys().iter().map(|v| v.unwrap()).collect::<Vec<_>>()
-        );
-        assert_eq!(
-            expected_values,
-            &array
-                .values()
-                .as_any()
-                .downcast_ref::<BinaryArray>()
-                .unwrap()
-                .iter()
-                .map(|v| v.unwrap())
-                .collect::<Vec<_>>()
-        );
-    }
-
-    #[test]
-    fn test_binary_array_to_dictionary() {
-        check_binary_array_to_dictionary(&[], &[], &[]);
-
-        check_binary_array_to_dictionary(&["a".as_bytes()], &[0], &["a".as_bytes()]);
-
-        check_binary_array_to_dictionary(
-            &["a".as_bytes(), "a".as_bytes()],
-            &[0, 0],
-            &["a".as_bytes()],
-        );
-
-        check_binary_array_to_dictionary(
-            &["a".as_bytes(), "a".as_bytes(), "b".as_bytes()],
-            &[0, 0, 1],
-            &["a".as_bytes(), "b".as_bytes()],
-        );
-
-        check_binary_array_to_dictionary(
-            &[
-                "a".as_bytes(),
-                "a".as_bytes(),
-                "b".as_bytes(),
-                "c".as_bytes(),
-            ],
-            &[0, 0, 1, 2],
-            &["a".as_bytes(), "b".as_bytes(), "c".as_bytes()],
-        );
-    }
-
    struct MutationInput<'a> {
        k0: &'a str,
        k1: u32,
@@ -1549,232 +1244,6 @@ mod tests {
        v1: &'a [Option<f64>],
    }

-    fn check_mutations_to_record_batches(
-        input: &[MutationInput],
-        expected: &[BatchOutput],
-        expected_timestamp: (i64, i64),
-        dedup: bool,
-    ) {
-        let metadata = metadata_for_test();
-        let mutations = input
-            .iter()
-            .map(|m| {
-                build_key_values_with_ts_seq_values(
-                    &metadata,
-                    m.k0.to_string(),
-                    m.k1,
-                    m.timestamps.iter().copied(),
-                    m.v1.iter().copied(),
-                    m.sequence,
-                )
-                .mutation
-            })
-            .collect::<Vec<_>>();
-        let total_rows: usize = mutations
-            .iter()
-            .flat_map(|m| m.rows.iter())
-            .map(|r| r.rows.len())
-            .sum();
-
-        let pk_encoder = DensePrimaryKeyCodec::new(&metadata);
-
-        let (batch, _, _) = mutations_to_record_batch(&mutations, &metadata, &pk_encoder, dedup)
-            .unwrap()
-            .unwrap();
-        let read_format = PrimaryKeyReadFormat::new_with_all_columns(metadata.clone());
-        let mut batches = VecDeque::new();
-        read_format
-            .convert_record_batch(&batch, None, &mut batches)
-            .unwrap();
-        if !dedup {
-            assert_eq!(
-                total_rows,
-                batches.iter().map(|b| { b.num_rows() }).sum::<usize>()
-            );
-        }
-        let batch_values = batches
-            .into_iter()
-            .map(|b| {
-                let pk_values = pk_encoder.decode(b.primary_key()).unwrap().into_dense();
-                let timestamps = b
-                    .timestamps()
-                    .as_any()
-                    .downcast_ref::<TimestampMillisecondVector>()
-                    .unwrap()
-                    .iter_data()
-                    .map(|v| v.unwrap().0.value())
-                    .collect::<Vec<_>>();
-                let float_values = b.fields()[1]
-                    .data
-                    .as_any()
-                    .downcast_ref::<Float64Vector>()
-                    .unwrap()
-                    .iter_data()
-                    .collect::<Vec<_>>();
-
-                (pk_values, timestamps, float_values)
-            })
-            .collect::<Vec<_>>();
-        assert_eq!(expected.len(), batch_values.len());
-
-        for idx in 0..expected.len() {
-            assert_eq!(expected[idx].pk_values, &batch_values[idx].0);
-            assert_eq!(expected[idx].timestamps, &batch_values[idx].1);
-            assert_eq!(expected[idx].v1, &batch_values[idx].2);
-        }
-    }
-
-    #[test]
-    fn test_mutations_to_record_batch() {
-        check_mutations_to_record_batches(
-            &[MutationInput {
-                k0: "a",
-                k1: 0,
-                timestamps: &[0],
-                v1: &[Some(0.1)],
-                sequence: 0,
-            }],
-            &[BatchOutput {
-                pk_values: &[Value::String("a".into()), Value::UInt32(0)],
-                timestamps: &[0],
-                v1: &[Some(0.1)],
-            }],
-            (0, 0),
-            true,
-        );
-
-        check_mutations_to_record_batches(
-            &[
-                MutationInput {
-                    k0: "a",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.1)],
-                    sequence: 0,
-                },
-                MutationInput {
-                    k0: "b",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.0)],
-                    sequence: 0,
-                },
-                MutationInput {
-                    k0: "a",
-                    k1: 0,
-                    timestamps: &[1],
-                    v1: &[Some(0.2)],
-                    sequence: 1,
-                },
-                MutationInput {
-                    k0: "a",
-                    k1: 1,
-                    timestamps: &[1],
-                    v1: &[Some(0.3)],
-                    sequence: 2,
-                },
-            ],
-            &[
-                BatchOutput {
-                    pk_values: &[Value::String("a".into()), Value::UInt32(0)],
-                    timestamps: &[0, 1],
-                    v1: &[Some(0.1), Some(0.2)],
-                },
-                BatchOutput {
-                    pk_values: &[Value::String("a".into()), Value::UInt32(1)],
-                    timestamps: &[1],
-                    v1: &[Some(0.3)],
-                },
-                BatchOutput {
-                    pk_values: &[Value::String("b".into()), Value::UInt32(0)],
-                    timestamps: &[0],
-                    v1: &[Some(0.0)],
-                },
-            ],
-            (0, 1),
-            true,
-        );
-
-        check_mutations_to_record_batches(
-            &[
-                MutationInput {
-                    k0: "a",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.1)],
-                    sequence: 0,
-                },
-                MutationInput {
-                    k0: "b",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.0)],
-                    sequence: 0,
-                },
-                MutationInput {
-                    k0: "a",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.2)],
-                    sequence: 1,
-                },
-            ],
-            &[
-                BatchOutput {
-                    pk_values: &[Value::String("a".into()), Value::UInt32(0)],
-                    timestamps: &[0],
-                    v1: &[Some(0.2)],
-                },
-                BatchOutput {
-                    pk_values: &[Value::String("b".into()), Value::UInt32(0)],
-                    timestamps: &[0],
-                    v1: &[Some(0.0)],
-                },
-            ],
-            (0, 0),
-            true,
-        );
-        check_mutations_to_record_batches(
-            &[
-                MutationInput {
-                    k0: "a",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.1)],
-                    sequence: 0,
-                },
-                MutationInput {
-                    k0: "b",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.0)],
-                    sequence: 0,
-                },
-                MutationInput {
-                    k0: "a",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.2)],
-                    sequence: 1,
-                },
-            ],
-            &[
-                BatchOutput {
-                    pk_values: &[Value::String("a".into()), Value::UInt32(0)],
-                    timestamps: &[0, 0],
-                    v1: &[Some(0.2), Some(0.1)],
-                },
-                BatchOutput {
-                    pk_values: &[Value::String("b".into()), Value::UInt32(0)],
-                    timestamps: &[0],
-                    v1: &[Some(0.0)],
-                },
-            ],
-            (0, 0),
-            false,
-        );
-    }
-
    fn encode(input: &[MutationInput]) -> EncodedBulkPart {
        let metadata = metadata_for_test();
        let kvs = input
--- a/src/mito2/src/metrics.rs
+++ b/src/mito2/src/metrics.rs
@@ -157,6 +157,35 @@ lazy_static! {
            "greptime_mito_inflight_compaction_count",
            "inflight compaction count",
        ).unwrap();
+
+    /// Bytes reserved by compaction memory manager.
+    pub static ref COMPACTION_MEMORY_IN_USE: IntGauge =
+        register_int_gauge!(
+            "greptime_mito_compaction_memory_in_use_bytes",
+            "bytes currently reserved for compaction tasks",
+        )
+        .unwrap();
+    /// Configured compaction memory limit.
+    pub static ref COMPACTION_MEMORY_LIMIT: IntGauge =
+        register_int_gauge!(
+            "greptime_mito_compaction_memory_limit_bytes",
+            "maximum bytes allowed for compaction tasks",
+        )
+        .unwrap();
+    /// Wait time to obtain compaction memory.
+    pub static ref COMPACTION_MEMORY_WAIT: Histogram = register_histogram!(
+        "greptime_mito_compaction_memory_wait_seconds",
+        "time waiting for compaction memory",
+        // 0.01s ~ ~10s
+        exponential_buckets(0.01, 2.0, 10).unwrap(),
+    ).unwrap();
+    /// Counter of rejected compaction memory allocations.
+    pub static ref COMPACTION_MEMORY_REJECTED: IntCounterVec =
+        register_int_counter_vec!(
+            "greptime_mito_compaction_memory_rejected_total",
+            "number of compaction tasks rejected due to memory limit",
+            &[TYPE_LABEL]
+        ).unwrap();
 }

 // Query metrics.
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -1137,6 +1137,12 @@ impl ScanInput {
        self.files.len()
    }

+    /// Gets the file handle from a row group index.
+    pub(crate) fn file_from_index(&self, index: RowGroupIndex) -> &FileHandle {
+        let file_index = index.index - self.num_memtables();
+        &self.files[file_index]
+    }
+
    pub fn region_metadata(&self) -> &RegionMetadataRef {
        self.mapper.metadata()
    }
--- a/Show More
+++ b/Show More